diff --git a/sys/conf/files b/sys/conf/files index f5217db..a9ead10 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -2009,6 +2009,7 @@ kern/kern_poll.c optional device_polling kern/kern_priv.c standard kern/kern_proc.c standard kern/kern_prot.c standard +kern/kern_rangelock.c standard kern/kern_resource.c standard kern/kern_rmlock.c standard kern/kern_rwlock.c standard @@ -2696,6 +2697,7 @@ vm/vm_page.c standard vm/vm_pageout.c standard vm/vm_pager.c standard vm/vm_phys.c standard +vm/vm_readwrite.c standard vm/vm_reserv.c standard vm/vm_unix.c standard vm/vm_zeroidle.c standard diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c new file mode 100644 index 0000000..00e5f4a --- /dev/null +++ b/sys/kern/kern_rangelock.c @@ -0,0 +1,166 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +uma_zone_t rl_entry_zone; + +static void +rangelock_sys_init(void) +{ + + rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); +} +SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, rangelock_sys_init, NULL); + +void +rangelock_init(struct rangelock *lock) +{ + + TAILQ_INIT(&lock->rl_waiters); + lock->rl_currdep = NULL; +} + +void +rangelock_destroy(struct rangelock *lock) +{ + + KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters")); +} + +static int +rangelock_incompatible(const struct rl_q_entry *e1, + const struct rl_q_entry *e2) +{ + + if ((e1->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ && + (e2->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ) + return (0); +#define IN_RANGE(a, e) (a <= e->rl_q_start && a < e->rl_q_end) + if (IN_RANGE(e1->rl_q_start, e2) || IN_RANGE(e2->rl_q_start, e1) || + IN_RANGE(e1->rl_q_end, e2) || IN_RANGE(e2->rl_q_end, e1)) + return (1); +#undef IN_RANGE + return (0); +} + +static void +rangelock_calc_block(struct rangelock *lock) +{ + struct rl_q_entry *entry, *entry1, *whead; + + if (lock->rl_currdep == TAILQ_FIRST(&lock->rl_waiters) && + lock->rl_currdep != NULL) + lock->rl_currdep = TAILQ_NEXT(lock->rl_currdep, rl_q_link); + for (entry = lock->rl_currdep; entry; + entry = TAILQ_NEXT(entry, rl_q_link)) { + TAILQ_FOREACH(entry1, &lock->rl_waiters, rl_q_link) { + if (rangelock_incompatible(entry, entry1)) + goto out; + if (entry1 == entry) + break; + } + } +out: + lock->rl_currdep = entry; + TAILQ_FOREACH(whead, &lock->rl_waiters, rl_q_link) { + if (whead == lock->rl_currdep) + break; + if (!(whead->rl_q_flags & RL_LOCK_GRANTED)) { + whead->rl_q_flags |= RL_LOCK_GRANTED; + wakeup(whead); + } + } +} + +static void +rangelock_unlock_vp_locked(struct vnode *vp, struct rl_q_entry *entry) +{ + + ASSERT_VI_LOCKED(vp, "rangelock"); + KASSERT(entry != vp->v_rl.rl_currdep, ("stuck currdep")); + TAILQ_REMOVE(&vp->v_rl.rl_waiters, entry, rl_q_link); + rangelock_calc_block(&vp->v_rl); + VI_UNLOCK(vp); + uma_zfree(rl_entry_zone, entry); +} + +void +rangelock_unlock(struct vnode *vp, void *cookie) +{ + struct rl_q_entry *entry; + + entry = cookie; + VI_LOCK(vp); + rangelock_unlock_vp_locked(vp, entry); +} + +void * +rangelock_unlock_range(struct vnode *vp, void *cookie, off_t base, size_t len) +{ + struct rl_q_entry *entry; + + entry = cookie; + VI_LOCK(vp); + KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED, ("XXX")); + KASSERT(entry->rl_q_start == base, ("XXX")); + KASSERT(entry->rl_q_end >= base + len, ("XXX")); + if (entry->rl_q_end == base + len) { + rangelock_unlock_vp_locked(vp, cookie); + return (NULL); + } + entry->rl_q_end = base + len; + rangelock_calc_block(&vp->v_rl); + VI_UNLOCK(vp); + return (cookie); +} + +static void * +rangelock_enqueue(struct vnode *vp, struct rl_q_entry *entry) +{ + + VI_LOCK(vp); + TAILQ_INSERT_TAIL(&vp->v_rl.rl_waiters, entry, rl_q_link); + if (vp->v_rl.rl_currdep == NULL) + vp->v_rl.rl_currdep = entry; + rangelock_calc_block(&vp->v_rl); + while (!(entry->rl_q_flags & RL_LOCK_GRANTED)) + msleep(entry, &vp->v_interlock, 0, "range", 0); + VI_UNLOCK(vp); + return (entry); +} + +void * +rangelock_rlock(struct vnode *vp, off_t base, size_t len) +{ + struct rl_q_entry *entry; + + entry = uma_zalloc(rl_entry_zone, M_WAITOK); + entry->rl_q_flags = RL_LOCK_READ; + entry->rl_q_start = base; + entry->rl_q_end = base + len; + return (rangelock_enqueue(vp, entry)); +} + +void * +rangelock_wlock(struct vnode *vp, off_t base, size_t len) +{ + struct rl_q_entry *entry; + + entry = uma_zalloc(rl_entry_zone, M_WAITOK); + entry->rl_q_flags = RL_LOCK_WRITE; + entry->rl_q_start = base; + entry->rl_q_end = base + len; + return (rangelock_enqueue(vp, entry)); +} diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index d92555f..8aa145f 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -853,6 +853,7 @@ vdestroy(struct vnode *vp) /* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif + rangelock_destroy(&vp->v_rl); lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); mtx_destroy(BO_MTX(bo)); @@ -1007,6 +1008,7 @@ alloc: if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) vp->v_vflag |= VV_NOKNOTE; } + rangelock_init(&vp->v_rl); *vpp = vp; return (0); diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 03e8d93..6618bc8 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -62,6 +62,9 @@ __FBSDID("$FreeBSD$"); #include +#include +#include + static fo_rdwr_t vn_read; static fo_rdwr_t vn_write; static fo_truncate_t vn_truncate; @@ -351,74 +354,69 @@ sequential_heuristic(struct uio *uio, struct file *fp) * Package up an I/O request on a vnode into a uio and do it. */ int -vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, - aresid, td) - enum uio_rw rw; - struct vnode *vp; - void *base; - int len; - off_t offset; - enum uio_seg segflg; - int ioflg; - struct ucred *active_cred; - struct ucred *file_cred; - int *aresid; - struct thread *td; +vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, + enum uio_seg segflg, int ioflg, struct ucred *active_cred, + struct ucred *file_cred, int *aresid, struct thread *td) { struct uio auio; struct iovec aiov; struct mount *mp; struct ucred *cred; + void *rl_cookie; int error, lock_flags; VFS_ASSERT_GIANT(vp->v_mount); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = base; + aiov.iov_len = len; + auio.uio_resid = len; + auio.uio_offset = offset; + auio.uio_segflg = segflg; + auio.uio_rw = rw; + auio.uio_td = td; + error = 0; + + if ((ioflg & IO_NODELOCKED) == 0) { + if (rw == UIO_READ) + rl_cookie = rangelock_rlock(vp, offset, len); + else + rl_cookie = rangelock_wlock(vp, offset, len); + } else + rl_cookie = NULL; + if ((ioflg & IO_NODELOCKED) == 0) { mp = NULL; if (rw == UIO_WRITE) { if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) - return (error); + goto out; if (MNT_SHARED_WRITES(mp) || - ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) { + ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) lock_flags = LK_SHARED; - } else { + else lock_flags = LK_EXCLUSIVE; - } vn_lock(vp, lock_flags | LK_RETRY); - } else - vn_lock(vp, LK_SHARED | LK_RETRY); - + } } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - aiov.iov_base = base; - aiov.iov_len = len; - auio.uio_resid = len; - auio.uio_offset = offset; - auio.uio_segflg = segflg; - auio.uio_rw = rw; - auio.uio_td = td; - error = 0; #ifdef MAC if ((ioflg & IO_NOMACCHECK) == 0) { - if (rw == UIO_READ) - error = mac_vnode_check_read(active_cred, file_cred, - vp); - else + if (rw == UIO_WRITE) error = mac_vnode_check_write(active_cred, file_cred, vp); } #endif if (error == 0) { - if (file_cred) + if (file_cred != NULL) cred = file_cred; else cred = active_cred; if (rw == UIO_READ) - error = VOP_READ(vp, &auio, ioflg, cred); + error = vn_read_chunk(vp, &auio, active_cred, cred, + ioflg); else error = VOP_WRITE(vp, &auio, ioflg, cred); } @@ -428,10 +426,15 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, if (auio.uio_resid && error == 0) error = EIO; if ((ioflg & IO_NODELOCKED) == 0) { - if (rw == UIO_WRITE && vp->v_type != VCHR) - vn_finished_write(mp); - VOP_UNLOCK(vp, 0); + if (rw == UIO_WRITE) { + if (vp->v_type != VCHR) + vn_finished_write(mp); + VOP_UNLOCK(vp, 0); + } } + out: + if (rl_cookie != NULL) + rangelock_unlock(vp, rl_cookie); return (error); } @@ -493,126 +496,149 @@ vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred, return (error); } +static struct mtx * +vn_lock_foffset(struct file *fp) +{ + struct mtx *mtxp; + + mtxp = mtx_pool_find(mtxpool_sleep, fp); + mtx_lock(mtxp); + while (fp->f_vnread_flags & FOFFSET_LOCKED) { + fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; + msleep(&fp->f_vnread_flags, mtxp, PUSER -1, + "vnread offlock", 0); + } + fp->f_vnread_flags |= FOFFSET_LOCKED; + mtx_unlock(mtxp); + return (mtxp); +} + +static void +vn_unlock_foffset(struct file *fp, struct mtx *mtxp) +{ + + mtx_lock(mtxp); + if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) + wakeup(&fp->f_vnread_flags); + fp->f_vnread_flags = 0; + mtx_unlock(mtxp); +} + +int +vn_read_chunk(struct vnode *vp, struct uio *uio, struct ucred *active_cred, + struct ucred *fcred, int ioflag) +{ + int error, vfslocked; + + error = 0; + vfslocked = 0; /* gcc */ + + if ((ioflag & IO_NODELOCKED) == 0) { + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vn_lock(vp, LK_SHARED | LK_RETRY); + } + +#ifdef MAC + if ((ioflag & IO_NOMACCHECK) == 0) + error = mac_vnode_check_read(active_cred, fcred, vp); +#endif + if (error == 0) { + error = vnode_pager_read(vp, uio, ioflag); + if (error == EOPNOTSUPP) + error = VOP_READ(vp, uio, ioflag, fcred); + } + if ((ioflag & IO_NODELOCKED) == 0) { + VOP_UNLOCK(vp, 0); + VFS_UNLOCK_GIANT(vfslocked); + } + return (error); +} + /* * File table vnode read routine. */ static int -vn_read(fp, uio, active_cred, flags, td) - struct file *fp; - struct uio *uio; - struct ucred *active_cred; - struct thread *td; - int flags; +vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, + struct thread *td) { struct vnode *vp; - int error, ioflag; struct mtx *mtxp; - int vfslocked; + void *rl_cookie; + int ioflag; + int error; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); - mtxp = NULL; - vp = fp->f_vnode; ioflag = 0; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; - vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vp = fp->f_vnode; + /* * According to McKusick the vn lock was protecting f_offset here. * It is now protected by the FOFFSET_LOCKED flag. */ if ((flags & FOF_OFFSET) == 0) { - mtxp = mtx_pool_find(mtxpool_sleep, fp); - mtx_lock(mtxp); - while(fp->f_vnread_flags & FOFFSET_LOCKED) { - fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; - msleep(&fp->f_vnread_flags, mtxp, PUSER -1, - "vnread offlock", 0); - } - fp->f_vnread_flags |= FOFFSET_LOCKED; - mtx_unlock(mtxp); - vn_lock(vp, LK_SHARED | LK_RETRY); + mtxp = vn_lock_foffset(fp); uio->uio_offset = fp->f_offset; } else - vn_lock(vp, LK_SHARED | LK_RETRY); - + mtxp = NULL; /* gcc */ + if (vp->v_type == VREG) + rl_cookie = rangelock_rlock(vp, uio->uio_offset, + uio->uio_resid); + else + rl_cookie = NULL; ioflag |= sequential_heuristic(uio, fp); - -#ifdef MAC - error = mac_vnode_check_read(active_cred, fp->f_cred, vp); - if (error == 0) -#endif - error = VOP_READ(vp, uio, ioflag, fp->f_cred); + error = vn_read_chunk(vp, uio, active_cred, fp->f_cred, ioflag); + fp->f_nextoff = uio->uio_offset; + if (rl_cookie != NULL) + rangelock_unlock(vp, rl_cookie); if ((flags & FOF_OFFSET) == 0) { fp->f_offset = uio->uio_offset; - mtx_lock(mtxp); - if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) - wakeup(&fp->f_vnread_flags); - fp->f_vnread_flags = 0; - mtx_unlock(mtxp); + vn_unlock_foffset(fp, mtxp); } - fp->f_nextoff = uio->uio_offset; - VOP_UNLOCK(vp, 0); - VFS_UNLOCK_GIANT(vfslocked); return (error); } -/* - * File table vnode write routine. - */ -static int -vn_write(fp, uio, active_cred, flags, td) - struct file *fp; - struct uio *uio; - struct ucred *active_cred; - struct thread *td; - int flags; +static inline int +vn_write_chunk(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, int ioflag, struct thread *td) { - struct vnode *vp; struct mount *mp; - int error, ioflag, lock_flags; - int vfslocked; + struct vnode *vp; + int error, lock_flags, vfslocked; - KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", - uio->uio_td, td)); + mp = NULL; vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if (vp->v_type == VREG) bwillwrite(); - ioflag = IO_UNIT; - if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) - ioflag |= IO_APPEND; - if (fp->f_flag & FNONBLOCK) - ioflag |= IO_NDELAY; - if (fp->f_flag & O_DIRECT) - ioflag |= IO_DIRECT; - if ((fp->f_flag & O_FSYNC) || - (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) - ioflag |= IO_SYNC; - mp = NULL; if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto unlock; if ((MNT_SHARED_WRITES(mp) || ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) && - (flags & FOF_OFFSET) != 0) { + (flags & FOF_OFFSET) != 0) lock_flags = LK_SHARED; - } else { + else lock_flags = LK_EXCLUSIVE; - } - vn_lock(vp, lock_flags | LK_RETRY); if ((flags & FOF_OFFSET) == 0) uio->uio_offset = fp->f_offset; ioflag |= sequential_heuristic(uio, fp); #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); - if (error == 0) +#else + error = 0; #endif - error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); + if (error == 0) { + error = vnode_pager_write(vp, uio, ioflag); + if (error == EOPNOTSUPP) + error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); + } if ((flags & FOF_OFFSET) == 0) fp->f_offset = uio->uio_offset; fp->f_nextoff = uio->uio_offset; @@ -625,6 +651,48 @@ unlock: } /* + * File table vnode write routine. + */ +static int +vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, + struct thread *td) +{ + struct vnode *vp; + void *rl_cookie; + int error, ioflag; + + KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", + uio->uio_td, td)); + vp = fp->f_vnode; + ioflag = IO_UNIT; + if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) + ioflag |= IO_APPEND; + if (fp->f_flag & FNONBLOCK) + ioflag |= IO_NDELAY; + if (fp->f_flag & O_DIRECT) + ioflag |= IO_DIRECT; + if ((fp->f_flag & O_FSYNC) || + (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) + ioflag |= IO_SYNC; + if (vp->v_type == VREG) { + if ((ioflag & IO_APPEND) || !(flags & FOF_OFFSET)) + /* + * For appenders, punt and lock the whole + * range. It also protects f_offset. + */ + rl_cookie = rangelock_wlock(vp, 0, (size_t)-1); + else + rl_cookie = rangelock_wlock(vp, uio->uio_offset, + uio->uio_resid); + } else + rl_cookie = NULL; + error = vn_write_chunk(fp, uio, active_cred, flags, ioflag, td); + if (rl_cookie != NULL) + rangelock_unlock(vp, rl_cookie); + return (error); +} + +/* * File table truncate routine. */ static int diff --git a/sys/sys/rangelock.h b/sys/sys/rangelock.h new file mode 100644 index 0000000..5ec6433 --- /dev/null +++ b/sys/sys/rangelock.h @@ -0,0 +1,48 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + * $FreeBSD$ + */ + +#ifndef _SYS_RANGELOCK_H +#define _SYS_RANGELOCK_H + +#include +#include +#include +#include +#include + +#ifdef _KERNEL + +struct vnode; + +struct rl_q_entry +{ + TAILQ_ENTRY(rl_q_entry) rl_q_link; + size_t rl_q_start, rl_q_end; + int rl_q_flags; +}; + +#define RL_LOCK_READ 0x0001 +#define RL_LOCK_WRITE 0x0002 +#define RL_LOCK_TYPE_MASK 0x0003 +#define RL_LOCK_GRANTED 0x0004 + +struct rangelock +{ + TAILQ_HEAD(, rl_q_entry) rl_waiters; + struct rl_q_entry *rl_currdep; +}; + +void rangelock_init(struct rangelock *lock); +void rangelock_destroy(struct rangelock *lock); +void rangelock_unlock(struct vnode *vp, void *cookie); +void *rangelock_unlock_range(struct vnode *vp, void *cookie, off_t base, + size_t len); +void *rangelock_rlock(struct vnode *vp, off_t base, size_t len); +void *rangelock_wlock(struct vnode *vp, off_t base, size_t len); +#endif + +#endif diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index b38c1d0..2e23522 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -168,7 +169,8 @@ struct vnode { */ struct vpollinfo *v_pollinfo; /* G Poll events, p for *v_pi */ struct label *v_label; /* MAC label for vnode */ - struct lockf *v_lockf; /* Byte-level lock list */ + struct lockf *v_lockf; /* Byte-level adv lock list */ + struct rangelock v_rl; /* Byte-range lock */ }; #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */ @@ -655,6 +657,8 @@ int vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, size_t *aresid, struct thread *td); +int vn_read_chunk(struct vnode *vp, struct uio *uio, + struct ucred *active_cred, struct ucred *f_cred, int ioflag); int vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred, struct ucred *file_cred, struct thread *td); int vn_start_write(struct vnode *vp, struct mount **mpp, int flags); diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c index 464a761..4c173fe 100644 --- a/sys/ufs/ffs/ffs_vnops.c +++ b/sys/ufs/ffs/ffs_vnops.c @@ -463,7 +463,7 @@ ffs_read(ap) return error; } #endif - + KASSERT(uio->uio_segflg != UIO_USERSPACE, ("ffs_read: UIO_USERSPACE")); seqcount = ap->a_ioflag >> IO_SEQSHIFT; ip = VTOI(vp); diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c index 08b77ae..8f537b3 100644 --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -1900,7 +1900,8 @@ ufs_readdir(ap) uio->uio_iov->iov_len = count; # if (BYTE_ORDER == LITTLE_ENDIAN) if (ap->a_vp->v_mount->mnt_maxsymlinklen > 0) { - error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); + error = vn_read_chunk(ap->a_vp, uio, ap->a_cred, + ap->a_cred, IO_NODELOCKED); } else { struct dirent *dp, *edp; struct uio auio; @@ -1916,7 +1917,8 @@ ufs_readdir(ap) aiov.iov_len = count; dirbuf = malloc(count, M_TEMP, M_WAITOK); aiov.iov_base = dirbuf; - error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred); + error = vn_read_chunk(ap->a_vp, &auio, ap->a_cred, + ap->a_cred, IO_NODELOCKED); if (error == 0) { readcnt = count - auio.uio_resid; edp = (struct dirent *)&dirbuf[readcnt]; @@ -1938,7 +1940,8 @@ ufs_readdir(ap) free(dirbuf, M_TEMP); } # else - error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); + error = vn_read_chunk(ap->a_vp, uio, ap->a_cred, + ap->a_cred, IO_NODELOCKED); # endif if (!error && ap->a_ncookies != NULL) { struct dirent* dpStart; diff --git a/sys/vm/vm_extern.h b/sys/vm/vm_extern.h index ff48983..ccfd066 100644 --- a/sys/vm/vm_extern.h +++ b/sys/vm/vm_extern.h @@ -84,5 +84,8 @@ void vm_thread_dispose(struct thread *td); int vm_thread_new(struct thread *td, int pages); void vm_thread_swapin(struct thread *td); void vm_thread_swapout(struct thread *td); +int vnode_pager_read(struct vnode *vp, struct uio *uio, int ioflags); +int vnode_pager_write(struct vnode *vp, struct uio *uio, int ioflags); + #endif /* _KERNEL */ #endif /* !_VM_EXTERN_H_ */ diff --git a/sys/vm/vm_readwrite.c b/sys/vm/vm_readwrite.c new file mode 100644 index 0000000..cfb8bef --- /dev/null +++ b/sys/vm/vm_readwrite.c @@ -0,0 +1,341 @@ +/*- + * Copyright (c) 2008 Jeffrey Roberson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +/* + * XXXKIB TODO + * + * 1. Backpressure for writes + * 2. VOP_REALLOCBLKS + * + */ + +/* + * Grab a page, waiting until we are woken up due to the page + * changing state. We keep on waiting, if the page continues + * to be in the object. If the page doesn't exist allocate it. + * + * This routine may block. + */ +static vm_page_t +vm_page_grab_next(vm_object_t object, vm_page_t prev, vm_pindex_t pindex) +{ + vm_page_t m; + + VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); + m = NULL; + if (prev) { + m = TAILQ_NEXT(prev, listq); + if (m && m->pindex != pindex) + m = NULL; + } + for (;;) { + if (m == NULL) + m = vm_page_lookup(object, pindex); + if (m != NULL) { + if (vm_page_sleep_if_busy(m, TRUE, "pgrnbwt") == 0) + break; + m = NULL; + continue; + } + m = vm_page_alloc(object, pindex, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY); + if (m != NULL) + break; + VM_OBJECT_UNLOCK(object); + VM_WAIT; + VM_OBJECT_LOCK(object); + } + return (m); +} + +/* + * Read a cluster starting at 'm'. + */ +static int +vnode_pager_read_cluster(struct vnode *vp, vm_page_t m, int maxrun) +{ + vm_page_t pa[MAXPHYS / PAGE_SIZE]; + vm_object_t obj; + vm_pindex_t idx; + daddr_t blkno; + int bsize; + int error; + int run; + int i; + + obj = vp->v_object; + idx = m->pindex; + bsize = vp->v_mount->mnt_stat.f_iosize; + VM_OBJECT_UNLOCK(obj); + error = VOP_BMAP(vp, IDX_TO_OFF(idx)/bsize, NULL, &blkno, NULL, &run); + VM_OBJECT_LOCK(obj); + run = MIN(run, maxrun); + if (error || run == 0 || blkno == -1) + return (vm_pager_get_pages(obj, &m, 1, 0)); + run = (run + 1) * bsize / PAGE_SIZE; + run = MIN(run, vp->v_mount->mnt_iosize_max / PAGE_SIZE); + pa[0] = m; + for (i = 1; i < run; i++) { + m = vm_page_grab_next(obj, m, idx + i); + if (m->valid) { + run = i; + break; + } + vm_page_busy(m); + pa[i] = m; + } + return (vm_pager_get_pages(obj, pa, run, 0)); +} + +int +vnode_pager_read(struct vnode *vp, struct uio *uio, int ioflags) +{ + vm_object_t obj; + vm_offset_t off; + vm_pindex_t idx; + vm_page_t m; + ssize_t size; + int error, vn_locked, obj_locked; + struct thread *td; + + if (ioflags & (IO_EXT|IO_DIRECT)) + return (EOPNOTSUPP); + ASSERT_VOP_LOCKED(vp, "vnode_pager_read"); + if (vp->v_iflag & VI_DOOMED) + return (EBADF); + obj = vp->v_object; + if (obj == NULL) + return (EOPNOTSUPP); + error = 0; + m = NULL; + vn_locked = VOP_ISLOCKED(vp); + obj_locked = 0; + td = uio->uio_td; + if (td == NULL) + td = curthread; + while (uio->uio_resid > 0) { + if (!obj_locked) { + VM_OBJECT_LOCK(obj); + obj_locked = 1; + } + size = obj->un_pager.vnp.vnp_size - uio->uio_offset; + if (size <= 0) + break; + idx = OFF_TO_IDX(uio->uio_offset); + off = uio->uio_offset - IDX_TO_OFF(idx); + size = MIN(MIN(PAGE_SIZE - off, uio->uio_resid), size); + m = vm_page_grab_next(obj, m, idx); + if (!vm_page_is_valid(m, off, size)) { + vm_page_busy(m); + error = vnode_pager_read_cluster(vp, m, + howmany((ioflags >> IO_SEQSHIFT), PAGE_SIZE)); + m = vm_page_lookup(obj, idx); + if (m == NULL) { + if (error == VM_PAGER_OK) + continue; + error = EIO; + break; + } + if (m->valid == 0 || error != VM_PAGER_OK) { + vm_page_lock_queues(); + vm_page_free(m); + vm_page_unlock_queues(); + error = EIO; + break; + } + vm_page_wakeup(m); + } + vm_object_pip_add(obj, 1); + vm_page_io_start(m); + VM_OBJECT_UNLOCK(obj); + VOP_UNLOCK(vp, 0); + error = uiomove_fromphys(&m, off, size, uio); + VM_OBJECT_LOCK(obj); + vm_page_io_finish(m); + vm_object_pip_wakeup(obj); + VM_OBJECT_UNLOCK(obj); + obj_locked = 0; + vn_lock(vp, vn_locked | LK_RETRY); + if (error != 0 || (vp->v_iflag & VI_DOOMED)) + break; + } + if (obj_locked) + VM_OBJECT_UNLOCK(obj); + if (error == 0) + vfs_mark_atime(vp, td->td_ucred); + + return (error); +} + +int +vnode_pager_write(struct vnode *vp, struct uio *uio, int ioflags) +{ + vm_object_t obj; + vm_offset_t off; + vm_pindex_t idx; + vm_page_t m; + struct vattr vattr; + ssize_t size, osize, osize1, resid, sresid; + int error, vn_locked, obj_locked, bits; + struct thread *td; + + if (ioflags & (IO_EXT|IO_INVAL|IO_DIRECT)) + return (EOPNOTSUPP); + ASSERT_VOP_LOCKED(vp, "vnode_pager_write"); + if (vp->v_iflag & VI_DOOMED) + return (EBADF); + obj = vp->v_object; + if (obj == NULL) + return (EOPNOTSUPP); + error = 0; + m = NULL; + vn_locked = VOP_ISLOCKED(vp); + obj_locked = 0; + if (ioflags & IO_APPEND) + uio->uio_offset = obj->un_pager.vnp.vnp_size; + td = uio->uio_td; + if (td == NULL) + td = curthread; + if (vp->v_type == VREG) { + PROC_LOCK(td->td_proc); + if (uio->uio_offset + uio->uio_resid > + lim_cur(td->td_proc, RLIMIT_FSIZE)) { + psignal(td->td_proc, SIGXFSZ); + PROC_UNLOCK(td->td_proc); + return (EFBIG); + } + PROC_UNLOCK(td->td_proc); + } + osize = osize1 = obj->un_pager.vnp.vnp_size; + resid = uio->uio_resid; + VATTR_NULL(&vattr); + VI_LOCK(vp); + KASSERT(vp->v_writecount > 0, ("vnode_pager_write: writecount")); + vp->v_writecount++; + VI_UNLOCK(vp); + while (uio->uio_resid > 0) { + size = uio->uio_resid; + idx = OFF_TO_IDX(uio->uio_offset); + off = uio->uio_offset - IDX_TO_OFF(idx); + size = MIN(PAGE_SIZE - off, uio->uio_resid); + osize1 = obj->un_pager.vnp.vnp_size; + if (osize1 < uio->uio_offset + size) { + vattr.va_size = uio->uio_offset + size; + if (obj_locked) { + VM_OBJECT_UNLOCK(obj); + obj_locked = 0; + } + error = VOP_SETATTR(vp, &vattr, td->td_ucred); + if (error != 0) + break; + } + + if (!obj_locked) { + VM_OBJECT_LOCK(obj); + obj_locked = 1; + } + m = vm_page_grab_next(obj, m, idx); + bits = vm_page_bits(off, size); + if ((m->valid & ~bits) != ~bits) { + vm_page_busy(m); + error = vnode_pager_read_cluster(vp, m, 1); + m = vm_page_lookup(obj, idx); + if (m == NULL) { + if (error == VM_PAGER_OK) + continue; + error = EIO; + break; + } + if (m->valid == 0 || error != VM_PAGER_OK) { + vm_page_lock_queues(); + vm_page_free(m); + vm_page_unlock_queues(); + error = EIO; + break; + } + vm_page_wakeup(m); + } + vm_object_pip_add(obj, 1); + vm_page_io_start(m); + VM_OBJECT_UNLOCK(obj); + VOP_UNLOCK(vp, 0); + sresid = uio->uio_resid; + error = uiomove_fromphys(&m, off, size, uio); + VM_OBJECT_LOCK(obj); + if (error == 0) { + vm_page_lock_queues(); + m->valid |= vm_page_bits(off, sresid - uio->uio_resid); + vm_page_dirty(m); + vm_page_unlock_queues(); + vm_object_set_writeable_dirty(obj); + } + vm_page_io_finish(m); + vm_object_pip_wakeup(obj); + if (error == 0 && (ioflags & IO_SYNC)) + vm_object_page_clean(obj, idx, idx + 1, ioflags); + VM_OBJECT_UNLOCK(obj); + obj_locked = 0; + vn_lock(vp, vn_locked | LK_RETRY); + if (error != 0 || (vp->v_iflag & VI_DOOMED)) + break; + } + if (obj_locked) + VM_OBJECT_UNLOCK(obj); + if (error == 0 && (ioflags & IO_SYNC)) + error = VOP_FSYNC(vp, MNT_WAIT, td); + if (error != 0) { + vattr.va_size = (ioflags & IO_UNIT) ? osize : osize1; + VOP_SETATTR(vp, &vattr, td->td_ucred); + if (ioflags & IO_UNIT) { + uio->uio_offset -= resid - uio->uio_resid; + uio->uio_resid = resid; + } + } + VI_LOCK(vp); + vp->v_writecount--; + VI_UNLOCK(vp); + + return (error); +} diff --git a/tools/regression/file/uio/uio.c b/tools/regression/file/uio/uio.c new file mode 100644 index 0000000..d857605 --- /dev/null +++ b/tools/regression/file/uio/uio.c @@ -0,0 +1,116 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int chunk_cnt = 1024; +int chunk_size = 1024; + +int +main(int argc, char *argv[]) +{ + struct iovec *wiov, *riov; + char **wdata, **rdata; + int fd, i; + ssize_t io_error; + + if (argc < 2) { + fprintf(stderr, "Usage: uio file [chunk count [chunk size]]\n"); + return (2); + } + fd = open(argv[1], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); + if (fd == -1) { + fprintf(stderr, "Failed to create %s: %s\n", + argv[1], strerror(errno)); + return (1); + } + + if (argc > 2) + chunk_cnt = atoi(argv[2]); + if (argc > 3) + chunk_size = atoi(argv[3]); + + wiov = calloc(chunk_cnt, sizeof(*wiov)); + wdata = calloc(chunk_cnt, sizeof(*wdata)); + + riov = calloc(chunk_cnt, sizeof(*riov)); + rdata = calloc(chunk_cnt, sizeof(*rdata)); + + for (i = 0; i < chunk_cnt; i++) { + rdata[i] = malloc(chunk_size); + riov[i].iov_base = rdata[i]; + riov[i].iov_len = chunk_size; + + wdata[i] = malloc(chunk_size); + memset(wdata[i], i, chunk_size); + wiov[i].iov_base = wdata[i]; + wiov[i].iov_len = chunk_size; + } + + io_error = writev(fd, wiov, chunk_cnt); + if (io_error == -1) { + fprintf(stderr, "write failed: %s\n", strerror(errno)); + return (1); + } else if (io_error != chunk_cnt * chunk_size) { + fprintf(stderr, "truncated write: %d %d\n", + io_error, chunk_cnt * chunk_size); + return (1); + } + + if (lseek(fd, 0, SEEK_SET) == -1) { + fprintf(stderr, "lseek failed: %s\n", strerror(errno)); + return (1); + } + + io_error = readv(fd, riov, chunk_cnt); + if (io_error == -1) { + fprintf(stderr, "read failed: %s\n", strerror(errno)); + return (1); + } else if (io_error != chunk_cnt * chunk_size) { + fprintf(stderr, "truncated read: %d %d\n", + io_error, chunk_cnt * chunk_size); + return (1); + } + + for (i = 0; i < chunk_cnt; i++) { + if (memcmp(rdata[i], wdata[i], chunk_size) != 0) { + fprintf(stderr, "chunk %d differs\n", i); + return (1); + } + } + + return (0); +} diff --git a/tools/regression/ufs/ba_clrbuf/ba_clrbuf.c b/tools/regression/ufs/ba_clrbuf/ba_clrbuf.c new file mode 100644 index 0000000..1b0acbe --- /dev/null +++ b/tools/regression/ufs/ba_clrbuf/ba_clrbuf.c @@ -0,0 +1,129 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const int blks = 2; + +static void +flush_buffers(int fd) +{ + struct stat st; + char *addr; + int error; + + printf("Flushing buffers\n"); + error = fstat(fd, &st); + if (error == -1) + err(2, "stat"); + fsync(fd); + addr = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (addr == (char *)-1) + err(2, "mmap"); + error = msync(addr, st.st_size, MS_SYNC | MS_INVALIDATE); + if (error == -1) + err(2, "msync"); + munmap(addr, st.st_size); +} + +int +main(int argc, char *argv[]) +{ + struct statfs fst; + char *data, *vrfy; + size_t sz; + int fd, i, error, ret; + + if (argc < 2) + errx(2, "Usage: ba_clrbuf file"); + + fd = open(argv[1], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); + if (fd == -1) + err(2, "Failed to create %s", argv[1]); + + if (fstatfs(fd, &fst) == -1) + err(2, "stat"); + + sz = fst.f_iosize * blks; + data = malloc(sz); + if (data == NULL) + err(2, "malloc"); + vrfy = malloc(sz); + if (vrfy == NULL) + err(2, "malloc"); + for (i = 0; i < (int)sz; i++) + data[i] = i; + error = write(fd, data, sz); + if (error == -1) + err(2, "write"); + else if (error != (int)sz) + errx(2, "Short write %d %d", error, sz); + + flush_buffers(fd); + + error = lseek(fd, 0, SEEK_SET); + if (error == -1) + err(2, "lseek 0"); + else if (error != 0) + errx(2, "lseek 0 returned %d", error); + error = write(fd, NULL, fst.f_iosize); + printf("faulty write, error %s\n", strerror(errno)); + + error = lseek(fd, 0, SEEK_SET); + if (error == -1) + err(2, "lseek 0/2"); + else if (error != 0) + errx(2, "lseek 0/2 returned %d", error); + error = read(fd, vrfy, sz); + if (error == -1) + err(2, "read"); + else if (error != (int)sz) + errx(2, "short read %d %d", error, sz); + + if (memcmp(data, vrfy, fst.f_iosize) != 0) { + printf("Zero block corrupted, byte at 0 is %x\n", + (unsigned char)vrfy[0]); + ret = 1; + } else { + printf("No corruption\n"); + ret = 0; + } + + return (ret); +}