--- /usr/src/sys/kern/kern_subr.c 2008-08-30 14:13:09.000000000 +0200 +++ sys/kern/kern_subr.c 2008-12-15 22:39:15.000000000 +0100 @@ -216,6 +216,25 @@ return (uiomove((char *)buf + offset, n, uio)); } +/* + * Check if uios involved overlap eachother, if referred to the same + * instance (condition which needs to be previously checked). + */ +int +uiomove_overlap(struct uio *dst, struct uio *src) +{ + int rval; + + rval = 1; + if (dst->uio_offset < src->uio_offset && + (dst->uio_offset + dst->uio_resid) < src->uio_offset) + rval = 0; + if (src->uio_offset < dst->uio_offset && + (src->uio_offset + src->uio_resid) < dst->uio_offset) + rval = 0; + return (rval); +} + #ifdef ZERO_COPY_SOCKETS /* * Experimental support for zero-copy I/O --- /usr/src/sys/kern/vfs_subr.c 2008-12-02 16:08:27.000000000 +0100 +++ sys/kern/vfs_subr.c 2008-12-15 22:44:59.000000000 +0100 @@ -93,6 +93,13 @@ #define WI_MPSAFEQ 0 #define WI_GIANTQ 1 +static vnbops { + SLIST_ENTRY(vnbops) vb_vniter; + struct cv vb_cv; + struct uio *vb_uio; + u_int vb_type; +}; + static MALLOC_DEFINE(M_NETADDR, "subr_export_host", "Export host address structure"); static void delmntque(struct vnode *vp); @@ -107,6 +114,8 @@ static void v_decr_useonly(struct vnode *); static void v_upgrade_usecount(struct vnode *); static void vfree(struct vnode *); +static void vnbops_fini(void *mem, int size); +static int vnbops_init(void *mem, int size, int flags); static void vnlru_free(int); static void vgonel(struct vnode *); static void vfs_knllock(void *arg); @@ -192,6 +201,7 @@ /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static uma_zone_t vnode_zone; static uma_zone_t vnodepoll_zone; +static uma_zone_t vnode_bops; /* Set to 1 to print out reclaim of active vnodes */ int prtactive; @@ -318,6 +328,8 @@ NULL, NULL, UMA_ALIGN_PTR, 0); vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + vnode_bops = uma_zcreate("VNBOPS", sizeof (struct vnbops), + NULL, NULL, vnbops_init, vnbops_fini, UMA_ALIGN_PTR, 0); /* * Initialize the filesystem syncer. */ @@ -332,6 +344,27 @@ SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); +static void +vnbops_fini(void *mem, int size) +{ + struct vnbops *vb; + + vb = mem; + cv_destroy(&vb->vb_cv); +} + +static int +vnbops_init(void *mem, int size, int flags) +{ + struct vnbops *vb; + + vb = mem; + bzero(vb, sizeof (*vb)); + cv_init(&vb->vb_cv, "vnode byte ranging"); + vb->vb_type = VB_NONE; + return (0); +} + /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Eventually, mountlist_mtx is not released on failure. @@ -848,6 +881,8 @@ VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL")); VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); + VNASSERT(TAILQ_EMPTY(&vp->v_vnbops), vp, + ("vp has byte range operations")); VI_UNLOCK(vp); #ifdef MAC mac_vnode_destroy(vp); @@ -967,6 +1002,7 @@ * opt-in. */ lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE); + SLIST_INIT(&vp->v_vnbops); /* * Initialize bufobj. */ @@ -2603,6 +2639,67 @@ return(count); } +struct vnbops * +vn_rw_block(struct vnode *vp, struct uio *uio, u_int type) +{ + struct vnbops *tvb, *vb; + + MPASS(vp != NULL && uio != NULL); + + vb = uma_zalloc(vnode_bops, 0); + if (vb == NULL) + return (NULL); + vb->vb_uio = uio; + vb->vb_type = type; +repeat: + VI_LOCK(vp); + SLIST_FOREACH(tvb, &vp->v_vnbops, vb_vniter) + switch(type) { + case VB_READ: + if (uiomove_overlap(uio, tvb->vb_uio) != 0 && + tvb->uio_type != VB_READ) { + cv_wait(&tvb->vb_cv, VI_MTX(vp)); + VI_UNLOCK(vp); + goto repeat; + } + break; + case VB_WRITE: + if (uiomove_overlap(uio, tvb->vb_uio) != 0) { + cv_wait(&tvb->vb_cv, VI_MTX(vp)); + VI_UNLOCK(vp); + goto repeat; + } + break; + case VB_APPEND: + if (tvb->vb_type == VB_APPEND || + uiomove_overlap(uio, tvb->vb_uio)) { + cv_wait(&tvb->vb_cv, VI_MTX(vp)); + VI_UNLOCK(vp); + goto repeat; + } + break; + default: + panic("%s: unknown vnode byte locking op", __func__); + } + + SLIST_INSERT_HEAD(&vp->v_vnbops, vb, vb_vniter); + vn_lock(vp, LK_SHARED | LK_RETRY); + return (vb); +} + +void +vn_rw_bunlock(struct vnode *vp, struct vnbops *vb) +{ + + MPASS(vb != NULL); + + VOP_UNLOCK(vp, 0); + VI_LOCK(vp); + SLIST_REMOVE(&vp->v_vnbops, vb, vb_vniter); + cv_broadcast(&vb->vb_cv); + VI_UNLOCK(vp); +} + /* * Print out a description of a vnode. */ --- /usr/src/sys/kern/vfs_vnops.c 2008-12-01 00:26:27.000000000 +0100 +++ sys/kern/vfs_vnops.c 2008-12-15 22:53:42.000000000 +0100 @@ -498,6 +498,7 @@ int flags; { struct vnode *vp; + struct vnbops *vb; int error, ioflag; struct mtx *mtxp; int vfslocked; @@ -526,18 +527,20 @@ } fp->f_vnread_flags |= FOFFSET_LOCKED; mtx_unlock(mtxp); - vn_lock(vp, LK_SHARED | LK_RETRY); uio->uio_offset = fp->f_offset; - } else - vn_lock(vp, LK_SHARED | LK_RETRY); + } ioflag |= sequential_heuristic(uio, fp); - + vb = vn_rw_block(vp, uio, VB_READ); + if (vb != NULL) { #ifdef MAC - error = mac_vnode_check_read(active_cred, fp->f_cred, vp); - if (error == 0) + error = mac_vnode_check_read(active_cred, fp->f_cred, vp); + if (error == 0) #endif - error = VOP_READ(vp, uio, ioflag, fp->f_cred); + error = VOP_READ(vp, uio, ioflag, fp->f_cred); + vn_rw_bunlock(vp, vb); + } else + error = ENOMEM; if ((flags & FOF_OFFSET) == 0) { fp->f_offset = uio->uio_offset; mtx_lock(mtxp); @@ -547,7 +550,6 @@ mtx_unlock(mtxp); } fp->f_nextoff = uio->uio_offset; - VOP_UNLOCK(vp, 0); VFS_UNLOCK_GIANT(vfslocked); return (error); } @@ -565,6 +567,7 @@ { struct vnode *vp; struct mount *mp; + struct vnbops *vb; int error, ioflag; int vfslocked; @@ -588,19 +591,22 @@ if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto unlock; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if ((flags & FOF_OFFSET) == 0) uio->uio_offset = fp->f_offset; ioflag |= sequential_heuristic(uio, fp); + vb = vn_rw_block(vp, uio, (ioflag & IO_APPEND) ? VB_APPEND : VB_WRITE); + if (vb != NULL) { #ifdef MAC - error = mac_vnode_check_write(active_cred, fp->f_cred, vp); - if (error == 0) + error = mac_vnode_check_write(active_cred, fp->f_cred, vp); + if (error == 0) #endif - error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); + error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); + vn_rw_bunlock(vp, vb); + } else + error = ENOMEM; if ((flags & FOF_OFFSET) == 0) fp->f_offset = uio->uio_offset; fp->f_nextoff = uio->uio_offset; - VOP_UNLOCK(vp, 0); if (vp->v_type != VCHR) vn_finished_write(mp); unlock: --- /usr/src/sys/sys/uio.h 2008-08-28 20:19:30.000000000 +0200 +++ sys/sys/uio.h 2008-12-15 22:40:55.000000000 +0100 @@ -99,6 +99,7 @@ int uiomove_frombuf(void *buf, int buflen, struct uio *uio); int uiomove_fromphys(struct vm_page *ma[], vm_offset_t offset, int n, struct uio *uio); +int uiomove_overlap(struct uio *dst, struct uio *src); int uiomoveco(void *cp, int n, struct uio *uio, int disposable); #else /* !_KERNEL */ --- /usr/src/sys/sys/vnode.h 2008-12-13 20:49:13.000000000 +0100 +++ sys/sys/vnode.h 2008-12-15 18:41:17.000000000 +0100 @@ -43,6 +43,11 @@ #include #include +#define VB_NONE 0x00 +#define VB_READ 0x01 +#define VB_WRITE 0x02 +#define VB_APPEND 0x03 + /* * The vnode is the focus of all file activity in UNIX. There is a * unique vnode allocated for each active file, each current directory, @@ -61,6 +66,7 @@ */ struct namecache; +struct vnbops; struct vpollinfo { struct mtx vpi_lock; /* lock to protect below */ @@ -156,6 +162,7 @@ u_long v_iflag; /* i vnode flags (see below) */ u_long v_vflag; /* v vnode flags */ int v_writecount; /* v ref count of writers */ + SLIST_HEAD(, vnbops) v_vnbops; /* i list of byte range ops */ /* * The machinery of being a vnode @@ -615,6 +622,8 @@ size_t len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, size_t *aresid, struct thread *td); +struct vnbops *vn_rw_block(struct vnode *vp, struct uio *uio, u_int type); +void vn_rw_bunlock(struct vnode *vp, struct vnbops *vb); int vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred, struct ucred *file_cred, struct thread *td); int vn_start_write(struct vnode *vp, struct mount **mpp, int flags);