Index: sys/kern/vfs_mount.c =================================================================== --- sys/kern/vfs_mount.c (revision 205589) +++ sys/kern/vfs_mount.c (working copy) @@ -505,6 +505,7 @@ mac_mount_create(cred, mp); #endif arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0); + vfs_syncer_init(mp); return (mp); } @@ -543,6 +544,7 @@ if (mp->mnt_lockref != 0) panic("vfs_mount_destroy: nonzero lock refcount"); MNT_IUNLOCK(mp); + vfs_syncer_destroy(mp); #ifdef MAC mac_mount_destroy(mp); #endif @@ -1035,12 +1037,14 @@ mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); if ((mp->mnt_flag & MNT_RDONLY) == 0) { + vfs_syncer_attach(mp, vfsp); if (mp->mnt_syncer == NULL) error = vfs_allocate_syncvnode(mp); } else { if (mp->mnt_syncer != NULL) vrele(mp->mnt_syncer); mp->mnt_syncer = NULL; + vfs_syncer_detach(mp); } vfs_unbusy(mp); VI_LOCK(vp); @@ -1077,11 +1081,15 @@ VOP_UNLOCK(vp, 0); mountcheckdirs(vp, newdp); vrele(newdp); - if ((mp->mnt_flag & MNT_RDONLY) == 0) + if ((mp->mnt_flag & MNT_RDONLY) == 0) { + vfs_syncer_attach(mp, vfsp); error = vfs_allocate_syncvnode(mp); + } vfs_unbusy(mp); - if (error) + if (error) { vrele(vp); + vfs_syncer_detach(mp); + } } else { vfs_unbusy(mp); vfs_mount_destroy(mp); @@ -1327,6 +1335,7 @@ VOP_UNLOCK(coveredvp, 0); return (error); } + vfs_syncer_detach(mp); mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c (revision 205589) +++ sys/kern/vfs_subr.c (working copy) @@ -89,14 +89,12 @@ #include #endif -#define WI_MPSAFEQ 0 -#define WI_GIANTQ 1 - static MALLOC_DEFINE(M_NETADDR, "subr_export_host", "Export host address structure"); static void delmntque(struct vnode *vp); static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo); +static void sched_sync(void *arg); static void syncer_shutdown(void *arg, int howto); static int vtryrecycle(struct vnode *vp); static void vbusy(struct vnode *vp); @@ -216,25 +214,10 @@ * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ -static int syncer_delayno; -static long syncer_mask; -LIST_HEAD(synclist, bufobj); -static struct synclist *syncer_workitem_pending[2]; -/* - * The sync_mtx protects: - * bo->bo_synclist - * sync_vnode_count - * syncer_delayno - * syncer_state - * syncer_workitem_pending - * syncer_worklist_len - * rushjob - */ -static struct mtx sync_mtx; -static struct cv sync_wakeup; #define SYNCER_MAXDELAY 32 -static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ +#define SYNCER_MASK (SYNCER_MAXDELAY - 1) + static int syncdelay = 30; /* max time to delay syncing data */ static int filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); @@ -242,18 +225,15 @@ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); static int metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); -static int rushjob; /* number of slots to run ASAP */ +#if 0 static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); +#endif /* * When shutting down the syncer, run it at four times normal speed. */ #define SYNCER_SHUTDOWN_SPEEDUP 4 -static int sync_vnode_count; -static int syncer_worklist_len; -static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } - syncer_state; /* * Number of vnodes we want to exist at any one time. This is mostly used @@ -279,7 +259,6 @@ #define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) #define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt) - /* * Initialize the vnode management data structures. */ @@ -313,16 +292,6 @@ NULL, NULL, UMA_ALIGN_PTR, 0); vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - /* - * Initialize the filesystem syncer. - */ - syncer_workitem_pending[WI_MPSAFEQ] = hashinit(syncer_maxdelay, M_VNODE, - &syncer_mask); - syncer_workitem_pending[WI_GIANTQ] = hashinit(syncer_maxdelay, M_VNODE, - &syncer_mask); - syncer_maxdelay = syncer_mask + 1; - mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); - cv_init(&sync_wakeup, "syncer"); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); @@ -521,6 +490,89 @@ mtx_unlock(&mntid_mtx); } +void +vfs_syncer_init(struct mount *mp) +{ + + MPASS(mp != NULL); + + mp->mnt_sync_delayno = 0; + mp->mnt_sync_vnode_count = 0; + mp->mnt_sync_worklist_len = 0; + mp->mnt_sync_rushjob = 0; + mp->mnt_sync_attached = 0; + cv_init(&mp->mnt_sync_wakeup, "syncer"); + mp->mnt_sync_workitem_pending = malloc(SYNCER_MAXDELAY * + sizeof(struct synclist), M_TEMP, M_NOWAIT | M_ZERO); + if (mp->mnt_sync_workitem_pending == NULL) + panic("%s: ENOMEM for the vnodes pending queue", __func__); +} + +void +vfs_syncer_attach(struct mount *mp, struct vfsconf *vfsp) +{ + int error; + + MPASS(mp != NULL); + + MNT_ILOCK(mp); + if (mp->mnt_sync_attached != 0) { + MNT_IUNLOCK(mp); + return; + } + if (vfsp == NULL) { + vfsp = mp->mnt_vfc; + MPASS(vfsp != NULL); + } + mp->mnt_sync_state = SYNCER_RUNNING; + mp->mnt_sync_attached = 1; + MNT_IUNLOCK(mp); + error = kthread_add(sched_sync, mp, NULL, &mp->mnt_sync_thr, 0, + 0, "syncer %s:%d", vfsp->vfc_name, vfsp->vfc_refcount); + if (error != 0) + panic("%s: kthread_add() failed with %d errno", __func__, + error); +} + +void +vfs_syncer_destroy(struct mount *mp) +{ + + MPASS(mp != NULL); + + if (mp->mnt_sync_workitem_pending != NULL) { + free(mp->mnt_sync_workitem_pending, M_TEMP); + mp->mnt_sync_workitem_pending = NULL; + cv_destroy(&mp->mnt_sync_wakeup); + } +} + +void +vfs_syncer_detach(struct mount *mp) +{ + + MNT_ILOCK(mp); + if (mp->mnt_sync_attached == 0) { + MNT_IUNLOCK(mp); + return; + } + + /* Until the syncer thread is not entirely closed, stop the unmount. */ + MPASS(mp->mnt_sync_state != SYNCER_TO_DIE); + while (mp->mnt_sync_state != SYNCER_DIED) { + mp->mnt_sync_state = SYNCER_TO_DIE; + + /* + * Wakeup the possible sleeping syncer in order to speedup the + * operation. + */ + cv_broadcast(&mp->mnt_sync_wakeup); + msleep(&mp->mnt_sync_thr, MNT_MTX(mp), PVFS, "syncer drain", 0); + } + mp->mnt_sync_attached = 0; + MNT_IUNLOCK(mp); +} + /* * Knob to control the precision of file timestamps: * @@ -1590,6 +1642,7 @@ brelvp(struct buf *bp) { struct bufobj *bo; + struct mount *mp; struct vnode *vp; CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); @@ -1598,20 +1651,24 @@ /* * Delete from old vnode list, if on one. */ - vp = bp->b_vp; /* XXX */ + vp = bp->b_vp; + mp = vp->v_mount; bo = bp->b_bufobj; + MNT_ILOCK(mp); BO_LOCK(bo); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) buf_vlist_remove(bp); - else + else { + BO_UNLOCK(bo); + MNT_IUNLOCK(mp); panic("brelvp: Buffer %p not on queue.", bp); + } if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { bo->bo_flag &= ~BO_ONWORKLST; - mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); - syncer_worklist_len--; - mtx_unlock(&sync_mtx); + mp->mnt_sync_worklist_len--; } + MNT_IUNLOCK(mp); bp->b_flags &= ~B_NEEDSGIANT; bp->b_vp = NULL; bp->b_bufobj = NULL; @@ -1623,138 +1680,103 @@ * Add an item to the syncer work queue. */ static void -vn_syncer_add_to_worklist(struct bufobj *bo, int delay) +vn_syncer_add_to_worklist(struct mount *mp, struct bufobj *bo, int delay) { - int queue, slot; + int slot; + ASSERT_MP_ILOCKED(mp); ASSERT_BO_LOCKED(bo); - mtx_lock(&sync_mtx); if (bo->bo_flag & BO_ONWORKLST) LIST_REMOVE(bo, bo_synclist); else { bo->bo_flag |= BO_ONWORKLST; - syncer_worklist_len++; + mp->mnt_sync_worklist_len++; } - - if (delay > syncer_maxdelay - 2) - delay = syncer_maxdelay - 2; - slot = (syncer_delayno + delay) & syncer_mask; - - queue = VFS_NEEDSGIANT(bo->__bo_vnode->v_mount) ? WI_GIANTQ : - WI_MPSAFEQ; - LIST_INSERT_HEAD(&syncer_workitem_pending[queue][slot], bo, - bo_synclist); - mtx_unlock(&sync_mtx); + if (delay > SYNCER_MAXDELAY - 2) + delay = SYNCER_MAXDELAY - 2; + slot = (mp->mnt_sync_delayno + delay) & SYNCER_MASK; + LIST_INSERT_HEAD(&mp->mnt_sync_workitem_pending[slot], bo, bo_synclist); } -static int -sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) +/* + * Syncs on the filesystem a vnode (feeded through its bufobj). + * It does expects a mountpoint locked and correctly referenced, and a + * valid bufobj. + */ +static void +sync_vnode(struct mount *mp, struct bufobj *bo) { - int error, len; + struct vnode *vp; - mtx_lock(&sync_mtx); - len = syncer_worklist_len - sync_vnode_count; - mtx_unlock(&sync_mtx); - error = SYSCTL_OUT(req, &len, sizeof(len)); - return (error); -} + MPASS(mp != NULL && bo != NULL); + ASSERT_MP_ILOCKED(mp); + MPASS(mp->mnt_writeopcount > 0); -SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, - sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); + vp = bo->__bo_vnode; -static struct proc *updateproc; -static void sched_sync(void); -static struct kproc_desc up_kp = { - "syncer", - sched_sync, - &updateproc -}; -SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); - -static int -sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) -{ - struct vnode *vp; - struct mount *mp; - - *bo = LIST_FIRST(slp); - if (*bo == NULL) - return (0); - vp = (*bo)->__bo_vnode; /* XXX */ - if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) - return (1); /* - * We use vhold in case the vnode does not - * successfully sync. vhold prevents the vnode from - * going away when we unlock the sync_mtx so that - * we can acquire the vnode interlock. + * vhold() the vnode linked to the bufobj in order to avoid + * possible recycling when releasing the mountpoint interlock. */ - vholdl(vp); - mtx_unlock(&sync_mtx); - VI_UNLOCK(vp); - if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { - vdrop(vp); - mtx_lock(&sync_mtx); - return (*bo == LIST_FIRST(slp)); - } + vhold(vp); + MNT_IUNLOCK(mp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - (void) VOP_FSYNC(vp, MNT_LAZY, td); + VOP_FSYNC(vp, MNT_LAZY, curthread); VOP_UNLOCK(vp, 0); - vn_finished_write(mp); - BO_LOCK(*bo); - if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { + MNT_ILOCK(mp); + BO_LOCK(bo); + if ((bo->bo_flag & BO_ONWORKLST) != 0) { + /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ - vn_syncer_add_to_worklist(*bo, syncdelay); + vn_syncer_add_to_worklist(mp, bo, syncdelay); } - BO_UNLOCK(*bo); + BO_UNLOCK(bo); vdrop(vp); - mtx_lock(&sync_mtx); - return (0); } /* * System filesystem synchronizer daemon. */ static void -sched_sync(void) +sched_sync(void *arg) { - struct synclist *gnext, *next; - struct synclist *gslp, *slp; + eventhandler_tag shutdown_tag; + long starttime; + struct mount *mp; + struct synclist *next; + struct synclist *slp; struct bufobj *bo; - long starttime; - struct thread *td = curthread; + struct thread *td; int last_work_seen; int net_worklist_len; int syncer_final_iter; int first_printf; - int error; + int vfslocked; + mp = arg; last_work_seen = 0; syncer_final_iter = 0; first_printf = 1; - syncer_state = SYNCER_RUNNING; starttime = time_uptime; + td = curthread; td->td_pflags |= TDP_NORUNNINGBUF; - EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, - SHUTDOWN_PRI_LAST); + shutdown_tag = EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, + mp, SHUTDOWN_PRI_LAST); - mtx_lock(&sync_mtx); - for (;;) { - if (syncer_state == SYNCER_FINAL_DELAY && - syncer_final_iter == 0) { - mtx_unlock(&sync_mtx); - kproc_suspend_check(td->td_proc); - mtx_lock(&sync_mtx); - } - net_worklist_len = syncer_worklist_len - sync_vnode_count; - if (syncer_state != SYNCER_RUNNING && + vfslocked = VFS_LOCK_GIANT(mp); + MNT_ILOCK(mp); + while (mp->mnt_sync_state != SYNCER_TO_DIE) { + MPASS(mp->mnt_sync_state != SYNCER_DIED); + net_worklist_len = mp->mnt_sync_worklist_len - + mp->mnt_sync_vnode_count; + if (mp->mnt_sync_state != SYNCER_RUNNING && starttime != time_uptime) { if (first_printf) { printf("\nSyncing disks, vnodes remaining..."); @@ -1771,27 +1793,28 @@ * Skip over empty worklist slots when shutting down. */ do { - slp = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno]; - gslp = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno]; - syncer_delayno += 1; - if (syncer_delayno == syncer_maxdelay) - syncer_delayno = 0; - next = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno]; - gnext = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno]; + slp = + &mp->mnt_sync_workitem_pending[mp->mnt_sync_delayno]; + mp->mnt_sync_delayno += 1; + if (mp->mnt_sync_delayno == SYNCER_MAXDELAY) + mp->mnt_sync_delayno = 0; + next = + &mp->mnt_sync_workitem_pending[mp->mnt_sync_delayno]; + /* * If the worklist has wrapped since the * it was emptied of all but syncer vnodes, * switch to the FINAL_DELAY state and run * for one more second. */ - if (syncer_state == SYNCER_SHUTTING_DOWN && + if (mp->mnt_sync_state == SYNCER_SHUTTING_DOWN && net_worklist_len == 0 && - last_work_seen == syncer_delayno) { - syncer_state = SYNCER_FINAL_DELAY; + last_work_seen == mp->mnt_sync_delayno) { + mp->mnt_sync_state = SYNCER_FINAL_DELAY; syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; } - } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && - LIST_EMPTY(gslp) && syncer_worklist_len > 0); + } while (mp->mnt_sync_state != SYNCER_RUNNING && + LIST_EMPTY(slp) && mp->mnt_sync_worklist_len > 0); /* * Keep track of the last time there was anything @@ -1799,35 +1822,69 @@ * Return to the SHUTTING_DOWN state if any * new work appears. */ - if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) - last_work_seen = syncer_delayno; - if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) - syncer_state = SYNCER_SHUTTING_DOWN; - while (!LIST_EMPTY(slp)) { - error = sync_vnode(slp, &bo, td); - if (error == 1) { + if (net_worklist_len > 0 || + mp->mnt_sync_state == SYNCER_RUNNING) + last_work_seen = mp->mnt_sync_delayno; + if (net_worklist_len > 0 && + mp->mnt_sync_state == SYNCER_FINAL_DELAY) + mp->mnt_sync_state = SYNCER_SHUTTING_DOWN; + + /* + * As long as the mountpoint interlock is already held and + * a simple, non-sleeping, call to vn_start_write() is due, + * it is convenient to just emulate interesting bits + * directly in the code. + * Furthermore, it is already known that the mountpoint is + * not a filesystem bypass, thus VOP_GETWRITEMOUNT() + * inquirying is not necessary and that the syncer thread + * didn't set TDP_IGNSUSP flag, thus this check is skipped. + * Ultimately, the syncer also doesn't need to acquire a + * refcount on the mountpoint because the races against + * unmount are handled via the syncer state movements. + */ + if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { + + /* + * The filesystem is already suspending. + * Move the bufobjs into the next syncer queue + * and skip any further magic. + */ + while (!LIST_EMPTY(slp)) { + bo = LIST_FIRST(slp); LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(next, bo, bo_synclist); + } + } else { + mp->mnt_writeopcount++; + + /* + * The mountpoint is ready to be written now. + * sync_buflist() may release the mountpoint + * interlock so use a strict loop for dealing with + * lost insertion races. + */ + while (!LIST_EMPTY(slp)) + sync_vnode(mp, LIST_FIRST(slp)); + + /* Emulate a vn_finished_write(). */ + mp->mnt_writeopcount--; + MPASS(mp->mnt_writeopcount >= 0); + if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && + mp->mnt_writeopcount == 0) + wakeup(&mp->mnt_writeopcount); + + /* + * If an unmount operation started while the mountpoint + * interlock has been released during sync_buflist() + * invocation, quit the syncer thread now. + */ + if (mp->mnt_sync_state == SYNCER_TO_DIE) continue; - } } - if (!LIST_EMPTY(gslp)) { - mtx_unlock(&sync_mtx); - mtx_lock(&Giant); - mtx_lock(&sync_mtx); - while (!LIST_EMPTY(gslp)) { - error = sync_vnode(gslp, &bo, td); - if (error == 1) { - LIST_REMOVE(bo, bo_synclist); - LIST_INSERT_HEAD(gnext, bo, - bo_synclist); - continue; - } - } - mtx_unlock(&Giant); - } - if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) + if (mp->mnt_sync_state == SYNCER_FINAL_DELAY && + syncer_final_iter > 0) syncer_final_iter--; + /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob @@ -1838,10 +1895,11 @@ * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ - if (rushjob > 0) { - rushjob -= 1; + if (mp->mnt_sync_rushjob > 0) { + mp->mnt_sync_rushjob -= 1; continue; } + /* * Just sleep for a short period of time between * iterations when shutting down to allow some I/O @@ -1854,12 +1912,23 @@ * matter as we are just trying to generally pace the * filesystem activity. */ - if (syncer_state != SYNCER_RUNNING) - cv_timedwait(&sync_wakeup, &sync_mtx, + if (mp->mnt_sync_state != SYNCER_RUNNING) + cv_timedwait(&mp->mnt_sync_wakeup, MNT_MTX(mp), hz / SYNCER_SHUTDOWN_SPEEDUP); else if (time_uptime == starttime) - cv_timedwait(&sync_wakeup, &sync_mtx, hz); + cv_timedwait(&mp->mnt_sync_wakeup, MNT_MTX(mp), hz); } + + /* + * If this point is reached, an unmount request is in progress. + * Wakeup the unmount thread and close the syncer. + */ + mp->mnt_sync_state = SYNCER_DIED; + wakeup(&mp->mnt_sync_thr); + MNT_IUNLOCK(mp); + VFS_UNLOCK_GIANT(vfslocked); + EVENTHANDLER_DEREGISTER(shutdown_pre_sync, shutdown_tag); + kthread_exit(); } /* @@ -1870,6 +1939,7 @@ int speedup_syncer(void) { +#if 0 int ret = 0; mtx_lock(&sync_mtx); @@ -1881,6 +1951,8 @@ mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); return (ret); +#endif + return (0); } /* @@ -1890,15 +1962,17 @@ static void syncer_shutdown(void *arg, int howto) { + struct mount *mp; + mp = arg; if (howto & RB_NOSYNC) return; - mtx_lock(&sync_mtx); - syncer_state = SYNCER_SHUTTING_DOWN; - rushjob = 0; - mtx_unlock(&sync_mtx); - cv_broadcast(&sync_wakeup); - kproc_shutdown(arg, howto); + MNT_ILOCK(mp); + mp->mnt_sync_state = SYNCER_SHUTTING_DOWN; + mp->mnt_sync_rushjob = 0; + MNT_IUNLOCK(mp); + cv_broadcast(&mp->mnt_sync_wakeup); + kthread_shutdown(arg, howto); } /* @@ -1909,6 +1983,7 @@ void reassignbuf(struct buf *bp) { + struct mount *mp; struct vnode *vp; struct bufobj *bo; int delay; @@ -1917,6 +1992,7 @@ #endif vp = bp->b_vp; + mp = vp->v_mount; bo = bp->b_bufobj; ++reassignbufcalls; @@ -1931,12 +2007,21 @@ /* * Delete from old vnode list, if on one. + * + * Lock the mountpoint now in order to avoid a LOR with the bufobj lock + * as we may be needing to insert a dirty buffer within the appropriate + * syncer pending worklist. */ + MNT_ILOCK(mp); BO_LOCK(bo); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) buf_vlist_remove(bp); - else + else { + BO_UNLOCK(bo); + MNT_IUNLOCK(mp); panic("reassignbuf: Buffer %p not on queue.", bp); + } + /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. @@ -1953,17 +2038,15 @@ default: delay = filedelay; } - vn_syncer_add_to_worklist(bo, delay); + vn_syncer_add_to_worklist(mp, bo, delay); } buf_vlist_add(bp, bo, BX_VNDIRTY); } else { buf_vlist_add(bp, bo, BX_VNCLEAN); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { - mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); - syncer_worklist_len--; - mtx_unlock(&sync_mtx); + mp->mnt_sync_worklist_len--; bo->bo_flag &= ~BO_ONWORKLST; } } @@ -1984,6 +2067,7 @@ ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); #endif BO_UNLOCK(bo); + MNT_IUNLOCK(mp); } /* @@ -3369,23 +3453,23 @@ * are mounted at once. */ next += incr; - if (next == 0 || next > syncer_maxdelay) { + if (next == 0 || next > SYNCER_MAXDELAY) { start /= 2; incr /= 2; if (start == 0) { - start = syncer_maxdelay / 2; - incr = syncer_maxdelay; + start = SYNCER_MAXDELAY / 2; + incr = SYNCER_MAXDELAY; } next = start; } bo = &vp->v_bufobj; + MNT_ILOCK(mp); BO_LOCK(bo); - vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); - /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ - mtx_lock(&sync_mtx); - sync_vnode_count++; - mtx_unlock(&sync_mtx); + vn_syncer_add_to_worklist(mp, bo, + syncdelay > 0 ? next % syncdelay : 0); + mp->mnt_sync_vnode_count++; BO_UNLOCK(bo); + MNT_IUNLOCK(mp); mp->mnt_syncer = vp; return (0); } @@ -3411,9 +3495,11 @@ * Move ourselves to the back of the sync list. */ bo = &syncvp->v_bufobj; + MNT_ILOCK(mp); BO_LOCK(bo); - vn_syncer_add_to_worklist(bo, syncdelay); + vn_syncer_add_to_worklist(mp, bo, syncdelay); BO_UNLOCK(bo); + MNT_IUNLOCK(mp); /* * Walk the list of vnodes pushing all that are dirty and @@ -3463,21 +3549,24 @@ static int sync_reclaim(struct vop_reclaim_args *ap) { - struct vnode *vp = ap->a_vp; + struct mount *mp; + struct vnode *vp; struct bufobj *bo; + vp = ap->a_vp; + mp = vp->v_mount; bo = &vp->v_bufobj; + MNT_ILOCK(mp); BO_LOCK(bo); vp->v_mount->mnt_syncer = NULL; if (bo->bo_flag & BO_ONWORKLST) { - mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); - syncer_worklist_len--; - sync_vnode_count--; - mtx_unlock(&sync_mtx); + mp->mnt_sync_worklist_len--; + mp->mnt_sync_vnode_count--; bo->bo_flag &= ~BO_ONWORKLST; } BO_UNLOCK(bo); + MNT_IUNLOCK(mp); return (0); } Index: sys/sys/mount.h =================================================================== --- sys/sys/mount.h (revision 205589) +++ sys/sys/mount.h (working copy) @@ -36,6 +36,7 @@ #include #include #ifdef _KERNEL +#include #include #include #include @@ -128,6 +129,7 @@ }; TAILQ_HEAD(vnodelst, vnode); +LIST_HEAD(synclist, bufobj); /* Mount options list */ TAILQ_HEAD(vfsoptlist, vfsopt); @@ -187,6 +189,21 @@ #define mnt_endzero mnt_gjprovider char *mnt_gjprovider; /* gjournal provider name */ struct lock mnt_explock; /* vfs_export walkers lock */ + struct cv mnt_sync_wakeup; + struct thread *mnt_sync_thr; + struct synclist *mnt_sync_workitem_pending; + int mnt_sync_delayno; + int mnt_sync_vnode_count; + int mnt_sync_worklist_len; + int mnt_sync_rushjob; + enum { + SYNCER_RUNNING, + SYNCER_SHUTTING_DOWN, + SYNCER_FINAL_DELAY, + SYNCER_TO_DIE, + SYNCER_DIED + } mnt_sync_state; + int mnt_sync_attached; }; struct vnode *__mnt_vnode_next(struct vnode **mvp, struct mount *mp); @@ -207,6 +224,7 @@ MNT_IUNLOCK(mp); \ } while (0) +#define ASSERT_MP_ILOCKED(mp) mtx_assert(&(mp)->mnt_mtx, MA_OWNED) #define MNT_ILOCK(mp) mtx_lock(&(mp)->mnt_mtx) #define MNT_ITRYLOCK(mp) mtx_trylock(&(mp)->mnt_mtx) #define MNT_IUNLOCK(mp) mtx_unlock(&(mp)->mnt_mtx) @@ -724,6 +742,10 @@ const char *value); int vfs_setpublicfs /* set publicly exported fs */ (struct mount *, struct netexport *, struct export_args *); +void vfs_syncer_attach(struct mount *mp, struct vfsconf *vfsp); +void vfs_syncer_detach(struct mount *mp); +void vfs_syncer_destroy(struct mount *mp); +void vfs_syncer_init(struct mount *mp); void vfs_msync(struct mount *, int); int vfs_busy(struct mount *, int); int vfs_export /* process mount export info */