Index: kern/vfs_bio.c =================================================================== RCS file: /home/ncvs/src/sys/kern/vfs_bio.c,v retrieving revision 1.541 diff -p -u -r1.541 vfs_bio.c --- kern/vfs_bio.c 22 Mar 2008 09:15:14 -0000 1.541 +++ kern/vfs_bio.c 26 Mar 2008 00:14:06 -0000 @@ -251,8 +251,8 @@ static struct mtx nblock; /* Queues for free buffers with various properties */ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; -/* Lock for the bufqueues */ -static struct mtx bqlock; +/* Lock for each bufqueue. */ +static struct mtx bqlock[BUFFER_QUEUES]; /* * Single global constant for BUF_WMESG, to avoid getting multiple references. @@ -279,7 +279,7 @@ static __inline void numdirtywakeup(int level) { - if (numdirtybuffers <= level) { + if (numdirtybuffers <= level && needsbuffer) { mtx_lock(&nblock); if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) { needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH; @@ -302,6 +302,8 @@ static __inline void bufspacewakeup(void) { + if (needsbuffer == 0) + return; /* * If someone is waiting for BUF space, wake them up. Even * though we haven't freed the kva space yet, the waiting @@ -323,16 +325,18 @@ void runningbufwakeup(struct buf *bp) { - if (bp->b_runningbufspace) { - atomic_subtract_int(&runningbufspace, bp->b_runningbufspace); - bp->b_runningbufspace = 0; - mtx_lock(&rbreqlock); - if (runningbufreq && runningbufspace <= lorunningspace) { - runningbufreq = 0; - wakeup(&runningbufreq); - } - mtx_unlock(&rbreqlock); + if (bp->b_runningbufspace == 0) + return; + atomic_subtract_int(&runningbufspace, bp->b_runningbufspace); + bp->b_runningbufspace = 0; + if (runningbufreq == 0) + return; + mtx_lock(&rbreqlock); + if (runningbufreq && runningbufspace <= lorunningspace) { + runningbufreq = 0; + wakeup(&runningbufreq); } + mtx_unlock(&rbreqlock); } /* @@ -349,6 +353,8 @@ bufcountwakeup(void) { atomic_add_int(&numfreebuffers, 1); + if (needsbuffer == 0) + return; mtx_lock(&nblock); if (needsbuffer) { needsbuffer &= ~VFS_BIO_NEED_ANY; @@ -378,6 +384,8 @@ void waitrunningbufspace(void) { + if (runningbufspace < hirunningspace) + return; mtx_lock(&rbreqlock); while (runningbufspace > hirunningspace) { ++runningbufreq; @@ -525,14 +533,15 @@ bufinit(void) struct buf *bp; int i; - mtx_init(&bqlock, "buf queue lock", NULL, MTX_DEF); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); /* next, make a null set of free lists */ - for (i = 0; i < BUFFER_QUEUES; i++) + for (i = 0; i < BUFFER_QUEUES; i++) { + mtx_init(&bqlock[i], "buf queue lock", NULL, MTX_DEF); TAILQ_INIT(&bufqueues[i]); + } /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { @@ -667,9 +676,12 @@ bremfree(struct buf *bp) void bremfreef(struct buf *bp) { - mtx_lock(&bqlock); + struct mtx *mtxp; + + mtxp = &bqlock[bp->b_qindex]; + mtx_lock(mtxp); bremfreel(bp); - mtx_unlock(&bqlock); + mtx_unlock(mtxp); } /* @@ -686,7 +698,7 @@ bremfreel(struct buf *bp) KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfreel: buffer %p not on a queue.", bp)); BUF_ASSERT_HELD(bp); - mtx_assert(&bqlock, MA_OWNED); + mtx_assert(&bqlock[bp->b_qindex], MA_OWNED); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); bp->b_qindex = QUEUE_NONE; @@ -763,6 +775,36 @@ breada(struct vnode * vp, daddr_t * rabl * read-ahead blocks. */ int +breads(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, + struct buf **bpp) +{ + struct buf *bp; + + CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); + *bpp = bp = getblk(vp, blkno, size, 0, 0, GB_LOCK_CANSHARE); + + /* if not found in cache, do some I/O */ + if ((bp->b_flags & B_CACHE) == 0) { + if (!TD_IS_IDLETHREAD(curthread)) + curthread->td_ru.ru_inblock++; + bp->b_iocmd = BIO_READ; + bp->b_flags &= ~B_INVAL; + bp->b_ioflags &= ~BIO_ERROR; + if (bp->b_rcred == NOCRED && cred != NOCRED) + bp->b_rcred = crhold(cred); + vfs_busy_pages(bp, 0); + bp->b_iooffset = dbtob(bp->b_blkno); + bstrategy(bp); + return bufwait(bp); + } + return (0); +} + +/* + * Operates like bread, but also starts asynchronous I/O on + * read-ahead blocks. + */ +int breadn(struct vnode * vp, daddr_t blkno, int size, daddr_t * rablkno, int *rabsize, int cnt, struct ucred * cred, struct buf **bpp) @@ -1145,11 +1187,18 @@ buf_dirty_count_severe(void) void brelse(struct buf *bp) { + int tail; + CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); + if (BUF_ISLOCKED(bp) == LK_SHARED) { + BUF_UNLOCK(bp); + return; + } + if (bp->b_flags & B_MANAGED) { bqrelse(bp); return; @@ -1334,13 +1383,12 @@ brelse(struct buf *bp) } /* enqueue */ - mtx_lock(&bqlock); /* Handle delayed bremfree() processing. */ if (bp->b_flags & B_REMFREE) - bremfreel(bp); + bremfreef(bp); if (bp->b_qindex != QUEUE_NONE) panic("brelse: free buffer onto another queue???"); - + tail = 0; /* buffers with no memory */ if (bp->b_bufsize == 0) { bp->b_flags |= B_INVAL; @@ -1352,7 +1400,6 @@ brelse(struct buf *bp) } else { bp->b_qindex = QUEUE_EMPTY; } - TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); /* buffers with junk contents */ } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { @@ -1361,7 +1408,6 @@ brelse(struct buf *bp) if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); bp->b_qindex = QUEUE_CLEAN; - TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); /* remaining buffers */ } else { if ((bp->b_flags & (B_DELWRI|B_NEEDSGIANT)) == @@ -1371,12 +1417,15 @@ brelse(struct buf *bp) bp->b_qindex = QUEUE_DIRTY; else bp->b_qindex = QUEUE_CLEAN; - if (bp->b_flags & B_AGE) - TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); - else - TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); + if ((bp->b_flags & B_AGE) == 0) + tail = 1; } - mtx_unlock(&bqlock); + mtx_lock(&bqlock[bp->b_qindex]); + if (tail) + TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); + else + TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); + mtx_unlock(&bqlock[bp->b_qindex]); /* * If B_INVAL and B_DELWRI is set, clear B_DELWRI. We have already @@ -1432,27 +1481,23 @@ bqrelse(struct buf *bp) KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); + if (BUF_ISLOCKED(bp) == LK_SHARED) { + BUF_UNLOCK(bp); + return; + } if (BUF_LOCKRECURSED(bp)) { /* do not release to free list */ BUF_UNLOCK(bp); return; } - + /* Handle delayed bremfree() processing. */ + if (bp->b_flags & B_REMFREE) + bremfreef(bp); if (bp->b_flags & B_MANAGED) { - if (bp->b_flags & B_REMFREE) { - mtx_lock(&bqlock); - bremfreel(bp); - mtx_unlock(&bqlock); - } bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); BUF_UNLOCK(bp); return; } - - mtx_lock(&bqlock); - /* Handle delayed bremfree() processing. */ - if (bp->b_flags & B_REMFREE) - bremfreel(bp); if (bp->b_qindex != QUEUE_NONE) panic("bqrelse: free buffer onto another queue???"); /* buffers with stale but valid contents */ @@ -1461,7 +1506,6 @@ bqrelse(struct buf *bp) bp->b_qindex = QUEUE_DIRTY_GIANT; else bp->b_qindex = QUEUE_DIRTY; - TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); } else { /* * The locking of the BO_LOCK for checking of the @@ -1472,20 +1516,19 @@ bqrelse(struct buf *bp) */ if (!vm_page_count_severe() || (bp->b_vflags & BV_BKGRDINPROG)) { bp->b_qindex = QUEUE_CLEAN; - TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, - b_freelist); } else { /* * We are too low on memory, we have to try to free * the buffer (most importantly: the wired pages * making up its backing store) *now*. */ - mtx_unlock(&bqlock); brelse(bp); return; } } - mtx_unlock(&bqlock); + mtx_lock(&bqlock[bp->b_qindex]); + TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); + mtx_unlock(&bqlock[bp->b_qindex]); if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI)) bufcountwakeup(); @@ -1692,7 +1735,7 @@ getnewbuf(int slpflag, int slptimeo, int struct buf *bp; struct buf *nbp; int defrag = 0; - int nqindex; + int qindex; int waiters = 0; static int flushingbufs; @@ -1720,56 +1763,52 @@ restart: * However, there are a number of cases (defragging, reusing, ...) * where we cannot backup. */ - mtx_lock(&bqlock); - nqindex = QUEUE_EMPTYKVA; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); - - if (nbp == NULL) { - /* - * If no EMPTYKVA buffers and we are either - * defragging or reusing, locate a CLEAN buffer - * to free or reuse. If bufspace useage is low - * skip this step so we can allocate a new buffer. - */ - if (defrag || bufspace >= lobufspace) { - nqindex = QUEUE_CLEAN; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); - } - - /* - * If we could not find or were not allowed to reuse a - * CLEAN buffer, check to see if it is ok to use an EMPTY - * buffer. We can only use an EMPTY buffer if allocating - * its KVA would not otherwise run us out of buffer space. - */ - if (nbp == NULL && defrag == 0 && - bufspace + maxsize < hibufspace) { - nqindex = QUEUE_EMPTY; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); - } + bp = NULL; + qindex = QUEUE_EMPTYKVA; + mtx_lock(&bqlock[qindex]); + nbp = TAILQ_FIRST(&bufqueues[qindex]); + /* + * If no EMPTYKVA buffers and we are either + * defragging or reusing, locate a CLEAN buffer + * to free or reuse. If bufspace useage is low + * skip this step so we can allocate a new buffer. + */ + if (nbp == NULL && (defrag || bufspace >= lobufspace)) { + mtx_unlock(&bqlock[qindex]); + qindex = QUEUE_CLEAN; + mtx_lock(&bqlock[qindex]); + nbp = TAILQ_FIRST(&bufqueues[qindex]); + } + /* + * If we could not find or were not allowed to reuse a + * CLEAN buffer, check to see if it is ok to use an EMPTY + * buffer. We can only use an EMPTY buffer if allocating + * its KVA would not otherwise run us out of buffer space. + */ + if (nbp == NULL && defrag == 0 && bufspace + maxsize < hibufspace) { + mtx_unlock(&bqlock[qindex]); + qindex = QUEUE_EMPTY; + mtx_lock(&bqlock[qindex]); + nbp = TAILQ_FIRST(&bufqueues[qindex]); } - /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ - - while ((bp = nbp) != NULL) { - int qindex = nqindex; - - /* - * Calculate next bp ( we can only use it if we do not block - * or do other fancy things ). - */ - if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { + for (;;) { + if (nbp == NULL) { switch(qindex) { case QUEUE_EMPTY: - nqindex = QUEUE_EMPTYKVA; + mtx_unlock(&bqlock[qindex]); + qindex = QUEUE_EMPTYKVA; + mtx_lock(&bqlock[qindex]); if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]))) break; /* FALLTHROUGH */ case QUEUE_EMPTYKVA: - nqindex = QUEUE_CLEAN; + mtx_unlock(&bqlock[qindex]); + qindex = QUEUE_CLEAN; + mtx_lock(&bqlock[qindex]); if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]))) break; /* FALLTHROUGH */ @@ -1780,6 +1819,14 @@ restart: break; } } + if (nbp == NULL) + break; + bp = nbp; + /* + * Calculate next bp ( we can only use it if we do not block + * or do other fancy things ). + */ + nbp = TAILQ_NEXT(bp, b_freelist); /* * If we are defragging then we need a buffer with * b_kvasize != 0. XXX this situation should no longer @@ -1822,9 +1869,8 @@ restart: */ KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex)); - bremfreel(bp); - mtx_unlock(&bqlock); + mtx_unlock(&bqlock[qindex]); if (qindex == QUEUE_CLEAN) { if (bp->b_flags & B_VMIO) { @@ -1937,6 +1983,7 @@ restart: int flags; char *waitmsg; + mtx_unlock(&bqlock[qindex]); if (defrag) { flags = VFS_BIO_NEED_BUFSPACE; waitmsg = "nbufkv"; @@ -1950,7 +1997,6 @@ restart: mtx_lock(&nblock); needsbuffer |= flags; mtx_unlock(&nblock); - mtx_unlock(&bqlock); bd_speedup(); /* heeeelp */ @@ -2136,7 +2182,7 @@ flushbufqueues(int queue, int flushdeps) target /= 2; flushed = 0; bp = NULL; - mtx_lock(&bqlock); + mtx_lock(&bqlock[queue]); TAILQ_INSERT_TAIL(&bufqueues[queue], &sentinel, b_freelist); while (flushed != target) { bp = TAILQ_FIRST(&bufqueues[queue]); @@ -2161,11 +2207,11 @@ flushbufqueues(int queue, int flushdeps) BO_UNLOCK(bp->b_bufobj); if (bp->b_flags & B_INVAL) { bremfreel(bp); - mtx_unlock(&bqlock); + mtx_unlock(&bqlock[queue]); brelse(bp); flushed++; numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2); - mtx_lock(&bqlock); + mtx_lock(&bqlock[queue]); continue; } @@ -2193,7 +2239,7 @@ flushbufqueues(int queue, int flushdeps) continue; } if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { - mtx_unlock(&bqlock); + mtx_unlock(&bqlock[queue]); CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); vfs_bio_awrite(bp); @@ -2203,14 +2249,14 @@ flushbufqueues(int queue, int flushdeps) flushed++; waitrunningbufspace(); numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2); - mtx_lock(&bqlock); + mtx_lock(&bqlock[queue]); continue; } vn_finished_write(mp); BUF_UNLOCK(bp); } TAILQ_REMOVE(&bufqueues[queue], &sentinel, b_freelist); - mtx_unlock(&bqlock); + mtx_unlock(&bqlock[queue]); return (flushed); } @@ -2449,8 +2495,11 @@ loop: * Buffer is in-core. If the buffer is not busy, it must * be on a queue. */ - lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; - + lockflags = LK_SLEEPFAIL | LK_INTERLOCK; + if (flags & GB_LOCK_CANSHARE) + lockflags |= LK_SHARED; + else + lockflags |= LK_EXCLUSIVE; if (flags & GB_LOCK_NOWAIT) lockflags |= LK_NOWAIT; @@ -2467,6 +2516,14 @@ loop: else if (error) return (NULL); + if (flags & GB_LOCK_CANSHARE && + ((bp->b_flags & (B_INVAL | B_CACHE | B_DELWRI)) != B_CACHE || + bp->b_bcount != size)) { + BUF_UNLOCK(bp); + flags &= ~GB_LOCK_CANSHARE; + goto loop; + } + /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set @@ -2477,7 +2534,8 @@ loop: bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; - bremfree(bp); + if ((flags & GB_LOCK_CANSHARE) == 0) + bremfree(bp); /* * check for size inconsistancies for non-VMIO case. Index: kern/vfs_subr.c =================================================================== RCS file: /home/ncvs/src/sys/kern/vfs_subr.c,v retrieving revision 1.725 diff -p -u -r1.725 vfs_subr.c --- kern/vfs_subr.c 24 Mar 2008 04:22:58 -0000 1.725 +++ kern/vfs_subr.c 26 Mar 2008 00:14:06 -0000 @@ -272,15 +272,6 @@ SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhe &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); /* - * Macros to control when a vnode is freed and recycled. All require - * the vnode interlock. - */ -#define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) -#define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) -#define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt) - - -/* * Initialize the vnode management data structures. */ #ifndef MAXVNODES_MAX @@ -685,7 +676,7 @@ vnlru_free(int count) TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); continue; } - VNASSERT(VCANRECYCLE(vp), vp, + VNASSERT((vp->v_iflag & VI_FREE), vp, ("vp inconsistent on freelist")); freevnodes--; vp->v_iflag &= ~VI_FREE; @@ -2040,12 +2031,18 @@ vget(struct vnode *vp, int flags, struct { int error; - error = 0; VFS_ASSERT_GIANT(vp->v_mount); - if ((flags & LK_INTERLOCK) == 0) - VI_LOCK(vp); - vholdl(vp); - if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) { + error = 0; + + /* + * Grab a hold count so the vnode can't disappear while we're + * locking it. + */ + if (flags & LK_INTERLOCK) + vholdl(vp); + else + vhold(vp); + if ((error = vn_lock(vp, flags)) != 0) { vdrop(vp); return (error); } @@ -2225,17 +2222,27 @@ void vhold(struct vnode *vp) { + /* + * Increment the holdcnt. If we acquire the first ref we need + * to lock the vnode interlock and remove the vnode from the + * free list. + */ + if (atomic_fetchadd_int(&vp->v_holdcnt, 1) != 0) + return; VI_LOCK(vp); - vholdl(vp); + if (vp->v_iflag & VI_FREE) + vbusy(vp); VI_UNLOCK(vp); } void vholdl(struct vnode *vp) { + ASSERT_VI_LOCKED(vp, "vholdl"); - vp->v_holdcnt++; - if (VSHOULDBUSY(vp)) + if (atomic_fetchadd_int(&vp->v_holdcnt, 1) != 0) + return; + if (vp->v_iflag & VI_FREE) vbusy(vp); } @@ -2247,8 +2254,17 @@ void vdrop(struct vnode *vp) { + if (atomic_fetchadd_int(&vp->v_holdcnt, -1) != 1) + return; VI_LOCK(vp); - vdropl(vp); + if (vp->v_holdcnt == 0 && (vp->v_iflag & VI_FREE) == 0) { + if (vp->v_iflag & VI_DOOMED) { + vdestroy(vp); + return; + } else + vfree(vp); + } + VI_UNLOCK(vp); } /* @@ -2263,8 +2279,8 @@ vdropl(struct vnode *vp) ASSERT_VI_LOCKED(vp, "vdropl"); if (vp->v_holdcnt <= 0) panic("vdrop: holdcnt %d", vp->v_holdcnt); - vp->v_holdcnt--; - if (vp->v_holdcnt == 0) { + if (atomic_fetchadd_int(&vp->v_holdcnt, -1) == 1 && + (vp->v_iflag & VI_FREE) == 0) { if (vp->v_iflag & VI_DOOMED) { vdestroy(vp); return; @@ -3004,7 +3020,8 @@ vfree(struct vnode *vp) mtx_lock(&vnode_free_list_mtx); VNASSERT(vp->v_op != NULL, vp, ("vfree: vnode already reclaimed.")); VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free")); - VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't")); + VNASSERT(!(vp->v_iflag & VI_FREE) && vp->v_holdcnt == 0, vp, + ("vfree: freeing when we shouldn't")); VNASSERT((vp->v_iflag & VI_DOOMED) == 0, vp, ("vfree: Freeing doomed vnode")); if (vp->v_iflag & VI_AGE) { Index: kern/vfs_vnops.c =================================================================== RCS file: /home/ncvs/src/sys/kern/vfs_vnops.c,v retrieving revision 1.259 diff -p -u -r1.259 vfs_vnops.c --- kern/vfs_vnops.c 24 Mar 2008 04:17:35 -0000 1.259 +++ kern/vfs_vnops.c 26 Mar 2008 00:14:07 -0000 @@ -862,18 +862,9 @@ _vn_lock(struct vnode *vp, int flags, ch { int error; - /* - * With no lock type requested we're just polling for validity. - */ - if ((flags & LK_TYPE_MASK) == 0) { - error = 0; - if ((flags & LK_INTERLOCK) == 0) - VI_LOCK(vp); - if (vp->v_iflag & VI_DOOMED) - error = ENOENT; - VI_UNLOCK(vp); - return (error); - } + KASSERT((flags & LK_TYPE_MASK) != 0, + ("vn_lock: No lock type specified.")); + do { error = VOP_LOCK1(vp, flags, file, line); flags &= ~LK_INTERLOCK; /* Interlock is always dropped. */ Index: sys/buf.h =================================================================== RCS file: /home/ncvs/src/sys/sys/buf.h,v retrieving revision 1.204 diff -p -u -r1.204 buf.h --- sys/buf.h 22 Mar 2008 09:15:16 -0000 1.204 +++ sys/buf.h 26 Mar 2008 00:14:12 -0000 @@ -472,6 +472,7 @@ buf_countdeps(struct buf *bp, int i) */ #define GB_LOCK_NOWAIT 0x0001 /* Fail if we block on a buf lock. */ #define GB_NOCREAT 0x0002 /* Don't create a buf if not found. */ +#define GB_LOCK_CANSHARE 0x0004 #ifdef _KERNEL extern int nbuf; /* The number of buffer headers */ @@ -501,6 +502,7 @@ int buf_dirty_count_severe(void); void bremfree(struct buf *); void bremfreef(struct buf *); /* XXX Force bremfree, only for nfs. */ int bread(struct vnode *, daddr_t, int, struct ucred *, struct buf **); +int breads(struct vnode *, daddr_t, int, struct ucred *, struct buf **); void breada(struct vnode *, daddr_t *, int *, int, struct ucred *); int breadn(struct vnode *, daddr_t, int, daddr_t *, int *, int, struct ucred *, struct buf **); Index: sys/vnode.h =================================================================== RCS file: /home/ncvs/src/sys/sys/vnode.h,v retrieving revision 1.333 diff -p -u -r1.333 vnode.h --- sys/vnode.h 24 Mar 2008 04:11:40 -0000 1.333 +++ sys/vnode.h 26 Mar 2008 00:14:12 -0000 @@ -151,7 +151,7 @@ struct vnode { struct lock v_lock; /* u (if fs don't have one) */ struct mtx v_interlock; /* lock for "i" things */ struct lock *v_vnlock; /* u pointer to vnode lock */ - int v_holdcnt; /* i prevents recycling. */ + volatile int v_holdcnt; /* i prevents recycling. */ int v_usecount; /* i ref count of users */ u_long v_iflag; /* i vnode flags (see below) */ u_long v_vflag; /* v vnode flags */ Index: ufs/ffs/ffs_snapshot.c =================================================================== RCS file: /home/ncvs/src/sys/ufs/ffs/ffs_snapshot.c,v retrieving revision 1.141 diff -p -u -r1.141 ffs_snapshot.c --- ufs/ffs/ffs_snapshot.c 19 Mar 2008 06:19:01 -0000 1.141 +++ ufs/ffs/ffs_snapshot.c 26 Mar 2008 00:14:12 -0000 @@ -122,6 +122,11 @@ ffs_copyonwrite(devvp, bp) return (EINVAL); } +void +ffs_snapdata_free(struct inode *ip) +{ +} + #else TAILQ_HEAD(snaphead, inode); @@ -162,7 +167,8 @@ static int mapacct_ufs2(struct vnode *, struct fs *, ufs_lbn_t, int); static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t); static void process_deferred_inactive(struct mount *); -static void try_free_snapdata(struct vnode *devvp, struct thread *td); +static void try_free_snapdata(struct vnode *devvp, struct vnode *vp); +static struct snapdata *ffs_snapdata_acquire(struct vnode *devvp); static int ffs_bp_snapblk(struct vnode *, struct buf *); /* @@ -603,34 +609,12 @@ loop: } MNT_IUNLOCK(mp); /* - * If there already exist snapshots on this filesystem, grab a - * reference to their shared lock. If this is the first snapshot - * on this filesystem, we need to allocate a lock for the snapshots - * to share. In either case, acquire the snapshot lock and give - * up our original private lock. + * Acquire a lock on the snapdata structure, creating it if necessary. */ - VI_LOCK(devvp); - sn = devvp->v_rdev->si_snapdata; - if (sn != NULL) { - xp = TAILQ_FIRST(&sn->sn_head); - VI_UNLOCK(devvp); - VI_LOCK(vp); - vp->v_vnlock = &sn->sn_lock; - } else { - VI_UNLOCK(devvp); - sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); - TAILQ_INIT(&sn->sn_head); - lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, - LK_CANRECURSE | LK_NOSHARE); - VI_LOCK(vp); - vp->v_vnlock = &sn->sn_lock; - mp_fixme("si_snapdata setting is racey."); - devvp->v_rdev->si_snapdata = sn; - xp = NULL; - } - lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, - VI_MTX(vp)); + sn = ffs_snapdata_acquire(devvp); + vp->v_vnlock = &sn->sn_lock; lockmgr(&vp->v_lock, LK_RELEASE, NULL); + xp = TAILQ_FIRST(&sn->sn_head); /* * If this is the first snapshot on this filesystem, then we need * to allocate the space for the list of preallocated snapshot blocks. @@ -1566,7 +1550,6 @@ ffs_snapremove(vp) struct vnode *devvp; struct buf *ibp; struct fs *fs; - struct thread *td = curthread; ufs2_daddr_t numblks, blkno, dblk; int error, loc, last; struct snapdata *sn; @@ -1588,16 +1571,14 @@ ffs_snapremove(vp) ip->i_nextsnap.tqe_prev = 0; VI_UNLOCK(devvp); lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); - VI_LOCK(vp); KASSERT(vp->v_vnlock == &sn->sn_lock, ("ffs_snapremove: lost lock mutation")); vp->v_vnlock = &vp->v_lock; - VI_UNLOCK(vp); VI_LOCK(devvp); lockmgr(&sn->sn_lock, LK_RELEASE, NULL); - try_free_snapdata(devvp, td); - } else - VI_UNLOCK(devvp); + try_free_snapdata(devvp, vp); + } + VI_UNLOCK(devvp); /* * Clear all BLK_NOCOPY fields. Pass any block claims to other * snapshots that want them (see ffs_snapblkfree below). @@ -1904,7 +1885,7 @@ ffs_snapshot_mount(mp) */ vp = NULL; lastvp = NULL; - sn = devvp->v_rdev->si_snapdata; + sn = NULL; for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; @@ -1937,30 +1918,11 @@ ffs_snapshot_mount(mp) continue; } /* - * If there already exist snapshots on this filesystem, grab a - * reference to their shared lock. If this is the first snapshot - * on this filesystem, we need to allocate a lock for the - * snapshots to share. In either case, acquire the snapshot - * lock and give up our original private lock. + * Acquire a lock on the snapdata structure, creating it if + * necessary. */ - VI_LOCK(devvp); - if (sn != NULL) { - - VI_UNLOCK(devvp); - VI_LOCK(vp); - vp->v_vnlock = &sn->sn_lock; - } else { - VI_UNLOCK(devvp); - sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); - TAILQ_INIT(&sn->sn_head); - lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, - LK_CANRECURSE | LK_NOSHARE); - VI_LOCK(vp); - vp->v_vnlock = &sn->sn_lock; - devvp->v_rdev->si_snapdata = sn; - } - lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, - VI_MTX(vp)); + sn = ffs_snapdata_acquire(devvp); + vp->v_vnlock = &sn->sn_lock; lockmgr(&vp->v_lock, LK_RELEASE, NULL); /* * Link it onto the active snapshot list. @@ -1980,7 +1942,7 @@ ffs_snapshot_mount(mp) /* * No usable snapshots found. */ - if (vp == NULL) + if (sn == NULL || vp == NULL) return; /* * Allocate the space for the block hints list. We always want to @@ -2035,7 +1997,6 @@ ffs_snapshot_unmount(mp) struct snapdata *sn; struct inode *xp; struct vnode *vp; - struct thread *td = curthread; VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; @@ -2045,13 +2006,13 @@ ffs_snapshot_unmount(mp) xp->i_nextsnap.tqe_prev = 0; lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE, VI_MTX(devvp)); - VI_LOCK(vp); - lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE, VI_MTX(vp)); - VI_LOCK(vp); + lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); KASSERT(vp->v_vnlock == &sn->sn_lock, ("ffs_snapshot_unmount: lost lock mutation")); vp->v_vnlock = &vp->v_lock; - VI_UNLOCK(vp); + VI_LOCK(devvp); + try_free_snapdata(devvp, vp); + VI_UNLOCK(devvp); lockmgr(&vp->v_lock, LK_RELEASE, NULL); lockmgr(&sn->sn_lock, LK_RELEASE, NULL); if (xp->i_effnlink > 0) @@ -2059,7 +2020,6 @@ ffs_snapshot_unmount(mp) VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; } - try_free_snapdata(devvp, td); ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); } @@ -2134,12 +2094,12 @@ ffs_bdflush(bo, bp) * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { - if ((nbp->b_vflags & BV_BKGRDINPROG) || - BUF_LOCK(nbp, - LK_EXCLUSIVE | LK_NOWAIT, NULL)) + if (nbp->b_vflags & BV_BKGRDINPROG) continue; if (bp == nbp) panic("bdwrite: found ourselves"); + if (BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) + continue; BO_UNLOCK(bo); /* * Don't countdeps with the bo lock @@ -2488,26 +2448,66 @@ process_deferred_inactive(struct mount * /* Try to free snapdata associated with devvp */ static void -try_free_snapdata(struct vnode *devvp, - struct thread *td) +try_free_snapdata(struct vnode *devvp, struct vnode *vp) { struct snapdata *sn; - ufs2_daddr_t *snapblklist; + ASSERT_VI_LOCKED(devvp, "try_free_snapdata"); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL || - (devvp->v_vflag & VV_COPYONWRITE) == 0) { - VI_UNLOCK(devvp); + (devvp->v_vflag & VV_COPYONWRITE) == 0) return; + devvp->v_vflag &= ~VV_COPYONWRITE; + devvp->v_rdev->si_snapdata = NULL; + VTOI(vp)->i_snapdata = sn; +} + +static struct snapdata * +ffs_snapdata_acquire(struct vnode *devvp) +{ + struct snapdata *nsn; + struct snapdata *sn; + + /* Just malloc on ahead of time to simplify logic below. */ + nsn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); + /* + * If there already exist snapshots on this filesystem, grab a + * reference to their shared lock. If this is the first snapshot + * on this filesystem, we need to allocate a lock for the + * snapshots to share. In either case, acquire the snapshot + * lock return the snapdata. + */ + VI_LOCK(devvp); + sn = devvp->v_rdev->si_snapdata; + if (sn == NULL) { + sn = nsn; + nsn = NULL; + TAILQ_INIT(&sn->sn_head); + lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, + LK_CANRECURSE | LK_NOSHARE); + devvp->v_rdev->si_snapdata = sn; } + lockmgr(&sn->sn_lock, + LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, VI_MTX(devvp)); + if (nsn) + free(nsn, M_UFSMNT); - devvp->v_rdev->si_snapdata = NULL; - devvp->v_vflag &= ~VV_COPYONWRITE; + return (sn); +} + +void +ffs_snapdata_free(struct inode *ip) +{ + ufs2_daddr_t *snapblklist; + struct snapdata *sn; + + sn = ip->i_snapdata; + ip->i_snapdata = NULL; snapblklist = sn->sn_blklist; sn->sn_blklist = NULL; sn->sn_listsize = 0; - lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp)); + lockmgr(&sn->sn_lock, LK_DRAIN, NULL); lockmgr(&sn->sn_lock, LK_RELEASE, NULL); lockdestroy(&sn->sn_lock); free(sn, M_UFSMNT); Index: ufs/ffs/ffs_vnops.c =================================================================== RCS file: /home/ncvs/src/sys/ufs/ffs/ffs_vnops.c,v retrieving revision 1.181 diff -p -u -r1.181 ffs_vnops.c --- ufs/ffs/ffs_vnops.c 22 Mar 2008 09:15:16 -0000 1.181 +++ ufs/ffs/ffs_vnops.c 26 Mar 2008 00:14:12 -0000 @@ -361,16 +361,6 @@ ffs_lock(ap) vp = ap->a_vp; flags = ap->a_flags; for (;;) { - /* - * vnode interlock must be held to ensure that - * the possibly external lock isn't freed, - * e.g. when mutating from snapshot file vnode - * to regular file vnode. - */ - if ((flags & LK_INTERLOCK) == 0) { - VI_LOCK(vp); - flags |= LK_INTERLOCK; - } lkp = vp->v_vnlock; result = _lockmgr_args(lkp, flags, VI_MTX(vp), LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, @@ -385,9 +375,12 @@ ffs_lock(ap) * right lock. Release it, and try to get the * new lock. */ - (void) _lockmgr_args(lkp, LK_RELEASE, VI_MTX(vp), + (void) _lockmgr_args(lkp, LK_RELEASE, NULL, LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, ap->a_file, ap->a_line); + if ((flags & (LK_INTERLOCK | LK_NOWAIT)) == + (LK_INTERLOCK | LK_NOWAIT)) + return (EBUSY); if ((flags & LK_TYPE_MASK) == LK_UPGRADE) flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; flags &= ~LK_INTERLOCK; @@ -501,6 +494,7 @@ ffs_read(ap) if (bytesinfile < xfersize) xfersize = bytesinfile; + if (ioflag & (IO_VMIO|IO_DIRECT)) { if (lblktosize(fs, nextlbn) >= ip->i_size) { /* * Don't do readahead if this is the end of the file. @@ -536,6 +530,9 @@ ffs_read(ap) */ error = bread(vp, lbn, size, NOCRED, &bp); } + } else { + error = breads(vp, lbn, size, NOCRED, &bp); + } if (error) { brelse(bp); bp = NULL; Index: ufs/ufs/inode.h =================================================================== RCS file: /home/ncvs/src/sys/ufs/ufs/inode.h,v retrieving revision 1.51 diff -p -u -r1.51 inode.h --- ufs/ufs/inode.h 10 Oct 2006 09:20:54 -0000 1.51 +++ ufs/ufs/inode.h 26 Mar 2008 00:14:12 -0000 @@ -42,6 +42,8 @@ #include #include +struct snapdata; + /* * This must agree with the definition in . */ @@ -89,6 +91,7 @@ struct inode { union { struct dirhash *dirhash; /* Hashing for large directories. */ daddr_t *snapblklist; /* Collect expunged snapshot blocks. */ + struct snapdata *snapdata; /* snapdata that must be reclaimed. */ } i_un; /* @@ -133,6 +136,7 @@ struct inode { #define i_umbufobj i_ump->um_bo #define i_dirhash i_un.dirhash #define i_snapblklist i_un.snapblklist +#define i_snapdata i_un.snapdata #define i_din1 dinode_u.din1 #define i_din2 dinode_u.din2 Index: ufs/ufs/ufs_extern.h =================================================================== RCS file: /home/ncvs/src/sys/ufs/ufs/ufs_extern.h,v retrieving revision 1.55 diff -p -u -r1.55 ufs_extern.h --- ufs/ufs/ufs_extern.h 14 Mar 2005 10:21:16 -0000 1.55 +++ ufs/ufs/ufs_extern.h 26 Mar 2008 00:14:12 -0000 @@ -78,6 +78,7 @@ int ufs_lookup(struct vop_cachedlookup_ int ufs_readdir(struct vop_readdir_args *); int ufs_reclaim(struct vop_reclaim_args *); void ffs_snapgone(struct inode *); +void ffs_snapdata_free(struct inode *); vfs_root_t ufs_root; int ufs_uninit(struct vfsconf *); int ufs_vinit(struct mount *, struct vop_vector *, struct vnode **); Index: ufs/ufs/ufs_inode.c =================================================================== RCS file: /home/ncvs/src/sys/ufs/ufs/ufs_inode.c,v retrieving revision 1.69 diff -p -u -r1.69 ufs_inode.c --- ufs/ufs/ufs_inode.c 22 Jun 2007 13:22:37 -0000 1.69 +++ ufs/ufs/ufs_inode.c 26 Mar 2008 00:14:12 -0000 @@ -213,9 +213,13 @@ ufs_reclaim(ap) } #endif #ifdef UFS_DIRHASH - if (ip->i_dirhash != NULL) + if (vp->v_type == VDIR && ip->i_dirhash != NULL) ufsdirhash_free(ip); #endif +#ifdef FFS + if (vp->v_type == VREG && ip->i_snapdata != NULL) + ffs_snapdata_free(ip); +#endif /* * Lock the clearing of v_data so ffs_lock() can inspect it * prior to obtaining the lock. Index: vm/vm_object.c =================================================================== RCS file: /home/ncvs/src/sys/vm/vm_object.c,v retrieving revision 1.392 diff -p -u -r1.392 vm_object.c --- vm/vm_object.c 26 Feb 2008 17:16:48 -0000 1.392 +++ vm/vm_object.c 26 Mar 2008 00:14:12 -0000 @@ -364,22 +364,11 @@ vm_object_allocate(objtype_t type, vm_pi void vm_object_reference(vm_object_t object) { - struct vnode *vp; - if (object == NULL) return; VM_OBJECT_LOCK(object); - object->ref_count++; - if (object->type == OBJT_VNODE) { - int vfslocked; - - vp = object->handle; - VM_OBJECT_UNLOCK(object); - vfslocked = VFS_LOCK_GIANT(vp->v_mount); - vget(vp, LK_RETRY, curthread); - VFS_UNLOCK_GIANT(vfslocked); - } else - VM_OBJECT_UNLOCK(object); + vm_object_reference_locked(object); + VM_OBJECT_UNLOCK(object); } /* @@ -395,8 +384,6 @@ vm_object_reference_locked(vm_object_t o struct vnode *vp; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); - KASSERT((object->flags & OBJ_DEAD) == 0, - ("vm_object_reference_locked: dead object referenced")); object->ref_count++; if (object->type == OBJT_VNODE) { vp = object->handle;