Index: kern/vfs_bio.c =================================================================== RCS file: /home/ncvs/src/sys/kern/vfs_bio.c,v retrieving revision 1.541 diff -p -u -r1.541 vfs_bio.c --- kern/vfs_bio.c 22 Mar 2008 09:15:14 -0000 1.541 +++ kern/vfs_bio.c 24 Mar 2008 04:51:07 -0000 @@ -251,8 +251,8 @@ static struct mtx nblock; /* Queues for free buffers with various properties */ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; -/* Lock for the bufqueues */ -static struct mtx bqlock; +/* Lock for each bufqueue. */ +static struct mtx bqlock[BUFFER_QUEUES]; /* * Single global constant for BUF_WMESG, to avoid getting multiple references. @@ -279,7 +279,7 @@ static __inline void numdirtywakeup(int level) { - if (numdirtybuffers <= level) { + if (numdirtybuffers <= level && needsbuffer) { mtx_lock(&nblock); if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) { needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH; @@ -302,6 +302,8 @@ static __inline void bufspacewakeup(void) { + if (needsbuffer == 0) + return; /* * If someone is waiting for BUF space, wake them up. Even * though we haven't freed the kva space yet, the waiting @@ -323,16 +325,18 @@ void runningbufwakeup(struct buf *bp) { - if (bp->b_runningbufspace) { - atomic_subtract_int(&runningbufspace, bp->b_runningbufspace); - bp->b_runningbufspace = 0; - mtx_lock(&rbreqlock); - if (runningbufreq && runningbufspace <= lorunningspace) { - runningbufreq = 0; - wakeup(&runningbufreq); - } - mtx_unlock(&rbreqlock); + if (bp->b_runningbufspace == 0) + return; + atomic_subtract_int(&runningbufspace, bp->b_runningbufspace); + bp->b_runningbufspace = 0; + if (runningbufreq == 0) + return; + mtx_lock(&rbreqlock); + if (runningbufreq && runningbufspace <= lorunningspace) { + runningbufreq = 0; + wakeup(&runningbufreq); } + mtx_unlock(&rbreqlock); } /* @@ -349,6 +353,8 @@ bufcountwakeup(void) { atomic_add_int(&numfreebuffers, 1); + if (needsbuffer == 0) + return; mtx_lock(&nblock); if (needsbuffer) { needsbuffer &= ~VFS_BIO_NEED_ANY; @@ -378,6 +384,8 @@ void waitrunningbufspace(void) { + if (runningbufspace < hirunningspace) + return; mtx_lock(&rbreqlock); while (runningbufspace > hirunningspace) { ++runningbufreq; @@ -525,14 +533,15 @@ bufinit(void) struct buf *bp; int i; - mtx_init(&bqlock, "buf queue lock", NULL, MTX_DEF); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); /* next, make a null set of free lists */ - for (i = 0; i < BUFFER_QUEUES; i++) + for (i = 0; i < BUFFER_QUEUES; i++) { + mtx_init(&bqlock[i], "buf queue lock", NULL, MTX_DEF); TAILQ_INIT(&bufqueues[i]); + } /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { @@ -667,9 +676,12 @@ bremfree(struct buf *bp) void bremfreef(struct buf *bp) { - mtx_lock(&bqlock); + struct mtx *mtxp; + + mtxp = &bqlock[bp->b_qindex]; + mtx_lock(mtxp); bremfreel(bp); - mtx_unlock(&bqlock); + mtx_unlock(mtxp); } /* @@ -686,7 +698,7 @@ bremfreel(struct buf *bp) KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfreel: buffer %p not on a queue.", bp)); BUF_ASSERT_HELD(bp); - mtx_assert(&bqlock, MA_OWNED); + mtx_assert(&bqlock[bp->b_qindex], MA_OWNED); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); bp->b_qindex = QUEUE_NONE; @@ -1145,11 +1187,18 @@ buf_dirty_count_severe(void) void brelse(struct buf *bp) { + int tail; + CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); + if (BUF_ISLOCKED(bp) == LK_SHARED) { + BUF_UNLOCK(bp); + return; + } + if (bp->b_flags & B_MANAGED) { bqrelse(bp); return; @@ -1334,13 +1383,12 @@ brelse(struct buf *bp) } /* enqueue */ - mtx_lock(&bqlock); /* Handle delayed bremfree() processing. */ if (bp->b_flags & B_REMFREE) - bremfreel(bp); + bremfreef(bp); if (bp->b_qindex != QUEUE_NONE) panic("brelse: free buffer onto another queue???"); - + tail = 0; /* buffers with no memory */ if (bp->b_bufsize == 0) { bp->b_flags |= B_INVAL; @@ -1352,7 +1400,6 @@ brelse(struct buf *bp) } else { bp->b_qindex = QUEUE_EMPTY; } - TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); /* buffers with junk contents */ } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { @@ -1361,7 +1408,6 @@ brelse(struct buf *bp) if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); bp->b_qindex = QUEUE_CLEAN; - TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); /* remaining buffers */ } else { if ((bp->b_flags & (B_DELWRI|B_NEEDSGIANT)) == @@ -1371,12 +1417,15 @@ brelse(struct buf *bp) bp->b_qindex = QUEUE_DIRTY; else bp->b_qindex = QUEUE_CLEAN; - if (bp->b_flags & B_AGE) - TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); - else - TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); + if ((bp->b_flags & B_AGE) == 0) + tail = 1; } - mtx_unlock(&bqlock); + mtx_lock(&bqlock[bp->b_qindex]); + if (tail) + TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); + else + TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); + mtx_unlock(&bqlock[bp->b_qindex]); /* * If B_INVAL and B_DELWRI is set, clear B_DELWRI. We have already @@ -1432,27 +1481,23 @@ bqrelse(struct buf *bp) KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); + if (BUF_ISLOCKED(bp) == LK_SHARED) { + BUF_UNLOCK(bp); + return; + } if (BUF_LOCKRECURSED(bp)) { /* do not release to free list */ BUF_UNLOCK(bp); return; } - + /* Handle delayed bremfree() processing. */ + if (bp->b_flags & B_REMFREE) + bremfreef(bp); if (bp->b_flags & B_MANAGED) { - if (bp->b_flags & B_REMFREE) { - mtx_lock(&bqlock); - bremfreel(bp); - mtx_unlock(&bqlock); - } bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); BUF_UNLOCK(bp); return; } - - mtx_lock(&bqlock); - /* Handle delayed bremfree() processing. */ - if (bp->b_flags & B_REMFREE) - bremfreel(bp); if (bp->b_qindex != QUEUE_NONE) panic("bqrelse: free buffer onto another queue???"); /* buffers with stale but valid contents */ @@ -1461,7 +1506,6 @@ bqrelse(struct buf *bp) bp->b_qindex = QUEUE_DIRTY_GIANT; else bp->b_qindex = QUEUE_DIRTY; - TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); } else { /* * The locking of the BO_LOCK for checking of the @@ -1472,20 +1516,19 @@ bqrelse(struct buf *bp) */ if (!vm_page_count_severe() || (bp->b_vflags & BV_BKGRDINPROG)) { bp->b_qindex = QUEUE_CLEAN; - TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, - b_freelist); } else { /* * We are too low on memory, we have to try to free * the buffer (most importantly: the wired pages * making up its backing store) *now*. */ - mtx_unlock(&bqlock); brelse(bp); return; } } - mtx_unlock(&bqlock); + mtx_lock(&bqlock[bp->b_qindex]); + TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); + mtx_unlock(&bqlock[bp->b_qindex]); if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI)) bufcountwakeup(); @@ -1692,7 +1735,7 @@ getnewbuf(int slpflag, int slptimeo, int struct buf *bp; struct buf *nbp; int defrag = 0; - int nqindex; + int qindex; int waiters = 0; static int flushingbufs; @@ -1720,56 +1763,52 @@ restart: * However, there are a number of cases (defragging, reusing, ...) * where we cannot backup. */ - mtx_lock(&bqlock); - nqindex = QUEUE_EMPTYKVA; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); - - if (nbp == NULL) { - /* - * If no EMPTYKVA buffers and we are either - * defragging or reusing, locate a CLEAN buffer - * to free or reuse. If bufspace useage is low - * skip this step so we can allocate a new buffer. - */ - if (defrag || bufspace >= lobufspace) { - nqindex = QUEUE_CLEAN; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); - } - - /* - * If we could not find or were not allowed to reuse a - * CLEAN buffer, check to see if it is ok to use an EMPTY - * buffer. We can only use an EMPTY buffer if allocating - * its KVA would not otherwise run us out of buffer space. - */ - if (nbp == NULL && defrag == 0 && - bufspace + maxsize < hibufspace) { - nqindex = QUEUE_EMPTY; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); - } + bp = NULL; + qindex = QUEUE_EMPTYKVA; + mtx_lock(&bqlock[qindex]); + nbp = TAILQ_FIRST(&bufqueues[qindex]); + /* + * If no EMPTYKVA buffers and we are either + * defragging or reusing, locate a CLEAN buffer + * to free or reuse. If bufspace useage is low + * skip this step so we can allocate a new buffer. + */ + if (nbp == NULL && (defrag || bufspace >= lobufspace)) { + mtx_unlock(&bqlock[qindex]); + qindex = QUEUE_CLEAN; + mtx_lock(&bqlock[qindex]); + nbp = TAILQ_FIRST(&bufqueues[qindex]); + } + /* + * If we could not find or were not allowed to reuse a + * CLEAN buffer, check to see if it is ok to use an EMPTY + * buffer. We can only use an EMPTY buffer if allocating + * its KVA would not otherwise run us out of buffer space. + */ + if (nbp == NULL && defrag == 0 && bufspace + maxsize < hibufspace) { + mtx_unlock(&bqlock[qindex]); + qindex = QUEUE_EMPTY; + mtx_lock(&bqlock[qindex]); + nbp = TAILQ_FIRST(&bufqueues[qindex]); } - /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ - - while ((bp = nbp) != NULL) { - int qindex = nqindex; - - /* - * Calculate next bp ( we can only use it if we do not block - * or do other fancy things ). - */ - if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { + for (;;) { + if (nbp == NULL) { switch(qindex) { case QUEUE_EMPTY: - nqindex = QUEUE_EMPTYKVA; + mtx_unlock(&bqlock[qindex]); + qindex = QUEUE_EMPTYKVA; + mtx_lock(&bqlock[qindex]); if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]))) break; /* FALLTHROUGH */ case QUEUE_EMPTYKVA: - nqindex = QUEUE_CLEAN; + mtx_unlock(&bqlock[qindex]); + qindex = QUEUE_CLEAN; + mtx_lock(&bqlock[qindex]); if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]))) break; /* FALLTHROUGH */ @@ -1780,6 +1819,14 @@ restart: break; } } + if (nbp == NULL) + break; + bp = nbp; + /* + * Calculate next bp ( we can only use it if we do not block + * or do other fancy things ). + */ + nbp = TAILQ_NEXT(bp, b_freelist); /* * If we are defragging then we need a buffer with * b_kvasize != 0. XXX this situation should no longer @@ -1822,9 +1869,8 @@ restart: */ KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex)); - bremfreel(bp); - mtx_unlock(&bqlock); + mtx_unlock(&bqlock[qindex]); if (qindex == QUEUE_CLEAN) { if (bp->b_flags & B_VMIO) { @@ -1937,6 +1983,7 @@ restart: int flags; char *waitmsg; + mtx_unlock(&bqlock[qindex]); if (defrag) { flags = VFS_BIO_NEED_BUFSPACE; waitmsg = "nbufkv"; @@ -1950,7 +1997,6 @@ restart: mtx_lock(&nblock); needsbuffer |= flags; mtx_unlock(&nblock); - mtx_unlock(&bqlock); bd_speedup(); /* heeeelp */ @@ -2136,7 +2182,7 @@ flushbufqueues(int queue, int flushdeps) target /= 2; flushed = 0; bp = NULL; - mtx_lock(&bqlock); + mtx_lock(&bqlock[queue]); TAILQ_INSERT_TAIL(&bufqueues[queue], &sentinel, b_freelist); while (flushed != target) { bp = TAILQ_FIRST(&bufqueues[queue]); @@ -2161,11 +2207,11 @@ flushbufqueues(int queue, int flushdeps) BO_UNLOCK(bp->b_bufobj); if (bp->b_flags & B_INVAL) { bremfreel(bp); - mtx_unlock(&bqlock); + mtx_unlock(&bqlock[queue]); brelse(bp); flushed++; numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2); - mtx_lock(&bqlock); + mtx_lock(&bqlock[queue]); continue; } @@ -2193,7 +2239,7 @@ flushbufqueues(int queue, int flushdeps) continue; } if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { - mtx_unlock(&bqlock); + mtx_unlock(&bqlock[queue]); CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); vfs_bio_awrite(bp); @@ -2203,14 +2249,14 @@ flushbufqueues(int queue, int flushdeps) flushed++; waitrunningbufspace(); numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2); - mtx_lock(&bqlock); + mtx_lock(&bqlock[queue]); continue; } vn_finished_write(mp); BUF_UNLOCK(bp); } TAILQ_REMOVE(&bufqueues[queue], &sentinel, b_freelist); - mtx_unlock(&bqlock); + mtx_unlock(&bqlock[queue]); return (flushed); }