This is unfinished patch that removes contention from single sf_buf by breaking it into one mutex per hash entry and also by using separate mutex for free list. To further reduce contention on the free list mutex I keep free sf_bufs not on free list after first use, but only on active lists. They can still be reused as they can be found by looking at refcnt. If we need free sf_buf and free list is already empty, we will look for empty sf_buf starting at our active list. If we can find it here we simply jump to samehash saving some atomic ops. As a side-effect this thing tunes itself automatcally - the more the given list is used the more empty sf_bufs it will have over time. To finish the patch I'd suggest removing free list altogether and just prepopulating hash lists with equal number of free sf_bufs. Index: i386/i386/vm_machdep.c =================================================================== --- i386/i386/vm_machdep.c (wersja 204039) +++ i386/i386/vm_machdep.c (kopia robocza) @@ -131,7 +131,8 @@ /* * A lock used to synchronize access to the hash table and free list */ -static struct mtx sf_buf_lock; +static struct mtx *sf_buf_active_lock; +static struct mtx sf_buf_freelist_lock; extern int _ucodesel, _udatasel; @@ -743,8 +744,14 @@ sf_bufs[i].kva = sf_base + i * PAGE_SIZE; TAILQ_INSERT_TAIL(&sf_buf_freelist, &sf_bufs[i], free_entry); } + sf_buf_active_lock = malloc(sizeof(sf_buf_active_lock[0]) * (sf_buf_hashmask + 1), + M_TEMP, M_WAITOK); + for (i = 0; i <= sf_buf_hashmask; i++) { + mtx_init(&sf_buf_active_lock[i], "sf_buf_active", NULL, + MTX_DEF); + } sf_buf_alloc_want = 0; - mtx_init(&sf_buf_lock, "sf_buf", NULL, MTX_DEF); + mtx_init(&sf_buf_freelist_lock, "sf_buf_freelist", NULL, MTX_DEF); } /* @@ -758,10 +765,12 @@ struct sf_head *hash_list; struct sf_buf *sf; boolean_t ret; + int elem; - hash_list = &sf_buf_active[SF_BUF_HASH(m)]; + elem = SF_BUF_HASH(m); + hash_list = &sf_buf_active[elem]; ret = FALSE; - mtx_lock(&sf_buf_lock); + mtx_lock(&sf_buf_active_lock[elem]); LIST_FOREACH(sf, hash_list, list_entry) { if (sf->m == m) { /* @@ -776,7 +785,7 @@ break; } } - mtx_unlock(&sf_buf_lock); + mtx_unlock(&sf_buf_active_lock[elem]); return (ret); } @@ -792,20 +801,16 @@ #ifdef SMP cpumask_t cpumask, other_cpus; #endif - int error; + int elem, error, i, j; KASSERT(curthread->td_pinned > 0 || (flags & SFB_CPUPRIVATE) == 0, ("sf_buf_alloc(SFB_CPUPRIVATE): curthread not pinned")); - hash_list = &sf_buf_active[SF_BUF_HASH(m)]; - mtx_lock(&sf_buf_lock); + elem = SF_BUF_HASH(m); + hash_list = &sf_buf_active[elem]; + mtx_lock(&sf_buf_active_lock[elem]); LIST_FOREACH(sf, hash_list, list_entry) { if (sf->m == m) { sf->ref_count++; - if (sf->ref_count == 1) { - TAILQ_REMOVE(&sf_buf_freelist, sf, free_entry); - nsfbufsused++; - nsfbufspeak = imax(nsfbufspeak, nsfbufsused); - } #ifdef SMP goto shootdown; #else @@ -813,29 +818,64 @@ #endif } } + mtx_unlock(&sf_buf_active_lock[elem]); + mtx_lock(&sf_buf_freelist_lock); +retry: + if (TAILQ_EMPTY(&sf_buf_freelist)) { + mtx_unlock(&sf_buf_freelist_lock); + for (i = 0, j = elem; i <= sf_buf_hashmask; i++, j++) { + if (j > sf_buf_hashmask) + j = 0; + hash_list = &sf_buf_active[j]; + mtx_lock(&sf_buf_active_lock[j]); + LIST_FOREACH(sf, hash_list, list_entry) { + if (sf->ref_count == 0) { + if (j == elem) + goto samehash; + LIST_REMOVE(sf, list_entry); + break; + } + } + mtx_unlock(&sf_buf_active_lock[j]); + if (sf != NULL) + break; + } + mtx_lock(&sf_buf_freelist_lock); + if (sf != NULL) { + TAILQ_INSERT_HEAD(&sf_buf_freelist, sf, free_entry); + nsfbufsused--; + } + } while ((sf = TAILQ_FIRST(&sf_buf_freelist)) == NULL) { - if (flags & SFB_NOWAIT) - goto done; + if (flags & SFB_NOWAIT) { + mtx_unlock(&sf_buf_freelist_lock); + return (NULL); + } sf_buf_alloc_want++; mbstat.sf_allocwait++; - error = msleep(&sf_buf_freelist, &sf_buf_lock, - (flags & SFB_CATCH) ? PCATCH | PVM : PVM, "sfbufa", 0); + error = msleep(&sf_buf_freelist, &sf_buf_freelist_lock, + (flags & SFB_CATCH) ? PCATCH | PVM : PVM, "sfbufa", hz / 10); sf_buf_alloc_want--; /* * If we got a signal, don't risk going back to sleep. */ - if (error) - goto done; + if (error) { + mtx_unlock(&sf_buf_freelist_lock); + return (NULL); + } + goto retry; } TAILQ_REMOVE(&sf_buf_freelist, sf, free_entry); - if (sf->m != NULL) - LIST_REMOVE(sf, list_entry); + nsfbufsused++; + nsfbufspeak = imax(nsfbufspeak, nsfbufsused); + mtx_unlock(&sf_buf_freelist_lock); + KASSERT(sf->m == NULL, ("sf->m = %p", sf->m)); + mtx_lock(&sf_buf_active_lock[elem]); LIST_INSERT_HEAD(hash_list, sf, list_entry); +samehash: sf->ref_count = 1; sf->m = m; - nsfbufsused++; - nsfbufspeak = imax(nsfbufspeak, nsfbufsused); /* * Update the sf_buf's virtual-to-physical mapping, flushing the @@ -881,9 +921,9 @@ #else if ((opte & (PG_V | PG_A)) == (PG_V | PG_A)) pmap_invalidate_page(kernel_pmap, sf->kva); +done: #endif -done: - mtx_unlock(&sf_buf_lock); + mtx_unlock(&sf_buf_active_lock[elem]); return (sf); } @@ -896,24 +936,26 @@ void sf_buf_free(struct sf_buf *sf) { + int elem; - mtx_lock(&sf_buf_lock); + elem = SF_BUF_HASH(sf->m); + mtx_lock(&sf_buf_active_lock[elem]); sf->ref_count--; - if (sf->ref_count == 0) { - TAILQ_INSERT_TAIL(&sf_buf_freelist, sf, free_entry); - nsfbufsused--; + if (sf->ref_count > 0) { + mtx_unlock(&sf_buf_active_lock[elem]); + return; + } #ifdef XEN /* * Xen doesn't like having dangling R/W mappings */ - pmap_qremove(sf->kva, 1); - sf->m = NULL; - LIST_REMOVE(sf, list_entry); + pmap_qremove(sf->kva, 1); + sf->m = NULL; + LIST_REMOVE(sf, list_entry); #endif - if (sf_buf_alloc_want > 0) - wakeup_one(&sf_buf_freelist); - } - mtx_unlock(&sf_buf_lock); + mtx_unlock(&sf_buf_active_lock[elem]); + if (sf_buf_alloc_want > 0) + wakeup_one(&sf_buf_freelist); } /*