This is unfinished patch that removes contention from single sf_buf by breaking
it into one mutex per hash entry and also by using separate mutex for free
list. To further reduce contention on the free list mutex I keep free sf_bufs
not on free list after first use, but only on active lists.  They can still be
reused as they can be found by looking at refcnt.  If we need free sf_buf and
free list is already empty, we will look for empty sf_buf starting at our
active list. If we can find it here we simply jump to samehash saving some
atomic ops. As a side-effect this thing tunes itself automatcally - the more
the given list is used the more empty sf_bufs it will have over time. To finish
the patch I'd suggest removing free list altogether and just prepopulating hash
lists with equal number of free sf_bufs.

Index: i386/i386/vm_machdep.c
===================================================================
--- i386/i386/vm_machdep.c	(wersja 204039)
+++ i386/i386/vm_machdep.c	(kopia robocza)
@@ -131,7 +131,8 @@
 /*
  * A lock used to synchronize access to the hash table and free list
  */
-static struct mtx sf_buf_lock;
+static struct mtx *sf_buf_active_lock;
+static struct mtx sf_buf_freelist_lock;
 
 extern int	_ucodesel, _udatasel;
 
@@ -743,8 +744,14 @@
 		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
 		TAILQ_INSERT_TAIL(&sf_buf_freelist, &sf_bufs[i], free_entry);
 	}
+	sf_buf_active_lock = malloc(sizeof(sf_buf_active_lock[0]) * (sf_buf_hashmask + 1),
+	    M_TEMP, M_WAITOK);
+	for (i = 0; i <= sf_buf_hashmask; i++) {
+		mtx_init(&sf_buf_active_lock[i], "sf_buf_active", NULL,
+		    MTX_DEF);
+	}
 	sf_buf_alloc_want = 0;
-	mtx_init(&sf_buf_lock, "sf_buf", NULL, MTX_DEF);
+	mtx_init(&sf_buf_freelist_lock, "sf_buf_freelist", NULL, MTX_DEF);
 }
 
 /*
@@ -758,10 +765,12 @@
 	struct sf_head *hash_list;
 	struct sf_buf *sf;
 	boolean_t ret;
+	int elem;
 
-	hash_list = &sf_buf_active[SF_BUF_HASH(m)];
+	elem = SF_BUF_HASH(m);
+	hash_list = &sf_buf_active[elem];
 	ret = FALSE;
-	mtx_lock(&sf_buf_lock);
+	mtx_lock(&sf_buf_active_lock[elem]);
 	LIST_FOREACH(sf, hash_list, list_entry) {
 		if (sf->m == m) {
 			/*
@@ -776,7 +785,7 @@
 			break;
 		}
 	}
-	mtx_unlock(&sf_buf_lock);
+	mtx_unlock(&sf_buf_active_lock[elem]);
 	return (ret);
 }
 
@@ -792,20 +801,16 @@
 #ifdef SMP
 	cpumask_t cpumask, other_cpus;
 #endif
-	int error;
+	int elem, error, i, j;
 
 	KASSERT(curthread->td_pinned > 0 || (flags & SFB_CPUPRIVATE) == 0,
 	    ("sf_buf_alloc(SFB_CPUPRIVATE): curthread not pinned"));
-	hash_list = &sf_buf_active[SF_BUF_HASH(m)];
-	mtx_lock(&sf_buf_lock);
+	elem = SF_BUF_HASH(m);
+	hash_list = &sf_buf_active[elem];
+	mtx_lock(&sf_buf_active_lock[elem]);
 	LIST_FOREACH(sf, hash_list, list_entry) {
 		if (sf->m == m) {
 			sf->ref_count++;
-			if (sf->ref_count == 1) {
-				TAILQ_REMOVE(&sf_buf_freelist, sf, free_entry);
-				nsfbufsused++;
-				nsfbufspeak = imax(nsfbufspeak, nsfbufsused);
-			}
 #ifdef SMP
 			goto shootdown;	
 #else
@@ -813,29 +818,64 @@
 #endif
 		}
 	}
+	mtx_unlock(&sf_buf_active_lock[elem]);
+	mtx_lock(&sf_buf_freelist_lock);
+retry:
+	if (TAILQ_EMPTY(&sf_buf_freelist)) {
+		mtx_unlock(&sf_buf_freelist_lock);
+		for (i = 0, j = elem; i <= sf_buf_hashmask; i++, j++) {
+			if (j > sf_buf_hashmask)
+				j = 0;
+			hash_list = &sf_buf_active[j];
+			mtx_lock(&sf_buf_active_lock[j]);
+			LIST_FOREACH(sf, hash_list, list_entry) {
+				if (sf->ref_count == 0) {
+					if (j == elem)
+						goto samehash;
+					LIST_REMOVE(sf, list_entry);
+					break;
+				}
+			}
+			mtx_unlock(&sf_buf_active_lock[j]);
+			if (sf != NULL)
+				break;
+		}
+		mtx_lock(&sf_buf_freelist_lock);
+		if (sf != NULL) {
+			TAILQ_INSERT_HEAD(&sf_buf_freelist, sf, free_entry);
+			nsfbufsused--;
+		}
+	}
 	while ((sf = TAILQ_FIRST(&sf_buf_freelist)) == NULL) {
-		if (flags & SFB_NOWAIT)
-			goto done;
+		if (flags & SFB_NOWAIT) {
+			mtx_unlock(&sf_buf_freelist_lock);
+			return (NULL);
+		}
 		sf_buf_alloc_want++;
 		mbstat.sf_allocwait++;
-		error = msleep(&sf_buf_freelist, &sf_buf_lock,
-		    (flags & SFB_CATCH) ? PCATCH | PVM : PVM, "sfbufa", 0);
+		error = msleep(&sf_buf_freelist, &sf_buf_freelist_lock,
+		    (flags & SFB_CATCH) ? PCATCH | PVM : PVM, "sfbufa", hz / 10);
 		sf_buf_alloc_want--;
 
 		/*
 		 * If we got a signal, don't risk going back to sleep. 
 		 */
-		if (error)
-			goto done;
+		if (error) {
+			mtx_unlock(&sf_buf_freelist_lock);
+			return (NULL);
+		}
+		goto retry;
 	}
 	TAILQ_REMOVE(&sf_buf_freelist, sf, free_entry);
-	if (sf->m != NULL)
-		LIST_REMOVE(sf, list_entry);
+	nsfbufsused++;
+	nsfbufspeak = imax(nsfbufspeak, nsfbufsused);
+	mtx_unlock(&sf_buf_freelist_lock);
+	KASSERT(sf->m == NULL, ("sf->m = %p", sf->m));
+	mtx_lock(&sf_buf_active_lock[elem]);
 	LIST_INSERT_HEAD(hash_list, sf, list_entry);
+samehash:
 	sf->ref_count = 1;
 	sf->m = m;
-	nsfbufsused++;
-	nsfbufspeak = imax(nsfbufspeak, nsfbufsused);
 
 	/*
 	 * Update the sf_buf's virtual-to-physical mapping, flushing the
@@ -881,9 +921,9 @@
 #else
 	if ((opte & (PG_V | PG_A)) ==  (PG_V | PG_A))
 		pmap_invalidate_page(kernel_pmap, sf->kva);
+done:
 #endif
-done:
-	mtx_unlock(&sf_buf_lock);
+	mtx_unlock(&sf_buf_active_lock[elem]);
 	return (sf);
 }
 
@@ -896,24 +936,26 @@
 void
 sf_buf_free(struct sf_buf *sf)
 {
+	int elem;
 
-	mtx_lock(&sf_buf_lock);
+	elem = SF_BUF_HASH(sf->m);
+	mtx_lock(&sf_buf_active_lock[elem]);
 	sf->ref_count--;
-	if (sf->ref_count == 0) {
-		TAILQ_INSERT_TAIL(&sf_buf_freelist, sf, free_entry);
-		nsfbufsused--;
+	if (sf->ref_count > 0) {
+		mtx_unlock(&sf_buf_active_lock[elem]);
+		return;
+	}
 #ifdef XEN
 /*
  * Xen doesn't like having dangling R/W mappings
  */
-		pmap_qremove(sf->kva, 1);
-		sf->m = NULL;
-		LIST_REMOVE(sf, list_entry);
+	pmap_qremove(sf->kva, 1);
+	sf->m = NULL;
+	LIST_REMOVE(sf, list_entry);
 #endif
-		if (sf_buf_alloc_want > 0)
-			wakeup_one(&sf_buf_freelist);
-	}
-	mtx_unlock(&sf_buf_lock);
+	mtx_unlock(&sf_buf_active_lock[elem]);
+	if (sf_buf_alloc_want > 0)
+		wakeup_one(&sf_buf_freelist);
 }
 
 /*