Index: sys/kern/vfs_subr.c
===================================================================
--- sys/kern/vfs_subr.c	(revision 295659)
+++ sys/kern/vfs_subr.c	(working copy)
@@ -145,51 +145,24 @@ int vttoif_tab[10] = {
 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
 
 /*
- * "Free" vnode target.  Free vnodes are rarely completely free, but are
- * just ones that are cheap to recycle.  Usually they are for files which
- * have been stat'd but not read; these usually have inode and namecache
- * data attached to them.  This target is the preferred minimum size of a
- * sub-cache consisting mostly of such files. The system balances the size
- * of this sub-cache with its complement to try to prevent either from
- * thrashing while the other is relatively inactive.  The targets express
- * a preference for the best balance.
- *
- * "Above" this target there are 2 further targets (watermarks) related
- * to recyling of free vnodes.  In the best-operating case, the cache is
- * exactly full, the free list has size between vlowat and vhiwat above the
- * free target, and recycling from it and normal use maintains this state.
- * Sometimes the free list is below vlowat or even empty, but this state
- * is even better for immediate use provided the cache is not full.
- * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
- * ones) to reach one of these states.  The watermarks are currently hard-
- * coded as 4% and 9% of the available space higher.  These and the default
- * of 25% for wantfreevnodes are too large if the memory size is large.
- * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
- * whenever vnlru_proc() becomes active.
+ * Free vnode target.  Free vnodes may simply be files which have been stat'd
+ * but not read.  This is somewhat common, and a small cache of such files
+ * should be kept to avoid recreation costs.
  */
 static u_long wantfreevnodes;
-SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
-    &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes");
+SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
+/* Number of vnodes in the free list. */
 static u_long freevnodes;
-SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
-    &freevnodes, 0, "Number of \"free\" vnodes");
+SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
+    "Number of vnodes in the free list");
 
-/*
- * The vfs.vlru_allow_cache_src sysctl variable is no longer used but
- * the sysctl remains to provide ABI compatibility. The new code frees
- * namecache sources as the last chance to satisfy the highest watermark,
- * instead of selecting the source vnodes randomly. This provides good
- * enough behaviour to keep vn_fullpath() working in most situations.
- * The filesystem layout with deep trees, where the depricated knob was
- * required, is thus handled automatically.
- */
 static int vlru_allow_cache_src;
 SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
-    &vlru_allow_cache_src, 0, "Placeholder for API compatibility (unused)");
+    &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
 
 static u_long recycles_count;
 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0,
-    "Number of vnodes recycled to meet vnode cache targets");
+    "Number of vnodes recycled to avoid exceding kern.maxvnodes");
 
 /*
  * Various variables used for debugging the new implementation of
@@ -299,13 +272,14 @@ static int syncer_worklist_len;
 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
     syncer_state;
 
-/* Target for maximum number of vnodes. */
+/*
+ * Number of vnodes we want to exist at any one time.  This is mostly used
+ * to size hash tables in vnode-related code.  It is normally not used in
+ * getnewvnode(), as wantfreevnodes is normally nonzero.)
+ *
+ * XXX desiredvnodes is historical cruft and should not exist.
+ */
 int desiredvnodes;
-static int gapvnodes;		/* gap between wanted and desired */
-static int vhiwat;		/* enough extras after expansion */
-static int vlowat;		/* minimal extras before expansion */
-static int vstir;		/* nonzero to stir non-free vnodes */
-static volatile int vsmalltrigger = 8;	/* pref to keep if > this many pages */
 
 static int
 sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
@@ -316,8 +290,6 @@ sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
 	if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
 		return (error);
 	if (old_desiredvnodes != desiredvnodes) {
-		wantfreevnodes = desiredvnodes / 4;
-		/* XXX locking seems to be incomplete. */
 		vfs_hash_changesize(desiredvnodes);
 		cache_changesize(desiredvnodes);
 	}
@@ -326,9 +298,9 @@ sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
 
 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0,
-    sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes");
+    sysctl_update_desiredvnodes, "I", "Maximum number of vnodes");
 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
-    &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
+    &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
 static int vnlru_nowhere;
 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
@@ -359,10 +331,10 @@ PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc,
  *
  * Reevaluate the following cap on the number of vnodes after the physical
  * memory size exceeds 512GB.  In the limit, as the physical memory size
- * grows, the ratio of the memory size in KB to to vnodes approaches 64:1.
+ * grows, the ratio of physical pages to vnodes approaches sixteen to one.
  */
 #ifndef	MAXVNODES_MAX
-#define	MAXVNODES_MAX	(512 * 1024 * 1024 / 64)	/* 8M */
+#define	MAXVNODES_MAX	(512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
 #endif
 
 /*
@@ -433,16 +405,15 @@ vntblinit(void *dummy __unused)
 	/*
 	 * Desiredvnodes is a function of the physical memory size and the
 	 * kernel's heap size.  Generally speaking, it scales with the
-	 * physical memory size.  The ratio of desiredvnodes to the physical
-	 * memory size is 1:16 until desiredvnodes exceeds 98,304.
-	 * Thereafter, the
-	 * marginal ratio of desiredvnodes to the physical memory size is
-	 * 1:64.  However, desiredvnodes is limited by the kernel's heap
+	 * physical memory size.  The ratio of desiredvnodes to physical pages
+	 * is one to four until desiredvnodes exceeds 98,304.  Thereafter, the
+	 * marginal ratio of desiredvnodes to physical pages is one to
+	 * sixteen.  However, desiredvnodes is limited by the kernel's heap
 	 * size.  The memory required by desiredvnodes vnodes and vm objects
-	 * must not exceed 1/7th of the kernel's heap size.
+	 * may not exceed one seventh of the kernel's heap size.
 	 */
-	physvnodes = maxproc + pgtok(cnt.v_page_count) / 64 +
-	    3 * min(98304 * 16, pgtok(cnt.v_page_count)) / 64;
+	physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4,
+	    cnt.v_page_count) / 16;
 	virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
 	    sizeof(struct vnode)));
 	desiredvnodes = min(physvnodes, virtvnodes);
@@ -831,41 +802,35 @@ vattr_null(struct vattr *vap)
  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  */
 static int
-vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger)
+vlrureclaim(struct mount *mp)
 {
 	struct vnode *vp;
-	int count, done, target;
+	int done;
+	int trigger;
+	int usevnodes;
+	int count;
 
+	/*
+	 * Calculate the trigger point, don't allow user
+	 * screwups to blow us up.   This prevents us from
+	 * recycling vnodes with lots of resident pages.  We
+	 * aren't trying to free memory, we are trying to
+	 * free vnodes.
+	 */
+	usevnodes = desiredvnodes;
+	if (usevnodes <= 0)
+		usevnodes = 1;
+	trigger = cnt.v_page_count * 2 / usevnodes;
 	done = 0;
 	vn_start_write(NULL, &mp, V_WAIT);
 	MNT_ILOCK(mp);
-	count = mp->mnt_nvnodelistsize;
-	target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1);
-	target = target / 10 + 1;
-	while (count != 0 && done < target) {
+	count = mp->mnt_nvnodelistsize / 10 + 1;
+	while (count != 0) {
 		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
 		while (vp != NULL && vp->v_type == VMARKER)
 			vp = TAILQ_NEXT(vp, v_nmntvnodes);
 		if (vp == NULL)
 			break;
-		/*
-		 * XXX LRU is completely broken for non-free vnodes.  First
-		 * by calling here in mountpoint order, then by moving
-		 * unselected vnodes to the end here, and most grossly by
-		 * removing the vlruvp() function that was supposed to
-		 * maintain the order.  (This function was born broken
-		 * since syncer problems prevented it doing anything.)  The
-		 * order is closer to LRC (C = Created).
-		 *
-		 * LRU reclaiming of vnodes seems to have last worked in
-		 * FreeBSD-3 where LRU wasn't mentioned under any spelling.
-		 * Then there was no hold count, and inactive vnodes were
-		 * simply put on the free list in LRU order.  The separate
-		 * lists also break LRU.  We prefer to reclaim from the
-		 * free list for technical reasons.  This tends to thrash
-		 * the free list to keep very unrecently used held vnodes.
-		 * The problem is mitigated by keeping the free list large.
-		 */
 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		--count;
@@ -874,12 +839,10 @@ static int
 		/*
 		 * If it's been deconstructed already, it's still
 		 * referenced, or it exceeds the trigger, skip it.
-		 * Also skip free vnodes.  We are trying to make space
-		 * to expand the free list, not reduce it.
 		 */
 		if (vp->v_usecount ||
-		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
-		    ((vp->v_iflag & VI_FREE) != 0) ||
+		    (!vlru_allow_cache_src &&
+			!LIST_EMPTY(&(vp)->v_cache_src)) ||
 		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VI_UNLOCK(vp);
@@ -905,8 +868,8 @@ static int
 		 * vnode lock before our VOP_LOCK() call fails.
 		 */
 		if (vp->v_usecount ||
-		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
-		    (vp->v_iflag & VI_FREE) != 0 ||
+		    (!vlru_allow_cache_src &&
+			!LIST_EMPTY(&(vp)->v_cache_src)) ||
 		    (vp->v_object != NULL &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VOP_UNLOCK(vp, LK_INTERLOCK);
@@ -939,7 +902,7 @@ relock_mnt:
 }
 
 /*
- * Attempt to reduce the free list by the requested amount.
+ * Attempt to keep the free list at wantfreevnodes length.
  */
 static void
 vnlru_free(int count)
@@ -996,24 +959,6 @@ vnlru_free(int count)
 		mtx_lock(&vnode_free_list_mtx);
 	}
 }
-
-/* XXX some names and initialization are bad for limits and watermarks. */
-static int
-vspace(void)
-{
-	int space;
-
-	gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
-	vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
-	vlowat = vhiwat / 2;
-	if (numvnodes > desiredvnodes)
-		return (0);
-	space = desiredvnodes - numvnodes;
-	if (freevnodes > wantfreevnodes)
-		space += freevnodes - wantfreevnodes;
-	return (space);
-}
-
 /*
  * Attempt to recycle vnodes in a context that is always safe to block.
  * Calling vlrurecycle() from the bowels of filesystem code has some
@@ -1026,36 +971,18 @@ static void
 vnlru_proc(void)
 {
 	struct mount *mp, *nmp;
-	unsigned long ofreevnodes, onumvnodes;
-	int done, force, reclaim_nc_src, trigger, usevnodes;
+	int done;
+	struct proc *p = vnlruproc;
 
-	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
+	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
 	    SHUTDOWN_PRI_FIRST);
 
-	force = 0;
 	for (;;) {
-		kproc_suspend_check(vnlruproc);
+		kproc_suspend_check(p);
 		mtx_lock(&vnode_free_list_mtx);
-		/*
-		 * If numvnodes is too large (due to desiredvnodes being
-		 * adjusted using its sysctl, or emergency growth), first
-		 * try to reduce it by discarding from the free list.
-		 */
-		if (numvnodes > desiredvnodes && freevnodes > 0)
-			vnlru_free(ulmin(numvnodes - desiredvnodes,
-			    freevnodes));
-		/*
-		 * Sleep if the vnode cache is in a good state.  This is
-		 * when it is not over-full and has space for about a 4%
-		 * or 9% expansion (by growing its size or inexcessively
-		 * reducing its free list).  Otherwise, try to reclaim
-		 * space for a 10% expansion.
-		 */
-		if (vstir && force == 0) {
-			force = 1;
-			vstir = 0;
-		}
-		if (vspace() >= vlowat && force == 0) {
+		if (freevnodes > wantfreevnodes)
+			vnlru_free(freevnodes - wantfreevnodes);
+		if (numvnodes <= desiredvnodes * 9 / 10) {
 			vnlruproc_sig = 0;
 			wakeup(&vnlruproc_sig);
 			msleep(vnlruproc, &vnode_free_list_mtx,
@@ -1064,33 +991,6 @@ vnlru_proc(void)
 		}
 		mtx_unlock(&vnode_free_list_mtx);
 		done = 0;
-		ofreevnodes = freevnodes;
-		onumvnodes = numvnodes;
-		/*
-		 * Calculate parameters for recycling.  These are the same
-		 * throughout the loop to give some semblance of fairness.
-		 * The trigger point is to avoid recycling vnodes with lots
-		 * of resident pages.  We aren't trying to free memory; we
-		 * are trying to recycle or at least free vnodes.
-		 */
-		if (numvnodes <= desiredvnodes)
-			usevnodes = numvnodes - freevnodes;
-		else
-			usevnodes = numvnodes;
-		if (usevnodes <= 0)
-			usevnodes = 1;
-		/*
-		 * The trigger value is is chosen to give a conservatively
-		 * large value to ensure that it alone doesn't prevent
-		 * making progress.  The value can easily be so large that
-		 * it is effectively infinite in some congested and
-		 * misconfigured cases, and this is necessary.  Normally
-		 * it is about 8 to 100 (pages), which is quite large.
-		 */
-		trigger = cnt.v_page_count * 2 / usevnodes;
-		if (force < 2)
-			trigger = vsmalltrigger;
-		reclaim_nc_src = force >= 3;
 		mtx_lock(&mountlist_mtx);
 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
@@ -1097,33 +997,24 @@ vnlru_proc(void)
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
-			done += vlrureclaim(mp, reclaim_nc_src, trigger);
+			done += vlrureclaim(mp);
 			mtx_lock(&mountlist_mtx);
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			vfs_unbusy(mp);
 		}
 		mtx_unlock(&mountlist_mtx);
-		if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
-			uma_reclaim();
 		if (done == 0) {
-			if (force == 0 || force == 1) {
-				force = 2;
-				continue;
-			}
-			if (force == 2) {
-				force = 3;
-				continue;
-			}
-			force = 0;
+#if 0
+			/* These messages are temporary debugging aids */
+			if (vnlru_nowhere < 5)
+				printf("vnlru process getting nowhere..\n");
+			else if (vnlru_nowhere == 5)
+				printf("vnlru process messages stopped.\n");
+#endif
 			vnlru_nowhere++;
 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 		} else
 			kern_yield(PRI_USER);
-		/*
-		 * After becoming active to expand above low water, keep
-		 * active until above high water.
-		 */
-		force = vspace() < vhiwat;
 	}
 }
 
@@ -1197,18 +1088,8 @@ vtryrecycle(struct vnode *vp)
 	return (0);
 }
 
-static void
-vcheckspace(void)
-{
-
-	if (vspace() < vlowat && vnlruproc_sig == 0) {
-		vnlruproc_sig = 1;
-		wakeup(vnlruproc);
-	}
-}
-
 /*
- * Wait if necessary for space for a new vnode.
+ * Wait for available vnodes.
  */
 static int
 getnewvnode_wait(int suspended)
@@ -1215,13 +1096,14 @@ getnewvnode_wait(int suspended)
 {
 
 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
-	if (numvnodes >= desiredvnodes) {
+	if (numvnodes > desiredvnodes) {
 		if (suspended) {
 			/*
-			 * The file system is being suspended.  We cannot
-			 * risk a deadlock here, so allow allocation of
-			 * another vnode even if this would give too many.
+			 * File system is beeing suspended, we cannot risk a
+			 * deadlock here, so allocate new vnode anyway.
 			 */
+			if (freevnodes > wantfreevnodes)
+				vnlru_free(freevnodes - wantfreevnodes);
 			return (0);
 		}
 		if (vnlruproc_sig == 0) {
@@ -1231,34 +1113,18 @@ getnewvnode_wait(int suspended)
 		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
 		    "vlruwk", hz);
 	}
-	/* Post-adjust like the pre-adjust in getnewvnode(). */
-	if (numvnodes + 1 > desiredvnodes && freevnodes > 1)
-		vnlru_free(1);
-	return (numvnodes >= desiredvnodes ? ENFILE : 0);
+	return (numvnodes > desiredvnodes ? ENFILE : 0);
 }
 
-/*
- * This hack is fragile, and probably not needed any more now that the
- * watermark handling works.
- */
 void
 getnewvnode_reserve(u_int count)
 {
 	struct thread *td;
 
-	/* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */
-	/* XXX no longer so quick, but this part is not racy. */
-	mtx_lock(&vnode_free_list_mtx);
-	if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes)
-		vnlru_free(ulmin(numvnodes + count - desiredvnodes,
-		    freevnodes - wantfreevnodes));
-	mtx_unlock(&vnode_free_list_mtx);
-
 	td = curthread;
 	/* First try to be quick and racy. */
 	if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
 		td->td_vp_reserv += count;
-		vcheckspace();	/* XXX no longer so quick, but more racy */
 		return;
 	} else
 		atomic_subtract_long(&numvnodes, count);
@@ -1271,18 +1137,9 @@ getnewvnode_reserve(u_int count)
 			atomic_add_long(&numvnodes, 1);
 		}
 	}
-	vcheckspace();
 	mtx_unlock(&vnode_free_list_mtx);
 }
 
-/*
- * This hack is fragile, especially if desiredvnodes or wantvnodes are
- * misconfgured or changed significantly.  Reducing desiredvnodes below
- * the reserved amount should cause bizarre behaviour like reducing it
- * below the number of active vnodes -- the system will try to reduce
- * numvnodes to match, but should fail, so the subtraction below should
- * not overflow.
- */
 void
 getnewvnode_drop_reserve(void)
 {
@@ -1303,7 +1160,6 @@ getnewvnode(const char *tag, struct mount *mp, str
 	struct vnode *vp;
 	struct thread *td;
 	struct lock_object *lo;
-	static int cyclecount;
 	int error;
 
 	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
@@ -1314,37 +1170,19 @@ getnewvnode(const char *tag, struct mount *mp, str
 		goto alloc;
 	}
 	mtx_lock(&vnode_free_list_mtx);
-	if (numvnodes < desiredvnodes)
-		cyclecount = 0;
-	else if (cyclecount++ >= freevnodes) {
-		cyclecount = 0;
-		vstir = 1;
-	}
 	/*
-	 * Grow the vnode cache if it will not be above its target max
-	 * after growing.  Otherwise, if the free list is nonempty, try
-	 * to reclaim 1 item from it before growing the cache (possibly
-	 * above its target max if the reclamation failed or is delayed).
-	 * Otherwise, wait for some space.  In all cases, schedule
-	 * vnlru_proc() if we are getting short of space.  The watermarks
-	 * should be chosen so that we never wait or even reclaim from
-	 * the free list to below its target minimum.
+	 * Lend our context to reclaim vnodes if they've exceeded the max.
 	 */
-	if (numvnodes + 1 <= desiredvnodes)
-		;
-	else if (freevnodes > 0)
+	if (freevnodes > wantfreevnodes)
 		vnlru_free(1);
-	else {
-		error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
-		    MNTK_SUSPEND));
+	error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
+	    MNTK_SUSPEND));
 #if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
-		if (error != 0) {
-			mtx_unlock(&vnode_free_list_mtx);
-			return (error);
-		}
+	if (error != 0) {
+		mtx_unlock(&vnode_free_list_mtx);
+		return (error);
+	}
 #endif
-	}
-	vcheckspace();
 	atomic_add_long(&numvnodes, 1);
 	mtx_unlock(&vnode_free_list_mtx);
 alloc:
Index: .
===================================================================
--- .	(revision 295659)
+++ .	(working copy)

Property changes on: .
___________________________________________________________________
Modified: svn:mergeinfo
## -0,1 +0,0 ##
   Reverse-merged /head:r291244