Index: amd64/amd64/cpu_switch.S
===================================================================
RCS file: /usr/home/ncvs/src/sys/amd64/amd64/cpu_switch.S,v
retrieving revision 1.156
diff -u -p -r1.156 cpu_switch.S
--- amd64/amd64/cpu_switch.S	30 Mar 2007 00:06:20 -0000	1.156
+++ amd64/amd64/cpu_switch.S	31 May 2007 23:15:57 -0000
@@ -73,19 +73,16 @@ ENTRY(cpu_throw)
 	movq	TD_PCB(%rsi),%rdx		/* newtd->td_proc */
 	movq	PCB_CR3(%rdx),%rdx
 	movq	%rdx,%cr3			/* new address space */
-	/* set bit in new pm_active */
-	movq	TD_PROC(%rsi),%rdx
-	movq	P_VMSPACE(%rdx), %rdx
-	LK btsl	%eax, VM_PMAP+PM_ACTIVE(%rdx)	/* set new */
-	jmp	sw1
+	jmp	swact
 
 /*
- * cpu_switch(old, new)
+ * cpu_switch(old, new, mtx)
  *
  * Save the current thread state, then select the next thread to run
  * and load its state.
  * %rdi = oldtd
  * %rsi = newtd
+ * %rdx = mtx
  */
 ENTRY(cpu_switch)
 	/* Switch to new thread.  First, save context. */
@@ -147,17 +144,33 @@ ENTRY(cpu_switch)
 	movq	TD_PCB(%rsi),%r8
 
 	/* switch address space */
-	movq	PCB_CR3(%r8),%rdx
+	movq	PCB_CR3(%r8),%rcx
 	movq	%cr3,%rax
-	cmpq	%rdx,%rax			/* Same address space? */
-	je	sw1
-	movq	%rdx,%cr3			/* new address space */
-
+	cmpq	%rcx,%rax			/* Same address space? */
+	jne	swinact
+	movq	%rdx, TD_LOCK(%rdi)		/* Release the old thread */
+	/* Wait for the new thread to become unblocked */
+	movq	$blocked_lock, %rdx
+1:
+	movq	TD_LOCK(%rsi),%rcx
+	cmpq	%rcx, %rdx
+	je	1b
+	jmp	sw1
+swinact:
+	movq	%rcx,%cr3			/* new address space */
 	movl	PCPU(CPUID), %eax
 	/* Release bit from old pmap->pm_active */
-	movq	TD_PROC(%rdi), %rdx		/* oldproc */
-	movq	P_VMSPACE(%rdx), %rdx
-	LK btrl	%eax, VM_PMAP+PM_ACTIVE(%rdx)	/* clear old */
+	movq	TD_PROC(%rdi), %rcx		/* oldproc */
+	movq	P_VMSPACE(%rcx), %rcx
+	LK btrl	%eax, VM_PMAP+PM_ACTIVE(%rcx)	/* clear old */
+	movq	%rdx, TD_LOCK(%rdi)		/* Release the old thread */
+swact:
+	/* Wait for the new thread to become unblocked */
+	movq	$blocked_lock, %rdx
+1:
+	movq	TD_LOCK(%rsi),%rcx
+	cmpq	%rcx, %rdx
+	je	1b
 
 	/* Set bit in new pmap->pm_active */
 	movq	TD_PROC(%rsi),%rdx		/* newproc */
Index: amd64/amd64/genassym.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/amd64/amd64/genassym.c,v
retrieving revision 1.161
diff -u -p -r1.161 genassym.c
--- amd64/amd64/genassym.c	30 Mar 2007 00:06:20 -0000	1.161
+++ amd64/amd64/genassym.c	18 May 2007 10:37:00 -0000
@@ -76,6 +76,7 @@ ASSYM(VM_PMAP, offsetof(struct vmspace, 
 ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
 ASSYM(P_SFLAG, offsetof(struct proc, p_sflag));
 
+ASSYM(TD_LOCK, offsetof(struct thread, td_lock));
 ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
 ASSYM(TD_PCB, offsetof(struct thread, td_pcb));
 ASSYM(TD_PROC, offsetof(struct thread, td_proc));
Index: amd64/amd64/machdep.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/amd64/amd64/machdep.c,v
retrieving revision 1.672
diff -u -p -r1.672 machdep.c
--- amd64/amd64/machdep.c	31 May 2007 22:52:10 -0000	1.672
+++ amd64/amd64/machdep.c	31 May 2007 20:40:12 -0000
@@ -460,9 +460,9 @@ cpu_est_clockrate(int cpu_id, uint64_t *
 
 #ifdef SMP
 	/* Schedule ourselves on the indicated cpu. */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 	sched_bind(curthread, cpu_id);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 #endif
 
 	/* Calibrate by measuring a short delay. */
@@ -473,9 +473,9 @@ cpu_est_clockrate(int cpu_id, uint64_t *
 	intr_restore(reg);
 
 #ifdef SMP
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 	sched_unbind(curthread);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 #endif
 
 	/*
Index: amd64/amd64/mp_machdep.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/amd64/amd64/mp_machdep.c,v
retrieving revision 1.285
diff -u -p -r1.285 mp_machdep.c
--- amd64/amd64/mp_machdep.c	19 May 2007 05:03:59 -0000	1.285
+++ amd64/amd64/mp_machdep.c	20 May 2007 11:40:23 -0000
@@ -46,6 +46,7 @@ __FBSDID("$FreeBSD: src/sys/amd64/amd64/
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
+#include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
@@ -590,25 +591,7 @@ init_secondary(void)
 	while (smp_started == 0)
 		ia32_pause();
 
-	/* ok, now grab sched_lock and enter the scheduler */
-	mtx_lock_spin(&sched_lock);
-
-	/*
-	 * Correct spinlock nesting.  The idle thread context that we are
-	 * borrowing was created so that it would start out with a single
-	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
-	 * explicitly acquired locks in this function, the nesting count
-	 * is now 2 rather than 1.  Since we are nested, calling
-	 * spinlock_exit() will simply adjust the counts without allowing
-	 * spin lock using code to interrupt us.
-	 */
-	spinlock_exit();
-	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
-
-	PCPU_SET(switchtime, cpu_ticks());
-	PCPU_SET(switchticks, ticks);
-
-	cpu_throw(NULL, choosethread());	/* doesn't return */
+	sched_throw(NULL);
 
 	panic("scheduler returned us to %s", __func__);
 	/* NOTREACHED */
@@ -988,12 +971,12 @@ ipi_bitmap_handler(struct trapframe fram
 
 	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
 		struct thread *running_thread = curthread;
-		mtx_lock_spin(&sched_lock);
+		thread_lock(running_thread);
 		if (running_thread->td_critnest > 1) 
 			running_thread->td_owepreempt = 1;
 		else 		
 			mi_switch(SW_INVOL | SW_PREEMPT, NULL);
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(running_thread);
 	}
 
 	/* Nothing to do for AST */
@@ -1177,11 +1160,9 @@ release_aps(void *dummy __unused)
 
 	if (mp_ncpus == 1) 
 		return;
-	mtx_lock_spin(&sched_lock);
 	atomic_store_rel_int(&aps_ready, 1);
 	while (smp_started == 0)
 		ia32_pause();
-	mtx_unlock_spin(&sched_lock);
 }
 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
 
Index: amd64/amd64/mp_watchdog.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/amd64/amd64/mp_watchdog.c,v
retrieving revision 1.4
diff -u -p -r1.4 mp_watchdog.c
--- amd64/amd64/mp_watchdog.c	28 Feb 2005 08:55:53 -0000	1.4
+++ amd64/amd64/mp_watchdog.c	31 May 2007 21:22:05 -0000
@@ -105,9 +105,7 @@ watchdog_function(void *arg)
 	 * locks to make sure.  Then reset the timer.
 	 */
 	mtx_lock(&Giant);
-	mtx_lock_spin(&sched_lock);
 	watchdog_timer = WATCHDOG_THRESHOLD;
-	mtx_unlock_spin(&sched_lock);
 	mtx_unlock(&Giant);
 	callout_reset(&watchdog_callout, 1 * hz, watchdog_function, NULL);
 }
@@ -156,34 +154,6 @@ SYSCTL_PROC(_debug, OID_AUTO, watchdog, 
     sysctl_watchdog, "I", "");
 
 /*
- * A badly behaved sysctl that leaks the sched lock when written to.  Then
- * spin holding it just to make matters worse.  This can be used to test the
- * effectiveness of the watchdog by generating a fairly hard and nast hang.
- * Note that Giant is also held in the current world order when we get here.
- */
-static int
-sysctl_leak_schedlock(SYSCTL_HANDLER_ARGS)
-{
-	int error, temp;
-
-	temp = 0;
-	error = sysctl_handle_int(oidp, &temp, 0, req);
-	if (error)
-		return (error);
-
-	if (req->newptr != NULL) {
-		if (temp) {
-			printf("Leaking the sched lock...\n");
-			mtx_lock_spin(&sched_lock);
-			while (1);
-		}
-	}
-	return (0);
-}
-SYSCTL_PROC(_debug, OID_AUTO, leak_schedlock, CTLTYPE_INT|CTLFLAG_RW, 0, 0,
-    sysctl_leak_schedlock, "IU", "");
-
-/*
  * Drop into the debugger by sending an IPI NMI to the boot processor.
  */
 static void
Index: amd64/amd64/vm_machdep.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/amd64/amd64/vm_machdep.c,v
retrieving revision 1.254
diff -u -p -r1.254 vm_machdep.c
--- amd64/amd64/vm_machdep.c	24 Apr 2007 21:17:45 -0000	1.254
+++ amd64/amd64/vm_machdep.c	31 May 2007 21:23:21 -0000
@@ -170,7 +170,7 @@ cpu_fork(td1, p2, td2, flags)
 	 * pcb2->pcb_[fg]sbase:	cloned above
 	 */
 
-	/* Setup to release sched_lock in fork_exit(). */
+	/* Setup to release spin count in fork_exit(). */
 	td2->td_md.md_spinlock_count = 1;
 	td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
 
@@ -304,7 +304,7 @@ cpu_set_upcall(struct thread *td, struct
 	 * pcb2->pcb_[fg]sbase: cloned above
 	 */
 
-	/* Setup to release sched_lock in fork_exit(). */
+	/* Setup to release spin count in fork_exit(). */
 	td->td_md.md_spinlock_count = 1;
 	td->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
 }
Index: amd64/linux32/linux32_machdep.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/amd64/linux32/linux32_machdep.c,v
retrieving revision 1.43
diff -u -p -r1.43 linux32_machdep.c
--- amd64/linux32/linux32_machdep.c	11 May 2007 01:25:50 -0000	1.43
+++ amd64/linux32/linux32_machdep.c	18 May 2007 10:37:01 -0000
@@ -486,10 +486,10 @@ linux_fork(struct thread *td, struct lin
 	/*
 	 * Make this runnable after we are finished with it.
 	 */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td2);
 	TD_SET_CAN_RUN(td2);
 	sched_add(td2, SRQ_BORING);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td2);
 
 	return (0);
 }
@@ -529,10 +529,10 @@ linux_vfork(struct thread *td, struct li
 	/*
 	 * Make this runnable after we are finished with it.
 	 */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td2);
 	TD_SET_CAN_RUN(td2);
 	sched_add(td2, SRQ_BORING);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td2);
 
 	/* wait for the children to exit, ie. emulate vfork */
 	PROC_LOCK(p2);
@@ -715,10 +715,10 @@ linux_clone(struct thread *td, struct li
 	/*
 	 * Make this runnable after we are finished with it.
 	 */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td2);
 	TD_SET_CAN_RUN(td2);
 	sched_add(td2, SRQ_BORING);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td2);
 
 	td->td_retval[0] = p2->p_pid;
 	td->td_retval[1] = 0;
Index: arm/arm/vm_machdep.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/arm/arm/vm_machdep.c,v
retrieving revision 1.31
diff -u -p -r1.31 vm_machdep.c
--- arm/arm/vm_machdep.c	23 May 2007 13:19:00 -0000	1.31
+++ arm/arm/vm_machdep.c	31 May 2007 21:23:48 -0000
@@ -143,7 +143,7 @@ cpu_fork(register struct thread *td1, re
 	tf->tf_r1 = 0;
 	pcb2->un_32.pcb32_sp = (u_int)sf;
 
-	/* Setup to release sched_lock in fork_exit(). */
+	/* Setup to release spin count in fork_exit(). */
 	td2->td_md.md_spinlock_count = 1;
 	td2->td_md.md_saved_cspr = 0;
 	td2->td_md.md_tp = *(uint32_t **)ARM_TP_ADDRESS;
@@ -288,7 +288,7 @@ cpu_set_upcall(struct thread *td, struct
 	td->td_pcb->un_32.pcb32_sp = (u_int)sf;
 	td->td_pcb->un_32.pcb32_und_sp = td->td_kstack + USPACE_UNDEF_STACK_TOP;
 
-	/* Setup to release sched_lock in fork_exit(). */
+	/* Setup to release spin count in fork_exit(). */
 	td->td_md.md_spinlock_count = 1;
 	td->td_md.md_saved_cspr = 0;
 }
Index: compat/linprocfs/linprocfs.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/compat/linprocfs/linprocfs.c,v
retrieving revision 1.114
diff -u -p -r1.114 linprocfs.c
--- compat/linprocfs/linprocfs.c	31 May 2007 22:52:11 -0000	1.114
+++ compat/linprocfs/linprocfs.c	31 May 2007 20:40:13 -0000
@@ -636,7 +636,7 @@ linprocfs_doprocstatus(PFS_FILL_ARGS)
 	if (P_SHOULDSTOP(p)) {
 		state = "T (stopped)";
 	} else {
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		switch(p->p_state) {
 		case PRS_NEW:
 			state = "I (idle)";
@@ -666,7 +666,7 @@ linprocfs_doprocstatus(PFS_FILL_ARGS)
 			state = "? (unknown)";
 			break;
 		}
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 	}
 
 	fill_kinfo_proc(p, &kp);
Index: compat/ndis/subr_ntoskrnl.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/compat/ndis/subr_ntoskrnl.c,v
retrieving revision 1.88
diff -u -p -r1.88 subr_ntoskrnl.c
--- compat/ndis/subr_ntoskrnl.c	25 Dec 2006 17:04:41 -0000	1.88
+++ compat/ndis/subr_ntoskrnl.c	18 May 2007 10:37:01 -0000
@@ -3824,7 +3824,7 @@ ntoskrnl_dpc_thread(arg)
 	 * once scheduled by an ISR.
 	 */
 
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 #ifdef NTOSKRNL_MULTIPLE_DPCS
 #if __FreeBSD_version >= 502102
 	sched_bind(curthread, kq->kq_cpu);
@@ -3834,7 +3834,7 @@ ntoskrnl_dpc_thread(arg)
 #if __FreeBSD_version < 600000
         curthread->td_base_pri = PRI_MIN_KERN;
 #endif
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 
 	while (1) {
 		KeWaitForSingleObject(&kq->kq_proc, 0, 0, TRUE, NULL);
Index: compat/svr4/svr4_misc.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/compat/svr4/svr4_misc.c,v
retrieving revision 1.93
diff -u -p -r1.93 svr4_misc.c
--- compat/svr4/svr4_misc.c	31 May 2007 22:52:11 -0000	1.93
+++ compat/svr4/svr4_misc.c	31 May 2007 20:40:13 -0000
@@ -1253,12 +1253,12 @@ loop:
 		 * See if we have a stopped or continued process.
 		 * XXX: This duplicates the same code in kern_wait().
 		 */
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		if ((p->p_flag & P_STOPPED_SIG) &&
 		    (p->p_suspcount == p->p_numthreads) &&
 		    (p->p_flag & P_WAITED) == 0 &&
 		    (p->p_flag & P_TRACED || uap->options & SVR4_WSTOPPED)) {
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 		        if (((uap->options & SVR4_WNOWAIT)) == 0)
 				p->p_flag |= P_WAITED;
 			sx_sunlock(&proctree_lock);
@@ -1278,7 +1278,7 @@ loop:
 			DPRINTF(("jobcontrol %d\n", pid));
 			return (svr4_setinfo(pid, &ru, status, uap->info));
 		}
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		if (uap->options & SVR4_WCONTINUED &&
 		    (p->p_flag & P_CONTINUED)) {
 			sx_sunlock(&proctree_lock);
Index: conf/files
===================================================================
RCS file: /usr/home/ncvs/src/sys/conf/files,v
retrieving revision 1.1213
diff -u -p -r1.1213 files
--- conf/files	31 May 2007 19:47:39 -0000	1.1213
+++ conf/files	31 May 2007 20:40:13 -0000
@@ -1430,6 +1430,7 @@ kern/posix4_mib.c		standard
 kern/sched_4bsd.c		optional sched_4bsd
 kern/sched_core.c		optional sched_core
 kern/sched_ule.c		optional sched_ule
+kern/sched_smp.c		optional sched_smp
 kern/serdev_if.m		standard
 kern/subr_acl_posix1e.c		standard
 kern/subr_autoconf.c		standard
Index: conf/options
===================================================================
RCS file: /usr/home/ncvs/src/sys/conf/options,v
retrieving revision 1.589
diff -u -p -r1.589 options
--- conf/options	30 May 2007 17:39:44 -0000	1.589
+++ conf/options	31 May 2007 20:40:13 -0000
@@ -137,6 +137,7 @@ QUOTA
 SCHED_4BSD	opt_sched.h
 SCHED_CORE	opt_sched.h
 SCHED_ULE	opt_sched.h
+SCHED_SMP	opt_sched.h
 SHOW_BUSYBUFS
 SLEEPQUEUE_PROFILING
 SLHCI_DEBUG	opt_slhci.h
Index: dev/hwpmc/hwpmc_mod.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/dev/hwpmc/hwpmc_mod.c,v
retrieving revision 1.28
diff -u -p -r1.28 hwpmc_mod.c
--- dev/hwpmc/hwpmc_mod.c	19 Apr 2007 08:02:51 -0000	1.28
+++ dev/hwpmc/hwpmc_mod.c	18 May 2007 10:37:01 -0000
@@ -591,10 +591,10 @@ static void
 pmc_save_cpu_binding(struct pmc_binding *pb)
 {
 	PMCDBG(CPU,BND,2, "%s", "save-cpu");
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 	pb->pb_bound = sched_is_bound(curthread);
 	pb->pb_cpu   = curthread->td_oncpu;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 	PMCDBG(CPU,BND,2, "save-cpu cpu=%d", pb->pb_cpu);
 }
 
@@ -607,12 +607,12 @@ pmc_restore_cpu_binding(struct pmc_bindi
 {
 	PMCDBG(CPU,BND,2, "restore-cpu curcpu=%d restore=%d",
 	    curthread->td_oncpu, pb->pb_cpu);
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 	if (pb->pb_bound)
 		sched_bind(curthread, pb->pb_cpu);
 	else
 		sched_unbind(curthread);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 	PMCDBG(CPU,BND,2, "%s", "restore-cpu done");
 }
 
@@ -631,9 +631,9 @@ pmc_select_cpu(int cpu)
 	    "disabled CPU %d", __LINE__, cpu));
 
 	PMCDBG(CPU,SEL,2, "select-cpu cpu=%d", cpu);
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 	sched_bind(curthread, cpu);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 
 	KASSERT(curthread->td_oncpu == cpu,
 	    ("[pmc,%d] CPU not bound [cpu=%d, curr=%d]", __LINE__,
Index: dev/md/md.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/dev/md/md.c,v
retrieving revision 1.168
diff -u -p -r1.168 md.c
--- dev/md/md.c	31 May 2007 11:51:49 -0000	1.168
+++ dev/md/md.c	31 May 2007 20:40:15 -0000
@@ -690,9 +690,9 @@ md_kthread(void *arg)
 	int error;
 
 	sc = arg;
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 	sched_prio(curthread, PRIBIO);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 	if (sc->type == MD_VNODE)
 		curthread->td_pflags |= TDP_NORUNNINGBUF;
 
Index: dev/syscons/syscons.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/dev/syscons/syscons.c,v
retrieving revision 1.450
diff -u -p -r1.450 syscons.c
--- dev/syscons/syscons.c	16 Nov 2006 12:27:51 -0000	1.450
+++ dev/syscons/syscons.c	31 May 2007 21:24:31 -0000
@@ -2326,8 +2326,8 @@ sc_switch_scr(sc_softc_t *sc, u_int next
     if (sc->new_scp == sc->old_scp) {
 	sc->switch_in_progress = 0;
 	/*
-	 * XXX wakeup() calls mtx_lock(&sched_lock) which will hang if
-	 * sched_lock is in an in-between state, e.g., when we stop at
+	 * XXX wakeup() locks the scheduler lock which will hang if
+	 * the lock is in an in-between state, e.g., when we stop at
 	 * a breakpoint at fork_exit.  It has always been wrong to call
 	 * wakeup() when the debugger is active.  In RELENG_4, wakeup()
 	 * is supposed to be locked by splhigh(), but the debugger may
Index: dev/syscons/syscons.h
===================================================================
RCS file: /usr/home/ncvs/src/sys/dev/syscons/syscons.h,v
retrieving revision 1.87
diff -u -p -r1.87 syscons.h
--- dev/syscons/syscons.h	13 Sep 2006 15:48:15 -0000	1.87
+++ dev/syscons/syscons.h	18 May 2007 10:37:01 -0000
@@ -536,7 +536,7 @@ typedef struct {
 		(*kbdsw[(kbd)->kb_index]->poll)((kbd), (on))
 
 #define SC_VIDEO_LOCKINIT(sc)						\
-		mtx_init(&(sc)->video_mtx, "syscons video lock", NULL,MTX_SPIN);
+		mtx_init(&(sc)->video_mtx, "syscons video lock", NULL,MTX_QUIET|MTX_SPIN);
 #define SC_VIDEO_LOCK(sc)						\
 		do {							\
 			if (!cold)					\
Index: fs/procfs/procfs_ctl.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/fs/procfs/procfs_ctl.c,v
retrieving revision 1.55
diff -u -p -r1.55 procfs_ctl.c
--- fs/procfs/procfs_ctl.c	22 Feb 2006 17:20:37 -0000	1.55
+++ fs/procfs/procfs_ctl.c	18 May 2007 10:37:01 -0000
@@ -286,9 +286,9 @@ out:
 		panic("procfs_control");
 	}
 
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	thread_unsuspend(p); /* If it can run, let it do so. */
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 	return (0);
 }
 
@@ -344,9 +344,9 @@ procfs_doprocctl(PFS_FILL_ARGS)
 #endif
 				/* XXXKSE: */
 				p->p_flag &= ~P_STOPPED_SIG;
-				mtx_lock_spin(&sched_lock);
+				PROC_SLOCK(p);
 				thread_unsuspend(p);
-				mtx_unlock_spin(&sched_lock);
+				PROC_SUNLOCK(p);
 			} else
 				psignal(p, nm->nm_val);
 			PROC_UNLOCK(p);
Index: fs/procfs/procfs_ioctl.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/fs/procfs/procfs_ioctl.c,v
retrieving revision 1.17
diff -u -p -r1.17 procfs_ioctl.c
--- fs/procfs/procfs_ioctl.c	1 May 2007 12:59:20 -0000	1.17
+++ fs/procfs/procfs_ioctl.c	18 May 2007 10:37:01 -0000
@@ -185,9 +185,9 @@ procfs_ioctl(PFS_IOCTL_ARGS)
 		if (P_SHOULDSTOP(p)) {
 			p->p_xstat = sig;
 			p->p_flag &= ~(P_STOPPED_TRACE|P_STOPPED_SIG);
-			mtx_lock_spin(&sched_lock);
+			PROC_SLOCK(p);
 			thread_unsuspend(p);
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 		} else if (sig)
 			psignal(p, sig);
 #else
Index: fs/procfs/procfs_status.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/fs/procfs/procfs_status.c,v
retrieving revision 1.59
diff -u -p -r1.59 procfs_status.c
--- fs/procfs/procfs_status.c	6 Dec 2006 06:34:54 -0000	1.59
+++ fs/procfs/procfs_status.c	18 May 2007 10:37:01 -0000
@@ -112,7 +112,7 @@ procfs_doprocstatus(PFS_FILL_ARGS)
 		sbuf_printf(sb, "noflags");
 	}
 
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 #ifdef KSE
 	if (p->p_flag & P_SA)
 		wmesg = "-kse- ";
@@ -127,7 +127,7 @@ procfs_doprocstatus(PFS_FILL_ARGS)
 		} else
 			wmesg = "nochan";
 	}
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 
 	if (p->p_sflag & PS_INMEM) {
 		struct timeval start, ut, st;
Index: geom/geom_kern.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/geom/geom_kern.c,v
retrieving revision 1.40
diff -u -p -r1.40 geom_kern.c
--- geom/geom_kern.c	25 Nov 2005 10:09:30 -0000	1.40
+++ geom/geom_kern.c	18 May 2007 10:37:01 -0000
@@ -88,9 +88,9 @@ g_up_procbody(void)
 	struct thread *tp = FIRST_THREAD_IN_PROC(p);
 
 	mtx_assert(&Giant, MA_NOTOWNED);
-	mtx_lock_spin(&sched_lock);
+	thread_lock(tp);
 	sched_prio(tp, PRIBIO);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(tp);
 	for(;;) {
 		g_io_schedule_up(tp);
 	}
@@ -111,9 +111,9 @@ g_down_procbody(void)
 	struct thread *tp = FIRST_THREAD_IN_PROC(p);
 
 	mtx_assert(&Giant, MA_NOTOWNED);
-	mtx_lock_spin(&sched_lock);
+	thread_lock(tp);
 	sched_prio(tp, PRIBIO);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(tp);
 	for(;;) {
 		g_io_schedule_down(tp);
 	}
@@ -134,9 +134,9 @@ g_event_procbody(void)
 	struct thread *tp = FIRST_THREAD_IN_PROC(p);
 
 	mtx_assert(&Giant, MA_NOTOWNED);
-	mtx_lock_spin(&sched_lock);
+	thread_lock(tp);
 	sched_prio(tp, PRIBIO);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(tp);
 	for(;;) {
 		g_run_events();
 		tsleep(&g_wait_event, PRIBIO, "-", hz/10);
Index: geom/eli/g_eli.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/geom/eli/g_eli.c,v
retrieving revision 1.37
diff -u -p -r1.37 g_eli.c
--- geom/eli/g_eli.c	8 Apr 2007 23:54:23 -0000	1.37
+++ geom/eli/g_eli.c	18 May 2007 10:37:01 -0000
@@ -332,11 +332,11 @@ g_eli_worker(void *arg)
 			tsleep(wr, 0, "geli:smp", hz / 4);
 	}
 #endif
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 	sched_prio(curthread, PRIBIO);
 	if (sc->sc_crypto == G_ELI_CRYPTO_SW && g_eli_threads == 0)
 		sched_bind(curthread, wr->w_number);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 
 	G_ELI_DEBUG(1, "Thread %s started.", curthread->td_proc->p_comm);
 
Index: geom/journal/g_journal.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/geom/journal/g_journal.c,v
retrieving revision 1.11
diff -u -p -r1.11 g_journal.c
--- geom/journal/g_journal.c	6 Apr 2007 12:53:54 -0000	1.11
+++ geom/journal/g_journal.c	18 May 2007 10:37:01 -0000
@@ -2057,9 +2057,9 @@ g_journal_worker(void *arg)
 	time_t last_write;
 	int type;
 
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 	sched_prio(curthread, PRIBIO);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 
 	sc = arg;
 	type = 0;	/* gcc */
Index: geom/mirror/g_mirror.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/geom/mirror/g_mirror.c,v
retrieving revision 1.92
diff -u -p -r1.92 g_mirror.c
--- geom/mirror/g_mirror.c	1 Nov 2006 22:51:49 -0000	1.92
+++ geom/mirror/g_mirror.c	18 May 2007 10:37:01 -0000
@@ -1768,9 +1768,9 @@ g_mirror_worker(void *arg)
 	int timeout;
 
 	sc = arg;
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 	sched_prio(curthread, PRIBIO);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 
 	sx_xlock(&sc->sc_lock);
 	for (;;) {
Index: geom/raid3/g_raid3.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/geom/raid3/g_raid3.c,v
retrieving revision 1.80
diff -u -p -r1.80 g_raid3.c
--- geom/raid3/g_raid3.c	1 Nov 2006 22:51:49 -0000	1.80
+++ geom/raid3/g_raid3.c	18 May 2007 10:37:01 -0000
@@ -2017,9 +2017,9 @@ g_raid3_worker(void *arg)
 	int timeout;
 
 	sc = arg;
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 	sched_prio(curthread, PRIBIO);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 
 	sx_xlock(&sc->sc_lock);
 	for (;;) {
Index: i386/i386/machdep.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/i386/i386/machdep.c,v
retrieving revision 1.654
diff -u -p -r1.654 machdep.c
--- i386/i386/machdep.c	31 May 2007 22:52:11 -0000	1.654
+++ i386/i386/machdep.c	31 May 2007 20:40:16 -0000
@@ -1058,9 +1058,9 @@ cpu_est_clockrate(int cpu_id, uint64_t *
 
 #ifdef SMP
 	/* Schedule ourselves on the indicated cpu. */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 	sched_bind(curthread, cpu_id);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 #endif
 
 	/* Calibrate by measuring a short delay. */
@@ -1071,9 +1071,9 @@ cpu_est_clockrate(int cpu_id, uint64_t *
 	intr_restore(reg);
 
 #ifdef SMP
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 	sched_unbind(curthread);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 #endif
 
 	/*
Index: i386/i386/mp_machdep.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/i386/i386/mp_machdep.c,v
retrieving revision 1.279
diff -u -p -r1.279 mp_machdep.c
--- i386/i386/mp_machdep.c	20 May 2007 22:03:57 -0000	1.279
+++ i386/i386/mp_machdep.c	23 May 2007 15:29:15 -0000
@@ -61,6 +61,7 @@ __FBSDID("$FreeBSD: src/sys/i386/i386/mp
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
+#include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
@@ -642,25 +643,8 @@ init_secondary(void)
 	while (smp_started == 0)
 		ia32_pause();
 
-	/* ok, now grab sched_lock and enter the scheduler */
-	mtx_lock_spin(&sched_lock);
-
-	/*
-	 * Correct spinlock nesting.  The idle thread context that we are
-	 * borrowing was created so that it would start out with a single
-	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
-	 * explicitly acquired locks in this function, the nesting count
-	 * is now 2 rather than 1.  Since we are nested, calling
-	 * spinlock_exit() will simply adjust the counts without allowing
-	 * spin lock using code to interrupt us.
-	 */
-	spinlock_exit();
-	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
-
-	PCPU_SET(switchtime, cpu_ticks());
-	PCPU_SET(switchticks, ticks);
-
-	cpu_throw(NULL, choosethread());	/* doesn't return */
+	/* enter the scheduler */
+	sched_throw(NULL);
 
 	panic("scheduler returned us to %s", __func__);
 	/* NOTREACHED */
@@ -1194,12 +1178,12 @@ ipi_bitmap_handler(struct trapframe fram
 #ifdef COUNT_IPIS
 		(*ipi_preempt_counts[cpu])++;
 #endif
-		mtx_lock_spin(&sched_lock);
+		thread_lock(running_thread);
 		if (running_thread->td_critnest > 1) 
 			running_thread->td_owepreempt = 1;
 		else 		
 			mi_switch(SW_INVOL | SW_PREEMPT, NULL);
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(running_thread);
 	}
 
 	if (ipi_bitmap & (1 << IPI_AST)) {
Index: i386/i386/mp_watchdog.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/i386/i386/mp_watchdog.c,v
retrieving revision 1.4
diff -u -p -r1.4 mp_watchdog.c
--- i386/i386/mp_watchdog.c	27 Feb 2005 22:34:07 -0000	1.4
+++ i386/i386/mp_watchdog.c	31 May 2007 21:22:34 -0000
@@ -105,9 +105,7 @@ watchdog_function(void *arg)
 	 * locks to make sure.  Then reset the timer.
 	 */
 	mtx_lock(&Giant);
-	mtx_lock_spin(&sched_lock);
 	watchdog_timer = WATCHDOG_THRESHOLD;
-	mtx_unlock_spin(&sched_lock);
 	mtx_unlock(&Giant);
 	callout_reset(&watchdog_callout, 1 * hz, watchdog_function, NULL);
 }
@@ -156,34 +154,6 @@ SYSCTL_PROC(_debug, OID_AUTO, watchdog, 
     sysctl_watchdog, "I", "");
 
 /*
- * A badly behaved sysctl that leaks the sched lock when written to.  Then
- * spin holding it just to make matters worse.  This can be used to test the
- * effectiveness of the watchdog by generating a fairly hard and nast hang.
- * Note that Giant is also held in the current world order when we get here.
- */
-static int
-sysctl_leak_schedlock(SYSCTL_HANDLER_ARGS)
-{
-	int error, temp;
-
-	temp = 0;
-	error = sysctl_handle_int(oidp, &temp, 0, req);
-	if (error)
-		return (error);
-
-	if (req->newptr != NULL) {
-		if (temp) {
-			printf("Leaking the sched lock...\n");
-			mtx_lock_spin(&sched_lock);
-			while (1);
-		}
-	}
-	return (0);
-}
-SYSCTL_PROC(_debug, OID_AUTO, leak_schedlock, CTLTYPE_INT|CTLFLAG_RW, 0, 0,
-    sysctl_leak_schedlock, "IU", "");
-
-/*
  * Drop into the debugger by sending an IPI NMI to the boot processor.
  */
 static void
Index: i386/i386/vm_machdep.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/i386/i386/vm_machdep.c,v
retrieving revision 1.281
diff -u -p -r1.281 vm_machdep.c
--- i386/i386/vm_machdep.c	29 May 2007 18:55:41 -0000	1.281
+++ i386/i386/vm_machdep.c	31 May 2007 21:23:06 -0000
@@ -264,7 +264,7 @@ cpu_fork(td1, p2, td2, flags)
 	}
 	mtx_unlock_spin(&dt_lock);
 
-	/* Setup to release sched_lock in fork_exit(). */
+	/* Setup to release spin count in fork_exit(). */
 	td2->td_md.md_spinlock_count = 1;
 	td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
 
@@ -438,7 +438,7 @@ cpu_set_upcall(struct thread *td, struct
 	 */
 	pcb2->pcb_ext = NULL;
 
-	/* Setup to release sched_lock in fork_exit(). */
+	/* Setup to release spin count in fork_exit(). */
 	td->td_md.md_spinlock_count = 1;
 	td->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
 }
Index: i386/isa/npx.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/i386/isa/npx.c,v
retrieving revision 1.171
diff -u -p -r1.171 npx.c
--- i386/isa/npx.c	23 Feb 2007 12:19:00 -0000	1.171
+++ i386/isa/npx.c	18 May 2007 10:37:01 -0000
@@ -230,9 +230,9 @@ npx_intr(dummy)
 	td = PCPU_GET(fpcurthread);
 	if (td != NULL) {
 		td->td_pcb->pcb_flags |= PCB_NPXTRAP;
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		td->td_flags |= TDF_ASTPENDING;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 	}
 	return (FILTER_HANDLED);
 }
Index: i386/linux/linux_machdep.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/i386/linux/linux_machdep.c,v
retrieving revision 1.75
diff -u -p -r1.75 linux_machdep.c
--- i386/linux/linux_machdep.c	11 May 2007 01:25:51 -0000	1.75
+++ i386/linux/linux_machdep.c	18 May 2007 10:37:01 -0000
@@ -325,10 +325,10 @@ linux_fork(struct thread *td, struct lin
 	/*
 	 * Make this runnable after we are finished with it.
 	 */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td2);
 	TD_SET_CAN_RUN(td2);
 	sched_add(td2, SRQ_BORING);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td2);
 
 	return (0);
 }
@@ -368,10 +368,10 @@ linux_vfork(struct thread *td, struct li
 	/*
 	 * Make this runnable after we are finished with it.
 	 */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td2);
 	TD_SET_CAN_RUN(td2);
 	sched_add(td2, SRQ_BORING);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td2);
 
 	/* wait for the children to exit, ie. emulate vfork */
 	PROC_LOCK(p2);
@@ -569,10 +569,10 @@ linux_clone(struct thread *td, struct li
 	/*
 	 * Make this runnable after we are finished with it.
 	 */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td2);
 	TD_SET_CAN_RUN(td2);
 	sched_add(td2, SRQ_BORING);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td2);
 
 	td->td_retval[0] = p2->p_pid;
 	td->td_retval[1] = 0;
Index: ia64/ia64/machdep.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/ia64/ia64/machdep.c,v
retrieving revision 1.217
diff -u -p -r1.217 machdep.c
--- ia64/ia64/machdep.c	31 May 2007 22:52:12 -0000	1.217
+++ ia64/ia64/machdep.c	31 May 2007 22:31:00 -0000
@@ -356,7 +356,7 @@ cpu_reset()
 }
 
 void
-cpu_switch(struct thread *old, struct thread *new)
+cpu_switch(struct thread *old, struct thread *new, struct mtx *mtx)
 {
 	struct pcb *oldpcb, *newpcb;
 
Index: ia64/ia64/mp_machdep.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/ia64/ia64/mp_machdep.c,v
retrieving revision 1.62
diff -u -p -r1.62 mp_machdep.c
--- ia64/ia64/mp_machdep.c	18 Nov 2006 21:52:26 -0000	1.62
+++ ia64/ia64/mp_machdep.c	23 May 2007 20:17:13 -0000
@@ -111,16 +111,6 @@ ia64_ap_startup(void)
 	PCPU_SET(curthread, PCPU_GET(idlethread));
 
 	/*
-	 * Correct spinlock nesting.  The idle thread context that we are
-	 * borrowing was created so that it would start out with a single
-	 * spin lock (sched_lock) held in fork_trampoline().  Since we
-	 * don't have any locks and explicitly acquire locks when we need
-	 * to, the nesting count will be off by 1.
-	 */
-	curthread->td_md.md_spinlock_count = 0;
-	critical_exit();
-
-	/*
 	 * Get and save the CPU specific MCA records. Should we get the
 	 * MCA state for each processor, or just the CMC state?
 	 */
@@ -133,17 +123,12 @@ ia64_ap_startup(void)
 
 	CTR1(KTR_SMP, "SMP: cpu%d launched", PCPU_GET(cpuid));
 
-	mtx_lock_spin(&sched_lock);
-
-	PCPU_SET(switchtime, cpu_ticks());
-	PCPU_SET(switchticks, ticks);
-
 	ia64_set_tpr(0);
 
 	/* kick off the clock on this AP */
 	pcpu_initclock();
 
-	cpu_throw(NULL, choosethread());
+	sched_throw(NULL);
 	/* NOTREACHED */
 }
 
Index: ia64/ia64/pmap.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/ia64/ia64/pmap.c,v
retrieving revision 1.187
diff -u -p -r1.187 pmap.c
--- ia64/ia64/pmap.c	31 May 2007 22:52:12 -0000	1.187
+++ ia64/ia64/pmap.c	31 May 2007 20:40:16 -0000
@@ -2235,8 +2235,7 @@ pmap_switch(pmap_t pm)
 	pmap_t prevpm;
 	int i;
 
-	mtx_assert(&sched_lock, MA_OWNED);
-
+	THREAD_LOCK_ASSERT(curthread, MA_OWNED);
 	prevpm = PCPU_GET(current_pmap);
 	if (prevpm == pm)
 		return (prevpm);
@@ -2263,10 +2262,13 @@ static pmap_t
 pmap_install(pmap_t pm)
 {
 	pmap_t prevpm;
+	struct thread *td;
 
-	mtx_lock_spin(&sched_lock);
+	td = curthread;
+	thread_lock(td);
 	prevpm = pmap_switch(pm);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
+
 	return (prevpm);
 }
 
Index: ia64/ia64/vm_machdep.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/ia64/ia64/vm_machdep.c,v
retrieving revision 1.93
diff -u -p -r1.93 vm_machdep.c
--- ia64/ia64/vm_machdep.c	16 May 2006 14:32:15 -0000	1.93
+++ ia64/ia64/vm_machdep.c	31 May 2007 21:24:52 -0000
@@ -159,7 +159,7 @@ cpu_set_upcall(struct thread *td, struct
 	pcb->pcb_special.rp = FDESC_FUNC(fork_trampoline);
 	cpu_set_fork_handler(td, (void (*)(void*))fork_return, td);
 
-	/* Setup to release sched_lock in fork_exit(). */
+	/* Setup to release the spin count in fork_exit(). */
 	td->td_md.md_spinlock_count = 1;
 	td->td_md.md_saved_intr = 1;
 }
@@ -284,7 +284,7 @@ cpu_fork(struct thread *td1, struct proc
 	td2->td_pcb->pcb_special.rp = FDESC_FUNC(fork_trampoline);
 	cpu_set_fork_handler(td2, (void (*)(void*))fork_return, td2);
 
-	/* Setup to release sched_lock in fork_exit(). */
+	/* Setup to release the spin count in fork_exit(). */
 	td2->td_md.md_spinlock_count = 1;
 	td2->td_md.md_saved_intr = 1;
 }
Index: kern/init_main.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/init_main.c,v
retrieving revision 1.277
diff -u -p -r1.277 init_main.c
--- kern/init_main.c	1 Jun 2007 01:12:43 -0000	1.277
+++ kern/init_main.c	31 May 2007 20:40:16 -0000
@@ -713,9 +713,9 @@ create_init(const void *udata __unused)
 	PROC_UNLOCK(initproc);
 	crfree(oldcred);
 	cred_update_thread(FIRST_THREAD_IN_PROC(initproc));
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(initproc);
 	initproc->p_sflag |= PS_INMEM;
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(initproc);
 	cpu_set_fork_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL);
 }
 SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL)
@@ -729,9 +729,9 @@ kick_init(const void *udata __unused)
 	struct thread *td;
 
 	td = FIRST_THREAD_IN_PROC(initproc);
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	TD_SET_CAN_RUN(td);
 	sched_add(td, SRQ_BORING);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 }
 SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL)
Index: kern/kern_acct.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_acct.c,v
retrieving revision 1.91
diff -u -p -r1.91 kern_acct.c
--- kern/kern_acct.c	1 Jun 2007 01:12:43 -0000	1.91
+++ kern/kern_acct.c	31 May 2007 20:40:16 -0000
@@ -612,9 +612,9 @@ acct_thread(void *dummy)
 
 	/* This is a low-priority kernel thread. */
 	pri = PRI_MAX_KERN;
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 	sched_prio(curthread, pri);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 
 	/* If another accounting kthread is already running, just die. */
 	sx_xlock(&acct_sx);
Index: kern/kern_clock.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.199
diff -u -p -r1.199 kern_clock.c
--- kern/kern_clock.c	1 Jun 2007 01:12:43 -0000	1.199
+++ kern/kern_clock.c	31 May 2007 21:59:43 -0000
@@ -201,32 +201,35 @@ hardclock_cpu(int usermode)
 	struct pstats *pstats;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
+	int ast;
 
 	/*
 	 * Run current process's virtual and profile time, as needed.
 	 */
-	mtx_lock_spin_flags(&sched_lock, MTX_QUIET);
-	sched_tick();
-#ifdef KSE
-#if 0  /* for now do nothing */
-	if (p->p_flag & P_SA) {
-		/* XXXKSE What to do? Should do more. */
-	}
-#endif
-#endif
 	pstats = p->p_stats;
+	ast = 0;
 	if (usermode &&
-	    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
-	    itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) {
-		p->p_sflag |= PS_ALRMPEND;
-		td->td_flags |= TDF_ASTPENDING;
+	    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value)) {
+		PROC_SLOCK(p);
+		if (itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) {
+			p->p_sflag |= PS_ALRMPEND;
+			ast = 1;
+		}
+		PROC_SUNLOCK(p);
 	}
-	if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
-	    itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) {
-		p->p_sflag |= PS_PROFPEND;
-		td->td_flags |= TDF_ASTPENDING;
+	if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value)) {
+		PROC_SLOCK(p);
+		if (itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) {
+			p->p_sflag |= PS_PROFPEND;
+			ast = 1;
+		}
+		PROC_SUNLOCK(p);
 	}
-	mtx_unlock_spin_flags(&sched_lock, MTX_QUIET);
+	thread_lock(td);
+	sched_tick();
+	if (ast)
+		td->td_flags |= TDF_ASTPENDING;
+	thread_unlock(td);
 
 #ifdef	HWPMC_HOOKS
 	if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid)))
@@ -272,8 +275,8 @@ hardclock(int usermode, uintfptr_t pc)
 	mtx_unlock_spin_flags(&callout_lock, MTX_QUIET);
 
 	/*
-	 * swi_sched acquires sched_lock, so we don't want to call it with
-	 * callout_lock held; incorrect locking order.
+	 * swi_sched acquires the thread lock, so we don't want to call it
+	 * with callout_lock held; incorrect locking order.
 	 */
 	if (need_softclock)
 		swi_sched(softclock_ih, 0);
@@ -411,6 +414,7 @@ statclock(int usermode)
 	td = curthread;
 	p = td->td_proc;
 
+	thread_lock_flags(td, MTX_QUIET);
 	if (usermode) {
 		/*
 		 * Charge the time as appropriate.
@@ -420,11 +424,10 @@ statclock(int usermode)
 			thread_statclock(1);
 #endif
 		td->td_uticks++;
-		mtx_lock_spin_flags(&time_lock, MTX_QUIET);
 		if (p->p_nice > NZERO)
-			cp_time[CP_NICE]++;
+			atomic_add_long(&cp_time[CP_NICE], 1);
 		else
-			cp_time[CP_USER]++;
+			atomic_add_long(&cp_time[CP_USER], 1);
 	} else {
 		/*
 		 * Came from kernel mode, so we were:
@@ -441,8 +444,7 @@ statclock(int usermode)
 		if ((td->td_pflags & TDP_ITHREAD) ||
 		    td->td_intr_nesting_level >= 2) {
 			td->td_iticks++;
-			mtx_lock_spin_flags(&time_lock, MTX_QUIET);
-			cp_time[CP_INTR]++;
+			atomic_add_long(&cp_time[CP_INTR], 1);
 		} else {
 #ifdef KSE
 			if (p->p_flag & P_SA)
@@ -450,19 +452,12 @@ statclock(int usermode)
 #endif
 			td->td_pticks++;
 			td->td_sticks++;
-			mtx_lock_spin_flags(&time_lock, MTX_QUIET);
 			if (!TD_IS_IDLETHREAD(td))
-				cp_time[CP_SYS]++;
+				atomic_add_long(&cp_time[CP_SYS], 1);
 			else
-				cp_time[CP_IDLE]++;
+				atomic_add_long(&cp_time[CP_IDLE], 1);
 		}
 	}
-	mtx_unlock_spin_flags(&time_lock, MTX_QUIET);
-	CTR4(KTR_SCHED, "statclock: %p(%s) prio %d stathz %d",
-	    td, td->td_proc->p_comm, td->td_priority, (stathz)?stathz:hz);
-
-	mtx_lock_spin_flags(&sched_lock, MTX_QUIET);
-	sched_clock(td);
 
 	/* Update resource usage integrals and maximums. */
 	MPASS(p->p_vmspace != NULL);
@@ -474,7 +469,10 @@ statclock(int usermode)
 	rss = pgtok(vmspace_resident_count(vm));
 	if (ru->ru_maxrss < rss)
 		ru->ru_maxrss = rss;
-	mtx_unlock_spin_flags(&sched_lock, MTX_QUIET);
+	CTR4(KTR_SCHED, "statclock: %p(%s) prio %d stathz %d",
+	    td, td->td_proc->p_comm, td->td_priority, (stathz)?stathz:hz);
+	sched_clock(td);
+	thread_unlock(td);
 }
 
 void
Index: kern/kern_condvar.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_condvar.c,v
retrieving revision 1.61
diff -u -p -r1.61 kern_condvar.c
--- kern/kern_condvar.c	8 May 2007 21:49:59 -0000	1.61
+++ kern/kern_condvar.c	18 May 2007 10:37:01 -0000
@@ -394,8 +394,8 @@ cv_signal(struct cv *cvp)
 	if (cvp->cv_waiters > 0) {
 		cvp->cv_waiters--;
 		sleepq_signal(cvp, SLEEPQ_CONDVAR, -1, 0);
-	} else
-		sleepq_release(cvp);
+	}
+	sleepq_release(cvp);
 }
 
 /*
Index: kern/kern_cpu.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_cpu.c,v
retrieving revision 1.24
diff -u -p -r1.24 kern_cpu.c
--- kern/kern_cpu.c	26 Mar 2007 18:03:29 -0000	1.24
+++ kern/kern_cpu.c	18 May 2007 10:37:01 -0000
@@ -298,17 +298,17 @@ cf_set_method(device_t dev, const struct
 		cpu_id = PCPU_GET(cpuid);
 		pc = cpu_get_pcpu(set->dev);
 		if (cpu_id != pc->pc_cpuid) {
-			mtx_lock_spin(&sched_lock);
+			thread_lock(curthread);
 			sched_bind(curthread, pc->pc_cpuid);
-			mtx_unlock_spin(&sched_lock);
+			thread_unlock(curthread);
 		}
 		CF_DEBUG("setting abs freq %d on %s (cpu %d)\n", set->freq,
 		    device_get_nameunit(set->dev), PCPU_GET(cpuid));
 		error = CPUFREQ_DRV_SET(set->dev, set);
 		if (cpu_id != pc->pc_cpuid) {
-			mtx_lock_spin(&sched_lock);
+			thread_lock(curthread);
 			sched_unbind(curthread);
-			mtx_unlock_spin(&sched_lock);
+			thread_unlock(curthread);
 		}
 		if (error) {
 			goto out;
@@ -327,17 +327,17 @@ cf_set_method(device_t dev, const struct
 		cpu_id = PCPU_GET(cpuid);
 		pc = cpu_get_pcpu(set->dev);
 		if (cpu_id != pc->pc_cpuid) {
-			mtx_lock_spin(&sched_lock);
+			thread_lock(curthread);
 			sched_bind(curthread, pc->pc_cpuid);
-			mtx_unlock_spin(&sched_lock);
+			thread_unlock(curthread);
 		}
 		CF_DEBUG("setting rel freq %d on %s (cpu %d)\n", set->freq,
 		    device_get_nameunit(set->dev), PCPU_GET(cpuid));
 		error = CPUFREQ_DRV_SET(set->dev, set);
 		if (cpu_id != pc->pc_cpuid) {
-			mtx_lock_spin(&sched_lock);
+			thread_lock(curthread);
 			sched_unbind(curthread);
-			mtx_unlock_spin(&sched_lock);
+			thread_unlock(curthread);
 		}
 		if (error) {
 			/* XXX Back out any successful setting? */
Index: kern/kern_exit.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_exit.c,v
retrieving revision 1.299
diff -u -p -r1.299 kern_exit.c
--- kern/kern_exit.c	1 Jun 2007 01:12:43 -0000	1.299
+++ kern/kern_exit.c	31 May 2007 21:02:00 -0000
@@ -523,12 +523,13 @@ retry:
 	 * proc lock.
 	 */
 	wakeup(p->p_pptr);
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p->p_pptr);
+	sched_exit(p->p_pptr, td);
+	PROC_SUNLOCK(p->p_pptr);
+	PROC_SLOCK(p);
 	p->p_state = PRS_ZOMBIE;
 	PROC_UNLOCK(p->p_pptr);
 
-	sched_exit(p->p_pptr, td);
-
 	/*
 	 * Hopefully no one will try to deliver a signal to the process this
 	 * late in the game.
@@ -718,12 +719,13 @@ loop:
 			 * in thread_exit() after having dropped the process
 			 * lock via PROC_UNLOCK() but before it has completed
 			 * cpu_throw().  In that case, the other thread must
-			 * still hold sched_lock, so simply by acquiring
-			 * sched_lock once we will wait long enough for the
+			 * still hold the proc slock, so simply by acquiring
+			 * proc slock once we will wait long enough for the
 			 * thread to exit in that case.
+			 * XXX This is questionable.
 			 */
-			mtx_lock_spin(&sched_lock);
-			mtx_unlock_spin(&sched_lock);
+			PROC_SLOCK(p);
+			PROC_SUNLOCK(p);
 			
 			td->td_retval[0] = p->p_pid;
 			if (status)
@@ -820,12 +822,12 @@ loop:
 			sx_xunlock(&allproc_lock);
 			return (0);
 		}
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		if ((p->p_flag & P_STOPPED_SIG) &&
 		    (p->p_suspcount == p->p_numthreads) &&
 		    (p->p_flag & P_WAITED) == 0 &&
 		    (p->p_flag & P_TRACED || options & WUNTRACED)) {
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 			p->p_flag |= P_WAITED;
 			sx_xunlock(&proctree_lock);
 			td->td_retval[0] = p->p_pid;
@@ -839,7 +841,7 @@ loop:
 
 			return (0);
 		}
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		if (options & WCONTINUED && (p->p_flag & P_CONTINUED)) {
 			sx_xunlock(&proctree_lock);
 			td->td_retval[0] = p->p_pid;
Index: kern/kern_fork.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_fork.c,v
retrieving revision 1.274
diff -u -p -r1.274 kern_fork.c
--- kern/kern_fork.c	1 Jun 2007 01:12:43 -0000	1.274
+++ kern/kern_fork.c	31 May 2007 21:44:45 -0000
@@ -407,8 +407,15 @@ again:
 		lastpid = trypid;
 
 	p2 = newproc;
+	td2 = FIRST_THREAD_IN_PROC(newproc);
 	p2->p_state = PRS_NEW;		/* protect against others */
 	p2->p_pid = trypid;
+	/*
+	 * Allow the scheduler to initialize the child.
+	 */
+	thread_lock(td);
+	sched_fork(td, td2);
+	thread_unlock(td);
 	AUDIT_ARG(pid, p2->p_pid);
 	LIST_INSERT_HEAD(&allproc, p2, p_list);
 	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
@@ -476,8 +483,6 @@ again:
 	 * Start by zeroing the section of proc that is zero-initialized,
 	 * then copy the section that is copied directly from the parent.
 	 */
-	td2 = FIRST_THREAD_IN_PROC(p2);
-
 	/* Allocate and switch to an alternate kstack if specified. */
 	if (pages != 0)
 		vm_thread_new_altkstack(td2, pages);
@@ -501,15 +506,9 @@ again:
 	p2->p_flag = 0;
 	if (p1->p_flag & P_PROFIL)
 		startprofclock(p2);
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p2);
 	p2->p_sflag = PS_INMEM;
-	/*
-	 * Allow the scheduler to adjust the priority of the child and
-	 * parent while we hold the sched_lock.
-	 */
-	sched_fork(td, td2);
-
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p2);
 	td2->td_ucred = crhold(p2->p_ucred);
 #ifdef AUDIT
 	audit_proc_fork(p1, p2);
@@ -693,18 +692,20 @@ again:
 	 * Set the child start time and mark the process as being complete.
 	 */
 	microuptime(&p2->p_stats->p_start);
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p2);
 	p2->p_state = PRS_NORMAL;
+	PROC_SUNLOCK(p2);
 
 	/*
 	 * If RFSTOPPED not requested, make child runnable and add to
 	 * run queue.
 	 */
 	if ((flags & RFSTOPPED) == 0) {
+		thread_lock(td2);
 		TD_SET_CAN_RUN(td2);
 		sched_add(td2, SRQ_BORING);
+		thread_unlock(td2);
 	}
-	mtx_unlock_spin(&sched_lock);
 
 	/*
 	 * Now can be swapped.
@@ -778,31 +779,14 @@ fork_exit(callout, arg, frame)
 	struct proc *p;
 	struct thread *td;
 
-	/*
-	 * Finish setting up thread glue so that it begins execution in a
-	 * non-nested critical section with sched_lock held but not recursed.
-	 */
 	td = curthread;
 	p = td->td_proc;
-	td->td_oncpu = PCPU_GET(cpuid);
 	KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new"));
 
-	sched_lock.mtx_lock = (uintptr_t)td;
-	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
 	CTR4(KTR_PROC, "fork_exit: new thread %p (kse %p, pid %d, %s)",
 		td, td->td_sched, p->p_pid, p->p_comm);
 
-	/*
-	 * Processes normally resume in mi_switch() after being
-	 * cpu_switch()'ed to, but when children start up they arrive here
-	 * instead, so we must do much the same things as mi_switch() would.
-	 */
-	if ((td = PCPU_GET(deadthread))) {
-		PCPU_SET(deadthread, NULL);
-		thread_stash(td);
-	}
-	mtx_unlock_spin(&sched_lock);
-
+	sched_fork_exit(td);
 	/*
 	 * cpu_set_fork_handler intercepts this function call to
 	 * have this call a non-return function to stay in kernel mode.
Index: kern/kern_idle.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_idle.c,v
retrieving revision 1.47
diff -u -p -r1.47 kern_idle.c
--- kern/kern_idle.c	23 Jan 2007 08:46:50 -0000	1.47
+++ kern/kern_idle.c	18 May 2007 10:37:01 -0000
@@ -73,13 +73,13 @@ idle_setup(void *dummy)
 
 		PROC_LOCK(p);
 		p->p_flag |= P_NOLOAD;
-		mtx_lock_spin(&sched_lock);
 		td = FIRST_THREAD_IN_PROC(p);
+		thread_lock(td);
 		TD_SET_CAN_RUN(td);
 		td->td_flags |= TDF_IDLETD;
 		sched_class(td, PRI_IDLE);
 		sched_prio(td, PRI_MAX_IDLE);
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 		PROC_UNLOCK(p);
 #ifdef SMP
 	}
Index: kern/kern_intr.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_intr.c,v
retrieving revision 1.145
diff -u -p -r1.145 kern_intr.c
--- kern/kern_intr.c	31 May 2007 19:25:33 -0000	1.145
+++ kern/kern_intr.c	31 May 2007 21:02:57 -0000
@@ -173,9 +173,9 @@ ithread_update(struct intr_thread *ithd)
 	/* Update name and priority. */
 	strlcpy(td->td_proc->p_comm, ie->ie_fullname,
 	    sizeof(td->td_proc->p_comm));
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	sched_prio(td, pri);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 }
 
 /*
@@ -342,10 +342,10 @@ ithread_create(const char *name)
 	if (error)
 		panic("kthread_create() failed with %d", error);
 	td = FIRST_THREAD_IN_PROC(p);	/* XXXKSE */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	sched_class(td, PRI_ITHD);
 	TD_SET_IWAIT(td);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	td->td_pflags |= TDP_ITHREAD;
 	ithd->it_thread = td;
 	CTR2(KTR_INTR, "%s: created %s", __func__, name);
@@ -367,10 +367,10 @@ ithread_create(const char *name, struct 
 	if (error)
 		panic("kthread_create() failed with %d", error);
 	td = FIRST_THREAD_IN_PROC(p);	/* XXXKSE */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	sched_class(td, PRI_ITHD);
 	TD_SET_IWAIT(td);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	td->td_pflags |= TDP_ITHREAD;
 	ithd->it_thread = td;
 	CTR2(KTR_INTR, "%s: created %s", __func__, name);
@@ -385,13 +385,13 @@ ithread_destroy(struct intr_thread *ithr
 
 	CTR2(KTR_INTR, "%s: killing %s", __func__, ithread->it_event->ie_name);
 	td = ithread->it_thread;
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	ithread->it_flags |= IT_DEAD;
 	if (TD_AWAITING_INTR(td)) {
 		TD_CLR_IWAIT(td);
 		sched_add(td, SRQ_INTR);
 	}
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 }
 
 #ifndef INTR_FILTER
@@ -622,7 +622,7 @@ ok:
 	 * so we have to remove the handler here rather than letting the
 	 * thread do it.
 	 */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(ie->ie_thread->it_thread);
 	if (!TD_AWAITING_INTR(ie->ie_thread->it_thread) && !cold) {
 		handler->ih_flags |= IH_DEAD;
 
@@ -634,7 +634,7 @@ ok:
 		ie->ie_thread->it_need = 1;
 	} else
 		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(ie->ie_thread->it_thread);
 	while (handler->ih_flags & IH_DEAD)
 		msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0);
 	intr_event_update(ie);
@@ -699,11 +699,11 @@ intr_event_schedule_thread(struct intr_e
 
 	/*
 	 * Set it_need to tell the thread to keep running if it is already
-	 * running.  Then, grab sched_lock and see if we actually need to
-	 * put this thread on the runqueue.
+	 * running.  Then, lock the thread and see if we actually need to
+	 * put it on the runqueue.
 	 */
 	it->it_need = 1;
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	if (TD_AWAITING_INTR(td)) {
 		CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
 		    p->p_comm);
@@ -713,7 +713,7 @@ intr_event_schedule_thread(struct intr_e
 		CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d",
 		    __func__, p->p_pid, p->p_comm, it->it_need, td->td_state);
 	}
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 
 	return (0);
 }
@@ -771,7 +771,7 @@ ok:
 	 * so we have to remove the handler here rather than letting the
 	 * thread do it.
 	 */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(it->it_thread);
 	if (!TD_AWAITING_INTR(it->it_thread) && !cold) {
 		handler->ih_flags |= IH_DEAD;
 
@@ -783,7 +783,7 @@ ok:
 		it->it_need = 1;
 	} else
 		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(it->it_thread);
 	while (handler->ih_flags & IH_DEAD)
 		msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0);
 	/* 
@@ -853,11 +853,11 @@ intr_event_schedule_thread(struct intr_e
 
 	/*
 	 * Set it_need to tell the thread to keep running if it is already
-	 * running.  Then, grab sched_lock and see if we actually need to
-	 * put this thread on the runqueue.
+	 * running.  Then, lock the thread and see if we actually need to
+	 * put it on the runqueue.
 	 */
 	it->it_need = 1;
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	if (TD_AWAITING_INTR(td)) {
 		CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
 		    p->p_comm);
@@ -867,7 +867,7 @@ intr_event_schedule_thread(struct intr_e
 		CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d",
 		    __func__, p->p_pid, p->p_comm, it->it_need, td->td_state);
 	}
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 
 	return (0);
 }
@@ -1128,13 +1128,13 @@ ithread_loop(void *arg)
 		 * lock.  This may take a while and it_need may get
 		 * set again, so we have to check it again.
 		 */
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		if (!ithd->it_need && !(ithd->it_flags & IT_DEAD)) {
 			TD_SET_IWAIT(td);
 			ie->ie_count = 0;
 			mi_switch(SW_VOL, NULL);
 		}
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 	}
 }
 #else
@@ -1202,13 +1202,13 @@ ithread_loop(void *arg)
 		 * lock.  This may take a while and it_need may get
 		 * set again, so we have to check it again.
 		 */
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		if (!ithd->it_need && !(ithd->it_flags & IT_DEAD)) {
 			TD_SET_IWAIT(td);
 			ie->ie_count = 0;
 			mi_switch(SW_VOL, NULL);
 		}
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 	}
 }
 
Index: kern/kern_kse.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_kse.c,v
retrieving revision 1.229
diff -u -p -r1.229 kern_kse.c
--- kern/kern_kse.c	21 Mar 2007 21:20:50 -0000	1.229
+++ kern/kern_kse.c	20 May 2007 11:38:23 -0000
@@ -57,7 +57,7 @@ extern int thread_debug;
 extern int max_threads_per_proc;
 extern int max_groups_per_proc;
 extern int max_threads_hits;
-extern struct mtx kse_zombie_lock;
+extern struct mtx kse_lock;
 
 
 TAILQ_HEAD(, kse_upcall) zombie_upcalls =
@@ -66,6 +66,9 @@ TAILQ_HEAD(, kse_upcall) zombie_upcalls 
 static int thread_update_usr_ticks(struct thread *td);
 static void thread_alloc_spare(struct thread *td);
 
+struct mtx kse_lock;
+MTX_SYSINIT(kse_lock, &kse_lock, "kse lock", MTX_SPIN);
+
 struct kse_upcall *
 upcall_alloc(void)
 {
@@ -86,7 +89,7 @@ void
 upcall_link(struct kse_upcall *ku, struct proc *p)
 {
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	TAILQ_INSERT_TAIL(&p->p_upcalls, ku, ku_link);
 	ku->ku_proc = p;
 }
@@ -96,7 +99,7 @@ upcall_unlink(struct kse_upcall *ku)
 {
 	struct proc *p = ku->ku_proc;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	KASSERT(ku->ku_owner == NULL, ("%s: have owner", __func__));
 	TAILQ_REMOVE(&p->p_upcalls, ku, ku_link);
 	upcall_stash(ku);
@@ -106,7 +109,7 @@ void
 upcall_remove(struct thread *td)
 {
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	PROC_SLOCK_ASSERT(td->td_proc, MA_OWNED);
 	if (td->td_upcall != NULL) {
 		/*
 	 	* If we are not a bound thread then decrement the count of
@@ -128,6 +131,16 @@ struct kse_switchin_args {
 };
 #endif
 
+#ifdef KSE
+void
+kse_unlink(struct thread *td)
+{
+	mtx_lock_spin(&kse_lock);
+	thread_unlink(td);
+	mtx_unlock_spin(&kse_lock);
+}
+#endif
+
 int
 kse_switchin(struct thread *td, struct kse_switchin_args *uap)
 {
@@ -160,11 +173,11 @@ kse_switchin(struct thread *td, struct k
 			else
 				ptrace_clear_single_step(td);
 			if (tmbx.tm_dflags & TMDF_SUSPEND) {
-				mtx_lock_spin(&sched_lock);
+				PROC_SLOCK(td->td_proc);
 				/* fuword can block, check again */
 				if (td->td_upcall)
 					ku->ku_flags |= KUF_DOUPCALL;
-				mtx_unlock_spin(&sched_lock);
+				PROC_SUNLOCK(td->td_proc);
 			}
 			_PRELE(td->td_proc);
 		}
@@ -208,23 +221,25 @@ kse_thr_interrupt(struct thread *td, str
 	case KSE_INTR_INTERRUPT:
 	case KSE_INTR_RESTART:
 		PROC_LOCK(p);
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (td2->td_mailbox == uap->tmbx)
 				break;
 		}
 		if (td2 == NULL) {
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 			PROC_UNLOCK(p);
 			return (ESRCH);
 		}
+		thread_lock(td2);
+		PROC_SUNLOCK(p);
 		if (uap->cmd == KSE_INTR_SENDSIG) {
 			if (uap->data > 0) {
 				td2->td_flags &= ~TDF_INTERRUPT;
-				mtx_unlock_spin(&sched_lock);
+				thread_unlock(td2);
 				tdsignal(p, td2, (int)uap->data, NULL);
 			} else {
-				mtx_unlock_spin(&sched_lock);
+				thread_unlock(td2);
 			}
 		} else {
 			td2->td_flags |= TDF_INTERRUPT | TDF_ASTPENDING;
@@ -236,7 +251,7 @@ kse_thr_interrupt(struct thread *td, str
 				td2->td_intrval = ERESTART;
 			if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR))
 				sleepq_abort(td2, td2->td_intrval);
-			mtx_unlock_spin(&sched_lock);
+			thread_unlock(td2);
 		}
 		PROC_UNLOCK(p);
 		break;
@@ -261,12 +276,14 @@ kse_thr_interrupt(struct thread *td, str
 			if (!(flags & TMDF_SUSPEND))
 				break;
 			PROC_LOCK(p);
-			mtx_lock_spin(&sched_lock);
+			PROC_SLOCK(p);
 			thread_stopped(p);
-			thread_suspend_one(td);
 			PROC_UNLOCK(p);
+			thread_lock(td);
+			thread_suspend_one(td);
+			PROC_SUNLOCK(p);
 			mi_switch(SW_VOL, NULL);
-			mtx_unlock_spin(&sched_lock);
+			thread_unlock(td);
 		}
 		return (0);
 
@@ -331,18 +348,18 @@ kse_exit(struct thread *td, struct kse_e
 	 */
 	count = 0;
 	PROC_LOCK(p);
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	FOREACH_UPCALL_IN_PROC(p, ku2) {
 		if ((ku2->ku_flags & KUF_EXITING) == 0)
 			count++;
 	}
 	if (count == 1 && (p->p_numthreads > 1)) {
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		PROC_UNLOCK(p);
 		return (EDEADLK);
 	}
 	ku->ku_flags |= KUF_EXITING;
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 	PROC_UNLOCK(p);
 
 	/* 
@@ -358,7 +375,7 @@ kse_exit(struct thread *td, struct kse_e
 	if (error)
 		psignal(p, SIGSEGV);
 	sigqueue_flush(&td->td_sigqueue);
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	upcall_remove(td);
 	if (p->p_numthreads != 1) {
 		thread_stopped(p);
@@ -376,7 +393,7 @@ kse_exit(struct thread *td, struct kse_e
 	 * The other possibility would be to let the process exit.
 	 */
 	thread_unthread(td);
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 	PROC_UNLOCK(p);
 #if 0
 	return (0);
@@ -458,9 +475,9 @@ kse_release(struct thread *td, struct ks
 		PROC_UNLOCK(p);
 	}
 	if (ku->ku_flags & KUF_DOUPCALL) {
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		ku->ku_flags &= ~KUF_DOUPCALL;
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 	}
 	return (0);
 #else /* !KSE */
@@ -486,7 +503,7 @@ kse_wakeup(struct thread *td, struct kse
 	if (!(p->p_flag & P_SA))
 		return (EINVAL);
 	PROC_LOCK(p);
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	if (uap->mbx) {
 		FOREACH_UPCALL_IN_PROC(p, ku) {
 			if (ku->ku_mailbox == uap->mbx)
@@ -494,7 +511,7 @@ kse_wakeup(struct thread *td, struct kse
 		}
 	} else {
 		if (p->p_upsleeps) {
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 			wakeup(&p->p_completed);
 			PROC_UNLOCK(p);
 			return (0);
@@ -502,15 +519,14 @@ kse_wakeup(struct thread *td, struct kse
 		ku = TAILQ_FIRST(&p->p_upcalls);
 	}
 	if (ku == NULL) {
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		PROC_UNLOCK(p);
 		return (ESRCH);
 	}
 	if ((td2 = ku->ku_owner) == NULL) {
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		panic("%s: no owner", __func__);
 	} else if (td2->td_kflags & (TDK_KSEREL | TDK_KSERELSIG)) {
-		mtx_unlock_spin(&sched_lock);
 		if (!(td2->td_kflags & TDK_WAKEUP)) {
 			td2->td_kflags |= TDK_WAKEUP;
 			if (td2->td_kflags & TDK_KSEREL)
@@ -520,8 +536,8 @@ kse_wakeup(struct thread *td, struct kse
 		}
 	} else {
 		ku->ku_flags |= KUF_DOUPCALL;
-		mtx_unlock_spin(&sched_lock);
 	}
+	PROC_SUNLOCK(p);
 	PROC_UNLOCK(p);
 	return (0);
 #else /* !KSE */
@@ -621,7 +637,7 @@ kse_create(struct thread *td, struct kse
 	if (td->td_standin == NULL)
 		thread_alloc_spare(td);
 	PROC_LOCK(p);
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	/*
 	 * If we are the first time, and a normal thread,
 	 * then transfer all the signals back to the 'process'.
@@ -648,6 +664,7 @@ kse_create(struct thread *td, struct kse
 	 * Each upcall structure has an owner thread, find which
 	 * one owns it.
 	 */
+	thread_lock(td);
 	if (uap->newgroup) {
 		/*
 		 * The newgroup parameter now means
@@ -674,7 +691,8 @@ kse_create(struct thread *td, struct kse
 			newtd = thread_schedule_upcall(td, newku);
 		}
 	}
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
+	PROC_SUNLOCK(p);
 
 	/*
 	 * Let the UTS instance know its LWPID.
@@ -699,9 +717,9 @@ kse_create(struct thread *td, struct kse
 		 * If we are starting a new thread, kick it off.
 		 */
 		if (newtd != td) {
-			mtx_lock_spin(&sched_lock);
+			thread_lock(newtd);
 			sched_add(newtd, SRQ_BORING);
-			mtx_unlock_spin(&sched_lock);
+			thread_unlock(newtd);
 		}
 	} else {
 		newtd->td_pflags &= ~TDP_SA;
@@ -734,9 +752,9 @@ kse_create(struct thread *td, struct kse
 				_PRELE(p);
 			}
 			PROC_UNLOCK(p);
-			mtx_lock_spin(&sched_lock);
+			thread_lock(newtd);
 			sched_add(newtd, SRQ_BORING);
-			mtx_unlock_spin(&sched_lock);
+			thread_unlock(newtd);
 		}
 	}
 	return (0);
@@ -764,9 +782,9 @@ kseinit(void)
 void
 upcall_stash(struct kse_upcall *ku)
 {
-	mtx_lock_spin(&kse_zombie_lock);
+	mtx_lock_spin(&kse_lock);
 	TAILQ_INSERT_HEAD(&zombie_upcalls, ku, ku_link);
-	mtx_unlock_spin(&kse_zombie_lock);
+	mtx_unlock_spin(&kse_lock);
 }
 
 /*
@@ -782,11 +800,11 @@ kse_GC(void)
 	 * we really don't care about the next instant..
 	 */
 	if (!TAILQ_EMPTY(&zombie_upcalls)) {
-		mtx_lock_spin(&kse_zombie_lock);
+		mtx_lock_spin(&kse_lock);
 		ku_first = TAILQ_FIRST(&zombie_upcalls);
 		if (ku_first)
 			TAILQ_INIT(&zombie_upcalls);
-		mtx_unlock_spin(&kse_zombie_lock);
+		mtx_unlock_spin(&kse_lock);
 		while (ku_first) {
 			ku_next = TAILQ_NEXT(ku_first, ku_link);
 			upcall_free(ku_first);
@@ -818,9 +836,9 @@ thread_export_context(struct thread *td,
 	 */
 	PROC_LOCK(p);
 	if (td->td_flags & TDF_NEEDSIGCHK) {
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		td->td_flags &= ~TDF_NEEDSIGCHK;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 		mtx_lock(&p->p_sigacts->ps_mtx);
 		while ((sig = cursig(td)) != 0)
 			postsig(sig);
@@ -921,9 +939,9 @@ thread_statclock(int user)
 		return (0);
 	if (user) {
 		/* Current always do via ast() */
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		td->td_flags |= TDF_ASTPENDING;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 		td->td_uuticks++;
 	} else if (td->td_mailbox != NULL)
 		td->td_usticks++;
@@ -966,7 +984,7 @@ error:
 
 /*
  * This function is intended to be used to initialize a spare thread
- * for upcall. Initialize thread's large data area outside sched_lock
+ * for upcall. Initialize thread's large data area outside the thread lock
  * for thread_schedule_upcall(). The crhold is also here to get it out
  * from the schedlock as it has a mutex op itself.
  * XXX BUG.. we need to get the cr ref after the thread has 
@@ -996,7 +1014,7 @@ thread_schedule_upcall(struct thread *td
 {
 	struct thread *td2;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/*
 	 * Schedule an upcall thread on specified kse_upcall,
@@ -1018,7 +1036,10 @@ thread_schedule_upcall(struct thread *td
 	 */
 	bcopy(&td->td_startcopy, &td2->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
+	sched_fork_thread(td, td2);
+	mtx_lock_spin(&kse_lock);
 	thread_link(td2, ku->ku_proc);
+	mtx_unlock_spin(&kse_lock);
 	/* inherit parts of blocked thread's context as a good template */
 	cpu_set_upcall(td2, td);
 	/* Let the new thread become owner of the upcall */
@@ -1030,7 +1051,6 @@ thread_schedule_upcall(struct thread *td
 	td2->td_inhibitors = 0;
 	SIGFILLSET(td2->td_sigmask);
 	SIG_CANTMASK(td2->td_sigmask);
-	sched_fork_thread(td, td2);
 	return (td2);	/* bogus.. should be a void function */
 }
 
@@ -1069,7 +1089,7 @@ thread_switchout(struct thread *td, int 
 	struct kse_upcall *ku;
 	struct thread *td2;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/*
 	 * If the outgoing thread is in threaded group and has never
@@ -1101,7 +1121,9 @@ thread_switchout(struct thread *td, int 
 		td->td_pflags &= ~TDP_CAN_UNBIND;
 		td2 = thread_schedule_upcall(td, ku);
 		if (flags & SW_INVOL || nextthread) {
+			thread_lock(td2);
 			sched_add(td2, SRQ_YIELDING);
+			thread_unlock(td2);
 		} else {
 			/* Keep up with reality.. we have one extra thread 
 			 * in the picture.. and it's 'running'.
@@ -1171,11 +1193,11 @@ thread_user_enter(struct thread *td)
 			if (__predict_false(p->p_flag & P_TRACED)) {
 				flags = fuword32(&tmbx->tm_dflags);
 				if (flags & TMDF_SUSPEND) {
-					mtx_lock_spin(&sched_lock);
+					PROC_SLOCK(td->td_proc);
 					/* fuword can block, check again */
 					if (td->td_upcall)
 						ku->ku_flags |= KUF_DOUPCALL;
-					mtx_unlock_spin(&sched_lock);
+					PROC_SUNLOCK(td->td_proc);
 				}
 			}
 		}
@@ -1256,7 +1278,7 @@ thread_userret(struct thread *td, struct
 		WITNESS_WARN(WARN_PANIC, &p->p_mtx.lock_object,
 		    "thread exiting in userret");
 		sigqueue_flush(&td->td_sigqueue);
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		thread_stopped(p);
 		thread_exit();
 		/* NOTREACHED */
@@ -1268,22 +1290,22 @@ thread_userret(struct thread *td, struct
 	if (p->p_numthreads > max_threads_per_proc) {
 		max_threads_hits++;
 		PROC_LOCK(p);
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		p->p_maxthrwaits++;
 		while (p->p_numthreads > max_threads_per_proc) {
 			if (p->p_numupcalls >= max_threads_per_proc)
 				break;
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 			if (msleep(&p->p_numthreads, &p->p_mtx, PPAUSE|PCATCH,
 			    "maxthreads", hz/10) != EWOULDBLOCK) {
-				mtx_lock_spin(&sched_lock);
+				PROC_SLOCK(p);
 				break;
 			} else {
-				mtx_lock_spin(&sched_lock);
+				PROC_SLOCK(p);
 			}
 		}
 		p->p_maxthrwaits--;
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		PROC_UNLOCK(p);
 	}
 
@@ -1300,9 +1322,9 @@ thread_userret(struct thread *td, struct
 
 		td->td_pflags &= ~TDP_UPCALLING;
 		if (ku->ku_flags & KUF_DOUPCALL) {
-			mtx_lock_spin(&sched_lock);
+			PROC_SLOCK(p);
 			ku->ku_flags &= ~KUF_DOUPCALL;
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 		}
 		/*
 		 * Set user context to the UTS
@@ -1390,9 +1412,9 @@ thread_continued(struct proc *p)
 		td = TAILQ_FIRST(&p->p_threads);
 		if (td && (td->td_pflags & TDP_SA)) {
 			FOREACH_UPCALL_IN_PROC(p, ku) {
-				mtx_lock_spin(&sched_lock);
+				PROC_SLOCK(p);
 				ku->ku_flags |= KUF_DOUPCALL;
-				mtx_unlock_spin(&sched_lock);
+				PROC_SUNLOCK(p);
 				wakeup(&p->p_completed);
 			}
 		}
Index: kern/kern_kthread.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_kthread.c,v
retrieving revision 1.37
diff -u -p -r1.37 kern_kthread.c
--- kern/kern_kthread.c	23 Jan 2007 08:46:50 -0000	1.37
+++ kern/kern_kthread.c	18 May 2007 10:37:01 -0000
@@ -113,9 +113,9 @@ kthread_create(void (*func)(void *), voi
 
 	/* Delay putting it on the run queue until now. */
 	if (!(flags & RFSTOPPED)) {
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		sched_add(td, SRQ_BORING); 
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 	}
 
 	return 0;
Index: kern/kern_lockf.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_lockf.c,v
retrieving revision 1.54
diff -u -p -r1.54 kern_lockf.c
--- kern/kern_lockf.c	29 Mar 2005 08:13:01 -0000	1.54
+++ kern/kern_lockf.c	18 May 2007 10:37:01 -0000
@@ -266,16 +266,19 @@ lf_setlock(lock)
 		 */
 		if ((lock->lf_flags & F_POSIX) &&
 		    (block->lf_flags & F_POSIX)) {
-			register struct proc *wproc;
+			struct proc *wproc;
+			struct proc *nproc;
 			struct thread *td;
-			register struct lockf *waitblock;
+			struct lockf *waitblock;
 			int i = 0;
 
 			/* The block is waiting on something */
-			/* XXXKSE this is not complete under threads */
 			wproc = (struct proc *)block->lf_id;
-			mtx_lock_spin(&sched_lock);
+restart:
+			nproc = NULL;
+			PROC_SLOCK(wproc);
 			FOREACH_THREAD_IN_PROC(wproc, td) {
+				thread_lock(td);
 				while (td->td_wchan &&
 				    (td->td_wmesg == lockstr) &&
 				    (i++ < maxlockdepth)) {
@@ -284,15 +287,20 @@ lf_setlock(lock)
 					waitblock = waitblock->lf_next;
 					if ((waitblock->lf_flags & F_POSIX) == 0)
 						break;
-					wproc = (struct proc *)waitblock->lf_id;
-					if (wproc == (struct proc *)lock->lf_id) {
-						mtx_unlock_spin(&sched_lock);
+					nproc = (struct proc *)waitblock->lf_id;
+					if (nproc == (struct proc *)lock->lf_id) {
+						PROC_SUNLOCK(wproc);
+						thread_unlock(td);
 						free(lock, M_LOCKF);
 						return (EDEADLK);
 					}
 				}
+				thread_unlock(td);
 			}
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(wproc);
+			wproc = nproc;
+			if (wproc)
+				goto restart;
 		}
 		/*
 		 * For flock type locks, we must first remove
Index: kern/kern_mutex.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_mutex.c,v
retrieving revision 1.190
diff -u -p -r1.190 kern_mutex.c
--- kern/kern_mutex.c	18 May 2007 15:04:59 -0000	1.190
+++ kern/kern_mutex.c	31 May 2007 22:07:57 -0000
@@ -127,6 +127,7 @@ struct lock_class lock_class_mtx_spin = 
 /*
  * System-wide mutexes
  */
+struct mtx blocked_lock;
 struct mtx sched_lock;
 struct mtx Giant;
 
@@ -305,6 +306,7 @@ void
 _mtx_lock_sleep(struct mtx *m, uintptr_t tid, int opts, const char *file,
     int line)
 {
+	struct turnstile *ts;
 #ifdef ADAPTIVE_MUTEXES
 	volatile struct thread *owner;
 #endif
@@ -334,7 +336,7 @@ _mtx_lock_sleep(struct mtx *m, uintptr_t
 		    m->lock_object.lo_name, (void *)m->mtx_lock, file, line);
 
 	while (!_obtain_lock(m, tid)) { 
-		turnstile_lock(&m->lock_object);
+		ts = turnstile_trywait(&m->lock_object);
 		v = m->mtx_lock;
 
 		/*
@@ -342,7 +344,7 @@ _mtx_lock_sleep(struct mtx *m, uintptr_t
 		 * the turnstile chain lock.
 		 */
 		if (v == MTX_UNOWNED) {
-			turnstile_release(&m->lock_object);
+			turnstile_cancel(ts);
 			cpu_spinwait();
 			continue;
 		}
@@ -358,7 +360,7 @@ _mtx_lock_sleep(struct mtx *m, uintptr_t
 		 */
 		if (v == MTX_CONTESTED) {
 			m->mtx_lock = tid | MTX_CONTESTED;
-			turnstile_claim(&m->lock_object);
+			turnstile_claim(ts);
 			break;
 		}
 #endif
@@ -370,7 +372,7 @@ _mtx_lock_sleep(struct mtx *m, uintptr_t
 		 */
 		if ((v & MTX_CONTESTED) == 0 &&
 		    !atomic_cmpset_ptr(&m->mtx_lock, v, v | MTX_CONTESTED)) {
-			turnstile_release(&m->lock_object);
+			turnstile_cancel(ts);
 			cpu_spinwait();
 			continue;
 		}
@@ -387,7 +389,7 @@ _mtx_lock_sleep(struct mtx *m, uintptr_t
 		if (m != &Giant && TD_IS_RUNNING(owner)) 
 #endif
 		{
-			turnstile_release(&m->lock_object);
+			turnstile_cancel(ts);
 			while (mtx_owner(m) == owner && TD_IS_RUNNING(owner)) {
 				cpu_spinwait();
 			}
@@ -414,8 +416,7 @@ _mtx_lock_sleep(struct mtx *m, uintptr_t
 		/*
 		 * Block on the turnstile.
 		 */
-		turnstile_wait(&m->lock_object, mtx_owner(m),
-		    TS_EXCLUSIVE_QUEUE);
+		turnstile_wait(ts, mtx_owner(m), TS_EXCLUSIVE_QUEUE);
 	}
 #ifdef KTR
 	if (cont_logged) {
@@ -428,7 +429,25 @@ _mtx_lock_sleep(struct mtx *m, uintptr_t
 	    waittime, (file), (line));					
 }
 
+static void
+_mtx_lock_spin_failed(struct mtx *m)
+{
+	struct thread *td;
+
+	td = mtx_owner(m);
+
+	/* If the mutex is unlocked, try again. */
+	if (td == NULL)
+		return;
 #ifdef SMP
+	printf( "spin lock %p (%s) held by %p (tid %d) too long\n",
+	    m, m->lock_object.lo_name, td, td->td_tid);
+#ifdef WITNESS
+	witness_display_spinlock(&m->lock_object, td);
+#endif
+	panic("spin lock held too long");
+}
+
 /*
  * _mtx_lock_spin: the tougher part of acquiring an MTX_SPIN lock.
  *
@@ -440,7 +459,6 @@ _mtx_lock_spin(struct mtx *m, uintptr_t 
     int line)
 {
 	int i = 0, contested = 0;
-	struct thread *td;	
 	uint64_t waittime = 0;
 	
 	if (LOCK_LOG_TEST(&m->lock_object, opts))
@@ -458,20 +476,8 @@ _mtx_lock_spin(struct mtx *m, uintptr_t 
 			}
 			if (i < 60000000 || kdb_active || panicstr != NULL)
 				DELAY(1);
-			else {
-				td = mtx_owner(m);
-
-				/* If the mutex is unlocked, try again. */
-				if (td == NULL)
-					continue;
-				printf(
-			"spin lock %p (%s) held by %p (tid %d) too long\n",
-				    m, m->lock_object.lo_name, td, td->td_tid);
-#ifdef WITNESS
-				witness_display_spinlock(&m->lock_object, td);
-#endif
-				panic("spin lock held too long");
-			}
+			else
+				_mtx_lock_spin_failed(m);
 			cpu_spinwait();
 		}
 		spinlock_enter();
@@ -482,10 +488,87 @@ _mtx_lock_spin(struct mtx *m, uintptr_t 
 
 	lock_profile_obtain_lock_success(&m->lock_object, contested,	
 	    waittime, (file), (line));
-
 }
 #endif /* SMP */
 
+void
+_thread_lock_flags(struct thread *td, int opts, const char *file, int line)
+{
+	struct mtx *m;
+	uintptr_t tid;
+	int i;
+
+	i = 0;
+	tid = (uintptr_t)curthread;
+	for (;;) {
+retry:
+		spinlock_enter();
+		m = __DEVOLATILE(struct mtx *, td->td_lock);
+		WITNESS_CHECKORDER(&m->lock_object,
+		    opts | LOP_NEWORDER | LOP_EXCLUSIVE, file, line);
+		while (!_obtain_lock(m, tid)) {
+			if (m->mtx_lock == tid) {
+				m->mtx_recurse++;
+				break;
+			}
+			/* Give interrupts a chance while we spin. */
+			spinlock_exit();
+			while (m->mtx_lock != MTX_UNOWNED) {
+				if (i++ < 10000000)
+					cpu_spinwait();
+				else if (i < 60000000 ||
+				    kdb_active || panicstr != NULL)
+					DELAY(1);
+				else
+					_mtx_lock_spin_failed(m);
+				cpu_spinwait();
+				if (m != td->td_lock)
+					goto retry;
+			}
+			spinlock_enter();
+		}
+		if (m == td->td_lock)
+			break;
+		_rel_spin_lock(m);	/* does spinlock_exit() */
+	}
+	WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
+}
+
+struct mtx *
+thread_lock_block(struct thread *td)
+{
+	struct mtx *lock;
+
+	spinlock_enter();
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	lock = __DEVOLATILE(struct mtx *, td->td_lock);
+	td->td_lock = &blocked_lock;
+	mtx_unlock_spin(lock);
+
+	return (lock);
+}
+
+void
+thread_lock_unblock(struct thread *td, struct mtx *new)
+{
+	mtx_assert(new, MA_OWNED);
+	MPASS(td->td_lock == &blocked_lock);
+	atomic_store_rel_ptr((void *)&td->td_lock, (uintptr_t)new);
+	spinlock_exit();
+}
+
+void
+thread_lock_set(struct thread *td, struct mtx *new)
+{
+	struct mtx *lock;
+
+	mtx_assert(new, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	lock = __DEVOLATILE(struct mtx *, td->td_lock);
+	td->td_lock = new;
+	mtx_unlock_spin(lock);
+}
+
 /*
  * _mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock.
  *
@@ -508,7 +591,11 @@ _mtx_unlock_sleep(struct mtx *m, int opt
 		return;
 	}
 
-	turnstile_lock(&m->lock_object);
+	/*
+	 * We have to lock the chain before the turnstile so this turnstile
+	 * can be removed from the hash list if it is empty.
+	 */
+	turnstile_chain_lock(&m->lock_object);
 	ts = turnstile_lookup(&m->lock_object);
 	if (LOCK_LOG_TEST(&m->lock_object, opts))
 		CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m);
@@ -518,7 +605,7 @@ _mtx_unlock_sleep(struct mtx *m, int opt
 		_release_lock_quick(m);
 		if (LOCK_LOG_TEST(&m->lock_object, opts))
 			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p no sleepers", m);
-		turnstile_release(&m->lock_object);
+		turnstile_chain_unlock(&m->lock_object);
 		return;
 	}
 #else
@@ -543,7 +630,12 @@ _mtx_unlock_sleep(struct mtx *m, int opt
 			    m);
 	}
 #endif
+	/*
+	 * This turnstile is now no longer associated with the mutex.  We can
+	 * unlock the chain lock so a new turnstile may take it's place.
+	 */
 	turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
+	turnstile_chain_unlock(&m->lock_object);
 
 #ifndef PREEMPTION
 	/*
@@ -557,7 +649,7 @@ _mtx_unlock_sleep(struct mtx *m, int opt
 	if (td->td_critnest > 0 || td1->td_priority >= td->td_priority)
 		return;
 
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td1);
 	if (!TD_IS_RUNNING(td1)) {
 #ifdef notyet
 		if (td->td_ithd != NULL) {
@@ -582,7 +674,7 @@ _mtx_unlock_sleep(struct mtx *m, int opt
 			CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p resuming lock=%p",
 			    m, (void *)m->mtx_lock);
 	}
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td1);
 #endif
 }
 
@@ -761,7 +853,10 @@ mutex_init(void)
 	 */
 	mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE);
 	mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
+	mtx_init(&blocked_lock, "blocked lock", NULL, MTX_SPIN);
+	blocked_lock.mtx_lock = 0xdeadc0de;	/* Always blocked. */
 	mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
+	mtx_init(&proc0.p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE);
 	mtx_init(&devmtx, "cdev", NULL, MTX_DEF);
 	mtx_lock(&Giant);
 	
Index: kern/kern_poll.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_poll.c,v
retrieving revision 1.28
diff -u -p -r1.28 kern_poll.c
--- kern/kern_poll.c	6 Dec 2006 06:34:55 -0000	1.28
+++ kern/kern_poll.c	18 May 2007 10:37:01 -0000
@@ -580,17 +580,17 @@ poll_idle(void)
 
 	rtp.prio = RTP_PRIO_MAX;	/* lowest priority */
 	rtp.type = RTP_PRIO_IDLE;
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(td->td_proc);
 	rtp_to_pri(&rtp, td);
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(td->td_proc);
 
 	for (;;) {
 		if (poll_in_idle_loop && poll_handlers > 0) {
 			idlepoll_sleeping = 0;
 			ether_poll(poll_each_burst);
-			mtx_lock_spin(&sched_lock);
+			thread_lock(td);
 			mi_switch(SW_VOL, NULL);
-			mtx_unlock_spin(&sched_lock);
+			thread_unlock(td);
 		} else {
 			idlepoll_sleeping = 1;
 			tsleep(&idlepoll_sleeping, 0, "pollid", hz * 3);
Index: kern/kern_proc.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_proc.c,v
retrieving revision 1.248
diff -u -p -r1.248 kern_proc.c
--- kern/kern_proc.c	1 Jun 2007 01:12:43 -0000	1.248
+++ kern/kern_proc.c	31 May 2007 20:40:17 -0000
@@ -177,6 +177,7 @@ proc_init(void *mem, int size, int flags
 	td = thread_alloc();
 	bzero(&p->p_mtx, sizeof(struct mtx));
 	mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
+	mtx_init(&p->p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE);
 	p->p_stats = pstats_alloc();
 	proc_linkup(p, td);
 	sched_newproc(p, td);
@@ -669,7 +670,7 @@ fill_kinfo_proc_only(struct proc *p, str
 		kp->ki_sigcatch = ps->ps_sigcatch;
 		mtx_unlock(&ps->ps_mtx);
 	}
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	if (p->p_state != PRS_NEW &&
 	    p->p_state != PRS_ZOMBIE &&
 	    p->p_vmspace != NULL) {
@@ -695,7 +696,7 @@ fill_kinfo_proc_only(struct proc *p, str
 	kp->ki_nice = p->p_nice;
 	rufetch(p, &kp->ki_rusage);
 	kp->ki_runtime = cputick2usec(p->p_rux.rux_runtime);
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 	if ((p->p_sflag & PS_INMEM) && p->p_stats != NULL) {
 		kp->ki_start = p->p_stats->p_start;
 		timevaladd(&kp->ki_start, &boottime);
@@ -747,7 +748,7 @@ fill_kinfo_proc_only(struct proc *p, str
 
 /*
  * Fill in information that is thread specific.
- * Must be called with sched_lock locked.
+ * Must be called with p_slock locked.
  */
 static void
 fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp)
@@ -755,7 +756,9 @@ fill_kinfo_thread(struct thread *td, str
 	struct proc *p;
 
 	p = td->td_proc;
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 
+	thread_lock(td);
 	if (td->td_wmesg != NULL)
 		strlcpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg));
 	else
@@ -813,6 +816,7 @@ fill_kinfo_thread(struct thread *td, str
 
 	SIGSETOR(kp->ki_siglist, td->td_siglist);
 	kp->ki_sigmask = td->td_sigmask;
+	thread_unlock(td);
 }
 
 /*
@@ -824,10 +828,10 @@ fill_kinfo_proc(struct proc *p, struct k
 {
 
 	fill_kinfo_proc_only(p, kp);
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	if (FIRST_THREAD_IN_PROC(p) != NULL)
 		fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp);
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 }
 
 struct pstats *
@@ -894,14 +898,14 @@ sysctl_out_proc(struct proc *p, struct s
 
 	fill_kinfo_proc_only(p, &kinfo_proc);
 	if (flags & KERN_PROC_NOTHREADS) {
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		if (FIRST_THREAD_IN_PROC(p) != NULL)
 			fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), &kinfo_proc);
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc,
 				   sizeof(kinfo_proc));
 	} else {
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		if (FIRST_THREAD_IN_PROC(p) != NULL)
 			FOREACH_THREAD_IN_PROC(p, td) {
 				fill_kinfo_thread(td, &kinfo_proc);
@@ -913,7 +917,7 @@ sysctl_out_proc(struct proc *p, struct s
 		else
 			error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc,
 					   sizeof(kinfo_proc));
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 	}
 	PROC_UNLOCK(p);
 	if (error)
@@ -1003,12 +1007,12 @@ sysctl_kern_proc(SYSCTL_HANDLER_ARGS)
 			/*
 			 * Skip embryonic processes.
 			 */
-			mtx_lock_spin(&sched_lock);
+			PROC_SLOCK(p);
 			if (p->p_state == PRS_NEW) {
-				mtx_unlock_spin(&sched_lock);
+				PROC_SUNLOCK(p);
 				continue;
 			}
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 			PROC_LOCK(p);
 			KASSERT(p->p_ucred != NULL,
 			    ("process credential is NULL for non-NEW proc"));
Index: kern/kern_resource.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_resource.c,v
retrieving revision 1.173
diff -u -p -r1.173 kern_resource.c
--- kern/kern_resource.c	1 Jun 2007 01:20:11 -0000	1.173
+++ kern/kern_resource.c	1 Jun 2007 02:10:51 -0000
@@ -263,9 +263,9 @@ donice(struct thread *td, struct proc *p
 		n = PRIO_MIN;
  	if (n < p->p_nice && priv_check(td, PRIV_SCHED_SETPRIORITY) != 0)
 		return (EACCES);
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	sched_nice(p, n);
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 	return (0);
 }
 
@@ -306,7 +306,7 @@ rtprio_thread(struct thread *td, struct 
 	case RTP_LOOKUP:
 		if ((error = p_cansee(td, p)))
 			break;
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		if (uap->lwpid == 0 || uap->lwpid == td->td_tid)
 			td1 = td;
 		else
@@ -315,7 +315,7 @@ rtprio_thread(struct thread *td, struct 
 			pri_to_rtp(td1, &rtp);
 		else
 			error = ESRCH;
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		PROC_UNLOCK(p);
 		return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
 	case RTP_SET:
@@ -342,7 +342,7 @@ rtprio_thread(struct thread *td, struct 
 			}
 		}
 
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		if (uap->lwpid == 0 || uap->lwpid == td->td_tid)
 			td1 = td;
 		else
@@ -351,7 +351,7 @@ rtprio_thread(struct thread *td, struct 
 			error = rtp_to_pri(&rtp, td1);
 		else
 			error = ESRCH;
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		break;
 	default:
 		error = EINVAL;
@@ -402,7 +402,7 @@ rtprio(td, uap)
 	case RTP_LOOKUP:
 		if ((error = p_cansee(td, p)))
 			break;
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		/*
 		 * Return OUR priority if no pid specified,
 		 * or if one is, report the highest priority
@@ -430,7 +430,7 @@ rtprio(td, uap)
 				}
 			}
 		}
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		PROC_UNLOCK(p);
 		return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
 	case RTP_SET:
@@ -468,7 +468,7 @@ rtprio(td, uap)
 		 * do all the threads on that process. If we
 		 * specify our own pid we do the latter.
 		 */
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		if (uap->pid == 0) {
 			error = rtp_to_pri(&rtp, td);
 		} else {
@@ -477,7 +477,7 @@ rtprio(td, uap)
 					break;
 			}
 		}
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		break;
 	default:
 		error = EINVAL;
@@ -492,9 +492,9 @@ rtp_to_pri(struct rtprio *rtp, struct th
 {
 	u_char	newpri;
 
-	mtx_assert(&sched_lock, MA_OWNED);
 	if (rtp->prio > RTP_PRIO_MAX)
 		return (EINVAL);
+	thread_lock(td);
 	switch (RTP_PRIO_BASE(rtp->type)) {
 	case RTP_PRIO_REALTIME:
 		newpri = PRI_MIN_REALTIME + rtp->prio;
@@ -506,12 +506,14 @@ rtp_to_pri(struct rtprio *rtp, struct th
 		newpri = PRI_MIN_IDLE + rtp->prio;
 		break;
 	default:
+		thread_unlock(td);
 		return (EINVAL);
 	}
 	sched_class(td, rtp->type);	/* XXX fix */
 	sched_user_prio(td, newpri);
 	if (curthread == td)
 		sched_prio(curthread, td->td_user_pri); /* XXX dubious */
+	thread_unlock(td);
 	return (0);
 }
 
@@ -519,7 +521,7 @@ void
 pri_to_rtp(struct thread *td, struct rtprio *rtp)
 {
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	thread_lock(td);
 	switch (PRI_BASE(td->td_pri_class)) {
 	case PRI_REALTIME:
 		rtp->prio = td->td_base_user_pri - PRI_MIN_REALTIME;
@@ -534,6 +536,7 @@ pri_to_rtp(struct thread *td, struct rtp
 		break;
 	}
 	rtp->type = td->td_pri_class;
+	thread_unlock(td);
 }
 
 #if defined(COMPAT_43)
@@ -634,10 +637,13 @@ lim_cb(void *arg)
 	 */
 	if (p->p_cpulimit == RLIM_INFINITY)
 		return;
-	mtx_lock_spin(&sched_lock);
-	FOREACH_THREAD_IN_PROC(p, td)
+	PROC_SLOCK(p);
+	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
 		ruxagg(&p->p_rux, td);
-	mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
+	}
+	PROC_SUNLOCK(p);
 	if (p->p_rux.rux_runtime > p->p_cpulimit * cpu_tickrate()) {
 		lim_rlimit(p, RLIMIT_CPU, &rlim);
 		if (p->p_rux.rux_runtime >= rlim.rlim_max * cpu_tickrate()) {
@@ -699,9 +705,9 @@ kern_setrlimit(td, which, limp)
 		if (limp->rlim_cur != RLIM_INFINITY &&
 		    p->p_cpulimit == RLIM_INFINITY)
 			callout_reset(&p->p_limco, hz, lim_cb, p);
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		p->p_cpulimit = limp->rlim_cur;
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		break;
 	case RLIMIT_DATA:
 		if (limp->rlim_cur > maxdsiz)
@@ -828,9 +834,7 @@ calcru(struct proc *p, struct timeval *u
 	uint64_t u;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	mtx_assert(&sched_lock, MA_NOTOWNED);
-	mtx_lock_spin(&sched_lock);
-
+	PROC_SLOCK(p);
 	/*
 	 * If we are getting stats for the current process, then add in the
 	 * stats that this thread has accumulated in its current time slice.
@@ -843,9 +847,9 @@ calcru(struct proc *p, struct timeval *u
 		p->p_rux.rux_runtime += u - PCPU_GET(switchtime);
 		PCPU_SET(switchtime, u);
 	}
-	/* Work on a copy of p_rux so we can let go of sched_lock */
+	/* Work on a copy of p_rux so we can let go of p_slock */
 	rux = p->p_rux;
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 	calcru1(p, &rux, up, sp);
 	/* Update the result from the p_rux copy */
 	p->p_rux.rux_uu = rux.rux_uu;
@@ -1013,6 +1017,9 @@ ruadd(struct rusage *ru, struct rusage_e
 void
 ruxagg(struct rusage_ext *rux, struct thread *td)
 {
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	PROC_SLOCK_ASSERT(td->td_proc, MA_OWNED);
 	rux->rux_runtime += td->td_runtime;
 	rux->rux_uticks += td->td_uticks;
 	rux->rux_sticks += td->td_sticks;
@@ -1033,17 +1040,19 @@ rufetch(struct proc *p, struct rusage *r
 	struct thread *td;
 
 	memset(ru, 0, sizeof(*ru));
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	if (p->p_ru == NULL)  {
 		KASSERT(p->p_numthreads > 0,
 		    ("rufetch: No threads or ru in proc %p", p));
 		FOREACH_THREAD_IN_PROC(p, td) {
+			thread_lock(td);
 			ruxagg(&p->p_rux, td);
+			thread_unlock(td);
 			rucollect(ru, &td->td_ru);
 		}
 	} else
 		*ru = *p->p_ru;
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 }
 
 /*
Index: kern/kern_rwlock.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_rwlock.c,v
retrieving revision 1.25
diff -u -p -r1.25 kern_rwlock.c
--- kern/kern_rwlock.c	18 May 2007 15:04:59 -0000	1.25
+++ kern/kern_rwlock.c	20 May 2007 11:40:27 -0000
@@ -187,6 +187,7 @@ _rw_wunlock(struct rwlock *rw, const cha
 void
 _rw_rlock(struct rwlock *rw, const char *file, int line)
 {
+	struct turnstile *ts;
 #ifdef ADAPTIVE_RWLOCKS
 	volatile struct thread *owner;
 #endif
@@ -256,7 +257,7 @@ _rw_rlock(struct rwlock *rw, const char 
 		 * has a write lock, so acquire the turnstile lock so we can
 		 * begin the process of blocking.
 		 */
-		turnstile_lock(&rw->lock_object);
+		ts = turnstile_trywait(&rw->lock_object);
 
 		/*
 		 * The lock might have been released while we spun, so
@@ -265,7 +266,7 @@ _rw_rlock(struct rwlock *rw, const char 
 		 */
 		x = rw->rw_lock;
 		if (x & RW_LOCK_READ) {
-			turnstile_release(&rw->lock_object);
+			turnstile_cancel(ts);
 			cpu_spinwait();
 			continue;
 		}
@@ -279,7 +280,7 @@ _rw_rlock(struct rwlock *rw, const char 
 		if (!(x & RW_LOCK_READ_WAITERS)) {
 			if (!atomic_cmpset_ptr(&rw->rw_lock, x,
 			    x | RW_LOCK_READ_WAITERS)) {
-				turnstile_release(&rw->lock_object);
+				turnstile_cancel(ts);
 				cpu_spinwait();
 				continue;
 			}
@@ -296,7 +297,7 @@ _rw_rlock(struct rwlock *rw, const char 
 		 */
 		owner = (struct thread *)RW_OWNER(x);
 		if (TD_IS_RUNNING(owner)) {
-			turnstile_release(&rw->lock_object);
+			turnstile_cancel(ts);
 			if (LOCK_LOG_TEST(&rw->lock_object, 0))
 				CTR3(KTR_LOCK, "%s: spinning on %p held by %p",
 				    __func__, rw, owner);
@@ -314,7 +315,7 @@ _rw_rlock(struct rwlock *rw, const char 
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
 			    rw);
-		turnstile_wait(&rw->lock_object, rw_owner(rw), TS_SHARED_QUEUE);
+		turnstile_wait(ts, rw_owner(rw), TS_SHARED_QUEUE);
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
 			    __func__, rw);
@@ -407,7 +408,7 @@ _rw_runlock(struct rwlock *rw, const cha
 		 * Ok, we know we have a waiting writer and we think we
 		 * are the last reader, so grab the turnstile lock.
 		 */
-		turnstile_lock(&rw->lock_object);
+		turnstile_chain_lock(&rw->lock_object);
 
 		/*
 		 * Try to drop our lock leaving the lock in a unlocked
@@ -427,7 +428,7 @@ _rw_runlock(struct rwlock *rw, const cha
 		 */
 		if (!atomic_cmpset_ptr(&rw->rw_lock,
 		    RW_READERS_LOCK(1) | RW_LOCK_WRITE_WAITERS, RW_UNLOCKED)) {
-			turnstile_release(&rw->lock_object);
+			turnstile_chain_unlock(&rw->lock_object);
 			continue;
 		}
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
@@ -445,6 +446,7 @@ _rw_runlock(struct rwlock *rw, const cha
 		MPASS(ts != NULL);
 		turnstile_broadcast(ts, TS_EXCLUSIVE_QUEUE);
 		turnstile_unpend(ts, TS_SHARED_LOCK);
+		turnstile_chain_unlock(&rw->lock_object);
 		break;
 	}
 	lock_profile_release_lock(&rw->lock_object);
@@ -458,6 +460,7 @@ _rw_runlock(struct rwlock *rw, const cha
 void
 _rw_wlock_hard(struct rwlock *rw, uintptr_t tid, const char *file, int line)
 {
+	struct turnstile *ts;
 #ifdef ADAPTIVE_RWLOCKS
 	volatile struct thread *owner;
 #endif
@@ -468,7 +471,7 @@ _rw_wlock_hard(struct rwlock *rw, uintpt
 		    rw->lock_object.lo_name, (void *)rw->rw_lock, file, line);
 
 	while (!_rw_write_lock(rw, tid)) {
-		turnstile_lock(&rw->lock_object);
+		ts = turnstile_trywait(&rw->lock_object);
 		v = rw->rw_lock;
 
 		/*
@@ -476,7 +479,7 @@ _rw_wlock_hard(struct rwlock *rw, uintpt
 		 * turnstile chain lock, try again.
 		 */
 		if (v == RW_UNLOCKED) {
-			turnstile_release(&rw->lock_object);
+			turnstile_cancel(ts);
 			cpu_spinwait();
 			continue;
 		}
@@ -495,12 +498,12 @@ _rw_wlock_hard(struct rwlock *rw, uintpt
 			if (atomic_cmpset_acq_ptr(&rw->rw_lock,
 			    RW_UNLOCKED | RW_LOCK_WRITE_WAITERS,
 			    tid | RW_LOCK_WRITE_WAITERS)) {
-				turnstile_claim(&rw->lock_object);
+				turnstile_claim(ts);
 				CTR2(KTR_LOCK, "%s: %p claimed by new writer",
 				    __func__, rw);
 				break;
 			}
-			turnstile_release(&rw->lock_object);
+			turnstile_cancel(ts);
 			cpu_spinwait();
 			continue;
 		}
@@ -513,7 +516,7 @@ _rw_wlock_hard(struct rwlock *rw, uintpt
 		if (!(v & RW_LOCK_WRITE_WAITERS)) {
 			if (!atomic_cmpset_ptr(&rw->rw_lock, v,
 			    v | RW_LOCK_WRITE_WAITERS)) {
-				turnstile_release(&rw->lock_object);
+				turnstile_cancel(ts);
 				cpu_spinwait();
 				continue;
 			}
@@ -530,7 +533,7 @@ _rw_wlock_hard(struct rwlock *rw, uintpt
 		 */
 		owner = (struct thread *)RW_OWNER(v);
 		if (!(v & RW_LOCK_READ) && TD_IS_RUNNING(owner)) {
-			turnstile_release(&rw->lock_object);
+			turnstile_cancel(ts);
 			if (LOCK_LOG_TEST(&rw->lock_object, 0))
 				CTR3(KTR_LOCK, "%s: spinning on %p held by %p",
 				    __func__, rw, owner);
@@ -548,8 +551,7 @@ _rw_wlock_hard(struct rwlock *rw, uintpt
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
 			    rw);
-		turnstile_wait(&rw->lock_object, rw_owner(rw),
-		    TS_EXCLUSIVE_QUEUE);
+		turnstile_wait(ts, rw_owner(rw), TS_EXCLUSIVE_QUEUE);
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
 			    __func__, rw);
@@ -574,7 +576,7 @@ _rw_wunlock_hard(struct rwlock *rw, uint
 	if (LOCK_LOG_TEST(&rw->lock_object, 0))
 		CTR2(KTR_LOCK, "%s: %p contested", __func__, rw);
 
-	turnstile_lock(&rw->lock_object);
+	turnstile_chain_lock(&rw->lock_object);
 	ts = turnstile_lookup(&rw->lock_object);
 
 #ifdef ADAPTIVE_RWLOCKS
@@ -587,7 +589,7 @@ _rw_wunlock_hard(struct rwlock *rw, uint
 		atomic_store_rel_ptr(&rw->rw_lock, RW_UNLOCKED);
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p no sleepers", __func__, rw);
-		turnstile_release(&rw->lock_object);
+		turnstile_chain_unlock(&rw->lock_object);
 		return;
 	}
 #else
@@ -640,6 +642,7 @@ _rw_wunlock_hard(struct rwlock *rw, uint
 			CTR2(KTR_LOCK, "%s: %p no sleepers 2", __func__, rw);
 		atomic_store_rel_ptr(&rw->rw_lock, v);
 		turnstile_disown(ts);
+		turnstile_chain_unlock(&rw->lock_object);
 		return;
 	}
 #endif
@@ -651,6 +654,7 @@ _rw_wunlock_hard(struct rwlock *rw, uint
 	turnstile_broadcast(ts, queue);
 	atomic_store_rel_ptr(&rw->rw_lock, v);
 	turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
+	turnstile_chain_unlock(&rw->lock_object);
 }
 
 /*
@@ -662,6 +666,7 @@ int
 _rw_try_upgrade(struct rwlock *rw, const char *file, int line)
 {
 	uintptr_t v, tid;
+	struct turnstile *ts;
 	int success;
 
 	KASSERT(rw->rw_lock != RW_DESTROYED,
@@ -686,7 +691,7 @@ _rw_try_upgrade(struct rwlock *rw, const
 	 * Ok, we think we have write waiters, so lock the
 	 * turnstile.
 	 */
-	turnstile_lock(&rw->lock_object);
+	ts = turnstile_trywait(&rw->lock_object);
 
 	/*
 	 * Try to switch from one reader to a writer again.  This time
@@ -705,9 +710,9 @@ _rw_try_upgrade(struct rwlock *rw, const
 #else
 	if (success && v)
 #endif
-		turnstile_claim(&rw->lock_object);
+		turnstile_claim(ts);
 	else
-		turnstile_release(&rw->lock_object);
+		turnstile_cancel(ts);
 out:
 	LOCK_LOG_TRY("WUPGRADE", &rw->lock_object, 0, success, file, line);
 	if (success)
@@ -745,7 +750,7 @@ _rw_downgrade(struct rwlock *rw, const c
 	 * Ok, we think we have waiters, so lock the turnstile so we can
 	 * read the waiter flags without any races.
 	 */
-	turnstile_lock(&rw->lock_object);
+	turnstile_chain_lock(&rw->lock_object);
 	v = rw->rw_lock;
 	MPASS(v & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS));
 
@@ -779,12 +784,9 @@ _rw_downgrade(struct rwlock *rw, const c
 	    (v & RW_LOCK_WRITE_WAITERS));
 	if (v & RW_LOCK_READ_WAITERS)
 		turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
-#ifdef ADAPTIVE_RWLOCKS
-	else if (ts == NULL)
-		turnstile_release(&rw->lock_object);
-#endif
-	else
+	else if (ts)
 		turnstile_disown(ts);
+	turnstile_chain_unlock(&rw->lock_object);
 out:
 	LOCK_LOG_LOCK("WDOWNGRADE", &rw->lock_object, 0, 0, file, line);
 }
Index: kern/kern_shutdown.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_shutdown.c,v
retrieving revision 1.181
diff -u -p -r1.181 kern_shutdown.c
--- kern/kern_shutdown.c	4 Mar 2007 22:36:46 -0000	1.181
+++ kern/kern_shutdown.c	18 May 2007 10:37:02 -0000
@@ -267,9 +267,9 @@ boot(int howto)
 	 * systems don't shutdown properly (i.e., ACPI power off) if we
 	 * run on another processor.
 	 */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 	sched_bind(curthread, 0);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 	KASSERT(PCPU_GET(cpuid) == 0, ("boot: not running on cpu 0"));
 #endif
 	/* We're in the process of rebooting. */
@@ -340,9 +340,9 @@ boot(int howto)
 			 */
 			DROP_GIANT();
 			for (subiter = 0; subiter < 50 * iter; subiter++) {
-				mtx_lock_spin(&sched_lock);
+				thread_lock(curthread);
 				mi_switch(SW_VOL, NULL);
-				mtx_unlock_spin(&sched_lock);
+				thread_unlock(curthread);
 				DELAY(1000);
 			}
 			PICKUP_GIANT();
@@ -555,9 +555,9 @@ panic(const char *fmt, ...)
 	}
 #endif
 #endif
-	mtx_lock_spin(&sched_lock);
+	/*thread_lock(td); */
 	td->td_flags |= TDF_INPANIC;
-	mtx_unlock_spin(&sched_lock);
+	/* thread_unlock(td); */
 	if (!sync_on_panic)
 		bootopt |= RB_NOSYNC;
 	boot(bootopt);
Index: kern/kern_sig.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_sig.c,v
retrieving revision 1.345
diff -u -p -r1.345 kern_sig.c
--- kern/kern_sig.c	1 Jun 2007 01:12:43 -0000	1.345
+++ kern/kern_sig.c	31 May 2007 20:40:17 -0000
@@ -511,10 +511,10 @@ sigqueue_delete_set_proc(struct proc *p,
 	sigqueue_init(&worklist, NULL);
 	sigqueue_move_set(&p->p_sigqueue, &worklist, set);
 
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	FOREACH_THREAD_IN_PROC(p, td0)
 		sigqueue_move_set(&td0->td_sigqueue, &worklist, set);
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 
 	sigqueue_flush(&worklist);
 }
@@ -552,7 +552,7 @@ cursig(struct thread *td)
 {
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 	mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED);
-	mtx_assert(&sched_lock, MA_NOTOWNED);
+	THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
 	return (SIGPENDING(td) ? issignal(td) : 0);
 }
 
@@ -588,9 +588,9 @@ signotify(struct thread *td)
 	if (! SIGISEMPTY(set))
 		sigqueue_move_set(&p->p_sigqueue, &td->td_sigqueue, &set);
 	if (SIGPENDING(td)) {
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 	}
 #ifdef KSE
 	if ((p->p_flag & P_SA) && !(p->p_flag & P_SIGEVENT)) {
@@ -758,7 +758,9 @@ kern_sigaction(td, sig, act, oact, flags
 			}
 #endif
 			/* never to be seen again */
+			PROC_SLOCK(p);
 			sigqueue_delete_proc(p, sig);
+			PROC_SUNLOCK(p);
 			if (sig != SIGCONT)
 				/* easier in psignal */
 				SIGADDSET(ps->ps_sigignore, sig);
@@ -954,7 +956,9 @@ execsigs(struct proc *p)
 		if (sigprop(sig) & SA_IGNORE) {
 			if (sig != SIGCONT)
 				SIGADDSET(ps->ps_sigignore, sig);
+			PROC_SLOCK(p);
 			sigqueue_delete_proc(p, sig);
+			PROC_SUNLOCK(p);
 		}
 		ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 	}
@@ -1849,7 +1853,7 @@ trapsignal(struct thread *td, ksiginfo_t
 			thread_user_enter(td);
 		PROC_LOCK(p);
 		SIGDELSET(td->td_sigmask, sig);
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		/*
 		 * Force scheduling an upcall, so UTS has chance to
 		 * process the signal before thread runs again in
@@ -1857,7 +1861,7 @@ trapsignal(struct thread *td, ksiginfo_t
 		 */
 		if (td->td_upcall)
 			td->td_upcall->ku_flags |= KUF_DOUPCALL;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 	} else {
 		PROC_LOCK(p);
 	}
@@ -1952,7 +1956,7 @@ sigtd(struct proc *p, int sig, int prop)
 	if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig))
 		return (curthread);
 	signal_td = NULL;
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (!SIGISMEMBER(td->td_sigmask, sig)) {
 			signal_td = td;
@@ -1961,7 +1965,7 @@ sigtd(struct proc *p, int sig, int prop)
 	}
 	if (signal_td == NULL)
 		signal_td = FIRST_THREAD_IN_PROC(p);
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 	return (signal_td);
 }
 
@@ -2128,7 +2132,9 @@ do_tdsignal(struct proc *p, struct threa
 				ksiginfo_tryfree(ksi);
 			return (ret);
 		}
+		PROC_SLOCK(p);
 		sigqueue_delete_proc(p, SIGCONT);
+		PROC_SUNLOCK(p);
 		if (p->p_flag & P_CONTINUED) {
 			p->p_flag &= ~P_CONTINUED;
 			PROC_LOCK(p->p_pptr);
@@ -2166,6 +2172,7 @@ do_tdsignal(struct proc *p, struct threa
 	 * waking up threads so that they can cross the user boundary.
 	 * We try do the per-process part here.
 	 */
+	PROC_SLOCK(p);
 	if (P_SHOULDSTOP(p)) {
 		/*
 		 * The process is in stopped mode. All the threads should be
@@ -2177,6 +2184,7 @@ do_tdsignal(struct proc *p, struct threa
 			 * so no further action is necessary.
 			 * No signal can restart us.
 			 */
+			PROC_SUNLOCK(p);
 			goto out;
 		}
 
@@ -2203,15 +2211,21 @@ do_tdsignal(struct proc *p, struct threa
 			 */
 			p->p_flag &= ~P_STOPPED_SIG;
 			if (p->p_numthreads == p->p_suspcount) {
+				PROC_SUNLOCK(p);
 				p->p_flag |= P_CONTINUED;
 				p->p_xstat = SIGCONT;
 				PROC_LOCK(p->p_pptr);
 				childproc_continued(p);
 				PROC_UNLOCK(p->p_pptr);
+				PROC_SLOCK(p);
 			}
 			if (action == SIG_DFL) {
+				thread_unsuspend(p);
+				PROC_SUNLOCK(p);
 				sigqueue_delete(sigqueue, sig);
-			} else if (action == SIG_CATCH) {
+				goto out;
+			}
+			if (action == SIG_CATCH) {
 #ifdef KSE
 				/*
 				 * The process wants to catch it so it needs
@@ -2223,20 +2237,18 @@ do_tdsignal(struct proc *p, struct threa
 				 * single thread is runnable asap.
 				 * XXXKSE for now however, make them all run.
 				 */
-#else
+#endif
 				/*
 				 * The process wants to catch it so it needs
 				 * to run at least one thread, but which one?
 				 */
-#endif
 				goto runfast;
 			}
 			/*
 			 * The signal is not ignored or caught.
 			 */
-			mtx_lock_spin(&sched_lock);
 			thread_unsuspend(p);
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 			goto out;
 		}
 
@@ -2246,6 +2258,7 @@ do_tdsignal(struct proc *p, struct threa
 			 * (If we did the shell could get confused).
 			 * Just make sure the signal STOP bit set.
 			 */
+			PROC_SUNLOCK(p);
 			p->p_flag |= P_STOPPED_SIG;
 			sigqueue_delete(sigqueue, sig);
 			goto out;
@@ -2259,10 +2272,11 @@ do_tdsignal(struct proc *p, struct threa
 		 * the PROCESS runnable, leave it stopped.
 		 * It may run a bit until it hits a thread_suspend_check().
 		 */
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR))
 			sleepq_abort(td, intrval);
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
+		PROC_SUNLOCK(p);
 		goto out;
 		/*
 		 * Mutexes are short lived. Threads waiting on them will
@@ -2270,9 +2284,10 @@ do_tdsignal(struct proc *p, struct threa
 		 */
 	} else if (p->p_state == PRS_NORMAL) {
 		if (p->p_flag & P_TRACED || action == SIG_CATCH) {
-			mtx_lock_spin(&sched_lock);
+			thread_lock(td);
 			tdsigwakeup(td, sig, action, intrval);
-			mtx_unlock_spin(&sched_lock);
+			thread_unlock(td);
+			PROC_SUNLOCK(p);
 			goto out;
 		}
 
@@ -2283,7 +2298,6 @@ do_tdsignal(struct proc *p, struct threa
 				goto out;
 			p->p_flag |= P_STOPPED_SIG;
 			p->p_xstat = sig;
-			mtx_lock_spin(&sched_lock);
 			sig_suspend_threads(td, p, 1);
 			if (p->p_numthreads == p->p_suspcount) {
 				/*
@@ -2294,10 +2308,10 @@ do_tdsignal(struct proc *p, struct threa
 				 * should never be equal to p_suspcount.
 				 */
 				thread_stopped(p);
-				mtx_unlock_spin(&sched_lock);
+				PROC_SUNLOCK(p);
 				sigqueue_delete_proc(p, p->p_xstat);
 			} else
-				mtx_unlock_spin(&sched_lock);
+				PROC_SUNLOCK(p);
 			goto out;
 		} 
 		else
@@ -2305,6 +2319,7 @@ do_tdsignal(struct proc *p, struct threa
 		/* NOTREACHED */
 	} else {
 		/* Not in "NORMAL" state. discard the signal. */
+		PROC_SUNLOCK(p);
 		sigqueue_delete(sigqueue, sig);
 		goto out;
 	}
@@ -2315,13 +2330,14 @@ do_tdsignal(struct proc *p, struct threa
 	 */
 
 runfast:
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	tdsigwakeup(td, sig, action, intrval);
+	thread_unlock(td);
 	thread_unsuspend(p);
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 out:
-	/* If we jump here, sched_lock should not be owned. */
-	mtx_assert(&sched_lock, MA_NOTOWNED);
+	/* If we jump here, proc slock should not be owned. */
+	PROC_SLOCK_ASSERT(p, MA_NOTOWNED);
 	return (ret);
 }
 
@@ -2337,7 +2353,8 @@ tdsigwakeup(struct thread *td, int sig, 
 	register int prop;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	mtx_assert(&sched_lock, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	prop = sigprop(sig);
 
 	/*
@@ -2366,14 +2383,16 @@ tdsigwakeup(struct thread *td, int sig, 
 		 * be awakened.
 		 */
 		if ((prop & SA_CONT) && action == SIG_DFL) {
-			mtx_unlock_spin(&sched_lock);
+			thread_unlock(td);
+			PROC_SUNLOCK(p);
 			sigqueue_delete(&p->p_sigqueue, sig);
 			/*
 			 * It may be on either list in this state.
 			 * Remove from both for now.
 			 */
 			sigqueue_delete(&td->td_sigqueue, sig);
-			mtx_lock_spin(&sched_lock);
+			PROC_SLOCK(p);
+			thread_lock(td);
 			return;
 		}
 
@@ -2403,9 +2422,10 @@ sig_suspend_threads(struct thread *td, s
 	struct thread *td2;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	mtx_assert(&sched_lock, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 
 	FOREACH_THREAD_IN_PROC(p, td2) {
+		thread_lock(td2);
 		if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) &&
 		    (td2->td_flags & TDF_SINTR) &&
 		    !TD_IS_SUSPENDED(td2)) {
@@ -2418,6 +2438,7 @@ sig_suspend_threads(struct thread *td, s
 				forward_signal(td2);
 #endif
 		}
+		thread_unlock(td2);
 	}
 }
 
@@ -2430,15 +2451,17 @@ ptracestop(struct thread *td, int sig)
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
 	    &p->p_mtx.lock_object, "Stopping for traced signal");
 
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	td->td_flags |= TDF_XSIG;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	td->td_xsig = sig;
+	PROC_SLOCK(p);
 	while ((p->p_flag & P_TRACED) && (td->td_flags & TDF_XSIG)) {
 		if (p->p_flag & P_SINGLE_EXIT) {
-			mtx_lock_spin(&sched_lock);
+			thread_lock(td);
 			td->td_flags &= ~TDF_XSIG;
-			mtx_unlock_spin(&sched_lock);
+			thread_unlock(td);
+			PROC_SUNLOCK(p);
 			return (sig);
 		}
 		/*
@@ -2448,26 +2471,19 @@ ptracestop(struct thread *td, int sig)
 		p->p_xstat = sig;
 		p->p_xthread = td;
 		p->p_flag |= (P_STOPPED_SIG|P_STOPPED_TRACE);
-		mtx_lock_spin(&sched_lock);
 		sig_suspend_threads(td, p, 0);
 stopme:
-		thread_stopped(p);
-		thread_suspend_one(td);
-		PROC_UNLOCK(p);
-		DROP_GIANT();
-		mi_switch(SW_VOL, NULL);
-		mtx_unlock_spin(&sched_lock);
-		PICKUP_GIANT();
-		PROC_LOCK(p);
-		if (!(p->p_flag & P_TRACED))
+		thread_suspend_switch(td);
+		if (!(p->p_flag & P_TRACED)) {
 			break;
+		}
 		if (td->td_flags & TDF_DBSUSPEND) {
 			if (p->p_flag & P_SINGLE_EXIT)
 				break;
-			mtx_lock_spin(&sched_lock);
 			goto stopme;
 		}
 	}
+	PROC_SUNLOCK(p);
 	return (td->td_xsig);
 }
 
@@ -2621,16 +2637,10 @@ issignal(td)
 				    &p->p_mtx.lock_object, "Catching SIGSTOP");
 				p->p_flag |= P_STOPPED_SIG;
 				p->p_xstat = sig;
-				mtx_lock_spin(&sched_lock);
+				PROC_SLOCK(p);
 				sig_suspend_threads(td, p, 0);
-				thread_stopped(p);
-				thread_suspend_one(td);
-				PROC_UNLOCK(p);
-				DROP_GIANT();
-				mi_switch(SW_INVOL, NULL);
-				mtx_unlock_spin(&sched_lock);
-				PICKUP_GIANT();
-				PROC_LOCK(p);
+				thread_suspend_switch(td);
+				PROC_SUNLOCK(p);
 				mtx_lock(&ps->ps_mtx);
 				break;
 			} else if (prop & SA_IGNORE) {
@@ -2672,18 +2682,18 @@ thread_stopped(struct proc *p)
 	int n;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	mtx_assert(&sched_lock, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	n = p->p_suspcount;
 	if (p == curproc)
 		n++;
 	if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) {
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		p->p_flag &= ~P_WAITED;
 		PROC_LOCK(p->p_pptr);
 		childproc_stopped(p, (p->p_flag & P_TRACED) ?
 			CLD_TRAPPED : CLD_STOPPED);
 		PROC_UNLOCK(p->p_pptr);
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 	}
 }
  
Index: kern/kern_subr.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_subr.c,v
retrieving revision 1.102
diff -u -p -r1.102 kern_subr.c
--- kern/kern_subr.c	16 Jan 2007 11:40:55 -0000	1.102
+++ kern/kern_subr.c	18 May 2007 10:37:02 -0000
@@ -453,11 +453,11 @@ uio_yield(void)
 	struct thread *td;
 
 	td = curthread;
-	mtx_lock_spin(&sched_lock);
 	DROP_GIANT();
+	thread_lock(td);
 	sched_prio(td, td->td_user_pri);
 	mi_switch(SW_INVOL, NULL);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	PICKUP_GIANT();
 }
 
Index: kern/kern_switch.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_switch.c,v
retrieving revision 1.129
diff -u -p -r1.129 kern_switch.c
--- kern/kern_switch.c	8 Feb 2007 01:52:25 -0000	1.129
+++ kern/kern_switch.c	31 May 2007 21:08:40 -0000
@@ -49,6 +49,8 @@ __FBSDID("$FreeBSD: src/sys/kern/kern_sw
 #include <sys/sysctl.h>
 #endif
 
+#include <machine/cpu.h>
+
 /* Uncomment this to enable logging of critical_enter/exit. */
 #if 0
 #define	KTR_CRITICAL	KTR_SCHED
@@ -77,6 +79,49 @@ static int kern_sched_preemption = 0;
 SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
     &kern_sched_preemption, 0, "Kernel preemption enabled");
 
+#ifdef SCHED_STATS
+long switch_preempt;
+long switch_owepreempt;
+long switch_turnstile;
+long switch_sleepq;
+long switch_sleepqtimo;
+long switch_relinquish;
+long switch_needresched;
+static SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW, 0, "switch stats");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, preempt, CTLFLAG_RD, &switch_preempt, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, owepreempt, CTLFLAG_RD, &switch_owepreempt, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, turnstile, CTLFLAG_RD, &switch_turnstile, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepq, CTLFLAG_RD, &switch_sleepq, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepqtimo, CTLFLAG_RD, &switch_sleepqtimo, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, relinquish, CTLFLAG_RD, &switch_relinquish, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, needresched, CTLFLAG_RD, &switch_needresched, 0, "");
+static int
+sysctl_stats_reset(SYSCTL_HANDLER_ARGS)
+{
+        int error;
+	int val;
+
+        val = 0;
+        error = sysctl_handle_int(oidp, &val, 0, req);
+        if (error != 0 || req->newptr == NULL)
+                return (error);
+        if (val == 0)
+                return (0);
+	switch_preempt = 0;
+	switch_owepreempt = 0;
+	switch_turnstile = 0;
+	switch_sleepq = 0;
+	switch_sleepqtimo = 0;
+	switch_relinquish = 0;
+	switch_needresched = 0;
+
+	return (0);
+}
+
+SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_WR, NULL,
+    0, sysctl_stats_reset, "I", "Reset scheduler statistics");
+#endif
+
 /************************************************************************
  * Functions that manipulate runnability from a thread perspective.	*
  ************************************************************************/
@@ -142,13 +187,13 @@ critical_exit(void)
 #ifdef PREEMPTION
 	if (td->td_critnest == 1) {
 		td->td_critnest = 0;
-		mtx_assert(&sched_lock, MA_NOTOWNED);
 		if (td->td_owepreempt) {
 			td->td_critnest = 1;
-			mtx_lock_spin(&sched_lock);
+			thread_lock(td);
 			td->td_critnest--;
+			SCHED_STAT_INC(switch_owepreempt);
 			mi_switch(SW_INVOL, NULL);
-			mtx_unlock_spin(&sched_lock);
+			thread_unlock(td);
 		}
 	} else
 #endif
@@ -173,7 +218,6 @@ maybe_preempt(struct thread *td)
 	int cpri, pri;
 #endif
 
-	mtx_assert(&sched_lock, MA_OWNED);
 #ifdef PREEMPTION
 	/*
 	 * The new thread should not preempt the current thread if any of the
@@ -199,6 +243,7 @@ maybe_preempt(struct thread *td)
 	 * to the new thread.
 	 */
 	ctd = curthread;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT ((ctd->td_sched != NULL && ctd->td_sched->ts_thread == ctd),
 	  ("thread has no (or wrong) sched-private part."));
 	KASSERT((td->td_inhibitors == 0),
@@ -219,15 +264,25 @@ maybe_preempt(struct thread *td)
 		ctd->td_owepreempt = 1;
 		return (0);
 	}
-
 	/*
 	 * Thread is runnable but not yet put on system run queue.
 	 */
+	MPASS(ctd->td_lock == &sched_lock);
+	MPASS(td->td_lock == &sched_lock);
 	MPASS(TD_ON_RUNQ(td));
 	TD_SET_RUNNING(td);
 	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
 	    td->td_proc->p_pid, td->td_proc->p_comm);
+	SCHED_STAT_INC(switch_preempt);
 	mi_switch(SW_INVOL|SW_PREEMPT, td);
+	/*
+	 * td's lock pointer may have changed.  We have to return with it
+	 * locked.
+	 */
+	spinlock_enter();
+	thread_unlock(ctd);
+	thread_lock(td);
+	spinlock_exit();
 	return (1);
 #else
 	return (0);
@@ -442,7 +497,6 @@ runq_choose(struct runq *rq)
 	struct td_sched *ts;
 	int pri;
 
-	mtx_assert(&sched_lock, MA_OWNED);
 	while ((pri = runq_findbit(rq)) != -1) {
 		rqh = &rq->rq_queues[pri];
 #if defined(SMP) && defined(SCHED_4BSD)
@@ -484,7 +538,6 @@ runq_choose_from(struct runq *rq, u_char
 	struct td_sched *ts;
 	int pri;
 
-	mtx_assert(&sched_lock, MA_OWNED);
 	if ((pri = runq_findbit_from(rq, idx)) != -1) {
 		rqh = &rq->rq_queues[pri];
 		ts = TAILQ_FIRST(rqh);
@@ -519,9 +572,20 @@ runq_remove_idx(struct runq *rq, struct 
 	KASSERT(ts->ts_thread->td_proc->p_sflag & PS_INMEM,
 		("runq_remove_idx: process swapped out"));
 	pri = ts->ts_rqindex;
+	KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri));
 	rqh = &rq->rq_queues[pri];
 	CTR5(KTR_RUNQ, "runq_remove_idx: td=%p, ts=%p pri=%d %d rqh=%p",
 	    ts->ts_thread, ts, ts->ts_thread->td_priority, pri, rqh);
+	{
+		struct td_sched *nts;
+
+		TAILQ_FOREACH(nts, rqh, ts_procq)
+			if (nts == ts)
+				break;
+		if (ts != nts)
+			panic("runq_remove_idx: ts %p not on rqindex %d",
+			    ts, pri);
+	}
 	TAILQ_REMOVE(rqh, ts, ts_procq);
 	if (TAILQ_EMPTY(rqh)) {
 		CTR0(KTR_RUNQ, "runq_remove_idx: empty");
@@ -589,18 +653,4 @@ sched_set_concurrency(struct proc *p, in
 {
 }
 
-/*
- * Called from thread_exit() for all exiting thread
- *
- * Not to be confused with sched_exit_thread()
- * that is only called from thread_exit() for threads exiting
- * without the rest of the process exiting because it is also called from
- * sched_exit() and we wouldn't want to call it twice.
- * XXX This can probably be fixed.
- */
-void
-sched_thread_exit(struct thread *td)
-{
-}
-
 #endif /* KERN_SWITCH_INCLUDE */
Index: kern/kern_synch.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.298
diff -u -p -r1.298 kern_synch.c
--- kern/kern_synch.c	1 Jun 2007 01:20:11 -0000	1.298
+++ kern/kern_synch.c	31 May 2007 21:03:27 -0000
@@ -213,9 +213,9 @@ _sleep(ident, lock, priority, wmesg, tim
 	 */
 	pri = priority & PRIMASK;
 	if (pri != 0 && pri != td->td_priority) {
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		sched_prio(td, pri);
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 	}
 
 	if (timo && catch)
@@ -362,6 +362,7 @@ wakeup_one(ident)
 
 	sleepq_lock(ident);
 	sleepq_signal(ident, SLEEPQ_SLEEP, -1, 0);
+	sleepq_release(ident);
 }
 
 /*
@@ -374,8 +375,8 @@ mi_switch(int flags, struct thread *newt
 	struct thread *td;
 	struct proc *p;
 
-	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
 	td = curthread;			/* XXX */
+	THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
 	p = td->td_proc;		/* XXX */
 	KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code"));
 #ifdef INVARIANTS
@@ -394,12 +395,15 @@ mi_switch(int flags, struct thread *newt
 	 * Don't perform context switches from the debugger.
 	 */
 	if (kdb_active) {
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 		kdb_backtrace();
 		kdb_reenter();
 		panic("%s: did not reenter debugger", __func__);
 	}
 
+	/*
+	 * XXX Need proc lock for stats!
+	 */
 	if (flags & SW_VOL)
 		td->td_ru.ru_nvcsw++;
 	else
@@ -466,7 +470,7 @@ setrunnable(struct thread *td)
 	struct proc *p;
 
 	p = td->td_proc;
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	switch (p->p_state) {
 	case PRS_ZOMBIE:
 		panic("setrunnable(1)");
@@ -495,7 +499,7 @@ setrunnable(struct thread *td)
 		if ((p->p_sflag & PS_SWAPPINGIN) == 0) {
 			p->p_sflag |= PS_SWAPINREQ;
 			/*
-			 * due to a LOR between sched_lock and
+			 * due to a LOR between the thread lock and
 			 * the sleepqueue chain locks, use
 			 * lower level scheduling functions.
 			 */
Index: kern/kern_thr.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_thr.c,v
retrieving revision 1.59
diff -u -p -r1.59 kern_thr.c
--- kern/kern_thr.c	23 Jan 2007 08:46:50 -0000	1.59
+++ kern/kern_thr.c	18 May 2007 10:37:02 -0000
@@ -226,12 +226,15 @@ create_thread(struct thread *td, mcontex
 	PROC_LOCK(td->td_proc);
 	td->td_proc->p_flag |= P_HADTHREADS;
 	newtd->td_sigmask = td->td_sigmask;
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	thread_link(newtd, p); 
-	PROC_UNLOCK(p);
-
+	thread_lock(td);
 	/* let the scheduler know about these things. */
 	sched_fork_thread(td, newtd);
+	thread_unlock(td);
+	PROC_SUNLOCK(p);
+	PROC_UNLOCK(p);
+	thread_lock(newtd);
 	if (rtp != NULL) {
 		if (!(td->td_pri_class == PRI_TIMESHARE &&
 		      rtp->type == RTP_PRIO_NORMAL)) {
@@ -242,7 +245,7 @@ create_thread(struct thread *td, mcontex
 	TD_SET_CAN_RUN(newtd);
 	/* if ((flags & THR_SUSPENDED) == 0) */
 		sched_add(newtd, SRQ_BORING);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(newtd);
 
 	return (error);
 }
@@ -275,7 +278,7 @@ thr_exit(struct thread *td, struct thr_e
 
 	PROC_LOCK(p);
 	sigqueue_flush(&td->td_sigqueue);
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 
 	/*
 	 * Shutting down last thread in the proc.  This will actually
@@ -286,7 +289,7 @@ thr_exit(struct thread *td, struct thr_e
 		thread_exit();
 		/* NOTREACHED */
 	}
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 	PROC_UNLOCK(p);
 	return (0);
 }
@@ -379,9 +382,9 @@ kern_thr_suspend(struct thread *td, stru
 		error = msleep((void *)td, &td->td_proc->p_mtx, PCATCH, "lthr",
 		    hz);
 	if (td->td_flags & TDF_THRWAKEUP) {
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		td->td_flags &= ~TDF_THRWAKEUP;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 		PROC_UNLOCK(td->td_proc);
 		return (0);
 	}
@@ -414,9 +417,9 @@ thr_wake(struct thread *td, struct thr_w
 		PROC_UNLOCK(p);
 		return (ESRCH);
 	}
-	mtx_lock_spin(&sched_lock);
+	thread_lock(ttd);
 	ttd->td_flags |= TDF_THRWAKEUP;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(ttd);
 	wakeup((void *)ttd);
 	PROC_UNLOCK(p);
 	return (0);
Index: kern/kern_thread.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_thread.c,v
retrieving revision 1.245
diff -u -p -r1.245 kern_thread.c
--- kern/kern_thread.c	1 Jun 2007 01:12:43 -0000	1.245
+++ kern/kern_thread.c	1 Jun 2007 02:09:25 -0000
@@ -70,8 +70,8 @@ int virtual_cpu;
 
 #endif
 TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
-struct mtx kse_zombie_lock;
-MTX_SYSINIT(kse_zombie_lock, &kse_zombie_lock, "kse zombie lock", MTX_SPIN);
+struct mtx zombie_lock;
+MTX_SYSINIT(zombie_lock, &zombie_lock, "zombie lock", MTX_SPIN);
 
 #ifdef KSE
 static int
@@ -121,14 +121,7 @@ thread_ctor(void *mem, int size, void *a
 	/*
 	 * Note that td_critnest begins life as 1 because the thread is not
 	 * running and is thereby implicitly waiting to be on the receiving
-	 * end of a context switch.  A context switch must occur inside a
-	 * critical section, and in fact, includes hand-off of the sched_lock.
-	 * After a context switch to a newly created thread, it will release
-	 * sched_lock for the first time, and its td_critnest will hit 0 for
-	 * the first time.  This happens on the far end of a context switch,
-	 * and when it context switches away from itself, it will in fact go
-	 * back into a critical section, and hand off the sched lock to the
-	 * next thread.
+	 * end of a context switch.
 	 */
 	td->td_critnest = 1;
 
@@ -222,6 +215,7 @@ thread_fini(void *mem, int size)
 void
 proc_linkup(struct proc *p, struct thread *td)
 {
+
 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
 	TAILQ_INIT(&p->p_upcalls);	     /* upcall list */
 	sigqueue_init(&p->p_sigqueue, p);
@@ -260,9 +254,9 @@ threadinit(void)
 void
 thread_stash(struct thread *td)
 {
-	mtx_lock_spin(&kse_zombie_lock);
+	mtx_lock_spin(&zombie_lock);
 	TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq);
-	mtx_unlock_spin(&kse_zombie_lock);
+	mtx_unlock_spin(&zombie_lock);
 }
 
 /*
@@ -278,11 +272,11 @@ thread_reap(void)
 	 * we really don't care about the next instant..
 	 */
 	if (!TAILQ_EMPTY(&zombie_threads)) {
-		mtx_lock_spin(&kse_zombie_lock);
+		mtx_lock_spin(&zombie_lock);
 		td_first = TAILQ_FIRST(&zombie_threads);
 		if (td_first)
 			TAILQ_INIT(&zombie_threads);
-		mtx_unlock_spin(&kse_zombie_lock);
+		mtx_unlock_spin(&zombie_lock);
 		while (td_first) {
 			td_next = TAILQ_NEXT(td_first, td_slpq);
 			if (td_first->td_ucred)
@@ -358,8 +352,9 @@ thread_exit(void)
 	td = curthread;
 	p = td->td_proc;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&Giant, MA_NOTOWNED);
+
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(p != NULL, ("thread exiting without a process"));
 	CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td,
@@ -390,28 +385,13 @@ thread_exit(void)
 	 */
 	cpu_thread_exit(td);	/* XXXSMP */
 
-#ifdef KSE
-	/*
-	 * The thread is exiting. scheduler can release its stuff
-	 * and collect stats etc.
-	 * XXX this is not very right, since PROC_UNLOCK may still
-	 * need scheduler stuff.
-	 */
-	sched_thread_exit(td);
-#endif
-
 	/* Do the same timestamp bookkeeping that mi_switch() would do. */
 	new_switchtime = cpu_ticks();
 	p->p_rux.rux_runtime += (new_switchtime - PCPU_GET(switchtime));
 	PCPU_SET(switchtime, new_switchtime);
 	PCPU_SET(switchticks, ticks);
 	cnt.v_swtch++;
-	/*
-	 * Aggregate this thread's tick stats in the parent so they are not
-	 * lost.  Also add the child usage to our own when the final thread
-	 * exits.
-	 */
-	ruxagg(&p->p_rux, td);
+	/* Add the child usage to our own when the final thread exits. */
 	if (p->p_numthreads == 1)
 		ruadd(p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux);
 	/*
@@ -424,7 +404,13 @@ thread_exit(void)
 	 */
 	if (p->p_flag & P_HADTHREADS) {
 		if (p->p_numthreads > 1) {
+			thread_lock(td);
+#ifdef KSE
+			kse_unlink(td);
+#else
 			thread_unlink(td);
+#endif
+			thread_unlock(td);
 			/* Impart our resource usage on another thread */
 			td2 = FIRST_THREAD_IN_PROC(p);
 			rucollect(&td2->td_ru, &td->td_ru);
@@ -437,7 +423,9 @@ thread_exit(void)
 			 */
 			if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 				if (p->p_numthreads == p->p_suspcount) {
+					thread_lock(p->p_singlethread);
 					thread_unsuspend_one(p->p_singlethread);
+					thread_unlock(p->p_singlethread);
 				}
 			}
 
@@ -454,8 +442,6 @@ thread_exit(void)
 			 */
 			upcall_remove(td);
 #endif
-
-			PROC_UNLOCK(p);
 			PCPU_SET(deadthread, td);
 		} else {
 			/*
@@ -473,17 +459,15 @@ thread_exit(void)
 			 */
 			panic ("thread_exit: Last thread exiting on its own");
 		}
-	} else {
-		/*
-		 * non threaded process comes here.
-		 * This includes an EX threaded process that is coming
-		 * here via exit1(). (exit1 dethreads the proc first).
-		 */
-		PROC_UNLOCK(p);
-	}
+	} 
+	PROC_UNLOCK(p);
+	thread_lock(td);
+	/* Aggregate our tick statistics into our parents rux. */
+	ruxagg(&p->p_rux, td);
+	PROC_SUNLOCK(p);
 	td->td_state = TDS_INACTIVE;
 	CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td);
-	cpu_throw(td, choosethread());
+	sched_throw(td);
 	panic("I'm a teapot!");
 	/* NOTREACHED */
 }
@@ -532,6 +516,11 @@ void
 thread_link(struct thread *td, struct proc *p)
 {
 
+	/*
+	 * XXX This can't be enabled because it's called for proc0 before
+	 * it's spinlock has been created.
+	 * PROC_SLOCK_ASSERT(p, MA_OWNED);
+	 */
 	td->td_state    = TDS_INACTIVE;
 	td->td_proc     = p;
 	td->td_flags    = 0;
@@ -579,7 +568,7 @@ thread_unlink(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	TAILQ_REMOVE(&p->p_threads, td, td_plist);
 	p->p_numthreads--;
 	/* could clear a few other things here */
@@ -631,7 +620,7 @@ thread_single(int mode)
 			p->p_flag &= ~P_SINGLE_BOUNDARY;
 	}
 	p->p_flag |= P_STOPPED_SINGLE;
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	p->p_singlethread = td;
 	if (mode == SINGLE_EXIT)
 		remaining = p->p_numthreads;
@@ -645,6 +634,7 @@ thread_single(int mode)
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (td2 == td)
 				continue;
+			thread_lock(td2);
 			td2->td_flags |= TDF_ASTPENDING;
 			if (TD_IS_INHIBITED(td2)) {
 				switch (mode) {
@@ -666,8 +656,10 @@ thread_single(int mode)
 						sleepq_abort(td2, ERESTART);
 					break;
 				default:	
-					if (TD_IS_SUSPENDED(td2))
+					if (TD_IS_SUSPENDED(td2)) {
+						thread_unlock(td2);
 						continue;
+					}
 					/*
 					 * maybe other inhibited states too?
 					 */
@@ -683,6 +675,7 @@ thread_single(int mode)
 				forward_signal(td2);
 			}
 #endif
+			thread_unlock(td2);
 		}
 		if (mode == SINGLE_EXIT)
 			remaining = p->p_numthreads;
@@ -702,13 +695,7 @@ stopme:
 		 * Wake us up when everyone else has suspended.
 		 * In the mean time we suspend as well.
 		 */
-		thread_stopped(p);
-		thread_suspend_one(td);
-		PROC_UNLOCK(p);
-		mi_switch(SW_VOL, NULL);
-		mtx_unlock_spin(&sched_lock);
-		PROC_LOCK(p);
-		mtx_lock_spin(&sched_lock);
+		thread_suspend_switch(td);
 		if (mode == SINGLE_EXIT)
 			remaining = p->p_numthreads;
 		else if (mode == SINGLE_BOUNDARY)
@@ -727,7 +714,7 @@ stopme:
 		p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT);
 		thread_unthread(td);
 	}
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 	return (0);
 }
 
@@ -800,7 +787,7 @@ thread_suspend_check(int return_instead)
 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td))
 			sigqueue_flush(&td->td_sigqueue);
 
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		thread_stopped(p);
 		/*
 		 * If the process is waiting for us to exit,
@@ -809,7 +796,15 @@ thread_suspend_check(int return_instead)
 		 */
 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td))
 			thread_exit();
-
+		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
+			if (p->p_numthreads == p->p_suspcount + 1) {
+				thread_lock(p->p_singlethread);
+				thread_unsuspend_one(p->p_singlethread);
+				thread_unlock(p->p_singlethread);
+			}
+		}
+		PROC_UNLOCK(p);
+		thread_lock(td);
 		/*
 		 * When a thread suspends, it just
 		 * gets taken off all queues.
@@ -819,29 +814,52 @@ thread_suspend_check(int return_instead)
 			p->p_boundary_count++;
 			td->td_flags |= TDF_BOUNDARY;
 		}
-		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
-			if (p->p_numthreads == p->p_suspcount) 
-				thread_unsuspend_one(p->p_singlethread);
-		}
-		PROC_UNLOCK(p);
+		PROC_SUNLOCK(p);
 		mi_switch(SW_INVOL, NULL);
-		if (return_instead == 0) {
-			p->p_boundary_count--;
+		if (return_instead == 0)
 			td->td_flags &= ~TDF_BOUNDARY;
-		}
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 		PROC_LOCK(p);
+		if (return_instead == 0)
+			p->p_boundary_count--;
 	}
 	return (0);
 }
 
 void
+thread_suspend_switch(struct thread *td)
+{
+	struct proc *p;
+
+	p = td->td_proc;
+	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	/*
+	 * We implement thread_suspend_one in stages here to avoid
+	 * dropping the proc lock while the thread lock is owned.
+	 */
+	thread_stopped(p);
+	p->p_suspcount++;
+	PROC_UNLOCK(p);
+	thread_lock(td);
+	TD_SET_SUSPENDED(td);
+	PROC_SUNLOCK(p);
+	DROP_GIANT();
+	mi_switch(SW_VOL, NULL);
+	thread_unlock(td);
+	PICKUP_GIANT();
+	PROC_LOCK(p);
+	PROC_SLOCK(p);
+}
+
+void
 thread_suspend_one(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
-	mtx_assert(&sched_lock, MA_OWNED);
-	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
 	p->p_suspcount++;
 	TD_SET_SUSPENDED(td);
@@ -852,8 +870,8 @@ thread_unsuspend_one(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
-	mtx_assert(&sched_lock, MA_OWNED);
-	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended"));
 	TD_CLR_SUSPENDED(td);
 	p->p_suspcount--;
@@ -868,13 +886,15 @@ thread_unsuspend(struct proc *p)
 {
 	struct thread *td;
 
-	mtx_assert(&sched_lock, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	if (!P_SHOULDSTOP(p)) {
                 FOREACH_THREAD_IN_PROC(p, td) {
+			thread_lock(td);
 			if (TD_IS_SUSPENDED(td)) {
 				thread_unsuspend_one(td);
 			}
+			thread_unlock(td);
 		}
 	} else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
 	    (p->p_numthreads == p->p_suspcount)) {
@@ -883,7 +903,9 @@ thread_unsuspend(struct proc *p)
 		 * threading request. Now we've downgraded to single-threaded,
 		 * let it continue.
 		 */
+		thread_lock(p->p_singlethread);
 		thread_unsuspend_one(p->p_singlethread);
+		thread_unlock(p->p_singlethread);
 	}
 }
 
@@ -900,7 +922,7 @@ thread_single_end(void)
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY);
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	p->p_singlethread = NULL;
 	/*
 	 * If there are other threads they mey now run,
@@ -910,12 +932,14 @@ thread_single_end(void)
 	 */
 	if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
                 FOREACH_THREAD_IN_PROC(p, td) {
+			thread_lock(td);
 			if (TD_IS_SUSPENDED(td)) {
 				thread_unsuspend_one(td);
 			}
+			thread_unlock(td);
 		}
 	}
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 }
 
 struct thread *
@@ -924,11 +948,11 @@ thread_find(struct proc *p, lwpid_t tid)
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (td->td_tid == tid)
 			break;
 	}
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 	return (td);
 }
Index: kern/kern_time.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_time.c,v
retrieving revision 1.140
diff -u -p -r1.140 kern_time.c
--- kern/kern_time.c	22 Apr 2007 15:31:21 -0000	1.140
+++ kern/kern_time.c	31 May 2007 21:04:07 -0000
@@ -552,9 +552,9 @@ kern_getitimer(struct thread *td, u_int 
 				timevalsub(&aitv->it_value, &ctv);
 		}
 	} else {
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		*aitv = p->p_stats->p_timer[which];
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 	}
 	return (0);
 }
@@ -623,10 +623,10 @@ kern_setitimer(struct thread *td, u_int 
 				timevalsub(&oitv->it_value, &ctv);
 		}
 	} else {
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		*oitv = p->p_stats->p_timer[which];
 		p->p_stats->p_timer[which] = *aitv;
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 	}
 	return (0);
 }
Index: kern/kern_umtx.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/kern_umtx.c,v
retrieving revision 1.59
diff -u -p -r1.59 kern_umtx.c
--- kern/kern_umtx.c	5 Mar 2007 13:10:57 -0000	1.59
+++ kern/kern_umtx.c	18 May 2007 10:37:02 -0000
@@ -124,8 +124,8 @@ struct umtx_q {
 
 	/*
 	 * Blocked on PI mutex. read can use chain lock
-	 * or sched_lock, write must have both chain lock and
-	 * sched_lock being hold.
+	 * or umtx_lock, write must have both chain lock and
+	 * umtx_lock being hold.
 	 */
 	struct umtx_pi		*uq_pi_blocked;
 
@@ -225,6 +225,8 @@ static void umtx_exec_hook(void *arg __u
 	struct image_params *imgp __unused);
 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
 
+static struct mtx umtx_lock;
+
 static void
 umtxq_sysinit(void *arg __unused)
 {
@@ -240,6 +242,7 @@ umtxq_sysinit(void *arg __unused)
 		umtxq_chains[i].uc_busy = 0;
 		umtxq_chains[i].uc_waiters = 0;
 	}
+	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
 	    EVENTHANDLER_PRI_ANY);
 }
@@ -1270,7 +1273,7 @@ umtx_pi_adjust_thread(struct umtx_pi *pi
 	struct umtx_q *uq, *uq1, *uq2;
 	struct thread *td1;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	mtx_assert(&umtx_lock, MA_OWNED);
 	if (pi == NULL)
 		return (0);
 
@@ -1316,7 +1319,7 @@ umtx_propagate_priority(struct thread *t
 	struct umtx_pi *pi;
 	int pri;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	mtx_assert(&umtx_lock, MA_OWNED);
 	pri = UPRI(td);
 	uq = td->td_umtxq;
 	pi = uq->uq_pi_blocked;
@@ -1334,7 +1337,9 @@ umtx_propagate_priority(struct thread *t
 		if (UPRI(td) <= pri)
 			return;
 
+		thread_lock(td);
 		sched_lend_user_prio(td, pri);
+		thread_unlock(td);
 
 		/*
 		 * Pick up the lock that td is blocked on.
@@ -1358,7 +1363,7 @@ umtx_unpropagate_priority(struct umtx_pi
 	struct umtx_pi *pi2;
 	int pri;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	mtx_assert(&umtx_lock, MA_OWNED);
 
 	while (pi != NULL && pi->pi_owner != NULL) {
 		pri = PRI_MAX;
@@ -1374,7 +1379,9 @@ umtx_unpropagate_priority(struct umtx_pi
 
 		if (pri > uq_owner->uq_inherited_pri)
 			pri = uq_owner->uq_inherited_pri;
+		thread_lock(pi->pi_owner);
 		sched_unlend_user_prio(pi->pi_owner, pri);
+		thread_unlock(pi->pi_owner);
 		pi = uq_owner->uq_pi_blocked;
 	}
 }
@@ -1388,7 +1395,7 @@ umtx_pi_setowner(struct umtx_pi *pi, str
 	struct umtx_q *uq_owner;
 
 	uq_owner = owner->td_umtxq;
-	mtx_assert(&sched_lock, MA_OWNED);
+	mtx_assert(&umtx_lock, MA_OWNED);
 	if (pi->pi_owner != NULL)
 		panic("pi_ower != NULL");
 	pi->pi_owner = owner;
@@ -1404,9 +1411,9 @@ umtx_pi_claim(struct umtx_pi *pi, struct
 	struct umtx_q *uq, *uq_owner;
 
 	uq_owner = owner->td_umtxq;
-	mtx_lock_spin(&sched_lock);
+	mtx_lock_spin(&umtx_lock);
 	if (pi->pi_owner == owner) {
-		mtx_unlock_spin(&sched_lock);
+		mtx_unlock_spin(&umtx_lock);
 		return (0);
 	}
 
@@ -1414,7 +1421,7 @@ umtx_pi_claim(struct umtx_pi *pi, struct
 		/*
 		 * userland may have already messed the mutex, sigh.
 		 */
-		mtx_unlock_spin(&sched_lock);
+		mtx_unlock_spin(&umtx_lock);
 		return (EPERM);
 	}
 	umtx_pi_setowner(pi, owner);
@@ -1423,10 +1430,12 @@ umtx_pi_claim(struct umtx_pi *pi, struct
 		int pri;
 
 		pri = UPRI(uq->uq_thread);
+		thread_lock(owner);
 		if (pri < UPRI(owner))
 			sched_lend_user_prio(owner, pri);
+		thread_unlock(owner);
 	}
-	mtx_unlock_spin(&sched_lock);
+	mtx_unlock_spin(&umtx_lock);
 	return (0);
 }
 
@@ -1442,7 +1451,7 @@ umtx_pi_adjust(struct thread *td, u_char
 
 	uq = td->td_umtxq;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	mtx_assert(&umtx_lock, MA_OWNED);
 	MPASS(TD_ON_UPILOCK(td));
 
 	/*
@@ -1493,14 +1502,14 @@ umtxq_sleep_pi(struct umtx_q *uq, struct
 		 */
 		PROC_LOCK(curproc);
 		td1 = thread_find(curproc, owner);
-		mtx_lock_spin(&sched_lock);
+		mtx_lock_spin(&umtx_lock);
 		if (td1 != NULL && pi->pi_owner == NULL) {
 			uq1 = td1->td_umtxq;
 			umtx_pi_setowner(pi, td1);
 		}
 		PROC_UNLOCK(curproc);
 	} else {
-		mtx_lock_spin(&sched_lock);
+		mtx_lock_spin(&umtx_lock);
 	}
 
 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
@@ -1516,12 +1525,12 @@ umtxq_sleep_pi(struct umtx_q *uq, struct
 
 	uq->uq_pi_blocked = pi;
 	td->td_flags |= TDF_UPIBLOCKED;
-	mtx_unlock_spin(&sched_lock);
+	mtx_unlock_spin(&umtx_lock);
 	umtxq_unlock(&uq->uq_key);
 
-	mtx_lock_spin(&sched_lock);
+	mtx_lock_spin(&umtx_lock);
 	umtx_propagate_priority(td);
-	mtx_unlock_spin(&sched_lock);
+	mtx_unlock_spin(&umtx_lock);
 
 	umtxq_lock(&uq->uq_key);
 	if (uq->uq_flags & UQF_UMTXQ) {
@@ -1536,12 +1545,12 @@ umtxq_sleep_pi(struct umtx_q *uq, struct
 	}
 	umtxq_unlock(&uq->uq_key);
 
-	mtx_lock_spin(&sched_lock);
+	mtx_lock_spin(&umtx_lock);
 	uq->uq_pi_blocked = NULL;
 	td->td_flags &= ~TDF_UPIBLOCKED;
 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
 	umtx_unpropagate_priority(pi);
-	mtx_unlock_spin(&sched_lock);
+	mtx_unlock_spin(&umtx_lock);
 
 	umtxq_lock(&uq->uq_key);
 
@@ -1575,7 +1584,7 @@ umtx_pi_unref(struct umtx_pi *pi)
 	UMTXQ_LOCKED_ASSERT(uc);
 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
 	if (--pi->pi_refcount == 0) {
-		mtx_lock_spin(&sched_lock);
+		mtx_lock_spin(&umtx_lock);
 		if (pi->pi_owner != NULL) {
 			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
 				pi, pi_link);
@@ -1583,7 +1592,7 @@ umtx_pi_unref(struct umtx_pi *pi)
 		}
 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
 			("blocked queue not empty"));
-		mtx_unlock_spin(&sched_lock);
+		mtx_unlock_spin(&umtx_lock);
 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
 		free = 1;
 	}
@@ -1822,7 +1831,7 @@ do_unlock_pi(struct thread *td, struct u
 			return (EPERM);
 		}
 		uq_me = curthread->td_umtxq;
-		mtx_lock_spin(&sched_lock);
+		mtx_lock_spin(&umtx_lock);
 		pi->pi_owner = NULL;
 		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
@@ -1834,8 +1843,10 @@ do_unlock_pi(struct thread *td, struct u
 					pri = UPRI(uq_first2->uq_thread);
 			}
 		}
+		thread_lock(curthread);
 		sched_unlend_user_prio(curthread, pri);
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(curthread);
+		mtx_unlock_spin(&umtx_lock);
 	}
 	umtxq_unlock(&key);
 
@@ -1891,18 +1902,20 @@ _do_lock_pp(struct thread *td, struct um
 			goto out;
 		}
 
-		mtx_lock_spin(&sched_lock);
+		mtx_lock_spin(&umtx_lock);
 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
-			mtx_unlock_spin(&sched_lock);
+			mtx_unlock_spin(&umtx_lock);
 			error = EINVAL;
 			goto out;
 		}
 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
+			thread_lock(td);
 			if (uq->uq_inherited_pri < UPRI(td))
 				sched_lend_user_prio(td, uq->uq_inherited_pri);
+			thread_unlock(td);
 		}
-		mtx_unlock_spin(&sched_lock);
+		mtx_unlock_spin(&umtx_lock);
 
 		owner = casuword32(&m->m_owner,
 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
@@ -1943,7 +1956,7 @@ _do_lock_pp(struct thread *td, struct um
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 
-		mtx_lock_spin(&sched_lock);
+		mtx_lock_spin(&umtx_lock);
 		uq->uq_inherited_pri = old_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
@@ -1955,12 +1968,14 @@ _do_lock_pp(struct thread *td, struct um
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
+		thread_lock(td);
 		sched_unlend_user_prio(td, pri);
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
+		mtx_unlock_spin(&umtx_lock);
 	}
 
 	if (error != 0) {
-		mtx_lock_spin(&sched_lock);
+		mtx_lock_spin(&umtx_lock);
 		uq->uq_inherited_pri = old_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
@@ -1972,8 +1987,10 @@ _do_lock_pp(struct thread *td, struct um
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
+		thread_lock(td);
 		sched_unlend_user_prio(td, pri);
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
+		mtx_unlock_spin(&umtx_lock);
 	}
 
 out:
@@ -2048,7 +2065,7 @@ do_unlock_pp(struct thread *td, struct u
 	if (error == -1)
 		error = EFAULT;
 	else {
-		mtx_lock_spin(&sched_lock);
+		mtx_lock_spin(&umtx_lock);
 		if (su != 0)
 			uq->uq_inherited_pri = new_inherited_pri;
 		pri = PRI_MAX;
@@ -2061,8 +2078,10 @@ do_unlock_pp(struct thread *td, struct u
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
+		thread_lock(td);
 		sched_unlend_user_prio(td, pri);
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
+		mtx_unlock_spin(&umtx_lock);
 	}
 	umtx_key_release(&key);
 	return (error);
@@ -2748,12 +2767,12 @@ umtx_thread_cleanup(struct thread *td)
 	if ((uq = td->td_umtxq) == NULL)
 		return;
 
-	mtx_lock_spin(&sched_lock);
+	mtx_lock_spin(&umtx_lock);
 	uq->uq_inherited_pri = PRI_MAX;
 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
 		pi->pi_owner = NULL;
 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
 	}
 	td->td_flags &= ~TDF_UBORROWING;
-	mtx_unlock_spin(&sched_lock);
+	mtx_unlock_spin(&umtx_lock);
 }
Index: kern/ksched.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/ksched.c,v
retrieving revision 1.35
diff -u -p -r1.35 ksched.c
--- kern/ksched.c	6 Dec 2006 06:34:55 -0000	1.35
+++ kern/ksched.c	18 May 2007 10:37:02 -0000
@@ -104,9 +104,7 @@ getscheduler(struct ksched *ksched, stru
 	struct rtprio rtp;
 	int e = 0;
 
-	mtx_lock_spin(&sched_lock);
 	pri_to_rtp(td, &rtp);
-	mtx_unlock_spin(&sched_lock);
 	switch (rtp.type)
 	{
 		case RTP_PRIO_FIFO:
@@ -151,9 +149,7 @@ ksched_getparam(struct ksched *ksched,
 {
 	struct rtprio rtp;
 
-	mtx_lock_spin(&sched_lock);
 	pri_to_rtp(td, &rtp);
-	mtx_unlock_spin(&sched_lock);
 	if (RTP_PRIO_IS_REALTIME(rtp.type))
 		param->sched_priority = rtpprio_to_p4prio(rtp.prio);
 
@@ -186,9 +182,7 @@ ksched_setscheduler(struct ksched *ksche
 			rtp.type = (policy == SCHED_FIFO)
 				? RTP_PRIO_FIFO : RTP_PRIO_REALTIME;
 
-			mtx_lock_spin(&sched_lock);
 			rtp_to_pri(&rtp, td);
-			mtx_unlock_spin(&sched_lock);
 		}
 		else
 			e = EPERM;
@@ -200,9 +194,7 @@ ksched_setscheduler(struct ksched *ksche
 		{
 			rtp.type = RTP_PRIO_NORMAL;
 			rtp.prio = p4prio_to_rtpprio(param->sched_priority);
-			mtx_lock_spin(&sched_lock);
 			rtp_to_pri(&rtp, td);
-			mtx_unlock_spin(&sched_lock);
 		}
 		break;
 		
Index: kern/sched_4bsd.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/sched_4bsd.c,v
retrieving revision 1.97
diff -u -p -r1.97 sched_4bsd.c
--- kern/sched_4bsd.c	27 Feb 2007 17:23:27 -0000	1.97
+++ kern/sched_4bsd.c	31 May 2007 22:17:55 -0000
@@ -248,7 +248,7 @@ static void
 maybe_resched(struct thread *td)
 {
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority < curthread->td_priority)
 		curthread->td_flags |= TDF_NEEDRESCHED;
 }
@@ -377,10 +377,7 @@ schedcpu(void)
 	realstathz = stathz ? stathz : hz;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
-		/*
-		 * Prevent state changes and protect run queue.
-		 */
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		/*
 		 * Increment time in/out of memory.  We ignore overflow; with
 		 * 16-bit int's (remember them?) overflow takes 45 days.
@@ -388,6 +385,7 @@ schedcpu(void)
 		p->p_swtime++;
 		FOREACH_THREAD_IN_PROC(p, td) { 
 			awake = 0;
+			thread_lock(td);
 			ts = td->td_sched;
 			/*
 			 * Increment sleep time (if sleeping).  We
@@ -456,13 +454,16 @@ XXX  this is broken
 				td->td_slptime = 0;
 			} else
 				td->td_slptime++;
-			if (td->td_slptime > 1)
+			if (td->td_slptime > 1) {
+				thread_unlock(td);
 				continue;
+			}
 			td->td_estcpu = decay_cpu(loadfac, td->td_estcpu);
 		      	resetpriority(td);
 			resetpriority_thread(td);
+			thread_unlock(td);
 		} /* end of thread loop */
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 	} /* end of process loop */
 	sx_sunlock(&allproc_lock);
 }
@@ -575,6 +576,7 @@ schedinit(void)
 	 */
 	proc0.p_sched = NULL; /* XXX */
 	thread0.td_sched = &td_sched0;
+	thread0.td_lock = &sched_lock;
 	td_sched0.ts_thread = &thread0;
 }
 
@@ -615,7 +617,7 @@ sched_clock(struct thread *td)
 {
 	struct td_sched *ts;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 
 	ts->ts_cpticks++;
@@ -635,22 +637,23 @@ sched_exit(struct proc *p, struct thread
 
 	CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
 	    td, td->td_proc->p_comm, td->td_priority);
-
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
 }
 
 void
 sched_exit_thread(struct thread *td, struct thread *child)
 {
-	struct proc *childproc = child->td_proc;
 
 	CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
-	    child, childproc->p_comm, child->td_priority);
+	    child, child->td_proc->p_comm, child->td_priority);
+	thread_lock(td);
 	td->td_estcpu = ESTCPULIM(td->td_estcpu + child->td_estcpu);
-	childproc->p_estcpu = ESTCPULIM(childproc->p_estcpu +
-		child->td_estcpu);
+	thread_unlock(td);
+	mtx_lock_spin(&sched_lock);
 	if ((child->td_proc->p_flag & P_NOLOAD) == 0)
 		sched_load_rem();
+	mtx_unlock_spin(&sched_lock);
 }
 
 void
@@ -663,6 +666,7 @@ void
 sched_fork_thread(struct thread *td, struct thread *childtd)
 {
 	childtd->td_estcpu = td->td_estcpu;
+	childtd->td_lock = &sched_lock;
 	sched_newthread(childtd);
 }
 
@@ -672,18 +676,20 @@ sched_nice(struct proc *p, int nice)
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	mtx_assert(&sched_lock, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	p->p_nice = nice;
 	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
 		resetpriority(td);
 		resetpriority_thread(td);
+		thread_unlock(td);
 	}
 }
 
 void
 sched_class(struct thread *td, int class)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_pri_class = class;
 }
 
@@ -697,7 +703,7 @@ sched_priority(struct thread *td, u_char
 	    td, td->td_proc->p_comm, td->td_priority, prio, curthread, 
 	    curthread->td_proc->p_comm);
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority == prio)
 		return;
 	td->td_priority = prio;
@@ -818,7 +824,7 @@ void
 sched_sleep(struct thread *td)
 {
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_slptime = 0;
 }
 
@@ -831,26 +837,18 @@ sched_switch(struct thread *td, struct t
 	ts = td->td_sched;
 	p = td->td_proc;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	/*  
+	 * Switch to the sched lock to fix things up and pick
+	 * a new thread.
+	 */
+	if (td->td_lock != &sched_lock) {
+		mtx_lock_spin(&sched_lock);
+		thread_unlock(td);
+	}
 
 	if ((p->p_flag & P_NOLOAD) == 0)
 		sched_load_rem();
-#if 0
-	/* 
-	 * We are volunteering to switch out so we get to nominate
-	 * a successor for the rest of our quantum
-	 * First try another thread in our process
-	 *
-	 * this is too expensive to do without per process run queues
-	 * so skip it for now.
-	 * XXX keep this comment as a marker.
-	 */
-	if (sched_followon &&
-	    (p->p_flag & P_HADTHREADS) &&
-	    (flags & SW_VOL) &&
-	    newtd == NULL) 
-		newtd = mumble();
-#endif
 
 	if (newtd) 
 		newtd->td_flags |= (td->td_flags & TDF_NEEDRESCHED);
@@ -896,6 +894,7 @@ sched_switch(struct thread *td, struct t
 	} else {
 		newtd = choosethread();
 	}
+	MPASS(newtd->td_lock == &sched_lock);
 
 	if (td != newtd) {
 #ifdef	HWPMC_HOOKS
@@ -904,7 +903,7 @@ sched_switch(struct thread *td, struct t
 #endif
 
                 /* I feel sleepy */
-		cpu_switch(td, newtd);
+		cpu_switch(td, newtd, td->td_lock);
 		/*
 		 * Where am I?  What year is it?
 		 * We are in the same thread that went to sleep above,
@@ -932,12 +931,13 @@ sched_switch(struct thread *td, struct t
 #endif
 	sched_lock.mtx_lock = (uintptr_t)td;
 	td->td_oncpu = PCPU_GET(cpuid);
+	MPASS(td->td_lock == &sched_lock);
 }
 
 void
 sched_wakeup(struct thread *td)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_slptime > 1) {
 		updatepri(td);
 		resetpriority(td);
@@ -1079,7 +1079,7 @@ sched_add(struct thread *td, int flags)
 	int single_cpu = 0;
 
 	ts = td->td_sched;
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
@@ -1089,6 +1089,14 @@ sched_add(struct thread *td, int flags)
 	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
 	    td, td->td_proc->p_comm, td->td_priority, curthread,
 	    curthread->td_proc->p_comm);
+	/*
+	 * Now that the thread is moving to the run-queue, set the lock
+	 * to the scheduler's lock.
+	 */
+	if (td->td_lock != &sched_lock) {
+		mtx_lock_spin(&sched_lock);
+		thread_lock_set(td, &sched_lock);
+	}
 	TD_SET_RUNQ(td);
 
 	if (td->td_pinned != 0) {
@@ -1140,7 +1148,7 @@ sched_add(struct thread *td, int flags)
 {
 	struct td_sched *ts;
 	ts = td->td_sched;
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
@@ -1150,6 +1158,14 @@ sched_add(struct thread *td, int flags)
 	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
 	    td, td->td_proc->p_comm, td->td_priority, curthread,
 	    curthread->td_proc->p_comm);
+	/*
+	 * Now that the thread is moving to the run-queue, set the lock
+	 * to the scheduler's lock.
+	 */
+	if (td->td_lock != &sched_lock) {
+		mtx_lock_spin(&sched_lock);
+		thread_lock_set(td, &sched_lock);
+	}
 	TD_SET_RUNQ(td);
 	CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td);
 	ts->ts_runq = &runq;
@@ -1207,6 +1223,7 @@ sched_choose(void)
 	struct td_sched *ts;
 	struct runq *rq;
 
+	mtx_assert(&sched_lock,  MA_OWNED);
 #ifdef SMP
 	struct td_sched *kecpu;
 
@@ -1256,10 +1273,10 @@ sched_userret(struct thread *td)
 	KASSERT((td->td_flags & TDF_BORROWING) == 0,
 	    ("thread with borrowed priority returning to userland"));
 	if (td->td_priority != td->td_user_pri) {
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		td->td_priority = td->td_user_pri;
 		td->td_base_pri = td->td_user_pri;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 	}
 }
 
@@ -1268,7 +1285,7 @@ sched_bind(struct thread *td, int cpu)
 {
 	struct td_sched *ts;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(TD_IS_RUNNING(td),
 	    ("sched_bind: cannot bind non-running thread"));
 
@@ -1287,25 +1304,26 @@ sched_bind(struct thread *td, int cpu)
 void
 sched_unbind(struct thread* td)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_sched->ts_flags &= ~TSF_BOUND;
 }
 
 int
 sched_is_bound(struct thread *td)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	return (td->td_sched->ts_flags & TSF_BOUND);
 }
 
 void
 sched_relinquish(struct thread *td)
 {
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	if (td->td_pri_class == PRI_TIMESHARE)
 		sched_prio(td, PRI_MAX_TIMESHARE);
+	SCHED_STAT_INC(switch_relinquish);
 	mi_switch(SW_VOL, NULL);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 }
 
 int
@@ -1363,5 +1381,57 @@ sched_idletd(void *dummy)
 	}
 }
 
+/*
+ * A CPU is entering for the first time or a thread is exiting.
+ */
+void
+sched_throw(struct thread *td)
+{
+	/*
+	 * Correct spinlock nesting.  The idle thread context that we are
+	 * borrowing was created so that it would start out with a single
+	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
+	 * explicitly acquired locks in this function, the nesting count
+	 * is now 2 rather than 1.  Since we are nested, calling
+	 * spinlock_exit() will simply adjust the counts without allowing
+	 * spin lock using code to interrupt us.
+	 */
+	if (td == NULL) {
+		mtx_lock_spin(&sched_lock);
+		spinlock_exit();
+	} else {
+		MPASS(td->td_lock == &sched_lock);
+	}
+	mtx_assert(&sched_lock, MA_OWNED);
+	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
+	PCPU_SET(switchtime, cpu_ticks());
+	PCPU_SET(switchticks, ticks);
+	cpu_throw(td, choosethread());	/* doesn't return */
+}
+
+void
+sched_fork_exit(struct thread *ctd)
+{
+	struct thread *td;
+
+	/*
+	 * Finish setting up thread glue so that it begins execution in a
+	 * non-nested critical section with sched_lock held but not recursed.
+	 */
+	ctd->td_oncpu = PCPU_GET(cpuid);
+	sched_lock.mtx_lock = (uintptr_t)ctd;
+	THREAD_LOCK_ASSERT(ctd, MA_OWNED | MA_NOTRECURSED);
+	/*
+	 * Processes normally resume in mi_switch() after being
+	 * cpu_switch()'ed to, but when children start up they arrive here
+	 * instead, so we must do much the same things as mi_switch() would.
+	 */
+	if ((td = PCPU_GET(deadthread))) {
+		PCPU_SET(deadthread, NULL);
+		thread_stash(td);
+	}
+	thread_unlock(ctd);
+}
+
 #define KERN_SWITCH_INCLUDE 1
 #include "kern/kern_switch.c"
Index: kern/sched_core.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/sched_core.c,v
retrieving revision 1.13
diff -u -p -r1.13 sched_core.c
--- kern/sched_core.c	8 Mar 2007 06:44:33 -0000	1.13
+++ kern/sched_core.c	31 May 2007 23:38:26 -0000
@@ -784,6 +784,7 @@ schedinit(void)
 	 */
 	proc0.p_sched = NULL; /* XXX */
 	thread0.td_sched = &kse0;
+	thread0.td_lock = &sched_lock;
 	kse0.ts_thread = &thread0;
 	kse0.ts_slice = 100;
 }
@@ -1018,7 +1019,7 @@ sched_switch(struct thread *td, struct t
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
-		cpu_switch(td, newtd);
+		cpu_switch(td, newtd, td->td_lock);
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
@@ -1110,6 +1111,7 @@ sched_fork_thread(struct thread *td, str
 	ts = td->td_sched;
 	ts2 = child->td_sched;
 
+	child->td_lock = td->td_lock;
 	ts2->ts_slptime = ts2->ts_slptime * CHILD_WEIGHT / 100;
 	if (child->td_pri_class == PRI_TIMESHARE)
 		sched_user_prio(child, sched_calc_pri(ts2));
@@ -1142,7 +1144,8 @@ sched_class(struct thread *td, int class
 void
 sched_exit(struct proc *p, struct thread *childtd)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
+
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	sched_exit_thread(FIRST_THREAD_IN_PROC(p), childtd);
 }
 
@@ -1747,5 +1750,57 @@ sched_idletd(void *dummy)
 	}
 }
 
+/*
+ * A CPU is entering for the first time or a thread is exiting.
+ */
+void
+sched_throw(struct thread *td)
+{
+	/*
+	 * Correct spinlock nesting.  The idle thread context that we are
+	 * borrowing was created so that it would start out with a single
+	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
+	 * explicitly acquired locks in this function, the nesting count
+	 * is now 2 rather than 1.  Since we are nested, calling
+	 * spinlock_exit() will simply adjust the counts without allowing
+	 * spin lock using code to interrupt us.
+	 */
+	if (td == NULL) {
+		mtx_lock_spin(&sched_lock);
+		spinlock_exit();
+	} else {
+		MPASS(td->td_lock == &sched_lock);
+	}
+	mtx_assert(&sched_lock, MA_OWNED);
+	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
+	PCPU_SET(switchtime, cpu_ticks());
+	PCPU_SET(switchticks, ticks);
+	cpu_throw(td, choosethread());	/* doesn't return */
+}
+
+void
+sched_fork_exit(struct thread *ctd)
+{
+	struct thread *td;
+
+	/*
+	 * Finish setting up thread glue so that it begins execution in a
+	 * non-nested critical section with sched_lock held but not recursed.
+	 */
+	ctd->td_oncpu = PCPU_GET(cpuid);
+	sched_lock.mtx_lock = (uintptr_t)ctd;
+	THREAD_LOCK_ASSERT(ctd, MA_OWNED | MA_NOTRECURSED);
+	/*
+	 * Processes normally resume in mi_switch() after being
+	 * cpu_switch()'ed to, but when children start up they arrive here
+	 * instead, so we must do much the same things as mi_switch() would.
+	 */
+	if ((td = PCPU_GET(deadthread))) {
+		PCPU_SET(deadthread, NULL);
+		thread_stash(td);
+	}
+	thread_unlock(ctd);
+}
+
 #define KERN_SWITCH_INCLUDE 1
 #include "kern/kern_switch.c"
Index: kern/sched_smp.c
===================================================================
RCS file: kern/sched_smp.c
diff -N kern/sched_smp.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ kern/sched_smp.c	31 May 2007 22:18:36 -0000
@@ -0,0 +1,2320 @@
+/*-
+ * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/sched_ule.c,v 1.192 2007/04/20 05:45:46 kmacy Exp $");
+
+#include "opt_hwpmc_hooks.h"
+#include "opt_sched.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resource.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/turnstile.h>
+#include <sys/umtx.h>
+#include <sys/vmmeter.h>
+#ifdef KTRACE
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+#endif
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+#endif
+
+#include <machine/cpu.h>
+#include <machine/smp.h>
+
+#ifndef PREEMPTION
+#error	"SCHED_ULE requires options PREEMPTION"
+#endif
+
+/*
+ * TODO:
+ *	Pick idle from affinity group or self group first.
+ *	Implement pick_score.
+ */
+
+#define	KTR_ULE	KTR_SCHED /* Enable for pickpri debugging. */
+
+/*
+ * Thread scheduler specific section.
+ */
+struct td_sched {	
+	TAILQ_ENTRY(td_sched) ts_procq;	/* (j/z) Run queue. */
+	int		ts_flags;	/* (j) TSF_* flags. */
+	struct thread	*ts_thread;	/* (*) Active associated thread. */
+	u_char		ts_rqindex;	/* (j) Run queue index. */
+	int		ts_slptime;
+	int		ts_slice;
+	struct runq	*ts_runq;
+	u_char		ts_cpu;		/* CPU that we have affinity for. */
+	/* The following variables are only used for pctcpu calculation */
+	int		ts_ltick;	/* Last tick that we were running on */
+	int		ts_ftick;	/* First tick that we were running on */
+	int		ts_ticks;	/* Tick count */
+#ifdef SMP
+	int		ts_rltick;	/* Real last tick, for affinity. */
+#endif
+
+	/* originally from kg_sched */
+	u_int	skg_slptime;		/* Number of ticks we vol. slept */
+	u_int	skg_runtime;		/* Number of ticks we were running */
+};
+/* flags kept in ts_flags */
+#define	TSF_BOUND	0x0001		/* Thread can not migrate. */
+#define	TSF_XFERABLE	0x0002		/* Thread was added as transferable. */
+
+static struct td_sched td_sched0;
+
+/*
+ * Cpu percentage computation macros and defines.
+ *
+ * SCHED_TICK_SECS:	Number of seconds to average the cpu usage across.
+ * SCHED_TICK_TARG:	Number of hz ticks to average the cpu usage across.
+ * SCHED_TICK_MAX:	Maximum number of ticks before scaling back.
+ * SCHED_TICK_SHIFT:	Shift factor to avoid rounding away results.
+ * SCHED_TICK_HZ:	Compute the number of hz ticks for a given ticks count.
+ * SCHED_TICK_TOTAL:	Gives the amount of time we've been recording ticks.
+ */
+#define	SCHED_TICK_SECS		10
+#define	SCHED_TICK_TARG		(hz * SCHED_TICK_SECS)
+#define	SCHED_TICK_MAX		(SCHED_TICK_TARG + hz)
+#define	SCHED_TICK_SHIFT	10
+#define	SCHED_TICK_HZ(ts)	((ts)->ts_ticks >> SCHED_TICK_SHIFT)
+#define	SCHED_TICK_TOTAL(ts)	(max((ts)->ts_ltick - (ts)->ts_ftick, hz))
+
+/*
+ * These macros determine priorities for non-interactive threads.  They are
+ * assigned a priority based on their recent cpu utilization as expressed
+ * by the ratio of ticks to the tick total.  NHALF priorities at the start
+ * and end of the MIN to MAX timeshare range are only reachable with negative
+ * or positive nice respectively.
+ *
+ * PRI_RANGE:	Priority range for utilization dependent priorities.
+ * PRI_NRESV:	Number of nice values.
+ * PRI_TICKS:	Compute a priority in PRI_RANGE from the ticks count and total.
+ * PRI_NICE:	Determines the part of the priority inherited from nice.
+ */
+#define	SCHED_PRI_NRESV		(PRIO_MAX - PRIO_MIN)
+#define	SCHED_PRI_NHALF		(SCHED_PRI_NRESV / 2)
+#define	SCHED_PRI_MIN		(PRI_MIN_TIMESHARE + SCHED_PRI_NHALF)
+#define	SCHED_PRI_MAX		(PRI_MAX_TIMESHARE - SCHED_PRI_NHALF)
+#define	SCHED_PRI_RANGE		(SCHED_PRI_MAX - SCHED_PRI_MIN + 1)
+#define	SCHED_PRI_TICKS(ts)						\
+    (SCHED_TICK_HZ((ts)) /						\
+    (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
+#define	SCHED_PRI_NICE(nice)	(nice)
+
+/*
+ * These determine the interactivity of a process.  Interactivity differs from
+ * cpu utilization in that it expresses the voluntary time slept vs time ran
+ * while cpu utilization includes all time not running.  This more accurately
+ * models the intent of the thread.
+ *
+ * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
+ *		before throttling back.
+ * SLP_RUN_FORK:	Maximum slp+run time to inherit at fork time.
+ * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
+ * INTERACT_THRESH:	Threshhold for placement on the current runq.
+ */
+#define	SCHED_SLP_RUN_MAX	((hz * 5) << SCHED_TICK_SHIFT)
+#define	SCHED_SLP_RUN_FORK	((hz / 2) << SCHED_TICK_SHIFT)
+#define	SCHED_INTERACT_MAX	(100)
+#define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
+#define	SCHED_INTERACT_THRESH	(30)
+
+/*
+ * tickincr:		Converts a stathz tick into a hz domain scaled by
+ *			the shift factor.  Without the shift the error rate
+ *			due to rounding would be unacceptably high.
+ * realstathz:		stathz is sometimes 0 and run off of hz.
+ * sched_slice:		Runtime of each thread before rescheduling.
+ */
+static int sched_interact = SCHED_INTERACT_THRESH;
+static int realstathz;
+static int tickincr;
+static int sched_slice;
+
+/*
+ * tdq - per processor runqs and statistics.
+ */
+struct tdq {
+	struct mtx	tdq_lock;
+	struct runq	tdq_idle;		/* Queue of IDLE threads. */
+	struct runq	tdq_timeshare;		/* timeshare run queue. */
+	struct runq	tdq_realtime;		/* real-time run queue. */
+	u_char		tdq_idx;		/* Current insert index. */
+	u_char		tdq_ridx;		/* Current removal index. */
+	short		tdq_flags;		/* Thread queue flags */
+	int		tdq_load;		/* Aggregate load. */
+#ifdef SMP
+	int		tdq_transferable;
+	LIST_ENTRY(tdq)	tdq_siblings;		/* Next in tdq group. */
+	struct tdq_group *tdq_group;		/* Our processor group. */
+#else
+	int		tdq_sysload;		/* For loadavg, !ITHD load. */
+#endif
+	char		tdq_name[16];		/* lock name */
+};
+
+#define	TDQF_BUSY	0x0001			/* Queue is marked as busy */
+
+#ifdef SMP
+/*
+ * tdq groups are groups of processors which can cheaply share threads.  When
+ * one processor in the group goes idle it will check the runqs of the other
+ * processors in its group prior to halting and waiting for an interrupt.
+ * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA.
+ * In a numa environment we'd want an idle bitmap per group and a two tiered
+ * load balancer.
+ */
+struct tdq_group {
+	int	tdg_cpus;		/* Count of CPUs in this tdq group. */
+	cpumask_t tdg_cpumask;		/* Mask of cpus in this group. */
+	cpumask_t tdg_idlemask;		/* Idle cpus in this group. */
+	cpumask_t tdg_mask;		/* Bit mask for first cpu. */
+	int	tdg_load;		/* Total load of this group. */
+	int	tdg_transferable;	/* Transferable load of this group. */
+	LIST_HEAD(, tdq) tdg_members;	/* Linked list of all members. */
+};
+
+#define	SCHED_AFFINITY_DEFAULT	(hz / 100)
+#define	SCHED_AFFINITY(ts)	((ts)->ts_rltick > ticks - affinity)
+
+/*
+ * Run-time tunables.
+ */
+static int rebalance = 0;
+static int pick_pri = 0;
+static int affinity;
+static int tryself = 1;
+static int tryselfidle = 1;
+static int ipi_ast = 0;
+static int ipi_preempt = 1;
+static int ipi_thresh = PRI_MIN_KERN;
+static int steal_htt = 0;
+static int steal_busy = 0;
+static int busy_thresh = 4;
+static int topology = 0;
+
+/*
+ * One thread queue per processor.
+ */
+static volatile cpumask_t tdq_idle;
+static volatile cpumask_t tdq_busy;
+static int tdg_maxid;
+static struct tdq	tdq_cpu[MAXCPU];
+static struct tdq_group tdq_groups[MAXCPU];
+static int bal_tick;
+static int gbal_tick;
+static int balance_groups;
+
+#define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
+#define	TDQ_CPU(x)	(&tdq_cpu[(x)])
+#define	TDQ_ID(x)	((x) - tdq_cpu)
+#define	TDQ_GROUP(x)	(&tdq_groups[(x)])
+#else	/* !SMP */
+static struct tdq	tdq_cpu;
+
+#define	TDQ_SELF()	(&tdq_cpu)
+#define	TDQ_CPU(x)	(&tdq_cpu)
+#endif
+
+#define	TDQ_LOCK_ASSERT(t, type)	mtx_assert(TDQ_LOCKPTR((t)), (type))
+#define	TDQ_LOCK(t)			mtx_lock_spin(TDQ_LOCKPTR((t)))
+#define	TDQ_UNLOCK(t)			mtx_unlock_spin(TDQ_LOCKPTR((t)))
+#if 0
+#define	TDQ_LOCKPTR(t)			(&sched_lock)
+#else
+#define	TDQ_LOCKPTR(t)			(&(t)->tdq_lock)
+#endif
+
+static void sched_priority(struct thread *);
+static void sched_thread_priority(struct thread *, u_char);
+static int sched_interact_score(struct thread *);
+static void sched_interact_update(struct thread *);
+static void sched_interact_fork(struct thread *);
+static void sched_pctcpu_update(struct td_sched *);
+static inline void sched_pin_td(struct thread *td);
+static inline void sched_unpin_td(struct thread *td);
+
+/* Operations on per processor queues */
+static struct td_sched * tdq_choose(struct tdq *);
+static void tdq_setup(struct tdq *);
+static void tdq_load_add(struct tdq *, struct td_sched *);
+static void tdq_load_rem(struct tdq *, struct td_sched *);
+static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int);
+static __inline void tdq_runq_rem(struct tdq *, struct td_sched *);
+void tdq_print(int cpu);
+static void runq_print(struct runq *rq);
+#ifdef SMP
+static struct tdq *tdq_pickidle(struct td_sched *);
+static struct tdq *tdq_pickpri(struct td_sched *, int);
+static struct td_sched *runq_steal(struct runq *);
+static void sched_balance(void);
+static void sched_balance_groups(void);
+static void sched_balance_group(struct tdq_group *);
+static void sched_balance_pair(struct tdq *, struct tdq *);
+static void sched_smp_tick(struct thread *);
+static void tdq_move(struct tdq *, int);
+static int tdq_idled(struct tdq *);
+static void tdq_notify(struct td_sched *);
+static struct td_sched *tdq_steal(struct tdq *, int);
+
+#define	THREAD_CAN_MIGRATE(td)	 ((td)->td_pinned == 0)
+#endif
+
+static void sched_setup(void *dummy);
+SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
+
+static void sched_initticks(void *dummy);
+SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL)
+
+static inline void
+sched_pin_td(struct thread *td)
+{
+	td->td_pinned++;
+}
+
+static inline void
+sched_unpin_td(struct thread *td)
+{
+	td->td_pinned--;
+}
+
+static void
+runq_print(struct runq *rq)
+{
+	struct rqhead *rqh;
+	struct td_sched *ts;
+	int pri;
+	int j;
+	int i;
+
+	for (i = 0; i < RQB_LEN; i++) {
+		printf("\t\trunq bits %d 0x%zx\n",
+		    i, rq->rq_status.rqb_bits[i]);
+		for (j = 0; j < RQB_BPW; j++)
+			if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
+				pri = j + (i << RQB_L2BPW);
+				rqh = &rq->rq_queues[pri];
+				TAILQ_FOREACH(ts, rqh, ts_procq) {
+					printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
+					    ts->ts_thread, ts->ts_thread->td_proc->p_comm, ts->ts_thread->td_priority, ts->ts_rqindex, pri);
+				}
+			}
+	}
+}
+
+void
+tdq_print(int cpu)
+{
+	struct tdq *tdq;
+
+	tdq = TDQ_CPU(cpu);
+
+	printf("tdq:\n");
+	printf("\tlockptr         %p\n", TDQ_LOCKPTR(tdq));
+	printf("\tlock name       %s\n", tdq->tdq_name);
+	printf("\tload:           %d\n", tdq->tdq_load);
+	printf("\ttimeshare idx:  %d\n", tdq->tdq_idx);
+	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
+	printf("\trealtime runq:\n");
+	runq_print(&tdq->tdq_realtime);
+	printf("\ttimeshare runq:\n");
+	runq_print(&tdq->tdq_timeshare);
+	printf("\tidle runq:\n");
+	runq_print(&tdq->tdq_idle);
+#ifdef SMP
+	printf("\tload transferable: %d\n", tdq->tdq_transferable);
+#endif
+}
+
+static __inline void
+tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags)
+{
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
+#ifdef SMP
+	if (THREAD_CAN_MIGRATE(ts->ts_thread)) {
+		tdq->tdq_transferable++;
+		tdq->tdq_group->tdg_transferable++;
+		ts->ts_flags |= TSF_XFERABLE;
+		if (tdq->tdq_transferable >= busy_thresh &&
+		    (tdq->tdq_flags & TDQF_BUSY) == 0) {
+			tdq->tdq_flags |= TDQF_BUSY;
+			atomic_set_int(&tdq_busy, 1 << TDQ_ID(tdq));
+		}
+	}
+#endif
+	if (ts->ts_runq == &tdq->tdq_timeshare) {
+		u_char pri;
+
+		pri = ts->ts_thread->td_priority;
+		KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE,
+			("Invalid priority %d on timeshare runq", pri));
+		/*
+		 * This queue contains only priorities between MIN and MAX
+		 * realtime.  Use the whole queue to represent these values.
+		 */
+#define	TS_RQ_PPQ	(((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
+		if ((flags & SRQ_BORROWING) == 0) {
+			pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ;
+			pri = (pri + tdq->tdq_idx) % RQ_NQS;
+			/*
+			 * This effectively shortens the queue by one so we
+			 * can have a one slot difference between idx and
+			 * ridx while we wait for threads to drain.
+			 */
+			if (tdq->tdq_ridx != tdq->tdq_idx &&
+			    pri == tdq->tdq_ridx)
+				pri = (unsigned char)(pri - 1) % RQ_NQS;
+		} else
+			pri = tdq->tdq_ridx;
+		runq_add_pri(ts->ts_runq, ts, pri, flags);
+	} else
+		runq_add(ts->ts_runq, ts, flags);
+}
+
+static __inline void
+tdq_runq_rem(struct tdq *tdq, struct td_sched *ts)
+{
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	KASSERT(ts->ts_runq != NULL,
+	    ("tdq_runq_remove: thread %p null ts_runq", ts->ts_thread));
+#ifdef SMP
+	if (ts->ts_flags & TSF_XFERABLE) {
+		tdq->tdq_transferable--;
+		tdq->tdq_group->tdg_transferable--;
+		ts->ts_flags &= ~TSF_XFERABLE;
+		if (tdq->tdq_transferable < busy_thresh && 
+		    (tdq->tdq_flags & TDQF_BUSY)) {
+			atomic_clear_int(&tdq_busy, 1 << TDQ_ID(tdq));
+			tdq->tdq_flags &= ~TDQF_BUSY;
+		}
+	}
+#endif
+	if (ts->ts_runq == &tdq->tdq_timeshare) {
+		if (tdq->tdq_idx != tdq->tdq_ridx)
+			runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx);
+		else
+			runq_remove_idx(ts->ts_runq, ts, NULL);
+		/*
+		 * For timeshare threads we update the priority here so
+		 * the priority reflects the time we've been sleeping.
+		 */
+		ts->ts_ltick = ticks;
+		sched_pctcpu_update(ts);
+		sched_priority(ts->ts_thread);
+	} else
+		runq_remove(ts->ts_runq, ts);
+}
+
+static void
+tdq_load_add(struct tdq *tdq, struct td_sched *ts)
+{
+	int class;
+
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
+	class = PRI_BASE(ts->ts_thread->td_pri_class);
+	tdq->tdq_load++;
+	CTR2(KTR_SCHED, "cpu %jd load: %d", TDQ_ID(tdq), tdq->tdq_load);
+	if (class != PRI_ITHD &&
+	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
+#ifdef SMP
+		tdq->tdq_group->tdg_load++;
+#else
+		tdq->tdq_sysload++;
+#endif
+}
+
+static void
+tdq_load_rem(struct tdq *tdq, struct td_sched *ts)
+{
+	int class;
+
+	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	class = PRI_BASE(ts->ts_thread->td_pri_class);
+	if (class != PRI_ITHD &&
+	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
+#ifdef SMP
+		tdq->tdq_group->tdg_load--;
+#else
+		tdq->tdq_sysload--;
+#endif
+	tdq->tdq_load--;
+	CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
+	ts->ts_runq = NULL;
+}
+
+#ifdef SMP
+static void
+sched_smp_tick(struct thread *td)
+{
+	struct tdq *tdq;
+
+	tdq = TDQ_SELF();
+	if (rebalance) {
+		if (ticks >= bal_tick)
+			sched_balance();
+		if (ticks >= gbal_tick && balance_groups)
+			sched_balance_groups();
+	}
+}
+
+/*
+ * sched_balance is a simple CPU load balancing algorithm.  It operates by
+ * finding the least loaded and most loaded cpu and equalizing their load
+ * by migrating some processes.
+ *
+ * Dealing only with two CPUs at a time has two advantages.  Firstly, most
+ * installations will only have 2 cpus.  Secondly, load balancing too much at
+ * once can have an unpleasant effect on the system.  The scheduler rarely has
+ * enough information to make perfect decisions.  So this algorithm chooses
+ * algorithm simplicity and more gradual effects on load in larger systems.
+ *
+ * It could be improved by considering the priorities and slices assigned to
+ * each task prior to balancing them.  There are many pathological cases with
+ * any approach and so the semi random algorithm below may work as well as any.
+ *
+ */
+static void
+sched_balance(void)
+{
+	struct tdq_group *high;
+	struct tdq_group *low;
+	struct tdq_group *tdg;
+	int cnt;
+	int i;
+
+	bal_tick = ticks + (random() % (hz * 2));
+	if (smp_started == 0)
+		return;
+	low = high = NULL;
+	i = random() % (tdg_maxid + 1);
+	for (cnt = 0; cnt <= tdg_maxid; cnt++) {
+		tdg = TDQ_GROUP(i);
+		/*
+		 * Find the CPU with the highest load that has some
+		 * threads to transfer.
+		 */
+		if ((high == NULL || tdg->tdg_load > high->tdg_load)
+		    && tdg->tdg_transferable)
+			high = tdg;
+		if (low == NULL || tdg->tdg_load < low->tdg_load)
+			low = tdg;
+		if (++i > tdg_maxid)
+			i = 0;
+	}
+	if (low != NULL && high != NULL && high != low)
+		sched_balance_pair(LIST_FIRST(&high->tdg_members),
+		    LIST_FIRST(&low->tdg_members));
+}
+
+static void
+sched_balance_groups(void)
+{
+	int i;
+
+	gbal_tick = ticks + (random() % (hz * 2));
+	if (smp_started)
+		for (i = 0; i <= tdg_maxid; i++)
+			sched_balance_group(TDQ_GROUP(i));
+}
+
+static void
+sched_balance_group(struct tdq_group *tdg)
+{
+	struct tdq *tdq;
+	struct tdq *high;
+	struct tdq *low;
+	int load;
+
+	if (tdg->tdg_transferable == 0)
+		return;
+	low = NULL;
+	high = NULL;
+	LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) {
+		load = tdq->tdq_load;
+		if (high == NULL || load > high->tdq_load)
+			high = tdq;
+		if (low == NULL || load < low->tdq_load)
+			low = tdq;
+	}
+	if (high != NULL && low != NULL && high != low)
+		sched_balance_pair(high, low);
+}
+
+static void
+sched_balance_pair(struct tdq *high, struct tdq *low)
+{
+	int transferable;
+	int high_load;
+	int low_load;
+	int move;
+	int diff;
+	int i;
+
+	/*
+	 * If we're transfering within a group we have to use this specific
+	 * tdq's transferable count, otherwise we can steal from other members
+	 * of the group.
+	 */
+	if (high->tdq_group == low->tdq_group) {
+		transferable = high->tdq_transferable;
+		high_load = high->tdq_load;
+		low_load = low->tdq_load;
+	} else {
+		transferable = high->tdq_group->tdg_transferable;
+		high_load = high->tdq_group->tdg_load;
+		low_load = low->tdq_group->tdg_load;
+	}
+	if (transferable == 0)
+		return;
+	/*
+	 * Determine what the imbalance is and then adjust that to how many
+	 * threads we actually have to give up (transferable).
+	 */
+	diff = high_load - low_load;
+	move = diff / 2;
+	if (diff & 0x1)
+		move++;
+	move = min(move, transferable);
+	for (i = 0; i < move; i++)
+		tdq_move(high, TDQ_ID(low));
+	return;
+}
+
+static void
+tdq_move(struct tdq *from, int cpu)
+{
+	struct tdq *tdq;
+	struct tdq *to;
+	struct td_sched *ts;
+
+	tdq = from;
+	to = TDQ_CPU(cpu);
+	ts = tdq_steal(tdq, 1);
+	if (ts == NULL) {
+		struct tdq_group *tdg;
+
+		tdg = tdq->tdq_group;
+		LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) {
+			if (tdq == from || tdq->tdq_transferable == 0)
+				continue;
+			ts = tdq_steal(tdq, 1);
+			break;
+		}
+		if (ts == NULL)
+			panic("tdq_move: No threads available with a "
+			    "transferable count of %d\n", 
+			    tdg->tdg_transferable);
+	}
+	if (tdq == to)
+		return;
+	sched_rem(ts->ts_thread);
+	ts->ts_cpu = cpu;
+	sched_pin_td(ts->ts_thread);
+	sched_add(ts->ts_thread, SRQ_YIELDING);
+	sched_unpin_td(ts->ts_thread);
+}
+
+static int
+tdq_idled(struct tdq *tdq)
+{
+	struct tdq_group *tdg;
+	struct tdq *steal;
+	struct td_sched *ts;
+
+	spinlock_enter();
+	tdg = tdq->tdq_group;
+	/*
+	 * If we're in a cpu group, try and steal threads from another cpu in
+	 * the group before idling.
+	 */
+	if (steal_htt && tdg->tdg_cpus > 1 && tdg->tdg_transferable) {
+		TDQ_UNLOCK(tdq);
+		LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) {
+			if (steal == tdq || steal->tdq_transferable == 0)
+				continue;
+			TDQ_LOCK(steal);
+			ts = tdq_steal(steal, 0);
+			if (ts)
+				goto steal;
+			TDQ_UNLOCK(steal);
+		}
+		TDQ_LOCK(tdq);
+	}
+	if (steal_busy && tdq_busy) {
+		TDQ_UNLOCK(tdq);
+		while (tdq_busy) {
+			int cpu;
+
+			cpu = ffs(tdq_busy);
+			if (cpu == 0)
+				break;
+			cpu--;
+			steal = TDQ_CPU(cpu);
+			TDQ_LOCK(steal);
+			if (steal->tdq_transferable == 0) {
+				TDQ_UNLOCK(steal);
+				continue;
+			}
+			ts = tdq_steal(steal, 1);
+			if (ts == NULL) {
+				TDQ_UNLOCK(steal);
+				continue;
+			}
+			CTR5(KTR_ULE,
+			    "tdq_idled: stealing td %p(%s) pri %d from %d busy 0x%X",
+			    ts->ts_thread, ts->ts_thread->td_proc->p_comm,
+			    ts->ts_thread->td_priority, cpu, tdq_busy);
+			goto steal;
+		}
+		TDQ_LOCK(tdq);
+	}
+	spinlock_exit();
+	/*
+	 * We only set the idled bit when all of the cpus in the group are
+	 * idle.  Otherwise we could get into a situation where a thread bounces
+	 * back and forth between two idle cores on seperate physical CPUs.
+	 */
+	tdg->tdg_idlemask |= PCPU_GET(cpumask);
+	if (tdg->tdg_idlemask == tdg->tdg_cpumask)
+		atomic_set_int(&tdq_idle, tdg->tdg_mask);
+	return (1);
+steal:
+	sched_rem(ts->ts_thread);
+	thread_lock_block(ts->ts_thread);
+	TDQ_LOCK(tdq);
+	thread_lock_unblock(ts->ts_thread, TDQ_LOCKPTR(tdq));
+	spinlock_exit();
+	ts->ts_cpu = PCPU_GET(cpuid);
+	sched_pin_td(ts->ts_thread);
+	sched_add(ts->ts_thread, SRQ_YIELDING);
+	sched_unpin_td(ts->ts_thread);
+
+	return (0);
+}
+
+static void
+tdq_notify(struct td_sched *ts)
+{
+	struct thread *ctd;
+	struct pcpu *pcpu;
+	int cpri;
+	int pri;
+	int cpu;
+
+	cpu = ts->ts_cpu;
+	pri = ts->ts_thread->td_priority;
+	pcpu = pcpu_find(cpu);
+	ctd = pcpu->pc_curthread;
+	cpri = ctd->td_priority;
+
+	/*
+	 * If our priority is not better than the current priority there is
+	 * nothing to do.
+	 */
+	if (pri > cpri)
+		return;
+	/*
+	 * Always IPI idle.
+	 */
+	if (cpri > PRI_MIN_IDLE)
+		goto sendipi;
+	/*
+	 * If we're realtime or better and there is timeshare or worse running
+	 * send an IPI.
+	 */
+	if (pri < PRI_MAX_REALTIME && cpri > PRI_MAX_REALTIME)
+		goto sendipi;
+	/*
+	 * Otherwise only IPI if we exceed the threshold.
+	 */
+	if (pri > ipi_thresh)
+		return;
+sendipi:
+	ctd->td_flags |= TDF_NEEDRESCHED;
+	if (cpri < PRI_MIN_IDLE) {
+		if (ipi_ast)
+			ipi_selected(1 << cpu, IPI_AST);
+		else if (ipi_preempt)
+			ipi_selected(1 << cpu, IPI_PREEMPT);
+	} else 
+		ipi_selected(1 << cpu, IPI_PREEMPT);
+}
+
+static struct td_sched *
+runq_steal(struct runq *rq)
+{
+	struct rqhead *rqh;
+	struct rqbits *rqb;
+	struct td_sched *ts;
+	int word;
+	int bit;
+
+	rqb = &rq->rq_status;
+	for (word = 0; word < RQB_LEN; word++) {
+		if (rqb->rqb_bits[word] == 0)
+			continue;
+		for (bit = 0; bit < RQB_BPW; bit++) {
+			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
+				continue;
+			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
+			TAILQ_FOREACH(ts, rqh, ts_procq) {
+				if (THREAD_CAN_MIGRATE(ts->ts_thread))
+					return (ts);
+			}
+		}
+	}
+	return (NULL);
+}
+
+static struct td_sched *
+tdq_steal(struct tdq *tdq, int stealidle)
+{
+	struct td_sched *ts;
+
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	/*
+	 * Steal from next first to try to get a non-interactive task that
+	 * may not have run for a while.
+	 * XXX Need to effect steal order for timeshare threads.
+	 */
+	if ((ts = runq_steal(&tdq->tdq_realtime)) != NULL)
+		return (ts);
+	if ((ts = runq_steal(&tdq->tdq_timeshare)) != NULL)
+		return (ts);
+	if (stealidle)
+		return (runq_steal(&tdq->tdq_idle));
+	return (NULL);
+}
+
+static struct tdq *
+tdq_pickidle(struct td_sched *ts)
+{
+	struct tdq_group *tdg;
+	struct tdq *tdq;
+	int self;
+	int cpu;
+
+	self = PCPU_GET(cpuid);
+	tdq = TDQ_CPU(self);
+	if (smp_started == 0)
+		goto self;
+	/*
+	 * If we're bound to a particular cpu, schedule here.
+	 */
+	if (!THREAD_CAN_MIGRATE(ts->ts_thread)) {
+		CTR1(KTR_ULE, "bound to %d", ts->ts_cpu);
+		tdq = TDQ_CPU(ts->ts_cpu);
+		TDQ_LOCK(tdq);
+		return (tdq);
+	}
+	/*
+	 * If the current CPU has idled, just run it here.
+	 */
+	if ((tdq->tdq_group->tdg_idlemask & PCPU_GET(cpumask)) != 0) {
+		CTR1(KTR_ULE, "self idle %X", tdq->tdq_group->tdg_idlemask);
+		goto self;
+	}
+	/*
+	 * Try the last group we ran on.
+	 */
+	tdg = TDQ_CPU(ts->ts_cpu)->tdq_group;
+	CTR1(KTR_ULE, "tdg_idlemask %X", tdg->tdg_idlemask);
+	cpu = ffs(tdg->tdg_idlemask);
+	if (cpu)
+		goto pick;
+	/*
+	 * Search for an idle group.
+	 */
+	CTR1(KTR_ULE, "tdq_idle %X", tdq_idle);
+	cpu = ffs(tdq_idle);
+	if (cpu) 
+		goto pick;
+	/*
+	 * XXX If there are no idle groups, check for an idle core.
+	 */
+	/*
+	 * No idle CPUs?
+	 */
+	CTR1(KTR_ULE, "none idle %X", tdq_idle);
+self:
+	TDQ_LOCK(tdq);
+	ts->ts_cpu = self;
+	return (tdq);
+
+pick:
+	cpu--;
+	tdq = TDQ_CPU(cpu);
+	TDQ_LOCK(tdq);
+	ts->ts_cpu = cpu;
+
+	return (tdq);
+}
+
+static struct tdq *
+tdq_pickpri(struct td_sched *ts, int flags)
+{
+	struct pcpu *pcpu;
+	struct tdq *tdq;
+	int lowpri;
+	int lowcpu;
+	int lowload;
+	int load;
+	int self;
+	int pri;
+	int cpu;
+
+	self = PCPU_GET(cpuid);
+	if (smp_started == 0)
+		goto self;
+	/*
+	 * If we're bound to a particular cpu, schedule here.
+	 */
+	if (!THREAD_CAN_MIGRATE(ts->ts_thread)) {
+		tdq = TDQ_CPU(ts->ts_cpu);
+		TDQ_LOCK(tdq);
+		return (tdq);
+	}
+	pri = ts->ts_thread->td_priority;
+	/*
+	 * Regardless of affinity, if the last cpu is idle send it there.
+	 */
+	tdq = TDQ_CPU(ts->ts_cpu);
+	TDQ_LOCK(tdq);
+	pcpu = pcpu_find(ts->ts_cpu);
+	if (pcpu->pc_curthread->td_priority > PRI_MIN_IDLE) {
+		CTR5(KTR_ULE,
+		    "ts_cpu %d idle, ltick %d ticks %d pri %d curthread %d",
+		    ts->ts_cpu, ts->ts_rltick, ticks, pri,
+		    pcpu->pc_curthread->td_priority);
+		return (tdq);
+	}
+	/*
+	 * If we have affinity, try to place it on the cpu we last ran on.
+	 */
+	if (SCHED_AFFINITY(ts) && pcpu->pc_curthread->td_priority > pri) {
+		CTR5(KTR_ULE,
+		    "affinity for %d, ltick %d ticks %d pri %d curthread %d",
+		    ts->ts_cpu, ts->ts_rltick, ticks, pri,
+		    pcpu->pc_curthread->td_priority);
+		return (tdq);
+	}
+	TDQ_UNLOCK(tdq);
+	/*
+	 * Try ourself first; If we're running something lower priority this
+	 * may have some locality with the waking thread and execute faster
+	 * here.
+	 */
+	if (tryself) {
+		/*
+		 * If we're being awoken by an interrupt thread or the waker
+		 * is going right to sleep run here as well.
+		 */
+		if ((TDQ_SELF()->tdq_load == 1) && (flags & SRQ_YIELDING ||
+		    curthread->td_pri_class == PRI_ITHD)) {
+			CTR2(KTR_ULE, "tryself load %d flags %d",
+			    TDQ_SELF()->tdq_load, flags);
+			goto self;
+		}
+	}
+	/*
+	 * Look for an idle group.
+	 */
+	CTR1(KTR_ULE, "tdq_idle %X", tdq_idle);
+	cpu = ffs(tdq_idle);
+	if (cpu) {
+		cpu--;
+		tdq = TDQ_CPU(cpu);
+		TDQ_LOCK(tdq);
+		ts->ts_cpu = cpu;
+		return (tdq);
+	}
+	if (tryselfidle && pri < curthread->td_priority) {
+		CTR1(KTR_ULE, "tryself %d",
+		    curthread->td_priority);
+		goto self;
+	}
+	/*
+ 	 * Now search for the cpu running the lowest priority thread with
+	 * the least load.
+	 */
+	lowload = 0;
+	lowpri = lowcpu = 0;
+	for (cpu = 0; cpu <= mp_maxid; cpu++) {
+		if (CPU_ABSENT(cpu))
+			continue;
+		tdq = TDQ_CPU(cpu);
+		pcpu = pcpu_find(cpu);
+		pri = pcpu->pc_curthread->td_priority;
+		CTR4(KTR_ULE,
+		    "cpu %d pri %d lowcpu %d lowpri %d",
+		    cpu, pri, lowcpu, lowpri);
+		if (pri < lowpri)
+			continue;
+		load = TDQ_CPU(cpu)->tdq_load;
+		if (lowpri && lowpri == pri && load > lowload)
+			continue;
+		lowpri = pri;
+		lowcpu = cpu;
+		lowload = load;
+	}
+	tdq = TDQ_CPU(lowcpu);
+	TDQ_LOCK(tdq);
+	ts->ts_cpu = lowcpu;
+	return (tdq);
+self:
+	tdq = TDQ_CPU(self);
+	TDQ_LOCK(tdq);
+	ts->ts_cpu = self;
+	return (tdq);
+
+}
+
+#endif	/* SMP */
+
+/*
+ * Pick the highest priority task we have and return it.
+ */
+
+static struct td_sched *
+tdq_choose(struct tdq *tdq)
+{
+	struct td_sched *ts;
+
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	ts = runq_choose(&tdq->tdq_realtime);
+	if (ts != NULL) {
+		KASSERT(ts->ts_thread->td_priority <= PRI_MAX_REALTIME,
+		    ("tdq_choose: Invalid priority on realtime queue %d",
+		    ts->ts_thread->td_priority));
+		return (ts);
+	}
+	ts = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
+	if (ts != NULL) {
+		KASSERT(ts->ts_thread->td_priority <= PRI_MAX_TIMESHARE &&
+		    ts->ts_thread->td_priority >= PRI_MIN_TIMESHARE,
+		    ("tdq_choose: Invalid priority on timeshare queue %d",
+		    ts->ts_thread->td_priority));
+		return (ts);
+	}
+
+	ts = runq_choose(&tdq->tdq_idle);
+	if (ts != NULL) {
+		KASSERT(ts->ts_thread->td_priority >= PRI_MIN_IDLE,
+		    ("tdq_choose: Invalid priority on idle queue %d",
+		    ts->ts_thread->td_priority));
+		return (ts);
+	}
+
+	return (NULL);
+}
+
+static void
+tdq_setup(struct tdq *tdq)
+{
+
+	snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
+	    "sched lock %d", (int)TDQ_ID(tdq));
+	mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock",
+	    MTX_SPIN | MTX_RECURSE);
+	runq_init(&tdq->tdq_realtime);
+	runq_init(&tdq->tdq_timeshare);
+	runq_init(&tdq->tdq_idle);
+	tdq->tdq_load = 0;
+}
+
+static void
+sched_setup(void *dummy)
+{
+	struct tdq *tdq;
+#ifdef SMP
+	int i;
+#endif
+
+	/*
+	 * To avoid divide-by-zero, we set realstathz a dummy value
+	 * in case which sched_clock() called before sched_initticks().
+	 */
+	realstathz = hz;
+	sched_slice = (realstathz/10);	/* ~100ms */
+	tickincr = 1 << SCHED_TICK_SHIFT;
+
+#ifdef SMP
+	balance_groups = 0;
+	/*
+	 * Initialize the tdqs.
+	 */
+	for (i = 0; i < MAXCPU; i++) {
+		tdq = &tdq_cpu[i];
+		tdq_setup(&tdq_cpu[i]);
+	}
+	if (smp_topology == NULL) {
+		struct tdq_group *tdg;
+		int cpus;
+
+		for (cpus = 0, i = 0; i < MAXCPU; i++) {
+			if (CPU_ABSENT(i))
+				continue;
+			tdq = &tdq_cpu[i];
+			tdg = &tdq_groups[cpus];
+			/*
+			 * Setup a tdq group with one member.
+			 */
+			tdq->tdq_transferable = 0;
+			tdq->tdq_group = tdg;
+			tdg->tdg_cpus = 1;
+			tdg->tdg_idlemask = 0;
+			tdg->tdg_cpumask = tdg->tdg_mask = 1 << i;
+			tdg->tdg_load = 0;
+			tdg->tdg_transferable = 0;
+			LIST_INIT(&tdg->tdg_members);
+			LIST_INSERT_HEAD(&tdg->tdg_members, tdq, tdq_siblings);
+			cpus++;
+		}
+		tdg_maxid = cpus - 1;
+	} else {
+		struct tdq_group *tdg;
+		struct cpu_group *cg;
+		int j;
+
+		topology = 1;
+		for (i = 0; i < smp_topology->ct_count; i++) {
+			cg = &smp_topology->ct_group[i];
+			tdg = &tdq_groups[i];
+			/*
+			 * Initialize the group.
+			 */
+			tdg->tdg_idlemask = 0;
+			tdg->tdg_load = 0;
+			tdg->tdg_transferable = 0;
+			tdg->tdg_cpus = cg->cg_count;
+			tdg->tdg_cpumask = cg->cg_mask;
+			LIST_INIT(&tdg->tdg_members);
+			/*
+			 * Find all of the group members and add them.
+			 */
+			for (j = 0; j < MAXCPU; j++) {
+				if ((cg->cg_mask & (1 << j)) != 0) {
+					if (tdg->tdg_mask == 0)
+						tdg->tdg_mask = 1 << j;
+					tdq_cpu[j].tdq_transferable = 0;
+					tdq_cpu[j].tdq_group = tdg;
+					LIST_INSERT_HEAD(&tdg->tdg_members,
+					    &tdq_cpu[j], tdq_siblings);
+				}
+			}
+			if (tdg->tdg_cpus > 1)
+				balance_groups = 1;
+		}
+		tdg_maxid = smp_topology->ct_count - 1;
+	}
+	/*
+	 * Stagger the group and global load balancer so they do not
+	 * interfere with each other.
+	 */
+	bal_tick = ticks + hz;
+	if (balance_groups)
+		gbal_tick = ticks + (hz / 2);
+#else
+	tdq_setup(TDQ_SELF());
+#endif
+	tdq = TDQ_SELF();
+	TDQ_LOCK(tdq);
+	tdq_load_add(tdq, &td_sched0);
+	TDQ_UNLOCK(tdq);
+}
+
+/* ARGSUSED */
+static void
+sched_initticks(void *dummy)
+{
+	int incr;
+
+	realstathz = stathz ? stathz : hz;
+	sched_slice = (realstathz/10);	/* ~100ms */
+
+	/*
+	 * tickincr is shifted out by 10 to avoid rounding errors due to
+	 * hz not being evenly divisible by stathz on all platforms.
+	 */
+	incr = (hz << SCHED_TICK_SHIFT) / realstathz;
+	/*
+	 * This does not work for values of stathz that are more than
+	 * 1 << SCHED_TICK_SHIFT * hz.  In practice this does not happen.
+	 */
+	if (incr == 0)
+		incr = 1;
+	tickincr = incr;
+#ifdef SMP
+	affinity = SCHED_AFFINITY_DEFAULT;
+#endif
+}
+
+
+/*
+ * Scale the scheduling priority according to the "interactivity" of this
+ * process.
+ */
+static void
+sched_priority(struct thread *td)
+{
+	int score;
+	int pri;
+
+	if (td->td_pri_class != PRI_TIMESHARE)
+		return;
+	/*
+	 * If the score is interactive we place the thread in the realtime
+	 * queue with a priority that is less than kernel and interrupt
+	 * priorities.  These threads are not subject to nice restrictions.
+	 *
+	 * Scores greater than this are placed on the normal realtime queue
+	 * where the priority is partially decided by the most recent cpu
+	 * utilization and the rest is decided by nice value.
+	 */
+	score = sched_interact_score(td);
+	if (score < sched_interact) {
+		pri = PRI_MIN_REALTIME;
+		pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact)
+		    * score;
+		KASSERT(pri >= PRI_MIN_REALTIME && pri <= PRI_MAX_REALTIME,
+		    ("sched_priority: invalid interactive priority %d score %d",
+		    pri, score));
+	} else {
+		pri = SCHED_PRI_MIN;
+		if (td->td_sched->ts_ticks)
+			pri += SCHED_PRI_TICKS(td->td_sched);
+		pri += SCHED_PRI_NICE(td->td_proc->p_nice);
+		if (!(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE)) {
+			static int once = 1;
+			if (once) {
+				printf("sched_priority: invalid priority %d",
+				    pri);
+				printf("nice %d, ticks %d ftick %d ltick %d tick pri %d\n",
+				    td->td_proc->p_nice,
+				    td->td_sched->ts_ticks,
+				    td->td_sched->ts_ftick,
+				    td->td_sched->ts_ltick,
+				    SCHED_PRI_TICKS(td->td_sched));
+				once = 0;
+			}
+			pri = min(max(pri, PRI_MIN_TIMESHARE),
+			    PRI_MAX_TIMESHARE);
+		}
+	}
+	sched_user_prio(td, pri);
+
+	return;
+}
+
+/*
+ * This routine enforces a maximum limit on the amount of scheduling history
+ * kept.  It is called after either the slptime or runtime is adjusted.
+ */
+static void
+sched_interact_update(struct thread *td)
+{
+	struct td_sched *ts;
+	u_int sum;
+
+	ts = td->td_sched;
+	sum = ts->skg_runtime + ts->skg_slptime;
+	if (sum < SCHED_SLP_RUN_MAX)
+		return;
+	/*
+	 * This only happens from two places:
+	 * 1) We have added an unusual amount of run time from fork_exit.
+	 * 2) We have added an unusual amount of sleep time from sched_sleep().
+	 */
+	if (sum > SCHED_SLP_RUN_MAX * 2) {
+		if (ts->skg_runtime > ts->skg_slptime) {
+			ts->skg_runtime = SCHED_SLP_RUN_MAX;
+			ts->skg_slptime = 1;
+		} else {
+			ts->skg_slptime = SCHED_SLP_RUN_MAX;
+			ts->skg_runtime = 1;
+		}
+		return;
+	}
+	/*
+	 * If we have exceeded by more than 1/5th then the algorithm below
+	 * will not bring us back into range.  Dividing by two here forces
+	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
+	 */
+	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
+		ts->skg_runtime /= 2;
+		ts->skg_slptime /= 2;
+		return;
+	}
+	ts->skg_runtime = (ts->skg_runtime / 5) * 4;
+	ts->skg_slptime = (ts->skg_slptime / 5) * 4;
+}
+
+static void
+sched_interact_fork(struct thread *td)
+{
+	int ratio;
+	int sum;
+
+	sum = td->td_sched->skg_runtime + td->td_sched->skg_slptime;
+	if (sum > SCHED_SLP_RUN_FORK) {
+		ratio = sum / SCHED_SLP_RUN_FORK;
+		td->td_sched->skg_runtime /= ratio;
+		td->td_sched->skg_slptime /= ratio;
+	}
+}
+
+static int
+sched_interact_score(struct thread *td)
+{
+	int div;
+
+	if (td->td_sched->skg_runtime > td->td_sched->skg_slptime) {
+		div = max(1, td->td_sched->skg_runtime / SCHED_INTERACT_HALF);
+		return (SCHED_INTERACT_HALF +
+		    (SCHED_INTERACT_HALF - (td->td_sched->skg_slptime / div)));
+	}
+	if (td->td_sched->skg_slptime > td->td_sched->skg_runtime) {
+		div = max(1, td->td_sched->skg_slptime / SCHED_INTERACT_HALF);
+		return (td->td_sched->skg_runtime / div);
+	}
+	/* runtime == slptime */
+	if (td->td_sched->skg_runtime)
+		return (SCHED_INTERACT_HALF);
+
+	/*
+	 * This can happen if slptime and runtime are 0.
+	 */
+	return (0);
+
+}
+
+/*
+ * Called from proc0_init() to bootstrap the scheduler.
+ */
+void
+schedinit(void)
+{
+
+	/*
+	 * Set up the scheduler specific parts of proc0.
+	 */
+	proc0.p_sched = NULL; /* XXX */
+	thread0.td_sched = &td_sched0;
+	thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF());
+	td_sched0.ts_ltick = ticks;
+	td_sched0.ts_ftick = ticks;
+	td_sched0.ts_thread = &thread0;
+}
+
+/*
+ * This is only somewhat accurate since given many processes of the same
+ * priority they will switch when their slices run out, which will be
+ * at most sched_slice stathz ticks.
+ */
+int
+sched_rr_interval(void)
+{
+
+	/* Convert sched_slice to hz */
+	return (hz/(realstathz/sched_slice));
+}
+
+static void
+sched_pctcpu_update(struct td_sched *ts)
+{
+
+	if (ts->ts_ticks == 0)
+		return;
+	if (ticks - (hz / 10) < ts->ts_ltick &&
+	    SCHED_TICK_TOTAL(ts) < SCHED_TICK_MAX)
+		return;
+	/*
+	 * Adjust counters and watermark for pctcpu calc.
+	 */
+	if (ts->ts_ltick > ticks - SCHED_TICK_TARG)
+		ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) *
+			    SCHED_TICK_TARG;
+	else
+		ts->ts_ticks = 0;
+	ts->ts_ltick = ticks;
+	ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG;
+}
+
+static void
+sched_thread_priority(struct thread *td, u_char prio)
+{
+	struct td_sched *ts;
+
+	CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)",
+	    td, td->td_proc->p_comm, td->td_priority, prio, curthread,
+	    curthread->td_proc->p_comm);
+	ts = td->td_sched;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	if (td->td_priority == prio)
+		return;
+
+	if (TD_ON_RUNQ(td) && prio < td->td_priority) {
+		/*
+		 * If the priority has been elevated due to priority
+		 * propagation, we may have to move ourselves to a new
+		 * queue.  This could be optimized to not re-add in some
+		 * cases.
+		 */
+		sched_rem(td);
+		td->td_priority = prio;
+		sched_add(td, SRQ_BORROWING|SRQ_OURSELF);
+	} else
+		td->td_priority = prio;
+}
+
+/*
+ * Update a thread's priority when it is lent another thread's
+ * priority.
+ */
+void
+sched_lend_prio(struct thread *td, u_char prio)
+{
+
+	td->td_flags |= TDF_BORROWING;
+	sched_thread_priority(td, prio);
+}
+
+/*
+ * Restore a thread's priority when priority propagation is
+ * over.  The prio argument is the minimum priority the thread
+ * needs to have to satisfy other possible priority lending
+ * requests.  If the thread's regular priority is less
+ * important than prio, the thread will keep a priority boost
+ * of prio.
+ */
+void
+sched_unlend_prio(struct thread *td, u_char prio)
+{
+	u_char base_pri;
+
+	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
+	    td->td_base_pri <= PRI_MAX_TIMESHARE)
+		base_pri = td->td_user_pri;
+	else
+		base_pri = td->td_base_pri;
+	if (prio >= base_pri) {
+		td->td_flags &= ~TDF_BORROWING;
+		sched_thread_priority(td, base_pri);
+	} else
+		sched_lend_prio(td, prio);
+}
+
+void
+sched_prio(struct thread *td, u_char prio)
+{
+	u_char oldprio;
+
+	/* First, update the base priority. */
+	td->td_base_pri = prio;
+
+	/*
+	 * If the thread is borrowing another thread's priority, don't
+	 * ever lower the priority.
+	 */
+	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
+		return;
+
+	/* Change the real priority. */
+	oldprio = td->td_priority;
+	sched_thread_priority(td, prio);
+
+	/*
+	 * If the thread is on a turnstile, then let the turnstile update
+	 * its state.
+	 */
+	if (TD_ON_LOCK(td) && oldprio != prio)
+		turnstile_adjust(td, oldprio);
+}
+
+void
+sched_user_prio(struct thread *td, u_char prio)
+{
+	u_char oldprio;
+
+	td->td_base_user_pri = prio;
+	if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio)
+                return;
+	oldprio = td->td_user_pri;
+	td->td_user_pri = prio;
+
+	if (TD_ON_UPILOCK(td) && oldprio != prio)
+		umtx_pi_adjust(td, oldprio);
+}
+
+void
+sched_lend_user_prio(struct thread *td, u_char prio)
+{
+	u_char oldprio;
+
+	td->td_flags |= TDF_UBORROWING;
+
+	oldprio = td->td_user_pri;
+	td->td_user_pri = prio;
+
+	if (TD_ON_UPILOCK(td) && oldprio != prio)
+		umtx_pi_adjust(td, oldprio);
+}
+
+void
+sched_unlend_user_prio(struct thread *td, u_char prio)
+{
+	u_char base_pri;
+
+	base_pri = td->td_base_user_pri;
+	if (prio >= base_pri) {
+		td->td_flags &= ~TDF_UBORROWING;
+		sched_user_prio(td, base_pri);
+	} else
+		sched_lend_user_prio(td, prio);
+}
+
+static inline struct mtx *thread_block_switch(struct thread *td);
+
+static inline struct mtx *
+thread_block_switch(struct thread *td)
+{
+	struct mtx *lock;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	lock = __DEVOLATILE(struct mtx *, td->td_lock);
+	td->td_lock = &blocked_lock;
+	mtx_unlock_spin(lock);
+
+	return (lock);
+}
+
+void
+sched_switch(struct thread *td, struct thread *newtd, int flags)
+{
+	struct tdq *tdq;
+	struct td_sched *ts;
+	struct mtx *mtx;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+
+	tdq = TDQ_SELF();
+	mtx = TDQ_LOCKPTR(tdq);
+	ts = td->td_sched;
+#ifdef SMP
+	ts->ts_rltick = ticks;
+#endif
+	td->td_lastcpu = td->td_oncpu;
+	td->td_oncpu = NOCPU;
+	td->td_flags &= ~TDF_NEEDRESCHED;
+	td->td_owepreempt = 0;
+	if (TD_IS_IDLETHREAD(td)) {
+		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+		TD_SET_CAN_RUN(td);
+		goto choose;
+	}
+	if (TD_IS_RUNNING(td)) {
+		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+		tdq_load_rem(tdq, ts);
+		sched_add(td, (flags & SW_PREEMPT) ?
+		    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
+		    SRQ_OURSELF|SRQ_YIELDING);
+		if (td->td_lock != TDQ_LOCKPTR(tdq)) {
+			spinlock_enter();
+			mtx = thread_block_switch(td);
+			TDQ_LOCK(tdq);
+			spinlock_exit();
+		}
+	} else {
+		if (td->td_lock != TDQ_LOCKPTR(tdq)) {
+			TDQ_LOCK(tdq);
+			mtx = thread_block_switch(td);
+		}
+		/* Remove the load for inhibited tds */
+		tdq_load_rem(tdq, ts);
+	}
+choose:
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
+	if (newtd != NULL) {
+		/*
+		 * If we bring in a thread account for it as if it had been
+		 * added to the run queue and then chosen.
+		 */
+		MPASS(newtd->td_lock == TDQ_LOCKPTR(tdq));
+		TD_SET_RUNNING(newtd);
+		tdq_load_add(TDQ_SELF(), newtd->td_sched);
+	} else
+		newtd = choosethread();
+	if (td != newtd) {
+#ifdef	HWPMC_HOOKS
+		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
+			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
+#endif
+		cpu_switch(td, newtd, mtx);
+		tdq = TDQ_SELF();	/* We may return on a different cpu */
+		TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)td;
+#ifdef	HWPMC_HOOKS
+		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
+			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
+#endif
+	}
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED);
+	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+	td->td_oncpu = PCPU_GET(cpuid);
+}
+
+void
+sched_nice(struct proc *p, int nice)
+{
+	struct thread *td;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+
+	p->p_nice = nice;
+	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
+		sched_priority(td);
+		sched_prio(td, td->td_base_user_pri);
+		thread_unlock(td);
+	}
+}
+
+void
+sched_sleep(struct thread *td)
+{
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+
+	td->td_sched->ts_slptime = ticks;
+}
+
+void
+sched_wakeup(struct thread *td)
+{
+	struct td_sched *ts;
+	int slptime;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	ts = td->td_sched;
+	/*
+	 * If we slept for more than a tick update our interactivity and
+	 * priority.
+	 */
+	slptime = ts->ts_slptime;
+	ts->ts_slptime = 0;
+	if (slptime && slptime != ticks) {
+		u_int hzticks;
+
+		hzticks = (ticks - slptime) << SCHED_TICK_SHIFT;
+		ts->skg_slptime += hzticks;
+		sched_interact_update(td);
+		sched_pctcpu_update(ts);
+		sched_priority(td);
+	}
+	/* Reset the slice value after we sleep. */
+	ts->ts_slice = sched_slice;
+	sched_add(td, SRQ_BORING);
+}
+
+/*
+ * Penalize the parent for creating a new child and initialize the child's
+ * priority.
+ */
+void
+sched_fork(struct thread *td, struct thread *child)
+{
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	sched_fork_thread(td, child);
+	/*
+	 * Penalize the parent and child for forking.
+	 */
+	sched_interact_fork(child);
+	sched_priority(child);
+	td->td_sched->skg_runtime += tickincr;
+	sched_interact_update(td);
+	sched_priority(td);
+}
+
+void
+sched_fork_thread(struct thread *td, struct thread *child)
+{
+	struct td_sched *ts;
+	struct td_sched *ts2;
+
+	/*
+	 * Initialize child.
+	 */
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	sched_newthread(child);
+	child->td_lock = TDQ_LOCKPTR(TDQ_SELF());
+	ts = td->td_sched;
+	ts2 = child->td_sched;
+	ts2->ts_cpu = ts->ts_cpu;
+	ts2->ts_runq = NULL;
+	/*
+	 * Grab our parents cpu estimation information and priority.
+	 */
+	ts2->ts_ticks = ts->ts_ticks;
+	ts2->ts_ltick = ts->ts_ltick;
+	ts2->ts_ftick = ts->ts_ftick;
+	child->td_user_pri = td->td_user_pri;
+	child->td_base_user_pri = td->td_base_user_pri;
+	/*
+	 * And update interactivity score.
+	 */
+	ts2->skg_slptime = ts->skg_slptime;
+	ts2->skg_runtime = ts->skg_runtime;
+	ts2->ts_slice = 1;	/* Attempt to quickly learn interactivity. */
+}
+
+void
+sched_class(struct thread *td, int class)
+{
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	if (td->td_pri_class == class)
+		return;
+
+#ifdef SMP
+	/*
+	 * On SMP if we're on the RUNQ we must adjust the transferable
+	 * count because could be changing to or from an interrupt
+	 * class.
+	 */
+	if (TD_ON_RUNQ(td)) {
+		struct tdq *tdq;
+
+		tdq = TDQ_CPU(td->td_sched->ts_cpu);
+		if (THREAD_CAN_MIGRATE(td)) {
+			tdq->tdq_transferable--;
+			tdq->tdq_group->tdg_transferable--;
+		}
+		td->td_pri_class = class;
+		if (THREAD_CAN_MIGRATE(td)) {
+			tdq->tdq_transferable++;
+			tdq->tdq_group->tdg_transferable++;
+		}
+	}
+#endif
+	td->td_pri_class = class;
+}
+
+/*
+ * Return some of the child's priority and interactivity to the parent.
+ */
+void
+sched_exit(struct proc *p, struct thread *child)
+{
+	struct thread *td;
+	
+	CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
+	    child, child->td_proc->p_comm, child->td_priority);
+
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	td = FIRST_THREAD_IN_PROC(p);
+	sched_exit_thread(td, child);
+}
+
+void
+sched_exit_thread(struct thread *td, struct thread *child)
+{
+	struct tdq *tdq;
+
+	CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
+	    child, child->td_proc->p_comm, child->td_priority);
+
+	tdq = TDQ_CPU(child->td_sched->ts_cpu);
+	TDQ_LOCK(tdq);
+	tdq_load_rem(tdq, child->td_sched);
+	TDQ_UNLOCK(tdq);
+#ifdef KSE
+	/*
+	 * KSE forks and exits so often that this penalty causes short-lived
+	 * threads to always be non-interactive.  This causes mozilla to
+	 * crawl under load.
+	 */
+	if ((td->td_pflags & TDP_SA) && td->td_proc == child->td_proc)
+		return;
+#endif
+	/*
+	 * Give the child's runtime to the parent without returning the
+	 * sleep time as a penalty to the parent.  This causes shells that
+	 * launch expensive things to mark their children as expensive.
+	 */
+	thread_lock(td);
+	td->td_sched->skg_runtime += child->td_sched->skg_runtime;
+	sched_interact_update(td);
+	sched_priority(td);
+	thread_unlock(td);
+}
+
+void
+sched_userret(struct thread *td)
+{
+	/*
+	 * XXX we cheat slightly on the locking here to avoid locking in  
+	 * the usual case.  Setting td_priority here is essentially an
+	 * incomplete workaround for not setting it properly elsewhere.
+	 * Now that some interrupt handlers are threads, not setting it
+	 * properly elsewhere can clobber it in the window between setting
+	 * it here and returning to user mode, so don't waste time setting
+	 * it perfectly here.
+	 */
+	KASSERT((td->td_flags & TDF_BORROWING) == 0,
+	    ("thread with borrowed priority returning to userland"));
+	if (td->td_priority != td->td_user_pri) {
+		thread_lock(td);
+		td->td_priority = td->td_user_pri;
+		td->td_base_pri = td->td_user_pri;
+		thread_unlock(td);
+        }
+}
+
+void
+sched_clock(struct thread *td)
+{
+	struct tdq *tdq;
+	struct td_sched *ts;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+#ifdef SMP
+	sched_smp_tick(td);
+#endif
+	tdq = TDQ_SELF();
+	/*
+	 * Advance the insert index once for each tick to ensure that all
+	 * threads get a chance to run.
+	 */
+	if (tdq->tdq_idx == tdq->tdq_ridx) {
+		tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
+		if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
+			tdq->tdq_ridx = tdq->tdq_idx;
+	}
+	ts = td->td_sched;
+	/*
+	 * We only do slicing code for TIMESHARE threads.
+	 */
+	if (td->td_pri_class != PRI_TIMESHARE)
+		return;
+	/*
+	 * We used a tick; charge it to the thread so that we can compute our
+	 * interactivity.
+	 */
+	td->td_sched->skg_runtime += tickincr;
+	sched_interact_update(td);
+	/*
+	 * We used up one time slice.
+	 */
+	if (--ts->ts_slice > 0)
+		return;
+	/*
+	 * We're out of time, recompute priorities and requeue.
+	 */
+	sched_priority(td);
+	td->td_flags |= TDF_NEEDRESCHED;
+}
+
+int
+sched_runnable(void)
+{
+	struct tdq *tdq;
+	int load;
+
+	load = 1;
+
+	tdq = TDQ_SELF();
+#ifdef SMP
+	if (tdq_busy)
+		goto out;
+#endif
+	if ((curthread->td_flags & TDF_IDLETD) != 0) {
+		if (tdq->tdq_load > 0)
+			goto out;
+	} else
+		if (tdq->tdq_load - 1 > 0)
+			goto out;
+	load = 0;
+out:
+	return (load);
+}
+
+struct thread *
+sched_choose(void)
+{
+	struct tdq *tdq;
+	struct td_sched *ts;
+
+	tdq = TDQ_SELF();
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+#ifdef SMP
+restart:
+#endif
+	ts = tdq_choose(tdq);
+	if (ts) {
+#ifdef SMP
+		if (ts->ts_thread->td_priority > PRI_MIN_IDLE)
+			if (tdq_idled(tdq) == 0)
+				goto restart;
+#endif
+		tdq_runq_rem(tdq, ts);
+		return (ts->ts_thread);
+	}
+#ifdef SMP
+	if (tdq_idled(tdq) == 0)
+		goto restart;
+#endif
+	return (PCPU_GET(idlethread));
+}
+
+static int
+sched_preempt(struct thread *td)
+{
+	struct thread *ctd;
+	int cpri;
+	int pri;
+
+	ctd = curthread;
+	pri = td->td_priority;
+	cpri = ctd->td_priority;
+	if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
+		return (0);
+	/*
+	 * Always preempt IDLE threads.  Otherwise only if the preempting
+	 * thread is an ithread.
+	 */
+	if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
+		return (0);
+	if (ctd->td_critnest > 1) {
+		CTR1(KTR_PROC, "sched_preempt: in critical section %d",
+		    ctd->td_critnest);
+		ctd->td_owepreempt = 1;
+		return (0);
+	}
+	/*
+	 * Thread is runnable but not yet put on system run queue.
+	 */
+	MPASS(TD_ON_RUNQ(td));
+	TD_SET_RUNNING(td);
+	MPASS(ctd->td_lock == td->td_lock);
+	MPASS(td->td_lock == TDQ_LOCKPTR(TDQ_SELF()));
+	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
+	    td->td_proc->p_pid, td->td_proc->p_comm);
+	/*
+	 * We enter the switch with two runnable threads that both have
+	 * the same lock.  When we return td may be sleeping so we need
+	 * to switch locks to make sure it's locked correctly.
+	 */
+	SCHED_STAT_INC(switch_preempt);
+	mi_switch(SW_INVOL|SW_PREEMPT, td);
+	spinlock_enter();
+	thread_unlock(ctd);
+	thread_lock(td);
+	spinlock_exit();
+
+	return (1);
+}
+
+void
+sched_add(struct thread *td, int flags)
+{
+	struct td_sched *ts;
+	struct tdq *tdq;
+	int preemptive;
+	int class;
+#ifdef SMP
+	int cpuid;
+	int cpumask;
+#endif
+	ts = td->td_sched;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
+	    td, td->td_proc->p_comm, td->td_priority, curthread,
+	    curthread->td_proc->p_comm);
+	KASSERT((td->td_inhibitors == 0),
+	    ("sched_add: trying to run inhibited thread"));
+	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
+	    ("sched_add: bad thread state"));
+	KASSERT(td->td_proc->p_sflag & PS_INMEM,
+	    ("sched_add: process swapped out"));
+
+        TD_SET_RUNQ(td);
+	class = PRI_BASE(td->td_pri_class);
+	preemptive = !(flags & SRQ_YIELDING);
+	/*
+	 * Recalculate the priority before we select the target cpu or
+	 * run-queue.
+	 */
+	if (class == PRI_TIMESHARE)
+		sched_priority(td);
+	if (ts->ts_slice == 0)
+		ts->ts_slice = sched_slice;
+#ifdef SMP
+	cpuid = PCPU_GET(cpuid);
+	/*
+	 * Pick the destination cpu and if it isn't ours transfer to the
+	 * target cpu.
+	 */
+	thread_lock_block(td);
+	if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_MIGRATE(td)) {
+		ts->ts_cpu = cpuid;
+		tdq = TDQ_CPU(cpuid);
+		TDQ_LOCK(tdq);
+	} else if (pick_pri)
+		tdq = tdq_pickpri(ts, flags);
+	else
+		tdq = tdq_pickidle(ts);
+	thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
+	if (ts->ts_cpu != cpuid)
+		preemptive = 0;
+	cpumask = 1 << ts->ts_cpu;
+	/*
+	 * If we had been idle, clear our bit in the group and potentially
+	 * the global bitmap.
+	 */
+	if ((class != PRI_IDLE && class != PRI_ITHD) &&
+	    (tdq->tdq_group->tdg_idlemask & cpumask) != 0) {
+		/*
+		 * Check to see if our group is unidling, and if so, remove it
+		 * from the global idle mask.
+		 */
+		if (tdq->tdq_group->tdg_idlemask ==
+		    tdq->tdq_group->tdg_cpumask)
+			atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask);
+		/*
+		 * Now remove ourselves from the group specific idle mask.
+		 */
+		tdq->tdq_group->tdg_idlemask &= ~cpumask;
+	}
+#else
+	tdq = TDQ_SELF();
+	TDQ_LOCK(tdq);
+	/*
+	 * Now that the thread is moving to the run-queue, set the lock
+	 * to the scheduler's lock.
+	 */
+	thread_lock_set(td, TDQ_LOCKPTR(tdq));
+#endif
+	/*
+	 * Pick the run queue based on priority.
+	 */
+	if (td->td_priority <= PRI_MAX_REALTIME)
+		ts->ts_runq = &tdq->tdq_realtime;
+	else if (td->td_priority <= PRI_MAX_TIMESHARE)
+		ts->ts_runq = &tdq->tdq_timeshare;
+	else
+		ts->ts_runq = &tdq->tdq_idle;
+	if (preemptive && sched_preempt(td))
+		return;
+	tdq_runq_add(tdq, ts, flags);
+	tdq_load_add(tdq, ts);
+#ifdef SMP
+	if (ts->ts_cpu != cpuid) {
+		tdq_notify(ts);
+		return;
+	}
+#endif
+	if (td->td_priority < curthread->td_priority)
+		curthread->td_flags |= TDF_NEEDRESCHED;
+}
+
+void
+sched_rem(struct thread *td)
+{
+	struct tdq *tdq;
+	struct td_sched *ts;
+
+	CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
+	    td, td->td_proc->p_comm, td->td_priority, curthread,
+	    curthread->td_proc->p_comm);
+	ts = td->td_sched;
+	tdq = TDQ_CPU(ts->ts_cpu);
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+	KASSERT(TD_ON_RUNQ(td),
+	    ("sched_rem: thread not on run queue"));
+	tdq_runq_rem(tdq, ts);
+	tdq_load_rem(tdq, ts);
+	TD_SET_CAN_RUN(td);
+}
+
+fixpt_t
+sched_pctcpu(struct thread *td)
+{
+	fixpt_t pctcpu;
+	struct td_sched *ts;
+
+	pctcpu = 0;
+	ts = td->td_sched;
+	if (ts == NULL)
+		return (0);
+
+	thread_lock(td);
+	if (ts->ts_ticks) {
+		int rtick;
+
+		sched_pctcpu_update(ts);
+		/* How many rtick per second ? */
+		rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
+		pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
+	}
+	td->td_proc->p_swtime = ts->ts_ltick - ts->ts_ftick;
+	thread_unlock(td);
+
+	return (pctcpu);
+}
+
+void
+sched_bind(struct thread *td, int cpu)
+{
+	struct td_sched *ts;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	ts = td->td_sched;
+	if (ts->ts_flags & TSF_BOUND)
+		sched_unbind(td);
+	ts->ts_flags |= TSF_BOUND;
+#ifdef SMP
+	sched_pin();
+	if (PCPU_GET(cpuid) == cpu)
+		return;
+	ts->ts_cpu = cpu;
+	/* When we return from mi_switch we'll be on the correct cpu. */
+	mi_switch(SW_VOL, NULL);
+#endif
+}
+
+void
+sched_unbind(struct thread *td)
+{
+	struct td_sched *ts;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	ts = td->td_sched;
+	if ((ts->ts_flags & TSF_BOUND) == 0)
+		return;
+	ts->ts_flags &= ~TSF_BOUND;
+#ifdef SMP
+	sched_unpin();
+#endif
+}
+
+int
+sched_is_bound(struct thread *td)
+{
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	return (td->td_sched->ts_flags & TSF_BOUND);
+}
+
+void
+sched_relinquish(struct thread *td)
+{
+	thread_lock(td);
+	if (td->td_pri_class == PRI_TIMESHARE)
+		sched_prio(td, PRI_MAX_TIMESHARE);
+	SCHED_STAT_INC(switch_relinquish);
+	mi_switch(SW_VOL, NULL);
+	thread_unlock(td);
+}
+
+int
+sched_load(void)
+{
+#ifdef SMP
+	int total;
+	int i;
+
+	total = 0;
+	for (i = 0; i <= tdg_maxid; i++)
+		total += TDQ_GROUP(i)->tdg_load;
+	return (total);
+#else
+	return (TDQ_SELF()->tdq_sysload);
+#endif
+}
+
+int
+sched_sizeof_proc(void)
+{
+	return (sizeof(struct proc));
+}
+
+int
+sched_sizeof_thread(void)
+{
+	return (sizeof(struct thread) + sizeof(struct td_sched));
+}
+
+void
+sched_tick(void)
+{
+	struct td_sched *ts;
+
+	ts = curthread->td_sched;
+	/* Adjust ticks for pctcpu */
+	ts->ts_ticks += 1 << SCHED_TICK_SHIFT;
+	ts->ts_ltick = ticks;
+	/*
+	 * Update if we've exceeded our desired tick threshhold by over one
+	 * second.
+	 */
+	if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick)
+		sched_pctcpu_update(ts);
+}
+
+/*
+ * The actual idle process.
+ */
+void
+sched_idletd(void *dummy)
+{
+	struct proc *p;
+	struct thread *td;
+
+	td = curthread;
+	p = td->td_proc;
+	mtx_assert(&Giant, MA_NOTOWNED);
+	/* ULE Relies on preemption for idle interruption. */
+	for (;;)
+		cpu_idle();
+}
+
+/*
+ * A CPU is entering for the first time or a thread is exiting.
+ */
+void
+sched_throw(struct thread *td)
+{
+	struct tdq *tdq;
+
+	/*
+	 * Correct spinlock nesting.  The idle thread context that we are
+	 * borrowing was created so that it would start out with a single
+	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
+	 * explicitly acquired locks in this function, the nesting count
+	 * is now 2 rather than 1.  Since we are nested, calling
+	 * spinlock_exit() will simply adjust the counts without allowing
+	 * spin lock using code to interrupt us.
+	 */
+	tdq = TDQ_SELF();
+	if (td == NULL) {
+		TDQ_LOCK(tdq);
+		spinlock_exit();
+	} else {
+		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+	}
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED);
+	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
+	PCPU_SET(switchtime, cpu_ticks());
+	PCPU_SET(switchticks, ticks);
+	cpu_throw(td, choosethread());	/* doesn't return */
+}
+
+void
+sched_fork_exit(struct thread *ctd)
+{
+	struct td_sched *ts;
+	struct thread *td;
+	struct tdq *tdq;
+	int cpuid;
+
+	/*
+	 * Finish setting up thread glue so that it begins execution in a
+	 * non-nested critical section with sched_lock held but not recursed.
+	 */
+	cpuid = PCPU_GET(cpuid);
+	tdq = TDQ_CPU(cpuid);
+	if (TD_IS_IDLETHREAD(ctd))
+		ctd->td_lock = TDQ_LOCKPTR(tdq);
+	ts = ctd->td_sched;
+	MPASS(ctd->td_lock == TDQ_LOCKPTR(tdq));
+	ctd->td_oncpu = cpuid;
+	TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)ctd;
+	THREAD_LOCK_ASSERT(ctd, MA_OWNED | MA_NOTRECURSED);
+	/*
+	 * Processes normally resume in mi_switch() after being
+	 * cpu_switch()'ed to, but when children start up they arrive here
+	 * instead, so we must do much the same things as mi_switch() would.
+	 */
+	if ((td = PCPU_GET(deadthread))) {
+		PCPU_SET(deadthread, NULL);
+		thread_stash(td);
+	}
+	thread_unlock(ctd);
+}
+
+static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
+SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0,
+    "Scheduler name");
+SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, tickincr, CTLFLAG_RD, &tickincr, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, realstathz, CTLFLAG_RD, &realstathz, 0, "");
+#ifdef SMP
+SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_affinity, CTLFLAG_RW,
+    &affinity, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_tryself, CTLFLAG_RW,
+    &tryself, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_tryselfidle, CTLFLAG_RW,
+    &tryselfidle, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, ipi_preempt, CTLFLAG_RW, &ipi_preempt, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, ipi_ast, CTLFLAG_RW, &ipi_ast, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, ipi_thresh, CTLFLAG_RW, &ipi_thresh, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, steal_busy, CTLFLAG_RW, &steal_busy, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, busy_thresh, CTLFLAG_RW, &busy_thresh, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, topology, CTLFLAG_RD, &topology, 0, "");
+#endif
+
+/* ps compat */
+static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
+SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
+
+
+#define KERN_SWITCH_INCLUDE 1
+#include "kern/kern_switch.c"
Index: kern/sched_ule.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/sched_ule.c,v
retrieving revision 1.192
diff -u -p -r1.192 sched_ule.c
--- kern/sched_ule.c	20 Apr 2007 05:45:46 -0000	1.192
+++ kern/sched_ule.c	31 May 2007 22:18:07 -0000
@@ -229,6 +229,7 @@ static int ipi_thresh = PRI_MIN_KERN;
 static int steal_htt = 1;
 static int steal_busy = 1;
 static int busy_thresh = 4;
+static int topology = 0;
 
 /*
  * One thread queue per processor.
@@ -434,7 +435,7 @@ tdq_load_add(struct tdq *tdq, struct td_
 	mtx_assert(&sched_lock, MA_OWNED);
 	class = PRI_BASE(ts->ts_thread->td_pri_class);
 	tdq->tdq_load++;
-	CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
+	CTR2(KTR_SCHED, "cpu %jd load: %d", TDQ_ID(tdq), tdq->tdq_load);
 	if (class != PRI_ITHD &&
 	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
 #ifdef SMP
@@ -997,7 +998,7 @@ sched_setup(void *dummy)
 		tdq = &tdq_cpu[i];
 		tdq_setup(&tdq_cpu[i]);
 	}
-	if (1) {
+	if (smp_topology == NULL) {
 		struct tdq_group *tdg;
 		struct tdq *tdq;
 		int cpus;
@@ -1027,6 +1028,7 @@ sched_setup(void *dummy)
 		struct cpu_group *cg;
 		int j;
 
+		topology = 1;
 		for (i = 0; i < smp_topology->ct_count; i++) {
 			cg = &smp_topology->ct_group[i];
 			tdg = &tdq_groups[i];
@@ -1248,6 +1250,7 @@ schedinit(void)
 	 */
 	proc0.p_sched = NULL; /* XXX */
 	thread0.td_sched = &td_sched0;
+	thread0.td_lock = &sched_lock;
 	td_sched0.ts_ltick = ticks;
 	td_sched0.ts_ftick = ticks;
 	td_sched0.ts_thread = &thread0;
@@ -1296,7 +1299,7 @@ sched_thread_priority(struct thread *td,
 	    td, td->td_proc->p_comm, td->td_priority, prio, curthread,
 	    curthread->td_proc->p_comm);
 	ts = td->td_sched;
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority == prio)
 		return;
 
@@ -1307,9 +1310,10 @@ sched_thread_priority(struct thread *td,
 		 * queue.  This could be optimized to not re-add in some
 		 * cases.
 		 */
+		MPASS(td->td_lock == &sched_lock);
 		sched_rem(td);
 		td->td_priority = prio;
-		sched_add(td, SRQ_BORROWING);
+		sched_add(td, SRQ_BORROWING|SRQ_OURSELF);
 	} else
 		td->td_priority = prio;
 }
@@ -1427,7 +1431,7 @@ sched_switch(struct thread *td, struct t
 	struct td_sched *ts;
 	int preempt;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	preempt = flags & SW_PREEMPT;
 	tdq = TDQ_SELF();
@@ -1440,24 +1444,33 @@ sched_switch(struct thread *td, struct t
 	 * If the thread has been assigned it may be in the process of switching
 	 * to the new cpu.  This is the case in sched_bind().
 	 */
+	/*
+	 * Switch to the sched lock to fix things up and pick
+	 * a new thread.
+	 */
+	if (td->td_lock != &sched_lock) {
+		mtx_lock_spin(&sched_lock);
+		thread_unlock(td);
+	}
 	if (TD_IS_IDLETHREAD(td)) {
+		MPASS(td->td_lock == &sched_lock);
 		TD_SET_CAN_RUN(td);
-	} else {
+	} else if (TD_IS_RUNNING(td)) {
+		/*
+		 * Don't allow the thread to migrate
+		 * from a preemption.
+		 */
 		tdq_load_rem(tdq, ts);
-		if (TD_IS_RUNNING(td)) {
-			/*
-			 * Don't allow the thread to migrate
-			 * from a preemption.
-			 */
-			if (preempt)
-				sched_pin_td(td);
-			sched_add(td, preempt ?
-			    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
-			    SRQ_OURSELF|SRQ_YIELDING);
-			if (preempt)
-				sched_unpin_td(td);
-		}
-	}
+		if (preempt)
+			sched_pin_td(td);
+		sched_add(td, preempt ?
+		    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
+		    SRQ_OURSELF|SRQ_YIELDING);
+		if (preempt)
+			sched_unpin_td(td);
+	} else
+		tdq_load_rem(tdq, ts);
+	mtx_assert(&sched_lock, MA_OWNED);
 	if (newtd != NULL) {
 		/*
 		 * If we bring in a thread account for it as if it had been
@@ -1473,7 +1486,7 @@ sched_switch(struct thread *td, struct t
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
 
-		cpu_switch(td, newtd);
+		cpu_switch(td, newtd, td->td_lock);
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
@@ -1481,6 +1494,7 @@ sched_switch(struct thread *td, struct t
 	}
 	sched_lock.mtx_lock = (uintptr_t)td;
 	td->td_oncpu = PCPU_GET(cpuid);
+	MPASS(td->td_lock == &sched_lock);
 }
 
 void
@@ -1489,12 +1503,14 @@ sched_nice(struct proc *p, int nice)
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	mtx_assert(&sched_lock, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 
 	p->p_nice = nice;
 	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
 		sched_priority(td);
 		sched_prio(td, td->td_base_user_pri);
+		thread_unlock(td);
 	}
 }
 
@@ -1502,7 +1518,7 @@ void
 sched_sleep(struct thread *td)
 {
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	td->td_sched->ts_slptime = ticks;
 }
@@ -1513,7 +1529,7 @@ sched_wakeup(struct thread *td)
 	struct td_sched *ts;
 	int slptime;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 	/*
 	 * If we slept for more than a tick update our interactivity and
@@ -1542,7 +1558,7 @@ sched_wakeup(struct thread *td)
 void
 sched_fork(struct thread *td, struct thread *child)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sched_fork_thread(td, child);
 	/*
 	 * Penalize the parent and child for forking.
@@ -1563,7 +1579,9 @@ sched_fork_thread(struct thread *td, str
 	/*
 	 * Initialize child.
 	 */
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sched_newthread(child);
+	child->td_lock = &sched_lock;
 	ts = td->td_sched;
 	ts2 = child->td_sched;
 	ts2->ts_cpu = ts->ts_cpu;
@@ -1588,7 +1606,7 @@ void
 sched_class(struct thread *td, int class)
 {
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_pri_class == class)
 		return;
 
@@ -1627,6 +1645,7 @@ sched_exit(struct proc *p, struct thread
 	CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
 	    child, child->td_proc->p_comm, child->td_priority);
 
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	td = FIRST_THREAD_IN_PROC(p);
 	sched_exit_thread(td, child);
 }
@@ -1638,7 +1657,9 @@ sched_exit_thread(struct thread *td, str
 	CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
 	    child, child->td_proc->p_comm, child->td_priority);
 
+	thread_lock(child);
 	tdq_load_rem(TDQ_CPU(child->td_sched->ts_cpu), child->td_sched);
+	thread_unlock(child);
 #ifdef KSE
 	/*
 	 * KSE forks and exits so often that this penalty causes short-lived
@@ -1653,9 +1674,11 @@ sched_exit_thread(struct thread *td, str
 	 * sleep time as a penalty to the parent.  This causes shells that
 	 * launch expensive things to mark their children as expensive.
 	 */
+	thread_lock(td);
 	td->td_sched->skg_runtime += child->td_sched->skg_runtime;
 	sched_interact_update(td);
 	sched_priority(td);
+	thread_unlock(td);
 }
 
 void
@@ -1673,10 +1696,10 @@ sched_userret(struct thread *td)
 	KASSERT((td->td_flags & TDF_BORROWING) == 0,
 	    ("thread with borrowed priority returning to userland"));
 	if (td->td_priority != td->td_user_pri) {
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		td->td_priority = td->td_user_pri;
 		td->td_base_pri = td->td_user_pri;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
         }
 }
 
@@ -1805,9 +1828,22 @@ sched_preempt(struct thread *td)
 	 */
 	MPASS(TD_ON_RUNQ(td));
 	TD_SET_RUNNING(td);
+	MPASS(ctd->td_lock == &sched_lock);
+	MPASS(td->td_lock == &sched_lock);
 	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
 	    td->td_proc->p_pid, td->td_proc->p_comm);
+	/*
+	 * We enter the switch with two runnable threads that both have
+	 * the same lock.  When we return td may be sleeping so we need
+	 * to switch locks to make sure he's locked correctly.
+	 */
+	SCHED_STAT_INC(switch_preempt);
 	mi_switch(SW_INVOL|SW_PREEMPT, td);
+	spinlock_enter();
+	thread_unlock(ctd);
+	thread_lock(td);
+	spinlock_exit();
+
 	return (1);
 }
 
@@ -1824,7 +1860,7 @@ sched_add(struct thread *td, int flags)
 #endif
 	ts = td->td_sched;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
 	    td, td->td_proc->p_comm, td->td_priority, curthread,
 	    curthread->td_proc->p_comm);
@@ -1834,8 +1870,15 @@ sched_add(struct thread *td, int flags)
 	    ("sched_add: bad thread state"));
 	KASSERT(td->td_proc->p_sflag & PS_INMEM,
 	    ("sched_add: process swapped out"));
-	KASSERT(ts->ts_runq == NULL,
-	    ("sched_add: thread %p is still assigned to a run queue", td));
+	/*
+	 * Now that the thread is moving to the run-queue, set the lock
+	 * to the scheduler's lock.
+	 */
+	if (td->td_lock != &sched_lock) {
+		mtx_lock_spin(&sched_lock);
+		thread_lock_set(td, &sched_lock);
+	}
+	mtx_assert(&sched_lock, MA_OWNED);
         TD_SET_RUNQ(td);
 	tdq = TDQ_SELF();
 	class = PRI_BASE(td->td_pri_class);
@@ -1920,7 +1963,7 @@ sched_rem(struct thread *td)
 	CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
 	    td, td->td_proc->p_comm, td->td_priority, curthread,
 	    curthread->td_proc->p_comm);
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 	KASSERT(TD_ON_RUNQ(td),
 	    ("sched_rem: thread not on run queue"));
@@ -1942,7 +1985,7 @@ sched_pctcpu(struct thread *td)
 	if (ts == NULL)
 		return (0);
 
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	if (ts->ts_ticks) {
 		int rtick;
 
@@ -1952,7 +1995,7 @@ sched_pctcpu(struct thread *td)
 		pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
 	}
 	td->td_proc->p_swtime = ts->ts_ltick - ts->ts_ftick;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 
 	return (pctcpu);
 }
@@ -1962,7 +2005,7 @@ sched_bind(struct thread *td, int cpu)
 {
 	struct td_sched *ts;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 	if (ts->ts_flags & TSF_BOUND)
 		sched_unbind(td);
@@ -1982,7 +2025,7 @@ sched_unbind(struct thread *td)
 {
 	struct td_sched *ts;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 	if ((ts->ts_flags & TSF_BOUND) == 0)
 		return;
@@ -1995,18 +2038,19 @@ sched_unbind(struct thread *td)
 int
 sched_is_bound(struct thread *td)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	return (td->td_sched->ts_flags & TSF_BOUND);
 }
 
 void
 sched_relinquish(struct thread *td)
 {
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	if (td->td_pri_class == PRI_TIMESHARE)
 		sched_prio(td, PRI_MAX_TIMESHARE);
+	SCHED_STAT_INC(switch_relinquish);
 	mi_switch(SW_VOL, NULL);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 }
 
 int
@@ -2071,6 +2115,58 @@ sched_idletd(void *dummy)
 		cpu_idle();
 }
 
+/*
+ * A CPU is entering for the first time or a thread is exiting.
+ */
+void
+sched_throw(struct thread *td)
+{
+	/*
+	 * Correct spinlock nesting.  The idle thread context that we are
+	 * borrowing was created so that it would start out with a single
+	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
+	 * explicitly acquired locks in this function, the nesting count
+	 * is now 2 rather than 1.  Since we are nested, calling
+	 * spinlock_exit() will simply adjust the counts without allowing
+	 * spin lock using code to interrupt us.
+	 */
+	if (td == NULL) {
+		mtx_lock_spin(&sched_lock);
+		spinlock_exit();
+	} else {
+		MPASS(td->td_lock == &sched_lock);
+	}
+	mtx_assert(&sched_lock, MA_OWNED);
+	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
+	PCPU_SET(switchtime, cpu_ticks());
+	PCPU_SET(switchticks, ticks);
+	cpu_throw(td, choosethread());	/* doesn't return */
+}
+
+void
+sched_fork_exit(struct thread *ctd)
+{
+	struct thread *td;
+
+	/*
+	 * Finish setting up thread glue so that it begins execution in a
+	 * non-nested critical section with sched_lock held but not recursed.
+	 */
+	ctd->td_oncpu = PCPU_GET(cpuid);
+	sched_lock.mtx_lock = (uintptr_t)ctd;
+	THREAD_LOCK_ASSERT(ctd, MA_OWNED | MA_NOTRECURSED);
+	/*
+	 * Processes normally resume in mi_switch() after being
+	 * cpu_switch()'ed to, but when children start up they arrive here
+	 * instead, so we must do much the same things as mi_switch() would.
+	 */
+	if ((td = PCPU_GET(deadthread))) {
+		PCPU_SET(deadthread, NULL);
+		thread_stash(td);
+	}
+	thread_unlock(ctd);
+}
+
 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0,
     "Scheduler name");
@@ -2093,6 +2189,7 @@ SYSCTL_INT(_kern_sched, OID_AUTO, ipi_th
 SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0, "");
 SYSCTL_INT(_kern_sched, OID_AUTO, steal_busy, CTLFLAG_RW, &steal_busy, 0, "");
 SYSCTL_INT(_kern_sched, OID_AUTO, busy_thresh, CTLFLAG_RW, &busy_thresh, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, topology, CTLFLAG_RD, &topology, 0, "");
 #endif
 
 /* ps compat */
Index: kern/subr_prof.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/subr_prof.c,v
retrieving revision 1.78
diff -u -p -r1.78 subr_prof.c
--- kern/subr_prof.c	20 May 2007 22:11:49 -0000	1.78
+++ kern/subr_prof.c	31 May 2007 21:39:57 -0000
@@ -423,12 +423,12 @@ profil(td, uap)
 	}
 	PROC_LOCK(p);
 	upp = &td->td_proc->p_stats->p_prof;
-	mtx_lock_spin(&time_lock);
+	PROC_SLOCK(p);
 	upp->pr_off = uap->offset;
 	upp->pr_scale = uap->scale;
 	upp->pr_base = uap->samples;
 	upp->pr_size = uap->size;
-	mtx_unlock_spin(&time_lock);
+	PROC_SUNLOCK(p);
 	startprofclock(p);
 	PROC_UNLOCK(p);
 
@@ -468,22 +468,22 @@ addupc_intr(struct thread *td, uintfptr_
 	if (ticks == 0)
 		return;
 	prof = &td->td_proc->p_stats->p_prof;
-	mtx_lock_spin(&time_lock);
+	PROC_SLOCK(td->td_proc);
 	if (pc < prof->pr_off ||
 	    (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) {
-		mtx_unlock_spin(&time_lock);		
+		PROC_SUNLOCK(td->td_proc);
 		return;			/* out of range; ignore */
 	}
 
 	addr = prof->pr_base + i;
-	mtx_unlock_spin(&time_lock);
+	PROC_SUNLOCK(td->td_proc);
 	if ((v = fuswintr(addr)) == -1 || suswintr(addr, v + ticks) == -1) {
 		td->td_profil_addr = pc;
 		td->td_profil_ticks = ticks;
 		td->td_pflags |= TDP_OWEUPC;
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		td->td_flags |= TDF_ASTPENDING;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 	}
 }
 
@@ -511,12 +511,15 @@ addupc_task(struct thread *td, uintfptr_
 	}
 	p->p_profthreads++;
 	prof = &p->p_stats->p_prof;
+	PROC_SLOCK(p);
 	if (pc < prof->pr_off ||
 	    (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) {
+		PROC_SUNLOCK(p);
 		goto out;
 	}
 
 	addr = prof->pr_base + i;
+	PROC_SUNLOCK(p);
 	PROC_UNLOCK(p);
 	if (copyin(addr, &v, sizeof(v)) == 0) {
 		v += ticks;
Index: kern/subr_sleepqueue.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/subr_sleepqueue.c,v
retrieving revision 1.36
diff -u -p -r1.36 subr_sleepqueue.c
--- kern/subr_sleepqueue.c	18 May 2007 06:32:24 -0000	1.36
+++ kern/subr_sleepqueue.c	20 May 2007 11:40:27 -0000
@@ -329,7 +329,6 @@ sleepq_add(void *wchan, struct lock_obje
 	}
 	TAILQ_INSERT_TAIL(&sq->sq_blocked[queue], td, td_slpq);
 	td->td_sleepqueue = NULL;
-	mtx_lock_spin(&sched_lock);
 	td->td_sqqueue = queue;
 	td->td_wchan = wchan;
 	td->td_wmesg = wmesg;
@@ -337,7 +336,6 @@ sleepq_add(void *wchan, struct lock_obje
 		td->td_flags |= TDF_SINTR;
 		td->td_flags &= ~TDF_SLEEPABORT;
 	}
-	mtx_unlock_spin(&sched_lock);
 }
 
 /*
@@ -362,7 +360,8 @@ sleepq_set_timeout(void *wchan, int timo
 /*
  * Marks the pending sleep of the current thread as interruptible and
  * makes an initial check for pending signals before putting a thread
- * to sleep. Return with sleep queue and scheduler lock held.
+ * to sleep. Enters and exits with the thread lock held.  Thread lock
+ * may have transitioned from the sleepq lock to a run lock.
  */
 static int
 sleepq_catch_signals(void *wchan)
@@ -382,7 +381,6 @@ sleepq_catch_signals(void *wchan)
 	CTR3(KTR_PROC, "sleepq catching signals: thread %p (pid %ld, %s)",
 		(void *)td, (long)p->p_pid, p->p_comm);
 
-	MPASS(td->td_flags & TDF_SINTR);
 	mtx_unlock_spin(&sc->sc_lock);
 
 	/* See if there are any pending signals for this thread. */
@@ -401,39 +399,38 @@ sleepq_catch_signals(void *wchan)
 			ret = ERESTART;
 		mtx_unlock(&ps->ps_mtx);
 	}
-
+	/*
+	 * Lock sleepq chain before unlocking proc
+	 * without this, we could lose a race.
+	 */
+	mtx_lock_spin(&sc->sc_lock);
+	PROC_UNLOCK(p);
+	thread_lock(td);
 	if (ret == 0) {
-		mtx_lock_spin(&sc->sc_lock);
-		/*
-		 * Lock sched_lock before unlocking proc lock,
-		 * without this, we could lose a race.
-		 */
-		mtx_lock_spin(&sched_lock);
-		PROC_UNLOCK(p);
-		if (!(td->td_flags & TDF_INTERRUPT))
+		if (!(td->td_flags & TDF_INTERRUPT)) {
+			sleepq_switch(wchan);
 			return (0);
+		}
 		/* KSE threads tried unblocking us. */
 		ret = td->td_intrval;
-		mtx_unlock_spin(&sched_lock);
-		MPASS(ret == EINTR || ret == ERESTART);
-	} else {
-		PROC_UNLOCK(p);
-		mtx_lock_spin(&sc->sc_lock);
+		MPASS(ret == EINTR || ret == ERESTART || ret == EWOULDBLOCK);
 	}
 	/*
 	 * There were pending signals and this thread is still
 	 * on the sleep queue, remove it from the sleep queue.
 	 */
-	sq = sleepq_lookup(wchan);
-	mtx_lock_spin(&sched_lock);
-	if (TD_ON_SLEEPQ(td))
+	if (TD_ON_SLEEPQ(td)) {
+		sq = sleepq_lookup(wchan);
 		sleepq_resume_thread(sq, td, -1);
+	}
+	mtx_unlock_spin(&sc->sc_lock);
+	MPASS(td->td_lock != &sc->sc_lock);
 	return (ret);
 }
 
 /*
- * Switches to another thread if we are still asleep on a sleep queue and
- * drop the lock on the sleep queue chain.  Returns with sched_lock held.
+ * Switches to another thread if we are still asleep on a sleep queue.
+ * Returns with thread lock.
  */
 static void
 sleepq_switch(void *wchan)
@@ -444,24 +441,18 @@ sleepq_switch(void *wchan)
 	td = curthread;
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
-	mtx_assert(&sched_lock, MA_OWNED);
-
-	/* 
-	 * If we have a sleep queue, then we've already been woken up, so
-	 * just return.
-	 */
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	/* We were removed */
 	if (td->td_sleepqueue != NULL) {
-		MPASS(!TD_ON_SLEEPQ(td));
 		mtx_unlock_spin(&sc->sc_lock);
 		return;
 	}
+	thread_lock_set(td, &sc->sc_lock);
 
-	/*
-	 * Otherwise, actually go to sleep.
-	 */
-	mtx_unlock_spin(&sc->sc_lock);
+	MPASS(td->td_sleepqueue == NULL);
 	sched_sleep(td);
 	TD_SET_SLEEPING(td);
+	SCHED_STAT_INC(switch_sleepq);
 	mi_switch(SW_VOL, NULL);
 	KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING"));
 	CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)",
@@ -476,8 +467,8 @@ sleepq_check_timeout(void)
 {
 	struct thread *td;
 
-	mtx_assert(&sched_lock, MA_OWNED);
 	td = curthread;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/*
 	 * If TDF_TIMEOUT is set, we timed out.
@@ -502,6 +493,7 @@ sleepq_check_timeout(void)
 	else if (callout_stop(&td->td_slpcallout) == 0) {
 		td->td_flags |= TDF_TIMEOUT;
 		TD_SET_SLEEPING(td);
+		SCHED_STAT_INC(switch_sleepqtimo);
 		mi_switch(SW_INVOL, NULL);
 	}
 	return (0);
@@ -515,8 +507,8 @@ sleepq_check_signals(void)
 {
 	struct thread *td;
 
-	mtx_assert(&sched_lock, MA_OWNED);
 	td = curthread;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/* We are no longer in an interruptible sleep. */
 	if (td->td_flags & TDF_SINTR)
@@ -539,11 +531,13 @@ sleepq_check_signals(void)
 void
 sleepq_wait(void *wchan)
 {
+	struct thread *td;
 
-	MPASS(!(curthread->td_flags & TDF_SINTR));
-	mtx_lock_spin(&sched_lock);
+	td = curthread;
+	MPASS(!(td->td_flags & TDF_SINTR));
+	thread_lock(td);
 	sleepq_switch(wchan);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 }
 
 /*
@@ -557,12 +551,8 @@ sleepq_wait_sig(void *wchan)
 	int rval;
 
 	rcatch = sleepq_catch_signals(wchan);
-	if (rcatch == 0)
-		sleepq_switch(wchan);
-	else
-		sleepq_release(wchan);
 	rval = sleepq_check_signals();
-	mtx_unlock_spin(&sched_lock); 
+	thread_unlock(curthread);
 	if (rcatch)
 		return (rcatch);
 	return (rval);
@@ -575,13 +565,16 @@ sleepq_wait_sig(void *wchan)
 int
 sleepq_timedwait(void *wchan)
 {
+	struct thread *td;
 	int rval;
 
-	MPASS(!(curthread->td_flags & TDF_SINTR));
-	mtx_lock_spin(&sched_lock);
+	td = curthread;
+	MPASS(!(td->td_flags & TDF_SINTR));
+	thread_lock(td);
 	sleepq_switch(wchan);
 	rval = sleepq_check_timeout();
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
+
 	return (rval);
 }
 
@@ -595,13 +588,9 @@ sleepq_timedwait_sig(void *wchan)
 	int rcatch, rvalt, rvals;
 
 	rcatch = sleepq_catch_signals(wchan);
-	if (rcatch == 0)
-		sleepq_switch(wchan);
-	else
-		sleepq_release(wchan);
 	rvalt = sleepq_check_timeout();
 	rvals = sleepq_check_signals();
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 	if (rcatch)
 		return (rcatch);
 	if (rvals)
@@ -622,9 +611,9 @@ sleepq_resume_thread(struct sleepqueue *
 	MPASS(sq->sq_wchan != NULL);
 	MPASS(td->td_wchan == sq->sq_wchan);
 	MPASS(td->td_sqqueue < NR_SLEEPQS && td->td_sqqueue >= 0);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sc = SC_LOOKUP(sq->sq_wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
-	mtx_assert(&sched_lock, MA_OWNED);
 
 	/* Remove the thread from the queue. */
 	TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq);
@@ -714,10 +703,8 @@ sleepq_signal(void *wchan, int flags, in
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 	sq = sleepq_lookup(wchan);
-	if (sq == NULL) {
-		sleepq_release(wchan);
+	if (sq == NULL)
 		return;
-	}
 	KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE),
 	    ("%s: mismatch between sleep/wakeup and cv_*", __func__));
 
@@ -733,10 +720,9 @@ sleepq_signal(void *wchan, int flags, in
 			besttd = td;
 	}
 	MPASS(besttd != NULL);
-	mtx_lock_spin(&sched_lock);
+	thread_lock(besttd);
 	sleepq_resume_thread(sq, besttd, pri);
-	mtx_unlock_spin(&sched_lock);
-	sleepq_release(wchan);
+	thread_unlock(besttd);
 }
 
 /*
@@ -746,6 +732,7 @@ void
 sleepq_broadcast(void *wchan, int flags, int pri, int queue)
 {
 	struct sleepqueue *sq;
+	struct thread *td;
 
 	CTR2(KTR_PROC, "sleepq_broadcast(%p, %d)", wchan, flags);
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
@@ -759,11 +746,12 @@ sleepq_broadcast(void *wchan, int flags,
 	    ("%s: mismatch between sleep/wakeup and cv_*", __func__));
 
 	/* Resume all blocked threads on the sleep queue. */
-	mtx_lock_spin(&sched_lock);
-	while (!TAILQ_EMPTY(&sq->sq_blocked[queue]))
-		sleepq_resume_thread(sq, TAILQ_FIRST(&sq->sq_blocked[queue]),
-		    pri);
-	mtx_unlock_spin(&sched_lock);
+	while (!TAILQ_EMPTY(&sq->sq_blocked[queue])) {
+		td = TAILQ_FIRST(&sq->sq_blocked[queue]);
+		thread_lock(td);
+		sleepq_resume_thread(sq, td, pri);
+		thread_unlock(td);
+	}
 	sleepq_release(wchan);
 }
 
@@ -774,6 +762,7 @@ sleepq_broadcast(void *wchan, int flags,
 static void
 sleepq_timeout(void *arg)
 {
+	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	struct thread *td;
 	void *wchan;
@@ -786,38 +775,29 @@ sleepq_timeout(void *arg)
 	 * First, see if the thread is asleep and get the wait channel if
 	 * it is.
 	 */
-	mtx_lock_spin(&sched_lock);
-	if (TD_ON_SLEEPQ(td)) {
+	thread_lock(td);
+	if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) {
 		wchan = td->td_wchan;
-		mtx_unlock_spin(&sched_lock);
-		sleepq_lock(wchan);
+		sc = SC_LOOKUP(wchan);
+		MPASS(td->td_lock == &sc->sc_lock);
 		sq = sleepq_lookup(wchan);
-		mtx_lock_spin(&sched_lock);
-	} else {
-		wchan = NULL;
-		sq = NULL;
+		MPASS(sq != NULL);
+		td->td_flags |= TDF_TIMEOUT;
+		sleepq_resume_thread(sq, td, -1);
+		thread_unlock(td);
+		return;
 	}
-
 	/*
-	 * At this point, if the thread is still on the sleep queue,
-	 * we have that sleep queue locked as it cannot migrate sleep
-	 * queues while we dropped sched_lock.  If it had resumed and
-	 * was on another CPU while the lock was dropped, it would have
-	 * seen that TDF_TIMEOUT and TDF_TIMOFAIL are clear and the
-	 * call to callout_stop() to stop this routine would have failed
-	 * meaning that it would have already set TDF_TIMEOUT to
-	 * synchronize with this function.
+	 * If the thread is on the SLEEPQ but not sleeping and we have it
+	 * locked it must be in sleepq_catch_signals().  Let it know we've
+ 	 * timedout here so it can remove itself.
 	 */
 	if (TD_ON_SLEEPQ(td)) {
-		MPASS(td->td_wchan == wchan);
-		MPASS(sq != NULL);
-		td->td_flags |= TDF_TIMEOUT;
-		sleepq_resume_thread(sq, td, -1);
-		mtx_unlock_spin(&sched_lock);
-		sleepq_release(wchan);
+		td->td_flags |= TDF_TIMEOUT | TDF_INTERRUPT;
+		td->td_intrval = EWOULDBLOCK;
+		thread_unlock(td);
 		return;
-	} else if (wchan != NULL)
-		sleepq_release(wchan);
+	}
 
 	/*
 	 * Now check for the edge cases.  First, if TDF_TIMEOUT is set,
@@ -835,7 +815,7 @@ sleepq_timeout(void *arg)
 		setrunnable(td);
 	} else
 		td->td_flags |= TDF_TIMOFAIL;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 }
 
 /*
@@ -855,33 +835,36 @@ sleepq_remove(struct thread *td, void *w
 	MPASS(wchan != NULL);
 	sleepq_lock(wchan);
 	sq = sleepq_lookup(wchan);
-	mtx_lock_spin(&sched_lock);
+	/*
+	 * We can not lock the thread here as it may be sleeping on a
+	 * different sleepq.  However, holding the sleepq lock for this
+	 * wchan can guarantee that we do not miss a wakeup for this
+	 * channel.  The asserts below will catch any false positives.
+	 */
 	if (!TD_ON_SLEEPQ(td) || td->td_wchan != wchan) {
-		mtx_unlock_spin(&sched_lock);
 		sleepq_release(wchan);
 		return;
 	}
-	MPASS(sq != NULL);
-
 	/* Thread is asleep on sleep queue sq, so wake it up. */
+	thread_lock(td);
+	MPASS(sq != NULL);
+	MPASS(td->td_wchan == wchan);
 	sleepq_resume_thread(sq, td, -1);
+	thread_unlock(td);
 	sleepq_release(wchan);
-	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Abort a thread as if an interrupt had occurred.  Only abort
  * interruptible waits (unfortunately it isn't safe to abort others).
- *
- * XXX: What in the world does the comment below mean?
- * Also, whatever the signal code does...
  */
 void
 sleepq_abort(struct thread *td, int intrval)
 {
+	struct sleepqueue *sq;
 	void *wchan;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	MPASS(TD_ON_SLEEPQ(td));
 	MPASS(td->td_flags & TDF_SINTR);
 	MPASS(intrval == EINTR || intrval == ERESTART);
@@ -895,14 +878,22 @@ sleepq_abort(struct thread *td, int intr
 
 	CTR3(KTR_PROC, "sleepq_abort: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_proc->p_comm);
+	td->td_intrval = intrval;
+	td->td_flags |= TDF_SLEEPABORT;
+	/*
+	 * If the thread has not slept yet it will find the signal in
+	 * sleepq_catch_signals() and call sleepq_resume_thread.  Otherwise
+	 * we have to do it here.
+	 */
+	if (!TD_IS_SLEEPING(td))
+		return;
 	wchan = td->td_wchan;
-	if (wchan != NULL) {
-		td->td_intrval = intrval;
-		td->td_flags |= TDF_SLEEPABORT;
-	}
-	mtx_unlock_spin(&sched_lock);
-	sleepq_remove(td, wchan);
-	mtx_lock_spin(&sched_lock);
+	MPASS(wchan != NULL);
+	sq = sleepq_lookup(wchan);
+	MPASS(sq != NULL);
+
+	/* Thread is asleep on sleep queue sq, so wake it up. */
+	sleepq_resume_thread(sq, td, -1);
 }
 
 #ifdef DDB
Index: kern/subr_smp.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/subr_smp.c,v
retrieving revision 1.198
diff -u -p -r1.198 subr_smp.c
--- kern/subr_smp.c	8 Mar 2007 06:44:34 -0000	1.198
+++ kern/subr_smp.c	31 May 2007 21:06:57 -0000
@@ -159,7 +159,7 @@ forward_signal(struct thread *td)
 	 * this thread, so all we need to do is poke it if it is currently
 	 * executing so that it executes ast().
 	 */
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(TD_IS_RUNNING(td),
 	    ("forward_signal: thread is not TDS_RUNNING"));
 
@@ -187,8 +187,6 @@ forward_roundrobin(void)
 	struct thread *td;
 	cpumask_t id, map, me;
 
-	mtx_assert(&sched_lock, MA_OWNED);
-
 	CTR0(KTR_SMP, "forward_roundrobin()");
 
 	if (!smp_started || cold || panicstr)
Index: kern/subr_taskqueue.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/subr_taskqueue.c,v
retrieving revision 1.38
diff -u -p -r1.38 subr_taskqueue.c
--- kern/subr_taskqueue.c	23 Jan 2007 08:46:50 -0000	1.38
+++ kern/subr_taskqueue.c	18 May 2007 10:37:02 -0000
@@ -349,15 +349,15 @@ taskqueue_start_threads(struct taskqueue
 		} else
 			tq->tq_pcount++;
 	}
-	mtx_lock_spin(&sched_lock);
 	for (i = 0; i < count; i++) {
 		if (tq->tq_pproc[i] == NULL)
 			continue;
 		td = FIRST_THREAD_IN_PROC(tq->tq_pproc[i]);
+		thread_lock(td);
 		sched_prio(td, pri);
 		sched_add(td, SRQ_BORING);
+		thread_unlock(td);
 	}
-	mtx_unlock_spin(&sched_lock);
 
 	return (0);
 }
Index: kern/subr_trap.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/subr_trap.c,v
retrieving revision 1.295
diff -u -p -r1.295 subr_trap.c
--- kern/subr_trap.c	1 Jun 2007 01:20:11 -0000	1.295
+++ kern/subr_trap.c	31 May 2007 21:06:42 -0000
@@ -82,11 +82,11 @@ userret(struct thread *td, struct trapfr
 #ifdef DIAGNOSTIC
 	/* Check that we called signotify() enough. */
 	PROC_LOCK(p);
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	if (SIGPENDING(td) && ((td->td_flags & TDF_NEEDSIGCHK) == 0 ||
 	    (td->td_flags & TDF_ASTPENDING) == 0))
 		printf("failed to set signal flags properly for ast()\n");
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	PROC_UNLOCK(p);
 #endif
 
@@ -163,7 +163,7 @@ ast(struct trapframe *framep)
 	KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
 	WITNESS_WARN(WARN_PANIC, NULL, "Returning to user mode");
 	mtx_assert(&Giant, MA_NOTOWNED);
-	mtx_assert(&sched_lock, MA_NOTOWNED);
+	THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
 	td->td_frame = framep;
 	td->td_pticks = 0;
 
@@ -179,7 +179,7 @@ ast(struct trapframe *framep)
 	 * AST's saved in sflag, the astpending flag will be set and
 	 * ast() will be called again.
 	 */
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	flags = td->td_flags;
 	sflag = p->p_sflag;
 	if (p->p_sflag & (PS_ALRMPEND | PS_PROFPEND))
@@ -188,10 +188,12 @@ ast(struct trapframe *framep)
 	if (p->p_sflag & PS_MACPEND)
 		p->p_sflag &= ~PS_MACPEND;
 #endif
+	thread_lock(td);
+	PROC_SUNLOCK(p);
 	td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK |
 	    TDF_NEEDRESCHED | TDF_INTERRUPT);
+	thread_unlock(td);
 	cnt.v_trap++;
-	mtx_unlock_spin(&sched_lock);
 
 	/*
 	 * XXXKSE While the fact that we owe a user profiling
@@ -239,10 +241,11 @@ ast(struct trapframe *framep)
 		if (KTRPOINT(td, KTR_CSW))
 			ktrcsw(1, 1);
 #endif
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		sched_prio(td, td->td_user_pri);
+		SCHED_STAT_INC(switch_needresched);
 		mi_switch(SW_INVOL, NULL);
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_CSW))
 			ktrcsw(0, 1);
Index: kern/subr_turnstile.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/subr_turnstile.c,v
retrieving revision 1.167
diff -u -p -r1.167 subr_turnstile.c
--- kern/subr_turnstile.c	18 May 2007 06:32:24 -0000	1.167
+++ kern/subr_turnstile.c	31 May 2007 22:17:16 -0000
@@ -116,6 +116,7 @@ __FBSDID("$FreeBSD: src/sys/kern/subr_tu
  *  q - td_contested lock
  */
 struct turnstile {
+	struct mtx ts_lock;			/* Spin lock for self. */
 	struct threadqueue ts_blocked[2];	/* (c + q) Blocked threads. */
 	struct threadqueue ts_pending;		/* (c) Pending threads. */
 	LIST_ENTRY(turnstile) ts_hash;		/* (c) Chain and free list. */
@@ -162,6 +163,7 @@ static void	turnstile_setowner(struct tu
 static void	turnstile_dtor(void *mem, int size, void *arg);
 #endif
 static int	turnstile_init(void *mem, int size, int flags);
+static void	turnstile_fini(void *mem, int size);
 
 /*
  * Walks the chain of turnstiles and their owners to propagate the priority
@@ -171,13 +173,20 @@ static int	turnstile_init(void *mem, int
 static void
 propagate_priority(struct thread *td)
 {
-	struct turnstile_chain *tc;
 	struct turnstile *ts;
 	int pri;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	pri = td->td_priority;
 	ts = td->td_blocked;
+	MPASS(td->td_lock == &ts->ts_lock);
+	/*
+	 * Grab a recursive lock on this turnstile chain so it stays locked
+	 * for the whole operation.  The caller expects us to return with
+	 * the original lock held.  We only ever lock down the chain so
+	 * the lock order is constant.
+	 */
+	mtx_lock_spin(&ts->ts_lock);
 	for (;;) {
 		td = ts->ts_owner;
 
@@ -186,9 +195,12 @@ propagate_priority(struct thread *td)
 			 * This might be a read lock with no owner.  There's
 			 * not much we can do, so just bail.
 			 */
+			mtx_unlock_spin(&ts->ts_lock);
 			return;
 		}
 
+		thread_lock_flags(td, MTX_DUPOK);
+		mtx_unlock_spin(&ts->ts_lock);
 		MPASS(td->td_proc != NULL);
 		MPASS(td->td_proc->p_magic == P_MAGIC);
 
@@ -213,8 +225,10 @@ propagate_priority(struct thread *td)
 		 * If this thread already has higher priority than the
 		 * thread that is being blocked, we are finished.
 		 */
-		if (td->td_priority <= pri)
+		if (td->td_priority <= pri) {
+			thread_unlock(td);
 			return;
+		}
 
 		/*
 		 * Bump this thread's priority.
@@ -227,6 +241,7 @@ propagate_priority(struct thread *td)
 		 */
 		if (TD_IS_RUNNING(td) || TD_ON_RUNQ(td)) {
 			MPASS(td->td_blocked == NULL);
+			thread_unlock(td);
 			return;
 		}
 
@@ -251,15 +266,13 @@ propagate_priority(struct thread *td)
 		 */
 		ts = td->td_blocked;
 		MPASS(ts != NULL);
-		tc = TC_LOOKUP(ts->ts_lockobj);
-		mtx_lock_spin(&tc->tc_lock);
-
+		MPASS(td->td_lock == &ts->ts_lock);
 		/* Resort td on the list if needed. */
 		if (!turnstile_adjust_thread(ts, td)) {
-			mtx_unlock_spin(&tc->tc_lock);
+			mtx_unlock_spin(&ts->ts_lock);
 			return;
 		}
-		mtx_unlock_spin(&tc->tc_lock);
+		/* The thread lock is released as ts lock above. */
 	}
 }
 
@@ -270,17 +283,16 @@ propagate_priority(struct thread *td)
 static int
 turnstile_adjust_thread(struct turnstile *ts, struct thread *td)
 {
-	struct turnstile_chain *tc;
 	struct thread *td1, *td2;
 	int queue;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	MPASS(TD_ON_LOCK(td));
 
 	/*
 	 * This thread may not be blocked on this turnstile anymore
 	 * but instead might already be woken up on another CPU
-	 * that is waiting on sched_lock in turnstile_unpend() to
+	 * that is waiting on the thread lock in turnstile_unpend() to
 	 * finish waking this thread up.  We can detect this case
 	 * by checking to see if this thread has been given a
 	 * turnstile by either turnstile_signal() or
@@ -295,8 +307,7 @@ turnstile_adjust_thread(struct turnstile
 	 * It needs to be moved if either its priority is lower than
 	 * the previous thread or higher than the next thread.
 	 */
-	tc = TC_LOOKUP(ts->ts_lockobj);
-	mtx_assert(&tc->tc_lock, MA_OWNED);
+	MPASS(td->td_lock == &ts->ts_lock);
 	td1 = TAILQ_PREV(td, threadqueue, td_lockq);
 	td2 = TAILQ_NEXT(td, td_lockq);
 	if ((td1 != NULL && td->td_priority < td1->td_priority) ||
@@ -385,9 +396,10 @@ init_turnstile0(void *dummy)
 
 	turnstile_zone = uma_zcreate("TURNSTILE", sizeof(struct turnstile),
 #ifdef INVARIANTS
-	    NULL, turnstile_dtor, turnstile_init, NULL, UMA_ALIGN_CACHE, 0);
+	    NULL, turnstile_dtor, turnstile_init, turnstile_fini,
+	    UMA_ALIGN_CACHE, 0);
 #else
-	    NULL, NULL, turnstile_init, NULL, UMA_ALIGN_CACHE, 0);
+	    NULL, NULL, turnstile_init, turnstile_fini, UMA_ALIGN_CACHE, 0);
 #endif
 	thread0.td_turnstile = turnstile_alloc();
 }
@@ -400,10 +412,8 @@ SYSINIT(turnstile0, SI_SUB_LOCK, SI_ORDE
 void
 turnstile_adjust(struct thread *td, u_char oldpri)
 {
-	struct turnstile_chain *tc;
 	struct turnstile *ts;
 
-	mtx_assert(&sched_lock, MA_OWNED);
 	MPASS(TD_ON_LOCK(td));
 
 	/*
@@ -411,15 +421,12 @@ turnstile_adjust(struct thread *td, u_ch
 	 */
 	ts = td->td_blocked;
 	MPASS(ts != NULL);
-	tc = TC_LOOKUP(ts->ts_lockobj);
-	mtx_lock_spin(&tc->tc_lock);
+	MPASS(td->td_lock == &ts->ts_lock);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
 
 	/* Resort the turnstile on the list. */
-	if (!turnstile_adjust_thread(ts, td)) {
-		mtx_unlock_spin(&tc->tc_lock);
+	if (!turnstile_adjust_thread(ts, td))
 		return;
-	}
-
 	/*
 	 * If our priority was lowered and we are at the head of the
 	 * turnstile, then propagate our new priority up the chain.
@@ -430,12 +437,8 @@ turnstile_adjust(struct thread *td, u_ch
 	    td->td_tsqueue == TS_SHARED_QUEUE);
 	if (td == TAILQ_FIRST(&ts->ts_blocked[td->td_tsqueue]) &&
 	    td->td_priority < oldpri) {
-		mtx_unlock_spin(&tc->tc_lock);
-		critical_enter();
 		propagate_priority(td);
-		critical_exit();
-	} else
-		mtx_unlock_spin(&tc->tc_lock);
+	}
 }
 
 /*
@@ -487,9 +490,19 @@ turnstile_init(void *mem, int size, int 
 	TAILQ_INIT(&ts->ts_blocked[TS_SHARED_QUEUE]);
 	TAILQ_INIT(&ts->ts_pending);
 	LIST_INIT(&ts->ts_free);
+	mtx_init(&ts->ts_lock, "turnstile lock", NULL, MTX_SPIN | MTX_RECURSE);
 	return (0);
 }
 
+static void
+turnstile_fini(void *mem, int size)
+{
+	struct turnstile *ts;
+
+	ts = mem;
+	mtx_destroy(&ts->ts_lock);
+}
+
 /*
  * Get a turnstile for a new thread.
  */
@@ -514,12 +527,51 @@ turnstile_free(struct turnstile *ts)
  * Lock the turnstile chain associated with the specified lock.
  */
 void
-turnstile_lock(struct lock_object *lock)
+turnstile_chain_lock(struct lock_object *lock)
+{
+	struct turnstile_chain *tc;
+
+	tc = TC_LOOKUP(lock);
+	mtx_lock_spin(&tc->tc_lock);
+}
+
+struct turnstile *
+turnstile_trywait(struct lock_object *lock)
 {
 	struct turnstile_chain *tc;
+	struct turnstile *ts;
 
 	tc = TC_LOOKUP(lock);
 	mtx_lock_spin(&tc->tc_lock);
+	LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
+		if (ts->ts_lockobj == lock) {
+			mtx_lock_spin(&ts->ts_lock);
+			return (ts);
+		}
+
+	ts = curthread->td_turnstile;
+	MPASS(ts != NULL);
+	mtx_lock_spin(&ts->ts_lock);
+	KASSERT(ts->ts_lockobj == NULL, ("stale ts_lockobj pointer"));
+	ts->ts_lockobj = lock;
+
+	return (ts);
+}
+
+void
+turnstile_cancel(struct turnstile *ts)
+{
+	struct turnstile_chain *tc;
+	struct lock_object *lock;
+
+	mtx_assert(&ts->ts_lock, MA_OWNED);
+
+	mtx_unlock_spin(&ts->ts_lock);
+	lock = ts->ts_lockobj;
+	if (ts == curthread->td_turnstile)
+		ts->ts_lockobj = NULL;
+	tc = TC_LOOKUP(lock);
+	mtx_unlock_spin(&tc->tc_lock);
 }
 
 /*
@@ -536,8 +588,10 @@ turnstile_lookup(struct lock_object *loc
 	tc = TC_LOOKUP(lock);
 	mtx_assert(&tc->tc_lock, MA_OWNED);
 	LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
-		if (ts->ts_lockobj == lock)
+		if (ts->ts_lockobj == lock) {
+			mtx_lock_spin(&ts->ts_lock);
 			return (ts);
+		}
 	return (NULL);
 }
 
@@ -545,7 +599,7 @@ turnstile_lookup(struct lock_object *loc
  * Unlock the turnstile chain associated with a given lock.
  */
 void
-turnstile_release(struct lock_object *lock)
+turnstile_chain_unlock(struct lock_object *lock)
 {
 	struct turnstile_chain *tc;
 
@@ -574,16 +628,13 @@ turnstile_first_waiter(struct turnstile 
  * owner appropriately.
  */
 void
-turnstile_claim(struct lock_object *lock)
+turnstile_claim(struct turnstile *ts)
 {
-	struct turnstile_chain *tc;
-	struct turnstile *ts;
 	struct thread *td, *owner;
+	struct turnstile_chain *tc;
 
-	tc = TC_LOOKUP(lock);
-	mtx_assert(&tc->tc_lock, MA_OWNED);
-	ts = turnstile_lookup(lock);
-	MPASS(ts != NULL);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
+	MPASS(ts != curthread->td_turnstile);
 
 	owner = curthread;
 	mtx_lock_spin(&td_contested_lock);
@@ -593,15 +644,18 @@ turnstile_claim(struct lock_object *lock
 	td = turnstile_first_waiter(ts);
 	MPASS(td != NULL);
 	MPASS(td->td_proc->p_magic == P_MAGIC);
-	mtx_unlock_spin(&tc->tc_lock);
+	MPASS(td->td_lock == &ts->ts_lock);
 
 	/*
 	 * Update the priority of the new owner if needed.
 	 */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(owner);
 	if (td->td_priority < owner->td_priority)
 		sched_lend_prio(owner, td->td_priority);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(owner);
+	tc = TC_LOOKUP(ts->ts_lockobj);
+	mtx_unlock_spin(&ts->ts_lock);
+	mtx_unlock_spin(&tc->tc_lock);
 }
 
 /*
@@ -611,31 +665,28 @@ turnstile_claim(struct lock_object *lock
  * turnstile chain locked and will return with it unlocked.
  */
 void
-turnstile_wait(struct lock_object *lock, struct thread *owner, int queue)
+turnstile_wait(struct turnstile *ts, struct thread *owner, int queue)
 {
 	struct turnstile_chain *tc;
-	struct turnstile *ts;
 	struct thread *td, *td1;
+	struct lock_object *lock;
 
 	td = curthread;
-	tc = TC_LOOKUP(lock);
-	mtx_assert(&tc->tc_lock, MA_OWNED);
-	MPASS(td->td_turnstile != NULL);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
 	if (queue == TS_SHARED_QUEUE)
 		MPASS(owner != NULL);
 	if (owner)
 		MPASS(owner->td_proc->p_magic == P_MAGIC);
 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
 
-	/* Look up the turnstile associated with the lock 'lock'. */
-	ts = turnstile_lookup(lock);
-
 	/*
 	 * If the lock does not already have a turnstile, use this thread's
 	 * turnstile.  Otherwise insert the current thread into the
 	 * turnstile already in use by this lock.
 	 */
-	if (ts == NULL) {
+	tc = TC_LOOKUP(ts->ts_lockobj);
+	if (ts == td->td_turnstile) {
+	mtx_assert(&tc->tc_lock, MA_OWNED);
 #ifdef TURNSTILE_PROFILING
 		tc->tc_depth++;
 		if (tc->tc_depth > tc->tc_max_depth) {
@@ -644,7 +695,7 @@ turnstile_wait(struct lock_object *lock,
 				turnstile_max_depth = tc->tc_max_depth;
 		}
 #endif
-		ts = td->td_turnstile;
+		tc = TC_LOOKUP(ts->ts_lockobj);
 		LIST_INSERT_HEAD(&tc->tc_turnstiles, ts, ts_hash);
 		KASSERT(TAILQ_EMPTY(&ts->ts_pending),
 		    ("thread's turnstile has pending threads"));
@@ -654,8 +705,7 @@ turnstile_wait(struct lock_object *lock,
 		    ("thread's turnstile has shared waiters"));
 		KASSERT(LIST_EMPTY(&ts->ts_free),
 		    ("thread's turnstile has a non-empty free list"));
-		KASSERT(ts->ts_lockobj == NULL, ("stale ts_lockobj pointer"));
-		ts->ts_lockobj = lock;
+		MPASS(ts->ts_lockobj != NULL);
 		mtx_lock_spin(&td_contested_lock);
 		TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
 		turnstile_setowner(ts, owner);
@@ -674,58 +724,31 @@ turnstile_wait(struct lock_object *lock,
 		MPASS(td->td_turnstile != NULL);
 		LIST_INSERT_HEAD(&ts->ts_free, td->td_turnstile, ts_hash);
 	}
+	thread_lock(td);
+	thread_lock_set(td, &ts->ts_lock);
 	td->td_turnstile = NULL;
-	mtx_unlock_spin(&tc->tc_lock);
-
-	mtx_lock_spin(&sched_lock);
-	/*
-	 * Handle race condition where a thread on another CPU that owns
-	 * lock 'lock' could have woken us in between us dropping the
-	 * turnstile chain lock and acquiring the sched_lock.
-	 */
-	if (td->td_flags & TDF_TSNOBLOCK) {
-		td->td_flags &= ~TDF_TSNOBLOCK;
-		mtx_unlock_spin(&sched_lock);
-		return;
-	}
-		
-#ifdef notyet
-	/*
-	 * If we're borrowing an interrupted thread's VM context, we
-	 * must clean up before going to sleep.
-	 */
-	if (td->td_ithd != NULL) {
-		struct ithd *it = td->td_ithd;
-
-		if (it->it_interrupted) {
-			if (LOCK_LOG_TEST(lock, 0))
-				CTR3(KTR_LOCK, "%s: %p interrupted %p",
-				    __func__, it, it->it_interrupted);
-			intr_thd_fixup(it);
-		}
-	}
-#endif
 
 	/* Save who we are blocked on and switch. */
+	lock = ts->ts_lockobj;
 	td->td_tsqueue = queue;
 	td->td_blocked = ts;
 	td->td_lockname = lock->lo_name;
 	TD_SET_LOCK(td);
-	critical_enter();
+	mtx_unlock_spin(&tc->tc_lock);
 	propagate_priority(td);
-	critical_exit();
 
 	if (LOCK_LOG_TEST(lock, 0))
 		CTR4(KTR_LOCK, "%s: td %d blocked on [%p] %s", __func__,
 		    td->td_tid, lock, lock->lo_name);
 
+	MPASS(td->td_lock == &ts->ts_lock);
+	SCHED_STAT_INC(switch_turnstile);
 	mi_switch(SW_VOL, NULL);
 
 	if (LOCK_LOG_TEST(lock, 0))
 		CTR4(KTR_LOCK, "%s: td %d free from blocked on [%p] %s",
 		    __func__, td->td_tid, lock, lock->lo_name);
-
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 }
 
 /*
@@ -740,11 +763,10 @@ turnstile_signal(struct turnstile *ts, i
 	int empty;
 
 	MPASS(ts != NULL);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
 	MPASS(curthread->td_proc->p_magic == P_MAGIC);
 	MPASS(ts->ts_owner == curthread ||
 	    (queue == TS_EXCLUSIVE_QUEUE && ts->ts_owner == NULL));
-	tc = TC_LOOKUP(ts->ts_lockobj);
-	mtx_assert(&tc->tc_lock, MA_OWNED);
 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
 
 	/*
@@ -766,6 +788,8 @@ turnstile_signal(struct turnstile *ts, i
 	empty = TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) &&
 	    TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]);
 	if (empty) {
+		tc = TC_LOOKUP(ts->ts_lockobj);
+		mtx_assert(&tc->tc_lock, MA_OWNED);
 		MPASS(LIST_EMPTY(&ts->ts_free));
 #ifdef TURNSTILE_PROFILING
 		tc->tc_depth--;
@@ -791,9 +815,14 @@ turnstile_broadcast(struct turnstile *ts
 	struct thread *td;
 
 	MPASS(ts != NULL);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
 	MPASS(curthread->td_proc->p_magic == P_MAGIC);
 	MPASS(ts->ts_owner == curthread ||
 	    (queue == TS_EXCLUSIVE_QUEUE && ts->ts_owner == NULL));
+	/*
+	 * We must have the chain locked so that we can remove the empty
+	 * turnstile from the hash queue.
+	 */
 	tc = TC_LOOKUP(ts->ts_lockobj);
 	mtx_assert(&tc->tc_lock, MA_OWNED);
 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
@@ -833,15 +862,14 @@ void
 turnstile_unpend(struct turnstile *ts, int owner_type)
 {
 	TAILQ_HEAD( ,thread) pending_threads;
-	struct turnstile_chain *tc;
+	struct turnstile *nts;
 	struct thread *td;
 	u_char cp, pri;
 
 	MPASS(ts != NULL);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
 	MPASS(ts->ts_owner == curthread ||
 	    (owner_type == TS_SHARED_LOCK && ts->ts_owner == NULL));
-	tc = TC_LOOKUP(ts->ts_lockobj);
-	mtx_assert(&tc->tc_lock, MA_OWNED);
 	MPASS(!TAILQ_EMPTY(&ts->ts_pending));
 
 	/*
@@ -855,7 +883,15 @@ turnstile_unpend(struct turnstile *ts, i
 	    TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]))
 		ts->ts_lockobj = NULL;
 #endif
-
+	/*
+	 * Adjust the priority of curthread based on other contested
+	 * locks it owns.  Don't lower the priority below the base
+	 * priority however.
+	 */
+	td = curthread;
+	pri = PRI_MAX;
+	thread_lock(td);
+	mtx_lock_spin(&td_contested_lock);
 	/*
 	 * Remove the turnstile from this thread's list of contested locks
 	 * since this thread doesn't own it anymore.  New threads will
@@ -864,31 +900,17 @@ turnstile_unpend(struct turnstile *ts, i
 	 * lock.
 	 */
 	if (ts->ts_owner != NULL) {
-		mtx_lock_spin(&td_contested_lock);
 		ts->ts_owner = NULL;
 		LIST_REMOVE(ts, ts_link);
-		mtx_unlock_spin(&td_contested_lock);
 	}
-	critical_enter();
-	mtx_unlock_spin(&tc->tc_lock);
-
-	/*
-	 * Adjust the priority of curthread based on other contested
-	 * locks it owns.  Don't lower the priority below the base
-	 * priority however.
-	 */
-	td = curthread;
-	pri = PRI_MAX;
-	mtx_lock_spin(&sched_lock);
-	mtx_lock_spin(&td_contested_lock);
-	LIST_FOREACH(ts, &td->td_contested, ts_link) {
-		cp = turnstile_first_waiter(ts)->td_priority;
+	LIST_FOREACH(nts, &td->td_contested, ts_link) {
+		cp = turnstile_first_waiter(nts)->td_priority;
 		if (cp < pri)
 			pri = cp;
 	}
 	mtx_unlock_spin(&td_contested_lock);
 	sched_unlend_prio(td, pri);
-
+	thread_unlock(td);
 	/*
 	 * Wake up all the pending threads.  If a thread is not blocked
 	 * on a lock, then it is currently executing on another CPU in
@@ -899,23 +921,21 @@ turnstile_unpend(struct turnstile *ts, i
 	while (!TAILQ_EMPTY(&pending_threads)) {
 		td = TAILQ_FIRST(&pending_threads);
 		TAILQ_REMOVE(&pending_threads, td, td_lockq);
+		thread_lock(td);
+		MPASS(td->td_lock == &ts->ts_lock);
 		MPASS(td->td_proc->p_magic == P_MAGIC);
-		if (TD_ON_LOCK(td)) {
-			td->td_blocked = NULL;
-			td->td_lockname = NULL;
+		MPASS(TD_ON_LOCK(td));
+		TD_CLR_LOCK(td);
+		MPASS(TD_CAN_RUN(td));
+		td->td_blocked = NULL;
+		td->td_lockname = NULL;
 #ifdef INVARIANTS
-			td->td_tsqueue = 0xff;
+		td->td_tsqueue = 0xff;
 #endif
-			TD_CLR_LOCK(td);
-			MPASS(TD_CAN_RUN(td));
-			sched_add(td, SRQ_BORING);
-		} else {
-			td->td_flags |= TDF_TSNOBLOCK;
-			MPASS(TD_IS_RUNNING(td) || TD_ON_RUNQ(td));
-		}
+		sched_add(td, SRQ_BORING);
+		thread_unlock(td);
 	}
-	critical_exit();
-	mtx_unlock_spin(&sched_lock);
+	mtx_unlock_spin(&ts->ts_lock);
 }
 
 /*
@@ -925,14 +945,12 @@ turnstile_unpend(struct turnstile *ts, i
 void
 turnstile_disown(struct turnstile *ts)
 {
-	struct turnstile_chain *tc;
 	struct thread *td;
 	u_char cp, pri;
 
 	MPASS(ts != NULL);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
 	MPASS(ts->ts_owner == curthread);
-	tc = TC_LOOKUP(ts->ts_lockobj);
-	mtx_assert(&tc->tc_lock, MA_OWNED);
 	MPASS(TAILQ_EMPTY(&ts->ts_pending));
 	MPASS(!TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) ||
 	    !TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]));
@@ -947,7 +965,6 @@ turnstile_disown(struct turnstile *ts)
 	ts->ts_owner = NULL;
 	LIST_REMOVE(ts, ts_link);
 	mtx_unlock_spin(&td_contested_lock);
-	mtx_unlock_spin(&tc->tc_lock);
 
 	/*
 	 * Adjust the priority of curthread based on other contested
@@ -956,7 +973,8 @@ turnstile_disown(struct turnstile *ts)
 	 */
 	td = curthread;
 	pri = PRI_MAX;
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
+	mtx_unlock_spin(&ts->ts_lock);
 	mtx_lock_spin(&td_contested_lock);
 	LIST_FOREACH(ts, &td->td_contested, ts_link) {
 		cp = turnstile_first_waiter(ts)->td_priority;
@@ -965,7 +983,7 @@ turnstile_disown(struct turnstile *ts)
 	}
 	mtx_unlock_spin(&td_contested_lock);
 	sched_unlend_prio(td, pri);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 }
 
 /*
@@ -975,12 +993,10 @@ struct thread *
 turnstile_head(struct turnstile *ts, int queue)
 {
 #ifdef INVARIANTS
-	struct turnstile_chain *tc;
 
 	MPASS(ts != NULL);
 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
-	tc = TC_LOOKUP(ts->ts_lockobj);
-	mtx_assert(&tc->tc_lock, MA_OWNED);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
 #endif
 	return (TAILQ_FIRST(&ts->ts_blocked[queue]));
 }
@@ -992,12 +1008,10 @@ int
 turnstile_empty(struct turnstile *ts, int queue)
 {
 #ifdef INVARIANTS
-	struct turnstile_chain *tc;
 
 	MPASS(ts != NULL);
 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
-	tc = TC_LOOKUP(ts->ts_lockobj);
-	mtx_assert(&tc->tc_lock, MA_OWNED);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
 #endif
 	return (TAILQ_EMPTY(&ts->ts_blocked[queue]));
 }
Index: kern/subr_witness.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/subr_witness.c,v
retrieving revision 1.232
diff -u -p -r1.232 subr_witness.c
--- kern/subr_witness.c	29 May 2007 18:55:41 -0000	1.232
+++ kern/subr_witness.c	31 May 2007 21:08:09 -0000
@@ -404,9 +404,12 @@ static struct witness_order_list_entry o
 #ifdef	HWPMC_HOOKS
 	{ "pmc-per-proc", &lock_class_mtx_spin },
 #endif
+	{ "process slock", &lock_class_mtx_spin },
 	{ "sleepq chain", &lock_class_mtx_spin },
-	{ "sched lock", &lock_class_mtx_spin },
+	{ "umtx lock", &lock_class_mtx_spin },
 	{ "turnstile chain", &lock_class_mtx_spin },
+	{ "turnstile lock", &lock_class_mtx_spin },
+	{ "sched lock", &lock_class_mtx_spin },
 	{ "td_contested", &lock_class_mtx_spin },
 	{ "callout", &lock_class_mtx_spin },
 	{ "entropy harvest mutex", &lock_class_mtx_spin },
@@ -429,7 +432,8 @@ static struct witness_order_list_entry o
 #endif
 	{ "clk", &lock_class_mtx_spin },
 	{ "mutex profiling lock", &lock_class_mtx_spin },
-	{ "kse zombie lock", &lock_class_mtx_spin },
+	{ "kse lock", &lock_class_mtx_spin },
+	{ "zombie lock", &lock_class_mtx_spin },
 	{ "ALD Queue", &lock_class_mtx_spin },
 #ifdef __ia64__
 	{ "MCA spin lock", &lock_class_mtx_spin },
@@ -446,6 +450,7 @@ static struct witness_order_list_entry o
 #ifdef	HWPMC_HOOKS
 	{ "pmc-leaf", &lock_class_mtx_spin },
 #endif
+	{ "blocked lock", &lock_class_mtx_spin },
 	{ NULL, NULL },
 	{ NULL, NULL }
 };
@@ -1961,10 +1966,10 @@ witness_list(struct thread *td)
 	 * td->td_oncpu to get the list of spinlocks for this thread
 	 * and "fix" this.
 	 *
-	 * That still wouldn't really fix this unless we locked sched_lock
-	 * or stopped the other CPU to make sure it wasn't changing the list
-	 * out from under us.  It is probably best to just not try to handle
-	 * threads on other CPU's for now.
+	 * That still wouldn't really fix this unless we locked the scheduler
+	 * lock or stopped the other CPU to make sure it wasn't changing the
+	 * list out from under us.  It is probably best to just not try to
+	 * handle threads on other CPU's for now.
 	 */
 	if (td == curthread && PCPU_GET(spinlocks) != NULL)
 		witness_list_locks(PCPU_PTR(spinlocks));
Index: kern/sys_generic.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/sys_generic.c,v
retrieving revision 1.156
diff -u -p -r1.156 sys_generic.c
--- kern/sys_generic.c	1 May 2007 06:35:54 -0000	1.156
+++ kern/sys_generic.c	18 May 2007 10:37:02 -0000
@@ -722,9 +722,9 @@ kern_select(struct thread *td, int nd, f
 	mtx_lock(&sellock);
 retry:
 	ncoll = nselcoll;
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	td->td_flags |= TDF_SELECT;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	mtx_unlock(&sellock);
 
 	error = selscan(td, ibits, obits, nd);
@@ -747,12 +747,12 @@ retry:
 	 * collisions and rescan the file descriptors if
 	 * necessary.
 	 */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 		goto retry;
 	}
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 
 	if (timo > 0)
 		error = cv_timedwait_sig(&selwait, &sellock, timo);
@@ -764,9 +764,9 @@ retry:
 
 done:
 	clear_selinfo_list(td);
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	td->td_flags &= ~TDF_SELECT;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	mtx_unlock(&sellock);
 
 done_nosellock:
@@ -896,9 +896,9 @@ poll(td, uap)
 	mtx_lock(&sellock);
 retry:
 	ncoll = nselcoll;
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	td->td_flags |= TDF_SELECT;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	mtx_unlock(&sellock);
 
 	error = pollscan(td, bits, nfds);
@@ -919,12 +919,12 @@ retry:
 	 * sellock, so check TDF_SELECT and the number of collisions
 	 * and rescan the file descriptors if necessary.
 	 */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 		goto retry;
 	}
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 
 	if (timo > 0)
 		error = cv_timedwait_sig(&selwait, &sellock, timo);
@@ -936,9 +936,9 @@ retry:
 
 done:
 	clear_selinfo_list(td);
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	td->td_flags &= ~TDF_SELECT;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	mtx_unlock(&sellock);
 
 done_nosellock:
@@ -1109,9 +1109,9 @@ doselwakeup(sip, pri)
 	}
 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
 	sip->si_thread = NULL;
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	td->td_flags &= ~TDF_SELECT;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	sleepq_remove(td, &selwait);
 	mtx_unlock(&sellock);
 }
Index: kern/sys_process.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/sys_process.c,v
retrieving revision 1.142
diff -u -p -r1.142 sys_process.c
--- kern/sys_process.c	4 Mar 2007 22:36:46 -0000	1.142
+++ kern/sys_process.c	18 May 2007 10:37:02 -0000
@@ -527,12 +527,12 @@ kern_ptrace(struct thread *td, int req, 
 			sx_slock(&allproc_lock);
 			FOREACH_PROC_IN_SYSTEM(p) {
 				PROC_LOCK(p);
-				mtx_lock_spin(&sched_lock);
+				PROC_SLOCK(p);
 				FOREACH_THREAD_IN_PROC(p, td2) {
 					if (td2->td_tid == pid)
 						break;
 				}
-				mtx_unlock_spin(&sched_lock);
+				PROC_SUNLOCK(p);
 				if (td2 != NULL)
 					break; /* proc lock held */
 				PROC_UNLOCK(p);
@@ -701,15 +701,15 @@ kern_ptrace(struct thread *td, int req, 
 		break;
 
 	case PT_SUSPEND:
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td2);
 		td2->td_flags |= TDF_DBSUSPEND;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td2);
 		break;
 
 	case PT_RESUME:
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td2);
 		td2->td_flags &= ~TDF_DBSUSPEND;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td2);
 		break;
 
 	case PT_STEP:
@@ -780,32 +780,35 @@ kern_ptrace(struct thread *td, int req, 
 			proctree_locked = 0;
 		}
 		/* deliver or queue signal */
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td2);
 		td2->td_flags &= ~TDF_XSIG;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td2);
 		td2->td_xsig = data;
 		p->p_xstat = data;
 		p->p_xthread = NULL;
 		if ((p->p_flag & (P_STOPPED_SIG | P_STOPPED_TRACE)) != 0) {
-			mtx_lock_spin(&sched_lock);
+			PROC_SLOCK(p);
 			if (req == PT_DETACH) {
 				struct thread *td3;
-				FOREACH_THREAD_IN_PROC(p, td3)
+				FOREACH_THREAD_IN_PROC(p, td3) {
+					thread_lock(td3);
 					td3->td_flags &= ~TDF_DBSUSPEND; 
+					thread_unlock(td3);
+				}
 			}
 			/*
 			 * unsuspend all threads, to not let a thread run,
 			 * you should use PT_SUSPEND to suspend it before
 			 * continuing process.
 			 */
-			mtx_unlock_spin(&sched_lock);
 #ifdef KSE
+			PROC_SUNLOCK(p);
 			thread_continued(p);
+			PROC_SLOCK(p);
 #endif
 			p->p_flag &= ~(P_STOPPED_TRACE|P_STOPPED_SIG|P_WAITED);
-			mtx_lock_spin(&sched_lock);
 			thread_unsuspend(p);
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 		}
 
 		if (data)
@@ -968,13 +971,13 @@ kern_ptrace(struct thread *td, int req, 
 		buf = malloc(num * sizeof(lwpid_t), M_TEMP, M_WAITOK);
 		tmp = 0;
 		PROC_LOCK(p);
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (tmp >= num)
 				break;
 			buf[tmp++] = td2->td_tid;
 		}
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		PROC_UNLOCK(p);
 		error = copyout(buf, addr, tmp * sizeof(lwpid_t));
 		free(buf, M_TEMP);
Index: kern/tty.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/kern/tty.c,v
retrieving revision 1.268
diff -u -p -r1.268 tty.c
--- kern/tty.c	20 Dec 2006 02:49:59 -0000	1.268
+++ kern/tty.c	18 May 2007 10:37:02 -0000
@@ -147,7 +147,9 @@ static struct cdevsw ttys_cdevsw = {
 	.d_flags =	D_TTY | D_NEEDGIANT,
 };
 
-static int	proc_compare(struct proc *p1, struct proc *p2);
+static int	proc_sum(struct proc *, int *);
+static int	proc_compare(struct proc *, struct proc *);
+static int	thread_compare(struct thread *, struct thread *);
 static int	ttnread(struct tty *tp);
 static void	ttyecho(int c, struct tty *tp);
 static int	ttyoutput(int c, struct tty *tp);
@@ -2528,7 +2530,7 @@ ttyinfo(struct tty *tp)
 {
 	struct timeval utime, stime;
 	struct proc *p, *pick;
-	struct thread *td;
+	struct thread *td, *picktd;
 	const char *stateprefix, *state;
 	long rss;
 	int load, pctcpu;
@@ -2566,21 +2568,25 @@ ttyinfo(struct tty *tp)
 
 	/*
 	 * Pick the most interesting process and copy some of its
-	 * state for printing later.  sched_lock must be held for
-	 * most parts of this.  Holding it throughout is simplest
-	 * and prevents even unimportant inconsistencies in the
-	 * copy of the state, but may increase interrupt latency
-	 * too much.
+	 * state for printing later.  This operation could rely on stale
+	 * data as we can't hold the proc slock or thread locks over the
+	 * whole list. However, we're guaranteed not to reference an exited
+	 * thread or proc since we hold the tty locked.
 	 */
 	pick = NULL;
-	mtx_lock_spin(&sched_lock);
 	LIST_FOREACH(p, &tp->t_pgrp->pg_members, p_pglist)
 		if (proc_compare(pick, p))
 			pick = p;
 
-	/*^T can only show state for 1 thread. just pick the first. */
+	PROC_SLOCK(pick);
+	picktd = NULL;
 	td = FIRST_THREAD_IN_PROC(pick);
+	FOREACH_THREAD_IN_PROC(pick, td)
+		if (thread_compare(picktd, td))
+			picktd = td;
+	td = picktd;
 	stateprefix = "";
+	thread_lock(td);
 	if (TD_IS_RUNNING(td))
 		state = "running";
 	else if (TD_ON_RUNQ(td) || TD_CAN_RUN(td))
@@ -2601,11 +2607,12 @@ ttyinfo(struct tty *tp)
 	else
 		state = "unknown";
 	pctcpu = (sched_pctcpu(td) * 10000 + FSCALE / 2) >> FSHIFT;
+	thread_unlock(td);
 	if (pick->p_state == PRS_NEW || pick->p_state == PRS_ZOMBIE)
 		rss = 0;
 	else
 		rss = pgtok(vmspace_resident_count(pick->p_vmspace));
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(pick);
 	PROC_LOCK(pick);
 	PGRP_UNLOCK(tp->t_pgrp);
 	calcru(pick, &utime, &stime);
@@ -2636,18 +2643,6 @@ ttyinfo(struct tty *tp)
  *	   we pick out just "short-term" sleepers (P_SINTR == 0).
  *	4) Further ties are broken by picking the highest pid.
  */
-#define ISRUN(p, val)						\
-do {								\
-	struct thread *td;					\
-	val = 0;						\
-	FOREACH_THREAD_IN_PROC(p, td) {				\
-		if (TD_ON_RUNQ(td) ||				\
-		    TD_IS_RUNNING(td)) {			\
-			val = 1;				\
-			break;					\
-		}						\
-	}							\
-} while (0)
 
 #define TESTAB(a, b)    ((a)<<1 | (b))
 #define ONLYA   2
@@ -2655,69 +2650,134 @@ do {								\
 #define BOTH    3
 
 static int
-proc_compare(struct proc *p1, struct proc *p2)
+proc_sum(struct proc *p, int *estcpup)
 {
-
-	int esta, estb;
 	struct thread *td;
-	mtx_assert(&sched_lock, MA_OWNED);
-	if (p1 == NULL)
+	int estcpu;
+	int val;
+
+	val = 0;
+	estcpu = 0;
+	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
+		if (TD_ON_RUNQ(td) ||
+		    TD_IS_RUNNING(td))
+			val = 1;
+		estcpu += sched_pctcpu(td);
+		thread_unlock(td);
+	}
+	*estcpup = estcpu;
+
+	return (val);
+}
+
+static int
+thread_compare(struct thread *td, struct thread *td2)
+{
+	int runa, runb;
+	int slpa, slpb;
+	fixpt_t esta, estb;
+
+	if (td == NULL)
 		return (1);
 
-	ISRUN(p1, esta);
-	ISRUN(p2, estb);
-	
+	/*
+	 * Fetch running stats, pctcpu usage, and interruptable flag.
+ 	 */
+	thread_lock(td);
+	runa = TD_IS_RUNNING(td) | TD_ON_RUNQ(td);
+	slpa = td->td_flags & TDF_SINTR;
+	esta = sched_pctcpu(td);
+	thread_unlock(td);
+	thread_lock(td2);
+	runb = TD_IS_RUNNING(td2) | TD_ON_RUNQ(td2);
+	estb = sched_pctcpu(td2);
+	slpb = td2->td_flags & TDF_SINTR;
+	thread_unlock(td2);
 	/*
 	 * see if at least one of them is runnable
 	 */
-	switch (TESTAB(esta, estb)) {
+	switch (TESTAB(runa, runb)) {
 	case ONLYA:
 		return (0);
 	case ONLYB:
 		return (1);
 	case BOTH:
-		/*
-		 * tie - favor one with highest recent cpu utilization
-		 */
-		esta = estb = 0;
-		FOREACH_THREAD_IN_PROC(p1, td)
-			esta += td->td_estcpu;
-		FOREACH_THREAD_IN_PROC(p2, td)
-			estb += td->td_estcpu;
-		if (estb > esta)
-			return (1);
-		if (esta > estb)
-			return (0);
-		return (p2->p_pid > p1->p_pid);	/* tie - return highest pid */
+		break;
 	}
 	/*
-	 * weed out zombies
+	 *  favor one with highest recent cpu utilization
 	 */
-	switch (TESTAB(p1->p_state == PRS_ZOMBIE, p2->p_state == PRS_ZOMBIE)) {
-	case ONLYA:
+	if (estb > esta)
 		return (1);
-	case ONLYB:
+	if (esta > estb)
+		return (0);
+	/*
+	 * favor one sleeping in a non-interruptible sleep
+	 */
+	switch (TESTAB(slpa, slpb)) {
+	case ONLYA:
 		return (0);
+	case ONLYB:
+		return (1);
 	case BOTH:
-		return (p2->p_pid > p1->p_pid); /* tie - return highest pid */
+		break;
 	}
 
-#if 0 /* XXXKSE */
+	return (td < td2);
+}
+
+static int
+proc_compare(struct proc *p1, struct proc *p2)
+{
+
+	int runa, runb;
+	fixpt_t esta, estb;
+
+	if (p1 == NULL)
+		return (1);
+
 	/*
-	 * pick the one with the smallest sleep time
+	 * Fetch various stats about these processes.  After we drop the
+	 * lock the information could be stale but the race is unimportant.
 	 */
-	if (p2->p_slptime > p1->p_slptime)
+	PROC_SLOCK(p1);
+	runa = proc_sum(p1, &esta);
+	PROC_SUNLOCK(p1);
+	PROC_SLOCK(p2);
+	runb = proc_sum(p2, &estb);
+	PROC_SUNLOCK(p2);
+	
+	/*
+	 * see if at least one of them is runnable
+	 */
+	switch (TESTAB(runa, runb)) {
+	case ONLYA:
 		return (0);
-	if (p1->p_slptime > p2->p_slptime)
+	case ONLYB:
 		return (1);
+	case BOTH:
+		break;
+	}
 	/*
-	 * favor one sleeping in a non-interruptible sleep
+	 *  favor one with highest recent cpu utilization
 	 */
-	if (p1->p_sflag & PS_SINTR && (p2->p_sflag & PS_SINTR) == 0)
+	if (estb > esta)
 		return (1);
-	if (p2->p_sflag & PS_SINTR && (p1->p_sflag & PS_SINTR) == 0)
+	if (esta > estb)
 		return (0);
-#endif
+	/*
+	 * weed out zombies
+	 */
+	switch (TESTAB(p1->p_state == PRS_ZOMBIE, p2->p_state == PRS_ZOMBIE)) {
+	case ONLYA:
+		return (1);
+	case ONLYB:
+		return (0);
+	case BOTH:
+		break;
+	}
+
 	return (p2->p_pid > p1->p_pid);		/* tie - return highest pid */
 }
 
Index: netncp/ncp_sock.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/netncp/ncp_sock.c,v
retrieving revision 1.18
diff -u -p -r1.18 ncp_sock.c
--- netncp/ncp_sock.c	27 Feb 2007 17:23:29 -0000	1.18
+++ netncp/ncp_sock.c	18 May 2007 10:37:02 -0000
@@ -189,9 +189,9 @@ ncp_poll(struct socket *so, int events)
 
 	/* Fake up enough state to look like we are in poll(2). */
 	mtx_lock(&sellock);
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	td->td_flags |= TDF_SELECT;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	mtx_unlock(&sellock);
 	TAILQ_INIT(&td->td_selq);
 
@@ -200,9 +200,9 @@ ncp_poll(struct socket *so, int events)
 	/* Tear down the fake poll(2) state. */
 	mtx_lock(&sellock);
 	clear_selinfo_list(td);
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	td->td_flags &= ~TDF_SELECT;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	mtx_unlock(&sellock);
 
 	return (revents);
@@ -229,9 +229,9 @@ ncp_sock_rselect(struct socket *so, stru
 
 retry:
 	ncoll = nselcoll;
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	td->td_flags |= TDF_SELECT;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	mtx_unlock(&sellock);
 
 	TAILQ_INIT(&td->td_selq);
@@ -257,12 +257,12 @@ retry:
 	 * the process, test TDF_SELECT and rescan file descriptors if
 	 * necessary.
 	 */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 		goto retry;
 	}
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 
 	if (timo > 0)
 		error = cv_timedwait(&selwait, &sellock, timo);
@@ -274,9 +274,9 @@ retry:
 done:
 	clear_selinfo_list(td);
 
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	td->td_flags &= ~TDF_SELECT;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	mtx_unlock(&sellock);
 
 done_noproclock:
Index: netsmb/smb_trantcp.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/netsmb/smb_trantcp.c,v
retrieving revision 1.24
diff -u -p -r1.24 smb_trantcp.c
--- netsmb/smb_trantcp.c	3 Aug 2006 15:31:52 -0000	1.24
+++ netsmb/smb_trantcp.c	18 May 2007 10:37:02 -0000
@@ -115,9 +115,9 @@ nbssn_rselect(struct nbpcb *nbp, struct 
 retry:
 
 	ncoll = nselcoll;
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	td->td_flags |= TDF_SELECT;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	mtx_unlock(&sellock);
 
 	/* XXX: Should be done when the thread is initialized. */
@@ -144,12 +144,12 @@ retry:
 	 * the process, test P_SELECT and rescan file descriptors if
 	 * necessary.
 	 */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 		goto retry;
 	}
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 
 	if (timo > 0)
 		error = cv_timedwait(&selwait, &sellock, timo);
@@ -161,9 +161,9 @@ retry:
 done:
 	clear_selinfo_list(td);
 	
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	td->td_flags &= ~TDF_SELECT;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	mtx_unlock(&sellock);
 
 done_noproclock:
Index: pc98/pc98/machdep.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/pc98/pc98/machdep.c,v
retrieving revision 1.393
diff -u -p -r1.393 machdep.c
--- pc98/pc98/machdep.c	31 May 2007 22:52:13 -0000	1.393
+++ pc98/pc98/machdep.c	31 May 2007 20:40:24 -0000
@@ -1055,9 +1055,9 @@ cpu_est_clockrate(int cpu_id, uint64_t *
 
 #ifdef SMP
 	/* Schedule ourselves on the indicated cpu. */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 	sched_bind(curthread, cpu_id);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 #endif
 
 	/* Calibrate by measuring a short delay. */
@@ -1068,9 +1068,9 @@ cpu_est_clockrate(int cpu_id, uint64_t *
 	intr_restore(reg);
 
 #ifdef SMP
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 	sched_unbind(curthread);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 #endif
 
 	/*
Index: powerpc/powerpc/vm_machdep.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/powerpc/powerpc/vm_machdep.c,v
retrieving revision 1.113
diff -u -p -r1.113 vm_machdep.c
--- powerpc/powerpc/vm_machdep.c	28 Dec 2006 23:56:50 -0000	1.113
+++ powerpc/powerpc/vm_machdep.c	31 May 2007 21:25:28 -0000
@@ -154,7 +154,7 @@ cpu_fork(struct thread *td1, struct proc
 	pcb->pcb_lr = (register_t)fork_trampoline;
 	pcb->pcb_usr = kernel_pmap->pm_sr[USER_SR];
 
-	/* Setup to release sched_lock in fork_exit(). */
+	/* Setup to release spin count in fork_exit(). */
 	td2->td_md.md_spinlock_count = 1;
 	td2->td_md.md_saved_msr = PSL_KERNSET;
 
@@ -327,7 +327,7 @@ cpu_set_upcall(struct thread *td, struct
 	pcb2->pcb_lr = (register_t)fork_trampoline;
 	pcb2->pcb_usr = kernel_pmap->pm_sr[USER_SR];
 
-	/* Setup to release sched_lock in fork_exit(). */
+	/* Setup to release spin count in fork_exit(). */
 	td->td_md.md_spinlock_count = 1;
 	td->td_md.md_saved_msr = PSL_KERNSET;
 }
Index: security/mac_lomac/mac_lomac.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/security/mac_lomac/mac_lomac.c,v
retrieving revision 1.50
diff -u -p -r1.50 mac_lomac.c
--- security/mac_lomac/mac_lomac.c	23 Apr 2007 13:36:53 -0000	1.50
+++ security/mac_lomac/mac_lomac.c	18 May 2007 10:37:02 -0000
@@ -536,10 +536,10 @@ maybe_demote(struct mac_lomac *subjlabel
 		subj->mac_lomac.ml_rangelow = objlabel->ml_single;
 	subj->mac_lomac.ml_rangehigh = objlabel->ml_single;
 	subj->mac_lomac.ml_flags |= MAC_LOMAC_FLAG_UPDATE;
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 	curthread->td_flags |= TDF_ASTPENDING;
 	curthread->td_proc->p_sflag |= PS_MACPEND;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 
 	/*
 	 * Avoid memory allocation while holding a mutex; cache the
Index: sparc64/sparc64/mp_machdep.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/sparc64/sparc64/mp_machdep.c,v
retrieving revision 1.34
diff -u -p -r1.34 mp_machdep.c
--- sparc64/sparc64/mp_machdep.c	20 May 2007 14:49:01 -0000	1.34
+++ sparc64/sparc64/mp_machdep.c	23 May 2007 15:29:29 -0000
@@ -364,12 +364,8 @@ cpu_mp_bootstrap(struct pcpu *pc)
 	while (csa->csa_count != 0)
 		;
 
-	/* ok, now grab sched_lock and enter the scheduler */
-	mtx_lock_spin(&sched_lock);
-	spinlock_exit();
-	PCPU_SET(switchtime, cpu_ticks());
-	PCPU_SET(switchticks, ticks);
-	cpu_throw(NULL, choosethread());	/* doesn't return */
+	/* ok, now enter the scheduler */
+	sched_throw(NULL);
 }
 
 void
Index: sparc64/sparc64/vm_machdep.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/sparc64/sparc64/vm_machdep.c,v
retrieving revision 1.74
diff -u -p -r1.74 vm_machdep.c
--- sparc64/sparc64/vm_machdep.c	10 Jul 2005 23:31:11 -0000	1.74
+++ sparc64/sparc64/vm_machdep.c	31 May 2007 21:26:16 -0000
@@ -171,7 +171,7 @@ cpu_set_upcall(struct thread *td, struct
 	pcb->pcb_pc = (u_long)fork_trampoline - 8;
 	pcb->pcb_sp = (u_long)fr - SPOFF;
 
-	/* Setup to release sched_lock in fork_exit(). */
+	/* Setup to release the spin count in fork_exit(). */
 	td->td_md.md_spinlock_count = 1;
 	td->td_md.md_saved_pil = 0;
 }
@@ -298,7 +298,7 @@ cpu_fork(struct thread *td1, struct proc
 	pcb2->pcb_sp = (u_long)fp - SPOFF;
 	pcb2->pcb_pc = (u_long)fork_trampoline - 8;
 
-	/* Setup to release sched_lock in fork_exit(). */
+	/* Setup to release the spin count in fork_exit(). */
 	td2->td_md.md_spinlock_count = 1;
 	td2->td_md.md_saved_pil = 0;
 
Index: sun4v/sun4v/mp_machdep.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/sun4v/sun4v/mp_machdep.c,v
retrieving revision 1.6
diff -u -p -r1.6 mp_machdep.c
--- sun4v/sun4v/mp_machdep.c	2 Feb 2007 05:00:21 -0000	1.6
+++ sun4v/sun4v/mp_machdep.c	31 May 2007 21:26:53 -0000
@@ -403,13 +403,8 @@ cpu_mp_bootstrap(struct pcpu *pc)
 
 	while (csa->csa_count != 0)
 		;
-	/* ok, now grab sched_lock and enter the scheduler */
-	mtx_lock_spin(&sched_lock);
-	spinlock_exit();
-	PCPU_SET(switchtime, cpu_ticks());
-	PCPU_SET(switchticks, ticks);
-
-	cpu_throw(NULL, choosethread());	/* doesn't return */
+	/* ok, now enter the scheduler */
+	sched_throw(NULL);
 }
 
 void
@@ -460,13 +455,12 @@ cpu_ipi_preempt(struct trapframe *tf)
 {
 	struct thread *running_thread = curthread;
 
-	mtx_lock_spin(&sched_lock);
+	thread_lock(running_thread);
 	if (running_thread->td_critnest > 1)
 		running_thread->td_owepreempt = 1;
 	else
 		mi_switch(SW_INVOL | SW_PREEMPT, NULL);
-	mtx_unlock_spin(&sched_lock);
-
+	thread_unlock(running_thread);
 }
 
 void
Index: sun4v/sun4v/trap.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/sun4v/sun4v/trap.c,v
retrieving revision 1.12
diff -u -p -r1.12 trap.c
--- sun4v/sun4v/trap.c	25 May 2007 01:21:40 -0000	1.12
+++ sun4v/sun4v/trap.c	31 May 2007 21:27:07 -0000
@@ -712,6 +712,5 @@ syscall(struct trapframe *tf)
 
 	WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning",
 	    (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???");
-	mtx_assert(&sched_lock, MA_NOTOWNED);
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
Index: sun4v/sun4v/vm_machdep.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/sun4v/sun4v/vm_machdep.c,v
retrieving revision 1.5
diff -u -p -r1.5 vm_machdep.c
--- sun4v/sun4v/vm_machdep.c	20 May 2007 13:06:45 -0000	1.5
+++ sun4v/sun4v/vm_machdep.c	31 May 2007 21:27:31 -0000
@@ -155,7 +155,7 @@ cpu_set_upcall(struct thread *td, struct
 	pcb->pcb_pc = (u_long)fork_trampoline - 8;
 	pcb->pcb_sp = (u_long)fr - SPOFF;
 
-	/* Setup to release sched_lock in fork_exit(). */
+	/* Setup to release spin count in fork_exit(). */
 	td->td_md.md_spinlock_count = 1;
 	td->td_md.md_saved_pil = 0;
 }
@@ -288,7 +288,7 @@ cpu_fork(struct thread *td1, struct proc
 	pcb2->pcb_pc = (u_long)fork_trampoline - 8;
 	pcb2->pcb_kstack = (uint64_t)(((char *)pcb2orig) - (CCFSZ + SPOFF));
 
-	/* Setup to release sched_lock in fork_exit(). */
+	/* Setup to release spin count in fork_exit(). */
 	td2->td_md.md_spinlock_count = 1;
 	td2->td_md.md_saved_pil = 0;
 
Index: sys/mutex.h
===================================================================
RCS file: /usr/home/ncvs/src/sys/sys/mutex.h,v
retrieving revision 1.95
diff -u -p -r1.95 mutex.h
--- sys/mutex.h	11 Apr 2007 13:44:55 -0000	1.95
+++ sys/mutex.h	23 May 2007 19:46:08 -0000
@@ -125,6 +125,14 @@ void	_mtx_unlock_spin_flags(struct mtx *
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 void	_mtx_assert(struct mtx *m, int what, const char *file, int line);
 #endif
+void	_thread_lock_flags(struct thread *, int, const char *, int);
+
+#define	thread_lock(tdp)						\
+    _thread_lock_flags((tdp), 0, __FILE__, __LINE__)
+#define	thread_lock_flags(tdp, opt)					\
+    _thread_lock_flags((tdp), (opt), __FILE__, __LINE__)
+#define	thread_unlock(tdp)						\
+       mtx_unlock_spin(__DEVOLATILE(struct mtx *, (tdp)->td_lock))
 
 /*
  * We define our machine-independent (unoptimized) mutex micro-operations
@@ -349,6 +357,7 @@ extern struct mtx_pool *mtxpool_sleep;
  */
 extern struct mtx sched_lock;
 extern struct mtx Giant;
+extern struct mtx blocked_lock;
 
 /*
  * Giant lock manipulation and clean exit macros.
Index: sys/proc.h
===================================================================
RCS file: /usr/home/ncvs/src/sys/sys/proc.h,v
retrieving revision 1.478
diff -u -p -r1.478 proc.h
--- sys/proc.h	1 Jun 2007 01:12:45 -0000	1.478
+++ sys/proc.h	1 Jun 2007 00:10:30 -0000
@@ -134,7 +134,7 @@ struct pargs {
  *      g - process group mtx
  *      h - callout_lock mtx
  *      i - by curproc or the master session mtx
- *      j - locked by sched_lock mtx
+ *      j - locked by proc slock
  *      k - only accessed by curthread
  *	k*- only accessed by curthread and from an interrupt
  *      l - the attaching proc or attaching proc parent
@@ -144,6 +144,7 @@ struct pargs {
  *      p - select lock (sellock)
  *      q - td_contested lock
  *      r - p_peers lock
+ *	t - thread lock
  *      x - created at fork, only changes during single threading in exec
  *      z - zombie threads lock
  *
@@ -195,32 +196,19 @@ struct mqueue_notifier;
  * other than CPU cycles, which are parceled out to the threads.
  */
 
-/***************
- * Threads are the unit of execution
- With a single run queue used by all processors:
-
- RUNQ: --->THREAD---THREAD--...               SLEEPQ:[]---THREAD---THREAD---THREAD
-                                                     []---THREAD
-                                       	             []
-                                                     []---THREAD---THREAD
-
-With PER-CPU run queues: 
-it gets more complicated.
- *
- *****************/
-
 /*
  * Kernel runnable context (thread).
  * This is what is put to sleep and reactivated.
  * Thread context.  Processes may have multiple threads.
  */
 struct thread {
+	volatile struct mtx *td_lock;	/* replaces sched lock */
 	struct proc	*td_proc;	/* (*) Associated process. */
 	TAILQ_ENTRY(thread) td_plist;	/* (*) All threads in this proc. */
 
 	/* The two queues below should someday be merged. */
-	TAILQ_ENTRY(thread) td_slpq;	/* (j) Sleep queue. */
-	TAILQ_ENTRY(thread) td_lockq;	/* (j) Lock queue. */
+	TAILQ_ENTRY(thread) td_slpq;	/* (t) Sleep queue. */
+	TAILQ_ENTRY(thread) td_lockq;	/* (t) Lock queue. */
 
 	TAILQ_HEAD(, selinfo) td_selq;	/* (p) List of selinfos. */
 	struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */
@@ -232,20 +220,20 @@ struct thread {
 
 /* Cleared during fork1() or thread_schedule_upcall(). */
 #define	td_startzero td_flags
-	int		td_flags;	/* (j) TDF_* flags. */
-	int		td_inhibitors;	/* (j) Why can not run. */
+	int		td_flags;	/* (t) TDF_* flags. */
+	int		td_inhibitors;	/* (t) Why can not run. */
 	int		td_pflags;	/* (k) Private thread (TDP_*) flags. */
 	int		td_dupfd;	/* (k) Ret value from fdopen. XXX */
-	int		td_sqqueue;	/* (j) Sleepqueue queue blocked on. */
-	void		*td_wchan;	/* (j) Sleep address. */
-	const char	*td_wmesg;	/* (j) Reason for sleep. */
-	u_char		td_lastcpu;	/* (j) Last cpu we were on. */
-	u_char		td_oncpu;	/* (j) Which cpu we are on. */
+	int		td_sqqueue;	/* (t) Sleepqueue queue blocked on. */
+	void		*td_wchan;	/* (t) Sleep address. */
+	const char	*td_wmesg;	/* (t) Reason for sleep. */
+	u_char		td_lastcpu;	/* (t) Last cpu we were on. */
+	u_char		td_oncpu;	/* (t) Which cpu we are on. */
 	volatile u_char td_owepreempt;  /* (k*) Preempt on last critical_exit */
 	short		td_locks;	/* (k) Count of non-spin locks. */
-	u_char		td_tsqueue;	/* (j) Turnstile queue blocked on. */
-	struct turnstile *td_blocked;	/* (j) Lock thread is blocked on. */
-	const char	*td_lockname;	/* (j) Name of lock blocked on. */
+	u_char		td_tsqueue;	/* (t) Turnstile queue blocked on. */
+	struct turnstile *td_blocked;	/* (t) Lock thread is blocked on. */
+	const char	*td_lockname;	/* (t) Name of lock blocked on. */
 	LIST_HEAD(, turnstile) td_contested;	/* (q) Contested locks. */
 	struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */
 	int		td_intr_nesting_level; /* (k) Interrupt recursion. */
@@ -253,18 +241,18 @@ struct thread {
 	struct kse_thr_mailbox *td_mailbox; /* (*) Userland mailbox address. */
 	struct ucred	*td_ucred;	/* (k) Reference to credentials. */
 	struct thread	*td_standin;	/* (k + a) Use this for an upcall. */
-	struct kse_upcall *td_upcall;	/* (k + j) Upcall structure. */
-	u_int		td_estcpu;	/* (j) Sum of the same field in KSEs. */
-	u_int		td_slptime;	/* (j) How long completely blocked. */
-	struct rusage	td_ru;		/* (j) rusage information */
-	uint64_t	td_runtime;	/* (j) How many cpu ticks we've run. */
-	u_int 		td_pticks;	/* (j) Statclock hits for profiling */
-	u_int		td_sticks;	/* (j) Statclock hits in system mode. */
-	u_int		td_iticks;	/* (j) Statclock hits in intr mode. */
-	u_int		td_uticks;	/* (j) Statclock hits in user mode. */
+	struct kse_upcall *td_upcall;	/* (k + t) Upcall structure. */
+	u_int		td_estcpu;	/* (t) estimated cpu utilization */
+	u_int		td_slptime;	/* (t) How long completely blocked. */
+	struct rusage	td_ru;		/* (t) rusage information */
+	uint64_t	td_runtime;	/* (t) How many cpu ticks we've run. */
+	u_int 		td_pticks;	/* (t) Statclock hits for profiling */
+	u_int		td_sticks;	/* (t) Statclock hits in system mode. */
+	u_int		td_iticks;	/* (t) Statclock hits in intr mode. */
+	u_int		td_uticks;	/* (t) Statclock hits in user mode. */
 	u_int		td_uuticks;	/* (k) Statclock hits (usr), for UTS. */
 	u_int		td_usticks;	/* (k) Statclock hits (sys), for UTS. */
-	int		td_intrval;	/* (j) Return value of TDF_INTERRUPT. */
+	int		td_intrval;	/* (t) Return value of TDF_INTERRUPT. */
 	sigset_t	td_oldsigmask;	/* (k) Saved mask from pre sigpause. */
 	sigset_t	td_sigmask;	/* (c) Current signal mask. */
 	volatile u_int	td_generation;	/* (k) For detection of preemption */
@@ -278,11 +266,11 @@ struct thread {
 
 /* Copied during fork1() or thread_sched_upcall(). */
 #define	td_startcopy td_endzero
-	u_char		td_base_pri;	/* (j) Thread base kernel priority. */
-	u_char		td_priority;	/* (j) Thread active priority. */
-	u_char		td_pri_class;	/* (j) Scheduling class. */
-	u_char		td_user_pri;	/* (j) User pri from estcpu and nice. */
-	u_char		td_base_user_pri; /* (j) Base user pri */
+	u_char		td_base_pri;	/* (t) Thread base kernel priority. */
+	u_char		td_priority;	/* (t) Thread active priority. */
+	u_char		td_pri_class;	/* (t) Scheduling class. */
+	u_char		td_user_pri;	/* (t) User pri from estcpu and nice. */
+	u_char		td_base_user_pri; /* (t) Base user pri */
 #define	td_endcopy td_pcb
 
 /*
@@ -296,7 +284,7 @@ struct thread {
 		TDS_CAN_RUN,
 		TDS_RUNQ,
 		TDS_RUNNING
-	} td_state;
+	} td_state;			/* (t) thread state */
 	register_t	td_retval[2];	/* (k) Syscall aux returns. */
 	struct callout	td_slpcallout;	/* (h) Callout for sleep. */
 	struct trapframe *td_frame;	/* (k) */
@@ -313,6 +301,16 @@ struct thread {
 	int		td_syscalls;	/* per-thread syscall count (used by NFS :)) */
 };
 
+struct mtx *thread_lock_block(struct thread *);
+void thread_lock_unblock(struct thread *, struct mtx *);
+void thread_lock_set(struct thread *, struct mtx *);
+#define	THREAD_LOCK_ASSERT(td, type)					\
+do {									\
+	struct mtx *__m = __DEVOLATILE(struct mtx *, (td)->td_lock);	\
+	if (__m != &blocked_lock)					\
+		mtx_assert(__m, (type));				\
+} while (0)
+
 /*
  * Flags kept in td_flags:
  * To change these you MUST have the scheduler lock.
@@ -324,22 +322,22 @@ struct thread {
 #define	TDF_IDLETD	0x00000020 /* This is a per-CPU idle thread. */
 #define	TDF_SELECT	0x00000040 /* Selecting; wakeup/waiting danger. */
 #define	TDF_SLEEPABORT	0x00000080 /* sleepq_abort was called. */
-#define	TDF_TSNOBLOCK	0x00000100 /* Don't block on a turnstile due to race. */
+#define	TDF_UNUSEDx100	0x00000100 /* --available-- */
 #define	TDF_UBORROWING	0x00000200 /* Thread is borrowing user pri. */
 #define	TDF_BOUNDARY	0x00000400 /* Thread suspended at user boundary */
 #define	TDF_ASTPENDING	0x00000800 /* Thread has some asynchronous events. */
 #define	TDF_TIMOFAIL	0x00001000 /* Timeout from sleep after we were awake. */
 #define	TDF_INTERRUPT	0x00002000 /* Thread is marked as interrupted. */
 #define	TDF_UPIBLOCKED	0x00004000 /* Thread blocked on user PI mutex. */
-#define	TDF_UNUSED15	0x00008000 /* --available -- */
+#define	TDF_UNUSED15	0x00008000 /* --available-- */
 #define	TDF_NEEDRESCHED	0x00010000 /* Thread needs to yield. */
 #define	TDF_NEEDSIGCHK	0x00020000 /* Thread may need signal delivery. */
 #define	TDF_XSIG	0x00040000 /* Thread is exchanging signal under trace */
 #define	TDF_UNUSED19	0x00080000 /* Thread is sleeping on a umtx. */
 #define	TDF_THRWAKEUP	0x00100000 /* Libthr thread must not suspend itself. */
 #define	TDF_DBSUSPEND	0x00200000 /* Thread is suspended by debugger */
-#define	TDF_UNUSED22	0x00400000 /* --available -- */
-#define	TDF_UNUSED23	0x00800000 /* --available -- */
+#define	TDF_UNUSED22	0x00400000 /* --available-- */
+#define	TDF_UNUSED23	0x00800000 /* --available-- */
 #define	TDF_SCHED0	0x01000000 /* Reserved for scheduler private use */
 #define	TDF_SCHED1	0x02000000 /* Reserved for scheduler private use */
 #define	TDF_SCHED2	0x04000000 /* Reserved for scheduler private use */
@@ -482,7 +480,8 @@ struct rusage_ext {
  */
 struct proc {
 	LIST_ENTRY(proc) p_list;	/* (d) List of all processes. */
-	TAILQ_HEAD(, thread) p_threads;	/* (j)(td_plist) Threads. (shortcut) */
+	TAILQ_HEAD(, thread) p_threads;	/* (j) all threads. */
+	struct mtx	p_slock;	/* process spin lock */
 	struct ucred	*p_ucred;	/* (c) Process owner's identity. */
 	struct filedesc	*p_fd;		/* (b) Open files. */
 	struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */
@@ -491,7 +490,7 @@ struct proc {
 	struct plimit	*p_limit;	/* (c) Process limits. */
 	struct callout	p_limco;	/* (c) Limit callout handle */
 	struct sigacts	*p_sigacts;	/* (x) Signal actions, state (CPU). */
-	TAILQ_HEAD(, kse_upcall) p_upcalls; /* All upcalls in the proc. */
+	TAILQ_HEAD(, kse_upcall) p_upcalls; /* (j) All upcalls in the proc. */
 
 	/*
 	 * The following don't make too much sense.
@@ -504,7 +503,6 @@ struct proc {
 		PRS_NORMAL,		/* threads can be run. */
 		PRS_ZOMBIE
 	} p_state;			/* (j/c) S* process status. */
-
 	pid_t		p_pid;		/* (b) Process identifier. */
 	LIST_ENTRY(proc) p_hash;	/* (d) Hash chain. */
 	LIST_ENTRY(proc) p_pglist;	/* (g + e) List of processes in pgrp. */
@@ -542,14 +540,12 @@ struct proc {
 	struct nlminfo	*p_nlminfo;	/* (?) Only used by/for lockd. */
 	struct kaioinfo	*p_aioinfo;	/* (c) ASYNC I/O info. */
 	struct thread	*p_singlethread;/* (c + j) If single threading this is it */
-	int		p_suspcount;	/* (c) Num threads in suspended mode. */
+	int		p_suspcount;	/* (j) Num threads in suspended mode. */
 	struct thread	*p_xthread;	/* (c) Trap thread */
 	int		p_boundary_count;/* (c) Num threads at user boundary */
 	int		p_pendingcnt;	/* how many signals are pending */
 	struct itimers	*p_itimers;	/* (c) POSIX interval timers. */
 /* from ksegrp */
-	u_int		p_estcpu;	/* (j) Sum of the field in threads. */
-	u_int		p_slptime;	/* (j) How long completely blocked. */
 	int		p_numupcalls;	/* (j) Num upcalls. */
 	int		p_upsleeps;	/* (c) Num threads in kse_release(). */
 	struct kse_thr_mailbox *p_completed; /* (c) Completed thread mboxes. */
@@ -592,6 +588,9 @@ struct proc {
 
 #define	NOCPU	0xff		/* For when we aren't on a CPU. */
 
+#define	PROC_SLOCK(p)	mtx_lock_spin(&(p)->p_slock)
+#define	PROC_SUNLOCK(p)	mtx_unlock_spin(&(p)->p_slock)
+#define	PROC_SLOCK_ASSERT(p, type)	mtx_assert(&(p)->p_slock, (type))
 
 /* These flags are kept in p_flag. */
 #define	P_ADVLOCK	0x00001	/* Process may hold a POSIX advisory lock. */
@@ -626,7 +625,7 @@ struct proc {
 #define	P_STOPPED	(P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE)
 #define	P_SHOULDSTOP(p)	((p)->p_flag & P_STOPPED)
 
-/* These flags are kept in p_sflag and are protected with sched_lock. */
+/* These flags are kept in p_sflag and are protected with proc slock. */
 #define	PS_INMEM	0x00001	/* Loaded into memory. */
 #define	PS_ALRMPEND	0x00020	/* Pending SIGVTALRM needs to be posted. */
 #define	PS_PROFPEND	0x00040	/* Pending SIGPROF needs to be posted. */
@@ -861,8 +860,8 @@ void	stopevent(struct proc *, u_int, u_i
 void	threadinit(void);
 void	cpu_idle(void);
 extern	void (*cpu_idle_hook)(void);	/* Hook to machdep CPU idler. */
-void	cpu_switch(struct thread *old, struct thread *new);
-void	cpu_throw(struct thread *old, struct thread *new) __dead2;
+void	cpu_switch(struct thread *, struct thread *, struct mtx *);
+void	cpu_throw(struct thread *, struct thread *) __dead2;
 void	unsleep(struct thread *);
 void	userret(struct thread *, struct trapframe *);
 
@@ -872,6 +871,7 @@ void	cpu_fork(struct thread *, struct pr
 void	cpu_set_fork_handler(struct thread *, void (*)(void *), void *);
 
 /* New in KSE. */
+void	kse_unlink(struct thread *);
 void	kse_GC(void);
 void	kseinit(void);
 void	cpu_set_upcall(struct thread *td, struct thread *td0);
@@ -900,6 +900,7 @@ void	childproc_stopped(struct proc *chil
 void	childproc_continued(struct proc *child);
 void	childproc_exited(struct proc *child);
 int	thread_suspend_check(int how);
+void	thread_suspend_switch(struct thread *);
 void	thread_suspend_one(struct thread *td);
 struct thread *thread_switchout(struct thread *td, int flags,
 	    struct thread *newtd);
Index: sys/resourcevar.h
===================================================================
RCS file: /usr/home/ncvs/src/sys/sys/resourcevar.h,v
retrieving revision 1.50
diff -u -p -r1.50 resourcevar.h
--- sys/resourcevar.h	1 Jun 2007 01:12:45 -0000	1.50
+++ sys/resourcevar.h	31 May 2007 21:32:58 -0000
@@ -47,7 +47,7 @@
  * Locking key:
  *      b - created at fork, never changes
  *      c - locked by proc mtx
- *      j - locked by sched_lock mtx
+ *      j - locked by proc slock
  *      k - only accessed by curthread
  */
 struct pstats {
Index: sys/sched.h
===================================================================
RCS file: /usr/home/ncvs/src/sys/sys/sched.h,v
retrieving revision 1.31
diff -u -p -r1.31 sched.h
--- sys/sched.h	23 Jan 2007 08:46:50 -0000	1.31
+++ sys/sched.h	21 May 2007 23:41:12 -0000
@@ -81,6 +81,7 @@ int	sched_runnable(void);
  */
 void	sched_exit(struct proc *p, struct thread *childtd);
 void	sched_fork(struct thread *td, struct thread *childtd);
+void	sched_fork_exit(struct thread *td);
 
 /*
  * KSE Groups contain scheduling priority information.  They record the
@@ -101,6 +102,7 @@ fixpt_t	sched_pctcpu(struct thread *td);
 void	sched_prio(struct thread *td, u_char prio);
 void	sched_sleep(struct thread *td);
 void	sched_switch(struct thread *td, struct thread *newtd, int flags);
+void	sched_throw(struct thread *td);
 void	sched_unlend_prio(struct thread *td, u_char prio);
 void	sched_unlend_user_prio(struct thread *td, u_char pri);
 void	sched_user_prio(struct thread *td, u_char prio);
@@ -155,6 +157,20 @@ sched_unpin(void)
 #define	SRQ_PREEMPTED	0x0008		/* has been preempted.. be kind */
 #define	SRQ_BORROWING	0x0010		/* Priority updated due to prio_lend */
 
+#define SCHED_STATS
+/* Switch stats. */
+#ifdef SCHED_STATS
+extern long switch_preempt;
+extern long switch_owepreempt;
+extern long switch_turnstile;
+extern long switch_sleepq;
+extern long switch_sleepqtimo;
+extern long switch_relinquish;
+extern long switch_needresched;
+#define SCHED_STAT_INC(var)     atomic_add_long(&(var), 1)
+#else
+#define SCHED_STAT_INC(var)
+#endif
 
 /* temporarily here */
 void schedinit(void);
@@ -162,7 +178,6 @@ void sched_init_concurrency(struct proc 
 void sched_set_concurrency(struct proc *p, int cuncurrency);
 void sched_schedinit(void);
 void sched_newproc(struct proc *p, struct thread *td);
-void sched_thread_exit(struct thread *td);
 void sched_newthread(struct thread *td);
 #endif /* _KERNEL */
 
Index: sys/turnstile.h
===================================================================
RCS file: /usr/home/ncvs/src/sys/sys/turnstile.h,v
retrieving revision 1.11
diff -u -p -r1.11 turnstile.h
--- sys/turnstile.h	18 Apr 2006 18:21:38 -0000	1.11
+++ sys/turnstile.h	18 May 2007 10:37:02 -0000
@@ -91,17 +91,19 @@ void	init_turnstiles(void);
 void	turnstile_adjust(struct thread *, u_char);
 struct turnstile *turnstile_alloc(void);
 void	turnstile_broadcast(struct turnstile *, int);
-void	turnstile_claim(struct lock_object *);
+void	turnstile_cancel(struct turnstile *);
+void	turnstile_chain_lock(struct lock_object *);
+void	turnstile_chain_unlock(struct lock_object *);
+void	turnstile_claim(struct turnstile *);
 void	turnstile_disown(struct turnstile *);
 int	turnstile_empty(struct turnstile *ts, int queue);
 void	turnstile_free(struct turnstile *);
 struct thread *turnstile_head(struct turnstile *, int);
-void	turnstile_lock(struct lock_object *);
 struct turnstile *turnstile_lookup(struct lock_object *);
-void	turnstile_release(struct lock_object *);
 int	turnstile_signal(struct turnstile *, int);
+struct turnstile *turnstile_trywait(struct lock_object *);
 void	turnstile_unpend(struct turnstile *, int);
-void	turnstile_wait(struct lock_object *, struct thread *, int);
+void	turnstile_wait(struct turnstile *, struct thread *, int);
 
 #endif	/* _KERNEL */
 #endif	/* _SYS_TURNSTILE_H_ */
Index: ufs/ffs/ffs_snapshot.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/ufs/ffs/ffs_snapshot.c,v
retrieving revision 1.135
diff -u -p -r1.135 ffs_snapshot.c
--- ufs/ffs/ffs_snapshot.c	10 Apr 2007 09:31:42 -0000	1.135
+++ ufs/ffs/ffs_snapshot.c	18 May 2007 10:37:02 -0000
@@ -389,12 +389,15 @@ restart:
 	 * Recind nice scheduling while running with the filesystem suspended.
 	 */
 	if (td->td_proc->p_nice > 0) {
-		PROC_LOCK(td->td_proc);
-		mtx_lock_spin(&sched_lock);
-		saved_nice = td->td_proc->p_nice;
-		sched_nice(td->td_proc, 0);
-		mtx_unlock_spin(&sched_lock);
-		PROC_UNLOCK(td->td_proc);
+		struct proc *p;
+
+		p = td->td_proc;
+		PROC_LOCK(p);
+		PROC_SLOCK(p);
+		saved_nice = p->p_nice;
+		sched_nice(p, 0);
+		PROC_SUNLOCK(p);
+		PROC_UNLOCK(p);
 	}
 	/*
 	 * Suspend operation on filesystem.
@@ -809,10 +812,13 @@ done:
 out:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (saved_nice > 0) {
-		PROC_LOCK(td->td_proc);
-		mtx_lock_spin(&sched_lock);
+		struct proc *p;
+
+		p = td->td_proc;
+		PROC_LOCK(p);
+		PROC_SLOCK(p);
 		sched_nice(td->td_proc, saved_nice);
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		PROC_UNLOCK(td->td_proc);
 	}
 	UFS_LOCK(ump);
Index: vm/vm_glue.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/vm/vm_glue.c,v
retrieving revision 1.222
diff -u -p -r1.222 vm_glue.c
--- vm/vm_glue.c	1 Jun 2007 01:12:45 -0000	1.222
+++ vm/vm_glue.c	31 May 2007 20:40:26 -0000
@@ -619,24 +619,26 @@ faultin(p)
 		 * busy swapping it in.
 		 */
 		++p->p_lock;
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		p->p_sflag |= PS_SWAPPINGIN;
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		PROC_UNLOCK(p);
 
 		FOREACH_THREAD_IN_PROC(p, td)
 			vm_thread_swapin(td);
 
 		PROC_LOCK(p);
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		p->p_sflag &= ~PS_SWAPPINGIN;
 		p->p_sflag |= PS_INMEM;
 		FOREACH_THREAD_IN_PROC(p, td) {
+			thread_lock(td);
 			TD_CLR_SWAPPED(td);
 			if (TD_CAN_RUN(td))
 				setrunnable(td);
+			thread_unlock(td);
 		}
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 
 		wakeup(&p->p_sflag);
 
@@ -672,9 +674,9 @@ scheduler(dummy)
 loop:
 	if (vm_page_count_min()) {
 		VM_WAIT;
-		mtx_lock_spin(&sched_lock);
+		thread_lock(&thread0);
 		proc0_rescan = 0;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(&thread0);
 		goto loop;
 	}
 
@@ -685,13 +687,14 @@ loop:
 		if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) {
 			continue;
 		}
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		FOREACH_THREAD_IN_PROC(p, td) {
 			/*
 			 * An otherwise runnable thread of a process
 			 * swapped out has only the TDI_SWAPPED bit set.
 			 * 
 			 */
+			thread_lock(td);
 			if (td->td_inhibitors == TDI_SWAPPED) {
 				pri = p->p_swtime + td->td_slptime;
 				if ((p->p_sflag & PS_SWAPINREQ) == 0) {
@@ -709,8 +712,9 @@ loop:
 					ppri = pri;
 				}
 			}
+			thread_unlock(td);
 		}
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 	}
 	sx_sunlock(&allproc_lock);
 
@@ -718,13 +722,13 @@ loop:
 	 * Nothing to do, back to sleep.
 	 */
 	if ((p = pp) == NULL) {
-		mtx_lock_spin(&sched_lock);
+		thread_lock(&thread0);
 		if (!proc0_rescan) {
 			TD_SET_IWAIT(&thread0);
 			mi_switch(SW_VOL, NULL);
 		}
 		proc0_rescan = 0;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(&thread0);
 		goto loop;
 	}
 	PROC_LOCK(p);
@@ -736,15 +740,15 @@ loop:
 	 */
 	if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) {
 		PROC_UNLOCK(p);
-		mtx_lock_spin(&sched_lock);
+		thread_lock(&thread0);
 		proc0_rescan = 0;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(&thread0);
 		goto loop;
 	}
 
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	p->p_sflag &= ~PS_SWAPINREQ;
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 
 	/*
 	 * We would like to bring someone in. (only if there is space).
@@ -752,10 +756,12 @@ loop:
 	 */
 	faultin(p);
 	PROC_UNLOCK(p);
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	p->p_swtime = 0;
+	PROC_SUNLOCK(p);
+	thread_lock(&thread0);
 	proc0_rescan = 0;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(&thread0);
 	goto loop;
 }
 
@@ -763,7 +769,8 @@ void kick_proc0(void)
 {
 	struct thread *td = &thread0;
 
-		
+	/* XXX This will probably cause a LOR in some cases */
+	thread_lock(td);
 	if (TD_AWAITING_INTR(td)) {
 		CTR2(KTR_INTR, "%s: sched_add %d", __func__, 0);
 		TD_CLR_IWAIT(td);
@@ -773,6 +780,7 @@ void kick_proc0(void)
 		CTR2(KTR_INTR, "%s: state %d",
 		    __func__, td->td_state);
 	}
+	thread_unlock(td);
 	
 }
 
@@ -821,12 +829,12 @@ retry:
 		 * creation.  It may have no
 		 * address space or lock yet.
 		 */
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		if (p->p_state == PRS_NEW) {
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 			continue;
 		}
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 
 		/*
 		 * An aio daemon switches its
@@ -876,7 +884,7 @@ retry:
 			break;
 
 		case PRS_NORMAL:
-			mtx_lock_spin(&sched_lock);
+			PROC_SLOCK(p);
 			/*
 			 * do not swapout a realtime process
 			 * Check all the thread groups..
@@ -929,7 +937,7 @@ retry:
 				 (minslptime > swap_idle_threshold2))) {
 				swapout(p);
 				didswap++;
-				mtx_unlock_spin(&sched_lock);
+				PROC_SUNLOCK(p);
 				PROC_UNLOCK(p);
 				vm_map_unlock(&vm->vm_map);
 				vmspace_free(vm);
@@ -937,7 +945,7 @@ retry:
 				goto retry;
 			}
 nextproc:			
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 		}
 nextproc2:
 		PROC_UNLOCK(p);
@@ -962,7 +970,7 @@ swapout(p)
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
+	mtx_assert(&p->p_slock, MA_OWNED | MA_NOTRECURSED);
 #if defined(SWAP_DEBUG)
 	printf("swapping out %d\n", p->p_pid);
 #endif
@@ -996,15 +1004,18 @@ swapout(p)
 	p->p_sflag &= ~PS_INMEM;
 	p->p_sflag |= PS_SWAPPINGOUT;
 	PROC_UNLOCK(p);
-	FOREACH_THREAD_IN_PROC(p, td)
+	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
 		TD_SET_SWAPPED(td);
-	mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
+	}
+	PROC_SUNLOCK(p);
 
 	FOREACH_THREAD_IN_PROC(p, td)
 		vm_thread_swapout(td);
 
 	PROC_LOCK(p);
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	p->p_sflag &= ~PS_SWAPPINGOUT;
 	p->p_swtime = 0;
 }
Index: vm/vm_meter.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/vm/vm_meter.c,v
retrieving revision 1.93
diff -u -p -r1.93 vm_meter.c
--- vm/vm_meter.c	31 May 2007 22:52:15 -0000	1.93
+++ vm/vm_meter.c	31 May 2007 20:40:26 -0000
@@ -131,17 +131,21 @@ vmtotal(SYSCTL_HANDLER_ARGS)
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_flag & P_SYSTEM)
 			continue;
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		switch (p->p_state) {
 		case PRS_NEW:
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 			continue;
 			break;
 		default:
 			FOREACH_THREAD_IN_PROC(p, td) {
 				/* Need new statistics  XXX */
+				thread_lock(td);
 				switch (td->td_state) {
 				case TDS_INHIBITED:
+					/*
+					 * XXX stats no longer synchronized.
+					 */
 					if (TD_ON_LOCK(td) ||
 					    (td->td_inhibitors ==
 					    TDI_SWAPPED)) {
@@ -162,13 +166,15 @@ vmtotal(SYSCTL_HANDLER_ARGS)
 				case TDS_RUNQ:
 				case TDS_RUNNING:
 					total.t_rq++;
+					thread_unlock(td);
 					continue;
 				default:
 					break;
 				}
+				thread_unlock(td);
 			}
 		}
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		/*
 		 * Note active objects.
 		 */
Index: vm/vm_pageout.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/vm/vm_pageout.c,v
retrieving revision 1.281
diff -u -p -r1.281 vm_pageout.c
--- vm/vm_pageout.c	31 May 2007 22:52:15 -0000	1.281
+++ vm/vm_pageout.c	31 May 2007 20:40:26 -0000
@@ -1246,22 +1246,24 @@ unlock_and_continue:
 			 * If the process is in a non-running type state,
 			 * don't touch it.  Check all the threads individually.
 			 */
-			mtx_lock_spin(&sched_lock);
+			PROC_SLOCK(p);
 			breakout = 0;
 			FOREACH_THREAD_IN_PROC(p, td) {
+				thread_lock(td);
 				if (!TD_ON_RUNQ(td) &&
 				    !TD_IS_RUNNING(td) &&
 				    !TD_IS_SLEEPING(td)) {
+					thread_unlock(td);
 					breakout = 1;
 					break;
 				}
+				thread_unlock(td);
 			}
+			PROC_SUNLOCK(p);
 			if (breakout) {
-				mtx_unlock_spin(&sched_lock);
 				PROC_UNLOCK(p);
 				continue;
 			}
-			mtx_unlock_spin(&sched_lock);
 			/*
 			 * get the process size
 			 */
@@ -1287,9 +1289,9 @@ unlock_and_continue:
 		sx_sunlock(&allproc_lock);
 		if (bigproc != NULL) {
 			killproc(bigproc, "out of swap space");
-			mtx_lock_spin(&sched_lock);
+			PROC_SLOCK(bigproc);
 			sched_nice(bigproc, PRIO_MIN);
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(bigproc);
 			PROC_UNLOCK(bigproc);
 			wakeup(&cnt.v_free_count);
 		}
@@ -1594,17 +1596,20 @@ vm_daemon()
 			 * if the process is in a non-running type state,
 			 * don't touch it.
 			 */
-			mtx_lock_spin(&sched_lock);
+			PROC_SLOCK(p);
 			breakout = 0;
 			FOREACH_THREAD_IN_PROC(p, td) {
+				thread_lock(td);
 				if (!TD_ON_RUNQ(td) &&
 				    !TD_IS_RUNNING(td) &&
 				    !TD_IS_SLEEPING(td)) {
+					thread_unlock(td);
 					breakout = 1;
 					break;
 				}
+				thread_unlock(td);
 			}
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 			if (breakout) {
 				PROC_UNLOCK(p);
 				continue;
Index: vm/vm_zeroidle.c
===================================================================
RCS file: /usr/home/ncvs/src/sys/vm/vm_zeroidle.c,v
retrieving revision 1.46
diff -u -p -r1.46 vm_zeroidle.c
--- vm/vm_zeroidle.c	31 May 2007 22:52:15 -0000	1.46
+++ vm/vm_zeroidle.c	31 May 2007 20:40:26 -0000
@@ -145,9 +145,9 @@ vm_pagezero(void __unused *arg)
 			vm_page_zero_idle();
 #ifndef PREEMPTION
 			if (sched_runnable()) {
-				mtx_lock_spin(&sched_lock);
+				thread_lock(curthread);
 				mi_switch(SW_VOL, NULL);
-				mtx_unlock_spin(&sched_lock);
+				thread_unlock(curthread);
 			}
 #endif
 		} else {
@@ -176,11 +176,11 @@ pagezero_start(void __unused *arg)
 	PROC_LOCK(pagezero_proc);
 	pagezero_proc->p_flag |= P_NOLOAD;
 	PROC_UNLOCK(pagezero_proc);
-	mtx_lock_spin(&sched_lock);
 	td = FIRST_THREAD_IN_PROC(pagezero_proc);
+	thread_lock(td);
 	sched_class(td, PRI_IDLE);
 	sched_prio(td, PRI_MAX_IDLE);
 	sched_add(td, SRQ_BORING);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 }
 SYSINIT(pagezero, SI_SUB_KTHREAD_VM, SI_ORDER_ANY, pagezero_start, NULL)