? bugmagnet.diff ? cont.diff ? foo ? kris-contention.diff ? lock_prof.diff ? out ? sched_lock ? stack-07-20.diff ? stack2.diff ? sys.diff ? sysback.diff ? threadlock.diff ? threadlock2.diff ? tmp.diff ? tophalf.diff ? ule.diff ? amd64/compile ? amd64/amd64/switch.diff ? amd64/amd64/trace.diff ? kern/bak ? kern/pcpu ? kern/stats ? kern/throw ? kern/tophalf.diff ? modules/md/export_syms ? modules/md/geom_md.ko ? modules/md/opt_geom.h ? modules/md/opt_md.h ? modules/md/vnode_if.h ? modules/md/vnode_if_newproto.h ? modules/md/vnode_if_typedef.h ? nfsserver/nfs.diff ? sys/stats Index: amd64/amd64/genassym.c =================================================================== RCS file: /usr/home/ncvs/src/sys/amd64/amd64/genassym.c,v retrieving revision 1.160 diff -u -r1.160 genassym.c --- amd64/amd64/genassym.c 20 Dec 2006 04:40:38 -0000 1.160 +++ amd64/amd64/genassym.c 26 Feb 2007 07:19:42 -0000 @@ -76,6 +76,7 @@ ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); ASSYM(P_SFLAG, offsetof(struct proc, p_sflag)); +ASSYM(TD_LOCK, offsetof(struct thread, td_lock)); ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); ASSYM(TD_PCB, offsetof(struct thread, td_pcb)); ASSYM(TD_PROC, offsetof(struct thread, td_proc)); Index: amd64/amd64/machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/amd64/amd64/machdep.c,v retrieving revision 1.669 diff -u -r1.669 machdep.c --- amd64/amd64/machdep.c 27 Jan 2007 18:13:24 -0000 1.669 +++ amd64/amd64/machdep.c 26 Feb 2007 07:19:42 -0000 @@ -460,9 +460,9 @@ #ifdef SMP /* Schedule ourselves on the indicated cpu. */ - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_bind(curthread, cpu_id); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); #endif /* Calibrate by measuring a short delay. */ @@ -473,9 +473,9 @@ intr_restore(reg); #ifdef SMP - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_unbind(curthread); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); #endif /* Index: amd64/amd64/mp_machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/amd64/amd64/mp_machdep.c,v retrieving revision 1.281 diff -u -r1.281 mp_machdep.c --- amd64/amd64/mp_machdep.c 8 Feb 2007 16:49:58 -0000 1.281 +++ amd64/amd64/mp_machdep.c 26 Feb 2007 07:19:42 -0000 @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -587,25 +588,7 @@ while (smp_started == 0) ia32_pause(); - /* ok, now grab sched_lock and enter the scheduler */ - mtx_lock_spin(&sched_lock); - - /* - * Correct spinlock nesting. The idle thread context that we are - * borrowing was created so that it would start out with a single - * spin lock (sched_lock) held in fork_trampoline(). Since we've - * explicitly acquired locks in this function, the nesting count - * is now 2 rather than 1. Since we are nested, calling - * spinlock_exit() will simply adjust the counts without allowing - * spin lock using code to interrupt us. - */ - spinlock_exit(); - KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); - - PCPU_SET(switchtime, cpu_ticks()); - PCPU_SET(switchticks, ticks); - - cpu_throw(NULL, choosethread()); /* doesn't return */ + sched_throw(NULL); panic("scheduler returned us to %s", __func__); /* NOTREACHED */ @@ -952,12 +935,12 @@ if (ipi_bitmap & (1 << IPI_PREEMPT)) { struct thread *running_thread = curthread; - mtx_lock_spin(&sched_lock); + thread_lock(running_thread); if (running_thread->td_critnest > 1) running_thread->td_owepreempt = 1; else mi_switch(SW_INVOL | SW_PREEMPT, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(running_thread); } /* Nothing to do for AST */ @@ -1141,11 +1124,9 @@ if (mp_ncpus == 1) return; - mtx_lock_spin(&sched_lock); atomic_store_rel_int(&aps_ready, 1); while (smp_started == 0) ia32_pause(); - mtx_unlock_spin(&sched_lock); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); Index: amd64/conf/GENERIC =================================================================== RCS file: /usr/home/ncvs/src/sys/amd64/conf/GENERIC,v retrieving revision 1.473 diff -u -r1.473 GENERIC --- amd64/conf/GENERIC 9 Feb 2007 19:03:17 -0000 1.473 +++ amd64/conf/GENERIC 26 Feb 2007 07:19:42 -0000 @@ -66,8 +66,9 @@ options GDB # Support remote GDB. options INVARIANTS # Enable calls of extra sanity checking options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS -options WITNESS # Enable checks to detect deadlocks and cycles -options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed +#options WITNESS # Enable checks to detect deadlocks and cycles +#options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed +options PRINTF_BUFR_SIZE=128 # Make an SMP-capable kernel by default options SMP # Symmetric MultiProcessor Kernel Index: amd64/linux32/linux32_machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/amd64/linux32/linux32_machdep.c,v retrieving revision 1.33 diff -u -r1.33 linux32_machdep.c --- amd64/linux32/linux32_machdep.c 15 Feb 2007 01:20:43 -0000 1.33 +++ amd64/linux32/linux32_machdep.c 26 Feb 2007 07:19:43 -0000 @@ -481,10 +481,10 @@ td2 = FIRST_THREAD_IN_PROC(p2); /* make it run */ - mtx_lock_spin(&sched_lock); + thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); return (0); } @@ -522,10 +522,10 @@ td2 = FIRST_THREAD_IN_PROC(p2); /* make it run */ - mtx_lock_spin(&sched_lock); + thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); /* wait for the children to exit, ie. emulate vfork */ PROC_LOCK(p2); @@ -673,10 +673,10 @@ /* * Make this runnable after we are finished with it. */ - mtx_lock_spin(&sched_lock); + thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; Index: boot/i386/btx/btx/btx.S =================================================================== RCS file: /usr/home/ncvs/src/sys/boot/i386/btx/btx/btx.S,v retrieving revision 1.44 diff -u -r1.44 btx.S --- boot/i386/btx/btx/btx.S 6 Dec 2006 17:45:35 -0000 1.44 +++ boot/i386/btx/btx/btx.S 26 Feb 2007 07:19:47 -0000 @@ -905,7 +905,7 @@ addl %edx,%eax # + offset xchgl %eax,%esi # Set pointer dump.4: movb $2,%dl # Num lines -dump.4a: movb $0x10,%cl # Bytes to dump +dump.4a: movb $0x40,%cl # Bytes to dump dump.5: lodsb # Get byte and call hex8 # dump it decb %cl # Keep count Index: compat/linprocfs/linprocfs.c =================================================================== RCS file: /usr/home/ncvs/src/sys/compat/linprocfs/linprocfs.c,v retrieving revision 1.105 diff -u -r1.105 linprocfs.c --- compat/linprocfs/linprocfs.c 21 Jan 2007 13:18:52 -0000 1.105 +++ compat/linprocfs/linprocfs.c 26 Feb 2007 07:19:50 -0000 @@ -579,7 +579,7 @@ if (P_SHOULDSTOP(p)) { state = "T (stopped)"; } else { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); switch(p->p_state) { case PRS_NEW: state = "I (idle)"; @@ -609,7 +609,7 @@ state = "? (unknown)"; break; } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } fill_kinfo_proc(p, &kp); Index: compat/ndis/subr_ntoskrnl.c =================================================================== RCS file: /usr/home/ncvs/src/sys/compat/ndis/subr_ntoskrnl.c,v retrieving revision 1.88 diff -u -r1.88 subr_ntoskrnl.c --- compat/ndis/subr_ntoskrnl.c 25 Dec 2006 17:04:41 -0000 1.88 +++ compat/ndis/subr_ntoskrnl.c 26 Feb 2007 07:19:51 -0000 @@ -3824,7 +3824,7 @@ * once scheduled by an ISR. */ - mtx_lock_spin(&sched_lock); + thread_lock(curthread); #ifdef NTOSKRNL_MULTIPLE_DPCS #if __FreeBSD_version >= 502102 sched_bind(curthread, kq->kq_cpu); @@ -3834,7 +3834,7 @@ #if __FreeBSD_version < 600000 curthread->td_base_pri = PRI_MIN_KERN; #endif - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); while (1) { KeWaitForSingleObject(&kq->kq_proc, 0, 0, TRUE, NULL); Index: compat/svr4/svr4_misc.c =================================================================== RCS file: /usr/home/ncvs/src/sys/compat/svr4/svr4_misc.c,v retrieving revision 1.91 diff -u -r1.91 svr4_misc.c --- compat/svr4/svr4_misc.c 6 Nov 2006 13:41:50 -0000 1.91 +++ compat/svr4/svr4_misc.c 26 Feb 2007 07:19:51 -0000 @@ -1253,12 +1253,12 @@ * See if we have a stopped or continued process. * XXX: This duplicates the same code in kern_wait(). */ - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if ((p->p_flag & P_STOPPED_SIG) && (p->p_suspcount == p->p_numthreads) && (p->p_flag & P_WAITED) == 0 && (p->p_flag & P_TRACED || uap->options & SVR4_WSTOPPED)) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); if (((uap->options & SVR4_WNOWAIT)) == 0) p->p_flag |= P_WAITED; sx_sunlock(&proctree_lock); @@ -1278,7 +1278,7 @@ DPRINTF(("jobcontrol %d\n", pid)); return (svr4_setinfo(pid, &ru, status, uap->info)); } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); if (uap->options & SVR4_WCONTINUED && (p->p_flag & P_CONTINUED)) { sx_sunlock(&proctree_lock); Index: dev/hwpmc/hwpmc_mod.c =================================================================== RCS file: /usr/home/ncvs/src/sys/dev/hwpmc/hwpmc_mod.c,v retrieving revision 1.26 diff -u -r1.26 hwpmc_mod.c --- dev/hwpmc/hwpmc_mod.c 6 Nov 2006 13:41:53 -0000 1.26 +++ dev/hwpmc/hwpmc_mod.c 26 Feb 2007 07:20:36 -0000 @@ -584,10 +584,10 @@ pmc_save_cpu_binding(struct pmc_binding *pb) { PMCDBG(CPU,BND,2, "%s", "save-cpu"); - mtx_lock_spin(&sched_lock); + thread_lock(curthread); pb->pb_bound = sched_is_bound(curthread); pb->pb_cpu = curthread->td_oncpu; - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); PMCDBG(CPU,BND,2, "save-cpu cpu=%d", pb->pb_cpu); } @@ -600,12 +600,12 @@ { PMCDBG(CPU,BND,2, "restore-cpu curcpu=%d restore=%d", curthread->td_oncpu, pb->pb_cpu); - mtx_lock_spin(&sched_lock); + thread_lock(curthread); if (pb->pb_bound) sched_bind(curthread, pb->pb_cpu); else sched_unbind(curthread); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); PMCDBG(CPU,BND,2, "%s", "restore-cpu done"); } @@ -624,9 +624,9 @@ "disabled CPU %d", __LINE__, cpu)); PMCDBG(CPU,SEL,2, "select-cpu cpu=%d", cpu); - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_bind(curthread, cpu); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); KASSERT(curthread->td_oncpu == cpu, ("[pmc,%d] CPU not bound [cpu=%d, curr=%d]", __LINE__, Index: dev/md/md.c =================================================================== RCS file: /usr/home/ncvs/src/sys/dev/md/md.c,v retrieving revision 1.167 diff -u -r1.167 md.c --- dev/md/md.c 14 Dec 2006 11:34:07 -0000 1.167 +++ dev/md/md.c 26 Feb 2007 07:20:43 -0000 @@ -690,9 +690,9 @@ int error; sc = arg; - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_prio(curthread, PRIBIO); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); if (sc->type == MD_VNODE) curthread->td_pflags |= TDP_NORUNNINGBUF; Index: dev/syscons/syscons.h =================================================================== RCS file: /usr/home/ncvs/src/sys/dev/syscons/syscons.h,v retrieving revision 1.87 diff -u -r1.87 syscons.h --- dev/syscons/syscons.h 13 Sep 2006 15:48:15 -0000 1.87 +++ dev/syscons/syscons.h 26 Feb 2007 07:20:55 -0000 @@ -536,7 +536,7 @@ (*kbdsw[(kbd)->kb_index]->poll)((kbd), (on)) #define SC_VIDEO_LOCKINIT(sc) \ - mtx_init(&(sc)->video_mtx, "syscons video lock", NULL,MTX_SPIN); + mtx_init(&(sc)->video_mtx, "syscons video lock", NULL,MTX_QUIET|MTX_SPIN); #define SC_VIDEO_LOCK(sc) \ do { \ if (!cold) \ Index: fs/procfs/procfs_ctl.c =================================================================== RCS file: /usr/home/ncvs/src/sys/fs/procfs/procfs_ctl.c,v retrieving revision 1.55 diff -u -r1.55 procfs_ctl.c --- fs/procfs/procfs_ctl.c 22 Feb 2006 17:20:37 -0000 1.55 +++ fs/procfs/procfs_ctl.c 26 Feb 2007 07:21:06 -0000 @@ -286,9 +286,9 @@ panic("procfs_control"); } - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); thread_unsuspend(p); /* If it can run, let it do so. */ - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); return (0); } @@ -344,9 +344,9 @@ #endif /* XXXKSE: */ p->p_flag &= ~P_STOPPED_SIG; - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); thread_unsuspend(p); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } else psignal(p, nm->nm_val); PROC_UNLOCK(p); Index: fs/procfs/procfs_ioctl.c =================================================================== RCS file: /usr/home/ncvs/src/sys/fs/procfs/procfs_ioctl.c,v retrieving revision 1.16 diff -u -r1.16 procfs_ioctl.c --- fs/procfs/procfs_ioctl.c 19 Feb 2007 13:04:25 -0000 1.16 +++ fs/procfs/procfs_ioctl.c 26 Feb 2007 07:21:06 -0000 @@ -178,9 +178,9 @@ if (P_SHOULDSTOP(p)) { p->p_xstat = sig; p->p_flag &= ~(P_STOPPED_TRACE|P_STOPPED_SIG); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); thread_unsuspend(p); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } else if (sig) psignal(p, sig); #else Index: fs/procfs/procfs_status.c =================================================================== RCS file: /usr/home/ncvs/src/sys/fs/procfs/procfs_status.c,v retrieving revision 1.59 diff -u -r1.59 procfs_status.c --- fs/procfs/procfs_status.c 6 Dec 2006 06:34:54 -0000 1.59 +++ fs/procfs/procfs_status.c 26 Feb 2007 07:21:06 -0000 @@ -112,7 +112,7 @@ sbuf_printf(sb, "noflags"); } - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); #ifdef KSE if (p->p_flag & P_SA) wmesg = "-kse- "; @@ -127,7 +127,7 @@ } else wmesg = "nochan"; } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); if (p->p_sflag & PS_INMEM) { struct timeval start, ut, st; Index: geom/geom_kern.c =================================================================== RCS file: /usr/home/ncvs/src/sys/geom/geom_kern.c,v retrieving revision 1.40 diff -u -r1.40 geom_kern.c --- geom/geom_kern.c 25 Nov 2005 10:09:30 -0000 1.40 +++ geom/geom_kern.c 26 Feb 2007 07:21:07 -0000 @@ -88,9 +88,9 @@ struct thread *tp = FIRST_THREAD_IN_PROC(p); mtx_assert(&Giant, MA_NOTOWNED); - mtx_lock_spin(&sched_lock); + thread_lock(tp); sched_prio(tp, PRIBIO); - mtx_unlock_spin(&sched_lock); + thread_unlock(tp); for(;;) { g_io_schedule_up(tp); } @@ -111,9 +111,9 @@ struct thread *tp = FIRST_THREAD_IN_PROC(p); mtx_assert(&Giant, MA_NOTOWNED); - mtx_lock_spin(&sched_lock); + thread_lock(tp); sched_prio(tp, PRIBIO); - mtx_unlock_spin(&sched_lock); + thread_unlock(tp); for(;;) { g_io_schedule_down(tp); } @@ -134,9 +134,9 @@ struct thread *tp = FIRST_THREAD_IN_PROC(p); mtx_assert(&Giant, MA_NOTOWNED); - mtx_lock_spin(&sched_lock); + thread_lock(tp); sched_prio(tp, PRIBIO); - mtx_unlock_spin(&sched_lock); + thread_unlock(tp); for(;;) { g_run_events(); tsleep(&g_wait_event, PRIBIO, "-", hz/10); Index: geom/eli/g_eli.c =================================================================== RCS file: /usr/home/ncvs/src/sys/geom/eli/g_eli.c,v retrieving revision 1.34 diff -u -r1.34 g_eli.c --- geom/eli/g_eli.c 28 Jan 2007 20:29:12 -0000 1.34 +++ geom/eli/g_eli.c 26 Feb 2007 07:21:07 -0000 @@ -331,11 +331,11 @@ tsleep(wr, 0, "geli:smp", hz / 4); } #endif - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_prio(curthread, PRIBIO); if (sc->sc_crypto == G_ELI_CRYPTO_SW && g_eli_threads == 0) sched_bind(curthread, wr->w_number); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); G_ELI_DEBUG(1, "Thread %s started.", curthread->td_proc->p_comm); Index: geom/journal/g_journal.c =================================================================== RCS file: /usr/home/ncvs/src/sys/geom/journal/g_journal.c,v retrieving revision 1.9 diff -u -r1.9 g_journal.c --- geom/journal/g_journal.c 2 Dec 2006 09:10:29 -0000 1.9 +++ geom/journal/g_journal.c 26 Feb 2007 07:21:07 -0000 @@ -2057,9 +2057,9 @@ time_t last_write; int type; - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_prio(curthread, PRIBIO); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); sc = arg; type = 0; /* gcc */ Index: geom/mirror/g_mirror.c =================================================================== RCS file: /usr/home/ncvs/src/sys/geom/mirror/g_mirror.c,v retrieving revision 1.92 diff -u -r1.92 g_mirror.c --- geom/mirror/g_mirror.c 1 Nov 2006 22:51:49 -0000 1.92 +++ geom/mirror/g_mirror.c 26 Feb 2007 07:21:07 -0000 @@ -1768,9 +1768,9 @@ int timeout; sc = arg; - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_prio(curthread, PRIBIO); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { Index: geom/raid3/g_raid3.c =================================================================== RCS file: /usr/home/ncvs/src/sys/geom/raid3/g_raid3.c,v retrieving revision 1.80 diff -u -r1.80 g_raid3.c --- geom/raid3/g_raid3.c 1 Nov 2006 22:51:49 -0000 1.80 +++ geom/raid3/g_raid3.c 26 Feb 2007 07:21:08 -0000 @@ -2017,9 +2017,9 @@ int timeout; sc = arg; - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_prio(curthread, PRIBIO); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { Index: i386/i386/machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/i386/i386/machdep.c,v retrieving revision 1.648 diff -u -r1.648 machdep.c --- i386/i386/machdep.c 23 Jan 2007 08:01:19 -0000 1.648 +++ i386/i386/machdep.c 26 Feb 2007 07:21:13 -0000 @@ -1058,9 +1058,9 @@ #ifdef SMP /* Schedule ourselves on the indicated cpu. */ - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_bind(curthread, cpu_id); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); #endif /* Calibrate by measuring a short delay. */ @@ -1071,9 +1071,9 @@ intr_restore(reg); #ifdef SMP - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_unbind(curthread); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); #endif /* Index: i386/i386/mp_machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/i386/i386/mp_machdep.c,v retrieving revision 1.274 diff -u -r1.274 mp_machdep.c --- i386/i386/mp_machdep.c 8 Feb 2007 16:49:59 -0000 1.274 +++ i386/i386/mp_machdep.c 26 Feb 2007 07:21:13 -0000 @@ -61,6 +61,7 @@ #include #include #include +#include #include #include @@ -629,25 +630,8 @@ while (smp_started == 0) ia32_pause(); - /* ok, now grab sched_lock and enter the scheduler */ - mtx_lock_spin(&sched_lock); - - /* - * Correct spinlock nesting. The idle thread context that we are - * borrowing was created so that it would start out with a single - * spin lock (sched_lock) held in fork_trampoline(). Since we've - * explicitly acquired locks in this function, the nesting count - * is now 2 rather than 1. Since we are nested, calling - * spinlock_exit() will simply adjust the counts without allowing - * spin lock using code to interrupt us. - */ - spinlock_exit(); - KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); - - PCPU_SET(switchtime, cpu_ticks()); - PCPU_SET(switchticks, ticks); - - cpu_throw(NULL, choosethread()); /* doesn't return */ + /* enter the scheduler */ + sched_throw(NULL); panic("scheduler returned us to %s", __func__); /* NOTREACHED */ @@ -1148,12 +1132,12 @@ #ifdef COUNT_IPIS (*ipi_preempt_counts[cpu])++; #endif - mtx_lock_spin(&sched_lock); + thread_lock(running_thread); if (running_thread->td_critnest > 1) running_thread->td_owepreempt = 1; else mi_switch(SW_INVOL | SW_PREEMPT, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(running_thread); } if (ipi_bitmap & (1 << IPI_AST)) { @@ -1342,11 +1326,9 @@ if (mp_ncpus == 1) return; - mtx_lock_spin(&sched_lock); atomic_store_rel_int(&aps_ready, 1); while (smp_started == 0) ia32_pause(); - mtx_unlock_spin(&sched_lock); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); Index: i386/isa/npx.c =================================================================== RCS file: /usr/home/ncvs/src/sys/i386/isa/npx.c,v retrieving revision 1.171 diff -u -r1.171 npx.c --- i386/isa/npx.c 23 Feb 2007 12:19:00 -0000 1.171 +++ i386/isa/npx.c 26 Feb 2007 07:21:15 -0000 @@ -230,9 +230,9 @@ td = PCPU_GET(fpcurthread); if (td != NULL) { td->td_pcb->pcb_flags |= PCB_NPXTRAP; - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags |= TDF_ASTPENDING; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } return (FILTER_HANDLED); } Index: i386/linux/linux_machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/i386/linux/linux_machdep.c,v retrieving revision 1.71 diff -u -r1.71 linux_machdep.c --- i386/linux/linux_machdep.c 23 Feb 2007 22:39:26 -0000 1.71 +++ i386/linux/linux_machdep.c 26 Feb 2007 07:21:15 -0000 @@ -325,10 +325,10 @@ /* * Make this runnable after we are finished with it. */ - mtx_lock_spin(&sched_lock); + thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); return (0); } @@ -368,10 +368,10 @@ /* * Make this runnable after we are finished with it. */ - mtx_lock_spin(&sched_lock); + thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); /* wait for the children to exit, ie. emulate vfork */ PROC_LOCK(p2); @@ -568,10 +568,10 @@ /* * Make this runnable after we are finished with it. */ - mtx_lock_spin(&sched_lock); + thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; Index: kern/init_main.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/init_main.c,v retrieving revision 1.273 diff -u -r1.273 init_main.c --- kern/init_main.c 23 Jan 2007 08:46:50 -0000 1.273 +++ kern/init_main.c 26 Feb 2007 07:21:18 -0000 @@ -712,9 +712,9 @@ PROC_UNLOCK(initproc); crfree(oldcred); cred_update_thread(FIRST_THREAD_IN_PROC(initproc)); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(initproc); initproc->p_sflag |= PS_INMEM; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(initproc); cpu_set_fork_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL); } SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL) @@ -728,9 +728,9 @@ struct thread *td; td = FIRST_THREAD_IN_PROC(initproc); - mtx_lock_spin(&sched_lock); + thread_lock(td); TD_SET_CAN_RUN(td); sched_add(td, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL) Index: kern/kern_acct.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_acct.c,v retrieving revision 1.86 diff -u -r1.86 kern_acct.c --- kern/kern_acct.c 8 Jan 2007 20:35:13 -0000 1.86 +++ kern/kern_acct.c 26 Feb 2007 07:21:18 -0000 @@ -540,9 +540,9 @@ /* This is a low-priority kernel thread. */ pri = PRI_MAX_KERN; - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_prio(curthread, pri); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); /* If another accounting kthread is already running, just die. */ sx_xlock(&acct_sx); Index: kern/kern_clock.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_clock.c,v retrieving revision 1.193 diff -u -r1.193 kern_clock.c --- kern/kern_clock.c 15 Dec 2006 21:44:49 -0000 1.193 +++ kern/kern_clock.c 26 Feb 2007 07:21:19 -0000 @@ -458,8 +458,9 @@ } CTR4(KTR_SCHED, "statclock: %p(%s) prio %d stathz %d", td, td->td_proc->p_comm, td->td_priority, (stathz)?stathz:hz); - + thread_lock_flags(td, MTX_QUIET); sched_clock(td); + thread_unlock(td); /* Update resource usage integrals and maximums. */ MPASS(p->p_stats != NULL); Index: kern/kern_condvar.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_condvar.c,v retrieving revision 1.57 diff -u -r1.57 kern_condvar.c --- kern/kern_condvar.c 16 Dec 2006 06:54:08 -0000 1.57 +++ kern/kern_condvar.c 26 Feb 2007 07:21:19 -0000 @@ -340,8 +340,8 @@ if (cvp->cv_waiters > 0) { cvp->cv_waiters--; sleepq_signal(cvp, SLEEPQ_CONDVAR, -1, 0); - } else - sleepq_release(cvp); + } + sleepq_release(cvp); } /* Index: kern/kern_cpu.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_cpu.c,v retrieving revision 1.23 diff -u -r1.23 kern_cpu.c --- kern/kern_cpu.c 3 Mar 2006 02:06:04 -0000 1.23 +++ kern/kern_cpu.c 26 Feb 2007 07:21:19 -0000 @@ -300,17 +300,17 @@ cpu_id = PCPU_GET(cpuid); pc = cpu_get_pcpu(set->dev); if (cpu_id != pc->pc_cpuid) { - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_bind(curthread, pc->pc_cpuid); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); } CF_DEBUG("setting abs freq %d on %s (cpu %d)\n", set->freq, device_get_nameunit(set->dev), PCPU_GET(cpuid)); error = CPUFREQ_DRV_SET(set->dev, set); if (cpu_id != pc->pc_cpuid) { - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_unbind(curthread); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); } if (error) { goto out; @@ -329,17 +329,17 @@ cpu_id = PCPU_GET(cpuid); pc = cpu_get_pcpu(set->dev); if (cpu_id != pc->pc_cpuid) { - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_bind(curthread, pc->pc_cpuid); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); } CF_DEBUG("setting rel freq %d on %s (cpu %d)\n", set->freq, device_get_nameunit(set->dev), PCPU_GET(cpuid)); error = CPUFREQ_DRV_SET(set->dev, set); if (cpu_id != pc->pc_cpuid) { - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_unbind(curthread); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); } if (error) { /* XXX Back out any successful setting? */ Index: kern/kern_exit.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_exit.c,v retrieving revision 1.294 diff -u -r1.294 kern_exit.c --- kern/kern_exit.c 25 Oct 2006 06:18:04 -0000 1.294 +++ kern/kern_exit.c 26 Feb 2007 07:21:19 -0000 @@ -524,11 +524,10 @@ * proc lock. */ wakeup(p->p_pptr); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); p->p_state = PRS_ZOMBIE; - PROC_UNLOCK(p->p_pptr); - sched_exit(p->p_pptr, td); + PROC_UNLOCK(p->p_pptr); /* * Hopefully no one will try to deliver a signal to the process this @@ -729,9 +728,10 @@ * still hold sched_lock, so simply by acquiring * sched_lock once we will wait long enough for the * thread to exit in that case. + * XXX This is questionable. */ - mtx_lock_spin(&sched_lock); - mtx_unlock_spin(&sched_lock); + PROC_SLOCK(p); + PROC_SUNLOCK(p); td->td_retval[0] = p->p_pid; if (status) @@ -828,12 +828,12 @@ sx_xunlock(&allproc_lock); return (0); } - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if ((p->p_flag & P_STOPPED_SIG) && (p->p_suspcount == p->p_numthreads) && (p->p_flag & P_WAITED) == 0 && (p->p_flag & P_TRACED || options & WUNTRACED)) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); p->p_flag |= P_WAITED; sx_xunlock(&proctree_lock); td->td_retval[0] = p->p_pid; @@ -847,7 +847,7 @@ return (0); } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); if (options & WCONTINUED && (p->p_flag & P_CONTINUED)) { sx_xunlock(&proctree_lock); td->td_retval[0] = p->p_pid; Index: kern/kern_fork.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_fork.c,v retrieving revision 1.266 diff -u -r1.266 kern_fork.c --- kern/kern_fork.c 23 Jan 2007 08:46:50 -0000 1.266 +++ kern/kern_fork.c 26 Feb 2007 07:21:19 -0000 @@ -418,8 +418,15 @@ lastpid = trypid; p2 = newproc; + td2 = FIRST_THREAD_IN_PROC(newproc); p2->p_state = PRS_NEW; /* protect against others */ p2->p_pid = trypid; + /* + * Allow the scheduler to initialize the child. + */ + thread_lock(td); + sched_fork(td, td2); + thread_unlock(td); AUDIT_ARG(pid, p2->p_pid); LIST_INSERT_HEAD(&allproc, p2, p_list); LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); @@ -473,8 +480,6 @@ * Start by zeroing the section of proc that is zero-initialized, * then copy the section that is copied directly from the parent. */ - td2 = FIRST_THREAD_IN_PROC(p2); - /* Allocate and switch to an alternate kstack if specified. */ if (pages != 0) vm_thread_new_altkstack(td2, pages); @@ -502,15 +507,10 @@ p2->p_flag = 0; if (p1->p_flag & P_PROFIL) startprofclock(p2); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p2); p2->p_sflag = PS_INMEM; - /* - * Allow the scheduler to adjust the priority of the child and - * parent while we hold the sched_lock. - */ - sched_fork(td, td2); + PROC_SUNLOCK(p2); - mtx_unlock_spin(&sched_lock); p2->p_ucred = crhold(td->td_ucred); td2->td_ucred = crhold(p2->p_ucred); #ifdef AUDIT @@ -695,18 +695,20 @@ * Set the child start time and mark the process as being complete. */ microuptime(&p2->p_stats->p_start); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p2); p2->p_state = PRS_NORMAL; + PROC_SUNLOCK(p2); /* * If RFSTOPPED not requested, make child runnable and add to * run queue. */ if ((flags & RFSTOPPED) == 0) { + thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); + thread_unlock(td2); } - mtx_unlock_spin(&sched_lock); /* * Now can be swapped. @@ -780,33 +782,14 @@ struct proc *p; struct thread *td; - /* - * Finish setting up thread glue so that it begins execution in a - * non-nested critical section with sched_lock held but not recursed. - */ td = curthread; p = td->td_proc; - td->td_oncpu = PCPU_GET(cpuid); KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new")); - sched_lock.mtx_lock = (uintptr_t)td; - mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); CTR4(KTR_PROC, "fork_exit: new thread %p (kse %p, pid %d, %s)", td, td->td_sched, p->p_pid, p->p_comm); - /* - * Processes normally resume in mi_switch() after being - * cpu_switch()'ed to, but when children start up they arrive here - * instead, so we must do much the same things as mi_switch() would. - */ - - if ((td = PCPU_GET(deadthread))) { - PCPU_SET(deadthread, NULL); - thread_stash(td); - } - td = curthread; - mtx_unlock_spin(&sched_lock); - + sched_fork_exit(td); /* * cpu_set_fork_handler intercepts this function call to * have this call a non-return function to stay in kernel mode. Index: kern/kern_idle.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_idle.c,v retrieving revision 1.47 diff -u -r1.47 kern_idle.c --- kern/kern_idle.c 23 Jan 2007 08:46:50 -0000 1.47 +++ kern/kern_idle.c 26 Feb 2007 07:21:19 -0000 @@ -73,13 +73,13 @@ PROC_LOCK(p); p->p_flag |= P_NOLOAD; - mtx_lock_spin(&sched_lock); td = FIRST_THREAD_IN_PROC(p); + thread_lock(td); TD_SET_CAN_RUN(td); td->td_flags |= TDF_IDLETD; sched_class(td, PRI_IDLE); sched_prio(td, PRI_MAX_IDLE); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); PROC_UNLOCK(p); #ifdef SMP } Index: kern/kern_intr.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_intr.c,v retrieving revision 1.140 diff -u -r1.140 kern_intr.c --- kern/kern_intr.c 23 Feb 2007 12:19:01 -0000 1.140 +++ kern/kern_intr.c 26 Feb 2007 07:21:19 -0000 @@ -163,9 +163,9 @@ /* Update name and priority. */ strlcpy(td->td_proc->p_comm, ie->ie_fullname, sizeof(td->td_proc->p_comm)); - mtx_lock_spin(&sched_lock); + thread_lock(td); sched_prio(td, pri); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } /* @@ -296,10 +296,10 @@ if (error) panic("kthread_create() failed with %d", error); td = FIRST_THREAD_IN_PROC(p); /* XXXKSE */ - mtx_lock_spin(&sched_lock); + thread_lock(td); sched_class(td, PRI_ITHD); TD_SET_IWAIT(td); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); td->td_pflags |= TDP_ITHREAD; ithd->it_thread = td; CTR2(KTR_INTR, "%s: created %s", __func__, name); @@ -313,13 +313,13 @@ CTR2(KTR_INTR, "%s: killing %s", __func__, ithread->it_event->ie_name); td = ithread->it_thread; - mtx_lock_spin(&sched_lock); + thread_lock(td); ithread->it_flags |= IT_DEAD; if (TD_AWAITING_INTR(td)) { TD_CLR_IWAIT(td); sched_add(td, SRQ_INTR); } - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } int @@ -465,7 +465,7 @@ * so we have to remove the handler here rather than letting the * thread do it. */ - mtx_lock_spin(&sched_lock); + thread_lock(ie->ie_thread->it_thread); if (!TD_AWAITING_INTR(ie->ie_thread->it_thread) && !cold) { handler->ih_flags |= IH_DEAD; @@ -477,7 +477,7 @@ ie->ie_thread->it_need = 1; } else TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next); - mtx_unlock_spin(&sched_lock); + thread_unlock(ie->ie_thread->it_thread); while (handler->ih_flags & IH_DEAD) msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0); intr_event_update(ie); @@ -546,7 +546,7 @@ * put this thread on the runqueue. */ it->it_need = 1; - mtx_lock_spin(&sched_lock); + thread_lock(td); if (TD_AWAITING_INTR(td)) { CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid, p->p_comm); @@ -556,7 +556,7 @@ CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d", __func__, p->p_pid, p->p_comm, it->it_need, td->td_state); } - mtx_unlock_spin(&sched_lock); + thread_unlock(td); return (0); } @@ -768,13 +768,13 @@ * lock. This may take a while and it_need may get * set again, so we have to check it again. */ - mtx_lock_spin(&sched_lock); + thread_lock(td); if (!ithd->it_need && !(ithd->it_flags & IT_DEAD)) { TD_SET_IWAIT(td); ie->ie_count = 0; mi_switch(SW_VOL, NULL); } - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } } Index: kern/kern_kse.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_kse.c,v retrieving revision 1.227 diff -u -r1.227 kern_kse.c --- kern/kern_kse.c 23 Jan 2007 08:46:50 -0000 1.227 +++ kern/kern_kse.c 26 Feb 2007 07:21:19 -0000 @@ -57,7 +57,7 @@ extern int max_threads_per_proc; extern int max_groups_per_proc; extern int max_threads_hits; -extern struct mtx kse_zombie_lock; +extern struct mtx kse_lock; TAILQ_HEAD(, kse_upcall) zombie_upcalls = @@ -66,6 +66,9 @@ static int thread_update_usr_ticks(struct thread *td); static void thread_alloc_spare(struct thread *td); +struct mtx kse_lock; +MTX_SYSINIT(kse_lock, &kse_lock, "kse lock", MTX_SPIN); + struct kse_upcall * upcall_alloc(void) { @@ -86,7 +89,7 @@ upcall_link(struct kse_upcall *ku, struct proc *p) { - mtx_assert(&sched_lock, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); TAILQ_INSERT_TAIL(&p->p_upcalls, ku, ku_link); ku->ku_proc = p; p->p_numupcalls++; @@ -97,7 +100,7 @@ { struct proc *p = ku->ku_proc; - mtx_assert(&sched_lock, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); KASSERT(ku->ku_owner == NULL, ("%s: have owner", __func__)); TAILQ_REMOVE(&p->p_upcalls, ku, ku_link); p->p_numupcalls--; @@ -108,7 +111,7 @@ upcall_remove(struct thread *td) { - mtx_assert(&sched_lock, MA_OWNED); + PROC_SLOCK_ASSERT(td->td_proc, MA_OWNED); if (td->td_upcall != NULL) { td->td_upcall->ku_owner = NULL; upcall_unlink(td->td_upcall); @@ -124,6 +127,14 @@ }; #endif +void +kse_unlink(struct thread *td) +{ + mtx_lock_spin(&kse_lock); + thread_unlink(td); + mtx_unlock_spin(&kse_lock); +} + int kse_switchin(struct thread *td, struct kse_switchin_args *uap) { @@ -156,11 +167,11 @@ else ptrace_clear_single_step(td); if (tmbx.tm_dflags & TMDF_SUSPEND) { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(td->td_proc); /* fuword can block, check again */ if (td->td_upcall) ku->ku_flags |= KUF_DOUPCALL; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(td->td_proc); } _PRELE(td->td_proc); } @@ -204,23 +215,25 @@ case KSE_INTR_INTERRUPT: case KSE_INTR_RESTART: PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); FOREACH_THREAD_IN_PROC(p, td2) { if (td2->td_mailbox == uap->tmbx) break; } if (td2 == NULL) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); return (ESRCH); } + thread_lock(td2); + PROC_SUNLOCK(p); if (uap->cmd == KSE_INTR_SENDSIG) { if (uap->data > 0) { td2->td_flags &= ~TDF_INTERRUPT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); tdsignal(p, td2, (int)uap->data, NULL); } else { - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); } } else { td2->td_flags |= TDF_INTERRUPT | TDF_ASTPENDING; @@ -232,7 +245,7 @@ td2->td_intrval = ERESTART; if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR)) sleepq_abort(td2, td2->td_intrval); - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); } PROC_UNLOCK(p); break; @@ -257,12 +270,14 @@ if (!(flags & TMDF_SUSPEND)) break; PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); thread_stopped(p); - thread_suspend_one(td); PROC_UNLOCK(p); + thread_lock(td); + thread_suspend_one(td); + PROC_SUNLOCK(p); mi_switch(SW_VOL, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } return (0); @@ -324,19 +339,19 @@ * ( or similar) and wait in the kernel to be needed. */ PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); FOREACH_UPCALL_IN_PROC(p, ku2) { if (ku2->ku_flags & KUF_EXITING) count++; } if ((p->p_numupcalls - count) == 1 && (p->p_numthreads > 1)) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); return (EDEADLK); } ku->ku_flags |= KUF_EXITING; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); /* @@ -352,7 +367,7 @@ if (error) psignal(p, SIGSEGV); sigqueue_flush(&td->td_sigqueue); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); upcall_remove(td); if (p->p_numthreads != 1) { thread_stopped(p); @@ -370,7 +385,7 @@ * The other possibility would be to let the process exit. */ thread_unthread(td); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); #if 0 return (0); @@ -452,9 +467,9 @@ PROC_UNLOCK(p); } if (ku->ku_flags & KUF_DOUPCALL) { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); ku->ku_flags &= ~KUF_DOUPCALL; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } return (0); #else /* !KSE */ @@ -480,7 +495,7 @@ if (!(p->p_flag & P_SA)) return (EINVAL); PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (uap->mbx) { FOREACH_UPCALL_IN_PROC(p, ku) { if (ku->ku_mailbox == uap->mbx) @@ -488,7 +503,7 @@ } } else { if (p->p_upsleeps) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); wakeup(&p->p_completed); PROC_UNLOCK(p); return (0); @@ -496,15 +511,14 @@ ku = TAILQ_FIRST(&p->p_upcalls); } if (ku == NULL) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); return (ESRCH); } if ((td2 = ku->ku_owner) == NULL) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); panic("%s: no owner", __func__); } else if (td2->td_kflags & (TDK_KSEREL | TDK_KSERELSIG)) { - mtx_unlock_spin(&sched_lock); if (!(td2->td_kflags & TDK_WAKEUP)) { td2->td_kflags |= TDK_WAKEUP; if (td2->td_kflags & TDK_KSEREL) @@ -514,8 +528,8 @@ } } else { ku->ku_flags |= KUF_DOUPCALL; - mtx_unlock_spin(&sched_lock); } + PROC_SUNLOCK(p); PROC_UNLOCK(p); return (0); #else /* !KSE */ @@ -621,10 +635,10 @@ thread_alloc_spare(td); PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (sa) { if( p->p_numupcalls >= ncpus) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); upcall_free(newku); return (EPROCLIM); @@ -659,6 +673,7 @@ * Each upcall structure has an owner thread, find which * one owns it. */ + thread_lock(td); if (uap->newgroup) { /* * The newgroup parameter now means @@ -685,7 +700,8 @@ newtd = thread_schedule_upcall(td, newku); } } - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + PROC_SUNLOCK(p); /* * Let the UTS instance know its LWPID. @@ -709,9 +725,9 @@ * If we are starting a new thread, kick it off. */ if (newtd != td) { - mtx_lock_spin(&sched_lock); + thread_lock(newtd); sched_add(newtd, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(newtd); } } else { newtd->td_pflags &= ~TDP_SA; @@ -744,9 +760,9 @@ _PRELE(p); } PROC_UNLOCK(p); - mtx_lock_spin(&sched_lock); + thread_lock(newtd); sched_add(newtd, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(newtd); } } return (0); @@ -774,9 +790,9 @@ void upcall_stash(struct kse_upcall *ku) { - mtx_lock_spin(&kse_zombie_lock); + mtx_lock_spin(&kse_lock); TAILQ_INSERT_HEAD(&zombie_upcalls, ku, ku_link); - mtx_unlock_spin(&kse_zombie_lock); + mtx_unlock_spin(&kse_lock); } /* @@ -792,11 +808,11 @@ * we really don't care about the next instant.. */ if (!TAILQ_EMPTY(&zombie_upcalls)) { - mtx_lock_spin(&kse_zombie_lock); + mtx_lock_spin(&kse_lock); ku_first = TAILQ_FIRST(&zombie_upcalls); if (ku_first) TAILQ_INIT(&zombie_upcalls); - mtx_unlock_spin(&kse_zombie_lock); + mtx_unlock_spin(&kse_lock); while (ku_first) { ku_next = TAILQ_NEXT(ku_first, ku_link); upcall_free(ku_first); @@ -828,9 +844,9 @@ */ PROC_LOCK(p); if (td->td_flags & TDF_NEEDSIGCHK) { - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags &= ~TDF_NEEDSIGCHK; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_lock(&p->p_sigacts->ps_mtx); while ((sig = cursig(td)) != 0) postsig(sig); @@ -931,9 +947,9 @@ return (0); if (user) { /* Current always do via ast() */ - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags |= TDF_ASTPENDING; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); td->td_uuticks++; } else if (td->td_mailbox != NULL) td->td_usticks++; @@ -976,7 +992,7 @@ /* * This function is intended to be used to initialize a spare thread - * for upcall. Initialize thread's large data area outside sched_lock + * for upcall. Initialize thread's large data area outside the thread lock * for thread_schedule_upcall(). The crhold is also here to get it out * from the schedlock as it has a mutex op itself. * XXX BUG.. we need to get the cr ref after the thread has @@ -1006,7 +1022,7 @@ { struct thread *td2; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); /* * Schedule an upcall thread on specified kse_upcall, @@ -1028,7 +1044,10 @@ */ bcopy(&td->td_startcopy, &td2->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); + sched_fork_thread(td, td2); + mtx_lock_spin(&kse_lock); thread_link(td2, ku->ku_proc); + mtx_unlock_spin(&kse_lock); /* inherit parts of blocked thread's context as a good template */ cpu_set_upcall(td2, td); /* Let the new thread become owner of the upcall */ @@ -1040,7 +1059,6 @@ td2->td_inhibitors = 0; SIGFILLSET(td2->td_sigmask); SIG_CANTMASK(td2->td_sigmask); - sched_fork_thread(td, td2); return (td2); /* bogus.. should be a void function */ } @@ -1079,7 +1097,7 @@ struct kse_upcall *ku; struct thread *td2; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); /* * If the outgoing thread is in threaded group and has never @@ -1111,7 +1129,9 @@ td->td_pflags &= ~TDP_CAN_UNBIND; td2 = thread_schedule_upcall(td, ku); if (flags & SW_INVOL || nextthread) { + thread_lock(td2); sched_add(td2, SRQ_YIELDING); + thread_unlock(td2); } else { /* Keep up with reality.. we have one extra thread * in the picture.. and it's 'running'. @@ -1181,11 +1201,11 @@ if (__predict_false(p->p_flag & P_TRACED)) { flags = fuword32(&tmbx->tm_dflags); if (flags & TMDF_SUSPEND) { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(td->td_proc); /* fuword can block, check again */ if (td->td_upcall) ku->ku_flags |= KUF_DOUPCALL; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(td->td_proc); } } } @@ -1266,7 +1286,7 @@ WITNESS_WARN(WARN_PANIC, &p->p_mtx.mtx_object, "thread exiting in userret"); sigqueue_flush(&td->td_sigqueue); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); thread_stopped(p); thread_exit(); /* NOTREACHED */ @@ -1278,22 +1298,22 @@ if (p->p_numthreads > max_threads_per_proc) { max_threads_hits++; PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); p->p_maxthrwaits++; while (p->p_numthreads > max_threads_per_proc) { if (p->p_numupcalls >= max_threads_per_proc) break; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); if (msleep(&p->p_numthreads, &p->p_mtx, PPAUSE|PCATCH, "maxthreads", hz/10) != EWOULDBLOCK) { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); break; } else { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); } } p->p_maxthrwaits--; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); } @@ -1310,9 +1330,9 @@ td->td_pflags &= ~TDP_UPCALLING; if (ku->ku_flags & KUF_DOUPCALL) { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); ku->ku_flags &= ~KUF_DOUPCALL; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } /* * Set user context to the UTS @@ -1400,9 +1420,9 @@ td = TAILQ_FIRST(&p->p_threads); if (td && (td->td_pflags & TDP_SA)) { FOREACH_UPCALL_IN_PROC(p, ku) { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); ku->ku_flags |= KUF_DOUPCALL; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); wakeup(&p->p_completed); } } Index: kern/kern_kthread.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_kthread.c,v retrieving revision 1.37 diff -u -r1.37 kern_kthread.c --- kern/kern_kthread.c 23 Jan 2007 08:46:50 -0000 1.37 +++ kern/kern_kthread.c 26 Feb 2007 07:21:19 -0000 @@ -113,9 +113,9 @@ /* Delay putting it on the run queue until now. */ if (!(flags & RFSTOPPED)) { - mtx_lock_spin(&sched_lock); + thread_lock(td); sched_add(td, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } return 0; Index: kern/kern_lockf.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_lockf.c,v retrieving revision 1.54 diff -u -r1.54 kern_lockf.c --- kern/kern_lockf.c 29 Mar 2005 08:13:01 -0000 1.54 +++ kern/kern_lockf.c 26 Feb 2007 07:21:19 -0000 @@ -266,16 +266,19 @@ */ if ((lock->lf_flags & F_POSIX) && (block->lf_flags & F_POSIX)) { - register struct proc *wproc; + struct proc *wproc; + struct proc *nproc; struct thread *td; - register struct lockf *waitblock; + struct lockf *waitblock; int i = 0; /* The block is waiting on something */ - /* XXXKSE this is not complete under threads */ wproc = (struct proc *)block->lf_id; - mtx_lock_spin(&sched_lock); +restart: + nproc = NULL; + PROC_SLOCK(wproc); FOREACH_THREAD_IN_PROC(wproc, td) { + thread_lock(td); while (td->td_wchan && (td->td_wmesg == lockstr) && (i++ < maxlockdepth)) { @@ -284,15 +287,20 @@ waitblock = waitblock->lf_next; if ((waitblock->lf_flags & F_POSIX) == 0) break; - wproc = (struct proc *)waitblock->lf_id; - if (wproc == (struct proc *)lock->lf_id) { - mtx_unlock_spin(&sched_lock); + nproc = (struct proc *)waitblock->lf_id; + if (nproc == (struct proc *)lock->lf_id) { + PROC_SUNLOCK(wproc); + thread_unlock(td); free(lock, M_LOCKF); return (EDEADLK); } } + thread_unlock(td); } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(wproc); + wproc = nproc; + if (wproc) + goto restart; } /* * For flock type locks, we must first remove Index: kern/kern_mutex.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_mutex.c,v retrieving revision 1.179 diff -u -r1.179 kern_mutex.c --- kern/kern_mutex.c 16 Dec 2006 02:37:57 -0000 1.179 +++ kern/kern_mutex.c 26 Feb 2007 07:21:19 -0000 @@ -113,6 +113,7 @@ /* * System-wide mutexes */ +struct mtx blocked_lock; struct mtx sched_lock; struct mtx Giant; @@ -264,6 +265,7 @@ _mtx_lock_sleep(struct mtx *m, uintptr_t tid, int opts, const char *file, int line) { + struct turnstile *ts; #if defined(SMP) && !defined(NO_ADAPTIVE_MUTEXES) volatile struct thread *owner; #endif @@ -291,7 +293,7 @@ while (!_obtain_lock(m, tid)) { lock_profile_obtain_lock_failed(&m->mtx_object, &contested); - turnstile_lock(&m->mtx_object); + ts = turnstile_trywait(&m->mtx_object); v = m->mtx_lock; /* @@ -299,7 +301,7 @@ * the turnstile chain lock. */ if (v == MTX_UNOWNED) { - turnstile_release(&m->mtx_object); + turnstile_cancel(ts); cpu_spinwait(); continue; } @@ -315,7 +317,7 @@ */ if (v == MTX_CONTESTED) { m->mtx_lock = tid | MTX_CONTESTED; - turnstile_claim(&m->mtx_object); + turnstile_claim(ts); break; } #endif @@ -327,7 +329,7 @@ */ if ((v & MTX_CONTESTED) == 0 && !atomic_cmpset_ptr(&m->mtx_lock, v, v | MTX_CONTESTED)) { - turnstile_release(&m->mtx_object); + turnstile_cancel(ts); cpu_spinwait(); continue; } @@ -344,7 +346,7 @@ if (m != &Giant && TD_IS_RUNNING(owner)) #endif { - turnstile_release(&m->mtx_object); + turnstile_cancel(ts); while (mtx_owner(m) == owner && TD_IS_RUNNING(owner)) { cpu_spinwait(); } @@ -371,8 +373,7 @@ /* * Block on the turnstile. */ - turnstile_wait(&m->mtx_object, mtx_owner(m), - TS_EXCLUSIVE_QUEUE); + turnstile_wait(ts, mtx_owner(m), TS_EXCLUSIVE_QUEUE); } #ifdef KTR if (cont_logged) { @@ -466,7 +467,11 @@ return; } - turnstile_lock(&m->mtx_object); + /* + * We have to lock the chain before the turnstile so this turnstile + * can be removed from the hash list if it is empty. + */ + turnstile_chain_lock(&m->mtx_object); ts = turnstile_lookup(&m->mtx_object); if (LOCK_LOG_TEST(&m->mtx_object, opts)) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m); @@ -476,7 +481,7 @@ _release_lock_quick(m); if (LOCK_LOG_TEST(&m->mtx_object, opts)) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p no sleepers", m); - turnstile_release(&m->mtx_object); + turnstile_chain_unlock(&m->mtx_object); return; } #else @@ -501,7 +506,12 @@ m); } #endif + /* + * This turnstile is now no longer associated with the mutex. We can + * unlock the chain lock so a new turnstile may take it's place. + */ turnstile_unpend(ts, TS_EXCLUSIVE_LOCK); + turnstile_chain_unlock(&m->mtx_object); #ifndef PREEMPTION /* @@ -514,21 +524,8 @@ td = curthread; if (td->td_critnest > 0 || td1->td_priority >= td->td_priority) return; - mtx_lock_spin(&sched_lock); + thread_lock(td1); if (!TD_IS_RUNNING(td1)) { -#ifdef notyet - if (td->td_ithd != NULL) { - struct ithd *it = td->td_ithd; - - if (it->it_interrupted) { - if (LOCK_LOG_TEST(&m->mtx_object, opts)) - CTR2(KTR_LOCK, - "_mtx_unlock_sleep: %p interrupted %p", - it, it->it_interrupted); - intr_thd_fixup(it); - } - } -#endif if (LOCK_LOG_TEST(&m->mtx_object, opts)) CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p switching out lock=%p", m, @@ -539,7 +536,7 @@ CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p resuming lock=%p", m, (void *)m->mtx_lock); } - mtx_unlock_spin(&sched_lock); + thread_unlock(td1); #endif return; @@ -722,7 +719,10 @@ */ mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE); mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE); + mtx_init(&blocked_lock, "blocked lock", NULL, MTX_SPIN); + blocked_lock.mtx_lock = 0xdeadc0de; /* Always blocked. */ mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK); + mtx_init(&proc0.p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE); mtx_init(&devmtx, "cdev", NULL, MTX_DEF); mtx_lock(&Giant); Index: kern/kern_poll.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_poll.c,v retrieving revision 1.28 diff -u -r1.28 kern_poll.c --- kern/kern_poll.c 6 Dec 2006 06:34:55 -0000 1.28 +++ kern/kern_poll.c 26 Feb 2007 07:21:19 -0000 @@ -580,17 +580,17 @@ rtp.prio = RTP_PRIO_MAX; /* lowest priority */ rtp.type = RTP_PRIO_IDLE; - mtx_lock_spin(&sched_lock); + PROC_SLOCK(td->td_proc); rtp_to_pri(&rtp, td); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(td->td_proc); for (;;) { if (poll_in_idle_loop && poll_handlers > 0) { idlepoll_sleeping = 0; ether_poll(poll_each_burst); - mtx_lock_spin(&sched_lock); + thread_lock(td); mi_switch(SW_VOL, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } else { idlepoll_sleeping = 1; tsleep(&idlepoll_sleeping, 0, "pollid", hz * 3); Index: kern/kern_proc.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_proc.c,v retrieving revision 1.246 diff -u -r1.246 kern_proc.c --- kern/kern_proc.c 6 Dec 2006 06:34:55 -0000 1.246 +++ kern/kern_proc.c 26 Feb 2007 07:21:19 -0000 @@ -177,6 +177,7 @@ td = thread_alloc(); bzero(&p->p_mtx, sizeof(struct mtx)); mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK); + mtx_init(&p->p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE); p->p_stats = pstats_alloc(); proc_linkup(p, td); sched_newproc(p, td); @@ -669,7 +670,7 @@ kp->ki_sigcatch = ps->ps_sigcatch; mtx_unlock(&ps->ps_mtx); } - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (p->p_state != PRS_NEW && p->p_state != PRS_ZOMBIE && p->p_vmspace != NULL) { @@ -694,7 +695,7 @@ kp->ki_pid = p->p_pid; kp->ki_nice = p->p_nice; kp->ki_runtime = cputick2usec(p->p_rux.rux_runtime); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); if ((p->p_sflag & PS_INMEM) && p->p_stats != NULL) { kp->ki_start = p->p_stats->p_start; timevaladd(&kp->ki_start, &boottime); @@ -754,7 +755,7 @@ /* * Fill in information that is thread specific. - * Must be called with sched_lock locked. + * Must be called with p_slock locked. */ static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp) @@ -762,7 +763,9 @@ struct proc *p; p = td->td_proc; + PROC_SLOCK_ASSERT(p, MA_OWNED); + thread_lock(td); if (td->td_wmesg != NULL) strlcpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg)); else @@ -820,6 +823,7 @@ SIGSETOR(kp->ki_siglist, td->td_siglist); kp->ki_sigmask = td->td_sigmask; + thread_unlock(td); } /* @@ -831,10 +835,10 @@ { fill_kinfo_proc_only(p, kp); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (FIRST_THREAD_IN_PROC(p) != NULL) fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } struct pstats * @@ -901,14 +905,14 @@ fill_kinfo_proc_only(p, &kinfo_proc); if (flags & KERN_PROC_NOTHREADS) { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (FIRST_THREAD_IN_PROC(p) != NULL) fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), &kinfo_proc); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc, sizeof(kinfo_proc)); } else { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (FIRST_THREAD_IN_PROC(p) != NULL) FOREACH_THREAD_IN_PROC(p, td) { fill_kinfo_thread(td, &kinfo_proc); @@ -920,7 +924,7 @@ else error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc, sizeof(kinfo_proc)); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } PROC_UNLOCK(p); if (error) @@ -1010,12 +1014,12 @@ /* * Skip embryonic processes. */ - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (p->p_state == PRS_NEW) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); continue; } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_LOCK(p); KASSERT(p->p_ucred != NULL, ("process credential is NULL for non-NEW proc")); Index: kern/kern_resource.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_resource.c,v retrieving revision 1.166 diff -u -r1.166 kern_resource.c --- kern/kern_resource.c 19 Feb 2007 13:22:36 -0000 1.166 +++ kern/kern_resource.c 26 Feb 2007 07:21:19 -0000 @@ -267,9 +267,9 @@ n = PRIO_MIN; if (n < p->p_nice && priv_check(td, PRIV_SCHED_SETPRIORITY) != 0) return (EACCES); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); sched_nice(p, n); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); return (0); } @@ -313,7 +313,7 @@ case RTP_LOOKUP: if ((error = p_cansee(td, p))) break; - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (uap->lwpid == 0 || uap->lwpid == td->td_tid) td1 = td; else @@ -322,7 +322,7 @@ pri_to_rtp(td1, &rtp); else error = ESRCH; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); return (copyout(&rtp, uap->rtp, sizeof(struct rtprio))); case RTP_SET: @@ -349,7 +349,7 @@ } } - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (uap->lwpid == 0 || uap->lwpid == td->td_tid) td1 = td; else @@ -358,7 +358,7 @@ error = rtp_to_pri(&rtp, td1); else error = ESRCH; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); break; default: error = EINVAL; @@ -412,7 +412,7 @@ case RTP_LOOKUP: if ((error = p_cansee(td, p))) break; - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); /* * Return OUR priority if no pid specified, * or if one is, report the highest priority @@ -440,7 +440,7 @@ } } } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); return (copyout(&rtp, uap->rtp, sizeof(struct rtprio))); case RTP_SET: @@ -478,7 +478,7 @@ * do all the threads on that process. If we * specify our own pid we do the latter. */ - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (uap->pid == 0) { error = rtp_to_pri(&rtp, td); } else { @@ -487,7 +487,7 @@ break; } } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); break; default: error = EINVAL; @@ -502,9 +502,9 @@ { u_char newpri; - mtx_assert(&sched_lock, MA_OWNED); if (rtp->prio > RTP_PRIO_MAX) return (EINVAL); + thread_lock(td); switch (RTP_PRIO_BASE(rtp->type)) { case RTP_PRIO_REALTIME: newpri = PRI_MIN_REALTIME + rtp->prio; @@ -516,12 +516,14 @@ newpri = PRI_MIN_IDLE + rtp->prio; break; default: + thread_unlock(td); return (EINVAL); } sched_class(td, rtp->type); /* XXX fix */ sched_user_prio(td, newpri); if (curthread == td) sched_prio(curthread, td->td_user_pri); /* XXX dubious */ + thread_unlock(td); return (0); } @@ -529,7 +531,7 @@ pri_to_rtp(struct thread *td, struct rtprio *rtp) { - mtx_assert(&sched_lock, MA_OWNED); + thread_lock(td); switch (PRI_BASE(td->td_pri_class)) { case PRI_REALTIME: rtp->prio = td->td_base_user_pri - PRI_MIN_REALTIME; @@ -544,6 +546,7 @@ break; } rtp->type = td->td_pri_class; + thread_unlock(td); } #if defined(COMPAT_43) @@ -683,9 +686,9 @@ switch (which) { case RLIMIT_CPU: - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); p->p_cpulimit = limp->rlim_cur; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); break; case RLIMIT_DATA: if (limp->rlim_cur > maxdsiz) @@ -819,9 +822,7 @@ uint64_t u; PROC_LOCK_ASSERT(p, MA_OWNED); - mtx_assert(&sched_lock, MA_NOTOWNED); - mtx_lock_spin(&sched_lock); - + PROC_SLOCK(p); /* * If we are getting stats for the current process, then add in the * stats that this thread has accumulated in its current time slice. @@ -840,9 +841,9 @@ p->p_rux.rux_sticks += td->td_sticks; td->td_sticks = 0; } - /* Work on a copy of p_rux so we can let go of sched_lock */ + /* Work on a copy of p_rux so we can let go of p_slock */ rux = p->p_rux; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); calcru1(p, &rux, up, sp); /* Update the result from the p_rux copy */ p->p_rux.rux_uu = rux.rux_uu; Index: kern/kern_rwlock.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_rwlock.c,v retrieving revision 1.12 diff -u -r1.12 kern_rwlock.c --- kern/kern_rwlock.c 13 Nov 2006 05:41:46 -0000 1.12 +++ kern/kern_rwlock.c 26 Feb 2007 07:21:19 -0000 @@ -146,6 +146,7 @@ #ifdef SMP volatile struct thread *owner; #endif + struct turnstile *ts; uint64_t waitstart; int contested; uintptr_t x; @@ -207,7 +208,7 @@ * has a write lock, so acquire the turnstile lock so we can * begin the process of blocking. */ - turnstile_lock(&rw->rw_object); + ts = turnstile_trywait(&rw->rw_object); /* * The lock might have been released while we spun, so @@ -216,7 +217,7 @@ */ x = rw->rw_lock; if (x & RW_LOCK_READ) { - turnstile_release(&rw->rw_object); + turnstile_cancel(ts); cpu_spinwait(); continue; } @@ -230,7 +231,7 @@ if (!(x & RW_LOCK_READ_WAITERS)) { if (!atomic_cmpset_ptr(&rw->rw_lock, x, x | RW_LOCK_READ_WAITERS)) { - turnstile_release(&rw->rw_object); + turnstile_cancel(ts); cpu_spinwait(); continue; } @@ -248,7 +249,7 @@ owner = (struct thread *)RW_OWNER(x); if (TD_IS_RUNNING(owner)) { lock_profile_obtain_lock_failed(&rw->rw_object, &contested); - turnstile_release(&rw->rw_object); + turnstile_cancel(ts); if (LOCK_LOG_TEST(&rw->rw_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, rw, owner); @@ -266,7 +267,7 @@ if (LOCK_LOG_TEST(&rw->rw_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__, rw); - turnstile_wait(&rw->rw_object, rw_owner(rw), TS_SHARED_QUEUE); + turnstile_wait(ts, rw_owner(rw), TS_SHARED_QUEUE); if (LOCK_LOG_TEST(&rw->rw_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from turnstile", __func__, rw); @@ -358,7 +359,7 @@ * Ok, we know we have a waiting writer and we think we * are the last reader, so grab the turnstile lock. */ - turnstile_lock(&rw->rw_object); + turnstile_chain_lock(&rw->rw_object); /* * Try to drop our lock leaving the lock in a unlocked @@ -378,7 +379,7 @@ */ if (!atomic_cmpset_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | RW_LOCK_WRITE_WAITERS, RW_UNLOCKED)) { - turnstile_release(&rw->rw_object); + turnstile_chain_unlock(&rw->rw_object); continue; } if (LOCK_LOG_TEST(&rw->rw_object, 0)) @@ -395,6 +396,7 @@ ts = turnstile_lookup(&rw->rw_object); MPASS(ts != NULL); turnstile_broadcast(ts, TS_EXCLUSIVE_QUEUE); + turnstile_chain_unlock(&rw->rw_object); turnstile_unpend(ts, TS_SHARED_LOCK); break; } @@ -411,6 +413,7 @@ #ifdef SMP volatile struct thread *owner; #endif + struct turnstile *ts; int contested; uintptr_t v; @@ -419,7 +422,7 @@ rw->rw_object.lo_name, (void *)rw->rw_lock, file, line); while (!_rw_write_lock(rw, tid)) { - turnstile_lock(&rw->rw_object); + ts = turnstile_trywait(&rw->rw_object); v = rw->rw_lock; /* @@ -427,7 +430,7 @@ * turnstile chain lock, try again. */ if (v == RW_UNLOCKED) { - turnstile_release(&rw->rw_object); + turnstile_cancel(ts); cpu_spinwait(); continue; } @@ -446,12 +449,12 @@ if (atomic_cmpset_acq_ptr(&rw->rw_lock, RW_UNLOCKED | RW_LOCK_WRITE_WAITERS, tid | RW_LOCK_WRITE_WAITERS)) { - turnstile_claim(&rw->rw_object); + turnstile_claim(ts); CTR2(KTR_LOCK, "%s: %p claimed by new writer", __func__, rw); break; } - turnstile_release(&rw->rw_object); + turnstile_cancel(ts); cpu_spinwait(); lock_profile_obtain_lock_failed(&rw->rw_object, &contested); continue; @@ -465,7 +468,7 @@ if (!(v & RW_LOCK_WRITE_WAITERS)) { if (!atomic_cmpset_ptr(&rw->rw_lock, v, v | RW_LOCK_WRITE_WAITERS)) { - turnstile_release(&rw->rw_object); + turnstile_cancel(ts); cpu_spinwait(); lock_profile_obtain_lock_failed(&rw->rw_object, &contested); continue; @@ -484,7 +487,7 @@ owner = (struct thread *)RW_OWNER(v); if (!(v & RW_LOCK_READ) && TD_IS_RUNNING(owner)) { lock_profile_obtain_lock_failed(&rw->rw_object, &contested); - turnstile_release(&rw->rw_object); + turnstile_cancel(ts); if (LOCK_LOG_TEST(&rw->rw_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, rw, owner); @@ -502,8 +505,7 @@ if (LOCK_LOG_TEST(&rw->rw_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__, rw); - turnstile_wait(&rw->rw_object, rw_owner(rw), - TS_EXCLUSIVE_QUEUE); + turnstile_wait(ts, rw_owner(rw), TS_EXCLUSIVE_QUEUE); if (LOCK_LOG_TEST(&rw->rw_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from turnstile", __func__, rw); @@ -528,7 +530,7 @@ if (LOCK_LOG_TEST(&rw->rw_object, 0)) CTR2(KTR_LOCK, "%s: %p contested", __func__, rw); - turnstile_lock(&rw->rw_object); + turnstile_chain_lock(&rw->rw_object); ts = turnstile_lookup(&rw->rw_object); #ifdef SMP @@ -541,7 +543,7 @@ atomic_store_rel_ptr(&rw->rw_lock, RW_UNLOCKED); if (LOCK_LOG_TEST(&rw->rw_object, 0)) CTR2(KTR_LOCK, "%s: %p no sleepers", __func__, rw); - turnstile_release(&rw->rw_object); + turnstile_chain_unlock(&rw->rw_object); return; } #else @@ -593,6 +595,7 @@ if (LOCK_LOG_TEST(&rw->rw_object, 0)) CTR2(KTR_LOCK, "%s: %p no sleepers 2", __func__, rw); atomic_store_rel_ptr(&rw->rw_lock, v); + turnstile_chain_unlock(&rw->rw_object); turnstile_disown(ts); return; } @@ -604,6 +607,7 @@ queue == TS_SHARED_QUEUE ? "read" : "write"); turnstile_broadcast(ts, queue); atomic_store_rel_ptr(&rw->rw_lock, v); + turnstile_chain_unlock(&rw->rw_object); turnstile_unpend(ts, TS_EXCLUSIVE_LOCK); } @@ -616,6 +620,7 @@ _rw_try_upgrade(struct rwlock *rw, const char *file, int line) { uintptr_t v, tid; + struct turnstile *ts; int success; _rw_assert(rw, RA_RLOCKED, file, line); @@ -638,7 +643,7 @@ * Ok, we think we have write waiters, so lock the * turnstile. */ - turnstile_lock(&rw->rw_object); + ts = turnstile_trywait(&rw->rw_object); /* * Try to switch from one reader to a writer again. This time @@ -656,9 +661,9 @@ #else if (success && v) #endif - turnstile_claim(&rw->rw_object); + turnstile_claim(ts); else - turnstile_release(&rw->rw_object); + turnstile_cancel(ts); out: LOCK_LOG_TRY("WUPGRADE", &rw->rw_object, 0, success, file, line); if (success) @@ -694,7 +699,7 @@ * Ok, we think we have waiters, so lock the turnstile so we can * read the waiter flags without any races. */ - turnstile_lock(&rw->rw_object); + turnstile_chain_lock(&rw->rw_object); v = rw->rw_lock; MPASS(v & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS)); @@ -726,12 +731,9 @@ turnstile_broadcast(ts, TS_SHARED_QUEUE); atomic_store_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | (v & RW_LOCK_WRITE_WAITERS)); + turnstile_chain_unlock(&rw->rw_object); if (v & RW_LOCK_READ_WAITERS) turnstile_unpend(ts, TS_EXCLUSIVE_LOCK); -#ifdef SMP - else if (ts == NULL) - turnstile_release(&rw->rw_object); -#endif else turnstile_disown(ts); out: Index: kern/kern_shutdown.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_shutdown.c,v retrieving revision 1.180 diff -u -r1.180 kern_shutdown.c --- kern/kern_shutdown.c 6 Nov 2006 13:42:00 -0000 1.180 +++ kern/kern_shutdown.c 26 Feb 2007 07:21:19 -0000 @@ -269,9 +269,9 @@ * systems don't shutdown properly (i.e., ACPI power off) if we * run on another processor. */ - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_bind(curthread, 0); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); KASSERT(PCPU_GET(cpuid) == 0, ("boot: not running on cpu 0")); #endif /* We're in the process of rebooting. */ @@ -342,9 +342,9 @@ */ DROP_GIANT(); for (subiter = 0; subiter < 50 * iter; subiter++) { - mtx_lock_spin(&sched_lock); + thread_lock(curthread); mi_switch(SW_VOL, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); DELAY(1000); } PICKUP_GIANT(); @@ -559,9 +559,9 @@ } #endif #endif - mtx_lock_spin(&sched_lock); + /*thread_lock(td); */ td->td_flags |= TDF_INPANIC; - mtx_unlock_spin(&sched_lock); + /* thread_unlock(td); */ if (!sync_on_panic) bootopt |= RB_NOSYNC; boot(bootopt); Index: kern/kern_sig.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_sig.c,v retrieving revision 1.339 diff -u -r1.339 kern_sig.c --- kern/kern_sig.c 9 Feb 2007 17:48:28 -0000 1.339 +++ kern/kern_sig.c 26 Feb 2007 07:21:20 -0000 @@ -511,10 +511,10 @@ sigqueue_init(&worklist, NULL); sigqueue_move_set(&p->p_sigqueue, &worklist, set); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); FOREACH_THREAD_IN_PROC(p, td0) sigqueue_move_set(&td0->td_sigqueue, &worklist, set); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); sigqueue_flush(&worklist); } @@ -554,7 +554,7 @@ { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED); - mtx_assert(&sched_lock, MA_NOTOWNED); + THREAD_LOCK_ASSERT(td, MA_NOTOWNED); return (SIGPENDING(td) ? issignal(td) : 0); } @@ -590,9 +590,9 @@ if (! SIGISEMPTY(set)) sigqueue_move_set(&p->p_sigqueue, &td->td_sigqueue, &set); if (SIGPENDING(td)) { - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } #ifdef KSE if ((p->p_flag & P_SA) && !(p->p_flag & P_SIGEVENT)) { @@ -762,7 +762,9 @@ } #endif /* never to be seen again */ + PROC_SLOCK(p); sigqueue_delete_proc(p, sig); + PROC_SUNLOCK(p); if (sig != SIGCONT) /* easier in psignal */ SIGADDSET(ps->ps_sigignore, sig); @@ -967,7 +969,9 @@ if (sigprop(sig) & SA_IGNORE) { if (sig != SIGCONT) SIGADDSET(ps->ps_sigignore, sig); + PROC_SLOCK(p); sigqueue_delete_proc(p, sig); + PROC_SUNLOCK(p); } ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } @@ -1913,7 +1917,7 @@ thread_user_enter(td); PROC_LOCK(p); SIGDELSET(td->td_sigmask, sig); - mtx_lock_spin(&sched_lock); + thread_lock(td); /* * Force scheduling an upcall, so UTS has chance to * process the signal before thread runs again in @@ -1921,7 +1925,7 @@ */ if (td->td_upcall) td->td_upcall->ku_flags |= KUF_DOUPCALL; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } else { PROC_LOCK(p); } @@ -2016,7 +2020,7 @@ if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig)) return (curthread); signal_td = NULL; - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); FOREACH_THREAD_IN_PROC(p, td) { if (!SIGISMEMBER(td->td_sigmask, sig)) { signal_td = td; @@ -2025,7 +2029,7 @@ } if (signal_td == NULL) signal_td = FIRST_THREAD_IN_PROC(p); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); return (signal_td); } @@ -2193,7 +2197,9 @@ ksiginfo_tryfree(ksi); return (ret); } + PROC_SLOCK(p); sigqueue_delete_proc(p, SIGCONT); + PROC_SUNLOCK(p); if (p->p_flag & P_CONTINUED) { p->p_flag &= ~P_CONTINUED; PROC_LOCK(p->p_pptr); @@ -2231,6 +2237,7 @@ * waking up threads so that they can cross the user boundary. * We try do the per-process part here. */ + PROC_SLOCK(p); if (P_SHOULDSTOP(p)) { /* * The process is in stopped mode. All the threads should be @@ -2242,6 +2249,7 @@ * so no further action is necessary. * No signal can restart us. */ + PROC_SUNLOCK(p); goto out; } @@ -2268,15 +2276,21 @@ */ p->p_flag &= ~P_STOPPED_SIG; if (p->p_numthreads == p->p_suspcount) { + PROC_SUNLOCK(p); p->p_flag |= P_CONTINUED; p->p_xstat = SIGCONT; PROC_LOCK(p->p_pptr); childproc_continued(p); PROC_UNLOCK(p->p_pptr); + PROC_SLOCK(p); } if (action == SIG_DFL) { + thread_unsuspend(p); + PROC_SUNLOCK(p); sigqueue_delete(sigqueue, sig); - } else if (action == SIG_CATCH) { + goto out; + } + if (action == SIG_CATCH) { #ifdef KSE /* * The process wants to catch it so it needs @@ -2288,20 +2302,18 @@ * single thread is runnable asap. * XXXKSE for now however, make them all run. */ -#else +#endif /* * The process wants to catch it so it needs * to run at least one thread, but which one? */ -#endif goto runfast; } /* * The signal is not ignored or caught. */ - mtx_lock_spin(&sched_lock); thread_unsuspend(p); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); goto out; } @@ -2311,6 +2323,7 @@ * (If we did the shell could get confused). * Just make sure the signal STOP bit set. */ + PROC_SUNLOCK(p); p->p_flag |= P_STOPPED_SIG; sigqueue_delete(sigqueue, sig); goto out; @@ -2324,10 +2337,11 @@ * the PROCESS runnable, leave it stopped. * It may run a bit until it hits a thread_suspend_check(). */ - mtx_lock_spin(&sched_lock); + thread_lock(td); if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR)) sleepq_abort(td, intrval); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + PROC_SUNLOCK(p); goto out; /* * Mutexes are short lived. Threads waiting on them will @@ -2335,9 +2349,10 @@ */ } else if (p->p_state == PRS_NORMAL) { if (p->p_flag & P_TRACED || action == SIG_CATCH) { - mtx_lock_spin(&sched_lock); + thread_lock(td); tdsigwakeup(td, sig, action, intrval); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + PROC_SUNLOCK(p); goto out; } @@ -2348,7 +2363,6 @@ goto out; p->p_flag |= P_STOPPED_SIG; p->p_xstat = sig; - mtx_lock_spin(&sched_lock); sig_suspend_threads(td, p, 1); if (p->p_numthreads == p->p_suspcount) { /* @@ -2359,10 +2373,10 @@ * should never be equal to p_suspcount. */ thread_stopped(p); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); sigqueue_delete_proc(p, p->p_xstat); } else - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); goto out; } else @@ -2370,6 +2384,7 @@ /* NOTREACHED */ } else { /* Not in "NORMAL" state. discard the signal. */ + PROC_SUNLOCK(p); sigqueue_delete(sigqueue, sig); goto out; } @@ -2380,13 +2395,14 @@ */ runfast: - mtx_lock_spin(&sched_lock); + thread_lock(td); tdsigwakeup(td, sig, action, intrval); + thread_unlock(td); thread_unsuspend(p); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); out: - /* If we jump here, sched_lock should not be owned. */ - mtx_assert(&sched_lock, MA_NOTOWNED); + /* If we jump here, proc slock should not be owned. */ + PROC_SLOCK_ASSERT(p, MA_NOTOWNED); return (ret); } @@ -2402,7 +2418,8 @@ register int prop; PROC_LOCK_ASSERT(p, MA_OWNED); - mtx_assert(&sched_lock, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); prop = sigprop(sig); /* @@ -2431,14 +2448,16 @@ * be awakened. */ if ((prop & SA_CONT) && action == SIG_DFL) { - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + PROC_SUNLOCK(p); sigqueue_delete(&p->p_sigqueue, sig); /* * It may be on either list in this state. * Remove from both for now. */ sigqueue_delete(&td->td_sigqueue, sig); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); + thread_lock(td); return; } @@ -2468,9 +2487,10 @@ struct thread *td2; PROC_LOCK_ASSERT(p, MA_OWNED); - mtx_assert(&sched_lock, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); FOREACH_THREAD_IN_PROC(p, td2) { + thread_lock(td2); if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) && (td2->td_flags & TDF_SINTR) && !TD_IS_SUSPENDED(td2)) { @@ -2483,6 +2503,7 @@ forward_signal(td2); #endif } + thread_unlock(td2); } } @@ -2495,15 +2516,17 @@ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.mtx_object, "Stopping for traced signal"); - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags |= TDF_XSIG; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); td->td_xsig = sig; + PROC_SLOCK(p); while ((p->p_flag & P_TRACED) && (td->td_flags & TDF_XSIG)) { if (p->p_flag & P_SINGLE_EXIT) { - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags &= ~TDF_XSIG; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + PROC_SUNLOCK(p); return (sig); } /* @@ -2513,26 +2536,19 @@ p->p_xstat = sig; p->p_xthread = td; p->p_flag |= (P_STOPPED_SIG|P_STOPPED_TRACE); - mtx_lock_spin(&sched_lock); sig_suspend_threads(td, p, 0); stopme: - thread_stopped(p); - thread_suspend_one(td); - PROC_UNLOCK(p); - DROP_GIANT(); - mi_switch(SW_VOL, NULL); - mtx_unlock_spin(&sched_lock); - PICKUP_GIANT(); - PROC_LOCK(p); - if (!(p->p_flag & P_TRACED)) + thread_suspend_switch(td); + if (!(p->p_flag & P_TRACED)) { break; + } if (td->td_flags & TDF_DBSUSPEND) { if (p->p_flag & P_SINGLE_EXIT) break; - mtx_lock_spin(&sched_lock); goto stopme; } } + PROC_SUNLOCK(p); return (td->td_xsig); } @@ -2686,16 +2702,10 @@ &p->p_mtx.mtx_object, "Catching SIGSTOP"); p->p_flag |= P_STOPPED_SIG; p->p_xstat = sig; - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); sig_suspend_threads(td, p, 0); - thread_stopped(p); - thread_suspend_one(td); - PROC_UNLOCK(p); - DROP_GIANT(); - mi_switch(SW_INVOL, NULL); - mtx_unlock_spin(&sched_lock); - PICKUP_GIANT(); - PROC_LOCK(p); + thread_suspend_switch(td); + PROC_SUNLOCK(p); mtx_lock(&ps->ps_mtx); break; } else if (prop & SA_IGNORE) { @@ -2740,18 +2750,18 @@ int n; PROC_LOCK_ASSERT(p, MA_OWNED); - mtx_assert(&sched_lock, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); n = p->p_suspcount; if (p == curproc) n++; if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); p->p_flag &= ~P_WAITED; PROC_LOCK(p->p_pptr); childproc_stopped(p, (p->p_flag & P_TRACED) ? CLD_TRAPPED : CLD_STOPPED); PROC_UNLOCK(p->p_pptr); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); } } Index: kern/kern_subr.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_subr.c,v retrieving revision 1.102 diff -u -r1.102 kern_subr.c --- kern/kern_subr.c 16 Jan 2007 11:40:55 -0000 1.102 +++ kern/kern_subr.c 26 Feb 2007 07:21:20 -0000 @@ -453,11 +453,11 @@ struct thread *td; td = curthread; - mtx_lock_spin(&sched_lock); DROP_GIANT(); + thread_lock(td); sched_prio(td, td->td_user_pri); mi_switch(SW_INVOL, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); PICKUP_GIANT(); } Index: kern/kern_switch.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_switch.c,v retrieving revision 1.129 diff -u -r1.129 kern_switch.c --- kern/kern_switch.c 8 Feb 2007 01:52:25 -0000 1.129 +++ kern/kern_switch.c 26 Feb 2007 07:21:20 -0000 @@ -49,6 +49,8 @@ #include #endif +#include + /* Uncomment this to enable logging of critical_enter/exit. */ #if 0 #define KTR_CRITICAL KTR_SCHED @@ -77,6 +79,24 @@ SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD, &kern_sched_preemption, 0, "Kernel preemption enabled"); +#ifdef SCHED_STATS +long switch_preempt; +long switch_owepreempt; +long switch_turnstile; +long switch_sleepq; +long switch_sleepqtimo; +long switch_relinquish; +long switch_needresched; +static SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW, 0, "switch stats"); +SYSCTL_INT(_kern_sched_stats, OID_AUTO, preempt, CTLFLAG_RD, &switch_preempt, 0, ""); +SYSCTL_INT(_kern_sched_stats, OID_AUTO, owepreempt, CTLFLAG_RD, &switch_owepreempt, 0, ""); +SYSCTL_INT(_kern_sched_stats, OID_AUTO, turnstile, CTLFLAG_RD, &switch_turnstile, 0, ""); +SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepq, CTLFLAG_RD, &switch_sleepq, 0, ""); +SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepqtimo, CTLFLAG_RD, &switch_sleepqtimo, 0, ""); +SYSCTL_INT(_kern_sched_stats, OID_AUTO, relinquish, CTLFLAG_RD, &switch_relinquish, 0, ""); +SYSCTL_INT(_kern_sched_stats, OID_AUTO, needresched, CTLFLAG_RD, &switch_needresched, 0, ""); +#endif + /************************************************************************ * Functions that manipulate runnability from a thread perspective. * ************************************************************************/ @@ -142,13 +162,13 @@ #ifdef PREEMPTION if (td->td_critnest == 1) { td->td_critnest = 0; - mtx_assert(&sched_lock, MA_NOTOWNED); if (td->td_owepreempt) { td->td_critnest = 1; - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_critnest--; + SCHED_STAT_INC(switch_owepreempt); mi_switch(SW_INVOL, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } } else #endif @@ -173,7 +193,6 @@ int cpri, pri; #endif - mtx_assert(&sched_lock, MA_OWNED); #ifdef PREEMPTION /* * The new thread should not preempt the current thread if any of the @@ -199,6 +218,7 @@ * to the new thread. */ ctd = curthread; + THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT ((ctd->td_sched != NULL && ctd->td_sched->ts_thread == ctd), ("thread has no (or wrong) sched-private part.")); KASSERT((td->td_inhibitors == 0), @@ -219,15 +239,25 @@ ctd->td_owepreempt = 1; return (0); } - /* * Thread is runnable but not yet put on system run queue. */ + MPASS(ctd->td_lock == &sched_lock); + MPASS(td->td_lock == &sched_lock); MPASS(TD_ON_RUNQ(td)); TD_SET_RUNNING(td); CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td, td->td_proc->p_pid, td->td_proc->p_comm); + SCHED_STAT_INC(switch_preempt); mi_switch(SW_INVOL|SW_PREEMPT, td); + /* + * td's lock pointer may have changed. We have to return with it + * locked. + */ + spinlock_enter(); + thread_unlock(ctd); + thread_lock(td); + spinlock_exit(); return (1); #else return (0); @@ -442,7 +472,6 @@ struct td_sched *ts; int pri; - mtx_assert(&sched_lock, MA_OWNED); while ((pri = runq_findbit(rq)) != -1) { rqh = &rq->rq_queues[pri]; #if defined(SMP) && defined(SCHED_4BSD) @@ -484,7 +513,6 @@ struct td_sched *ts; int pri; - mtx_assert(&sched_lock, MA_OWNED); if ((pri = runq_findbit_from(rq, idx)) != -1) { rqh = &rq->rq_queues[pri]; ts = TAILQ_FIRST(rqh); @@ -519,9 +547,20 @@ KASSERT(ts->ts_thread->td_proc->p_sflag & PS_INMEM, ("runq_remove_idx: process swapped out")); pri = ts->ts_rqindex; + KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri)); rqh = &rq->rq_queues[pri]; CTR5(KTR_RUNQ, "runq_remove_idx: td=%p, ts=%p pri=%d %d rqh=%p", ts->ts_thread, ts, ts->ts_thread->td_priority, pri, rqh); + { + struct td_sched *nts; + + TAILQ_FOREACH(nts, rqh, ts_procq) + if (nts == ts) + break; + if (ts != nts) + panic("runq_remove_idx: ts %p not on rqindex %d", + ts, pri); + } TAILQ_REMOVE(rqh, ts, ts_procq); if (TAILQ_EMPTY(rqh)) { CTR0(KTR_RUNQ, "runq_remove_idx: empty"); @@ -589,18 +628,86 @@ { } -/* - * Called from thread_exit() for all exiting thread - * - * Not to be confused with sched_exit_thread() - * that is only called from thread_exit() for threads exiting - * without the rest of the process exiting because it is also called from - * sched_exit() and we wouldn't want to call it twice. - * XXX This can probably be fixed. - */ void -sched_thread_exit(struct thread *td) +thread_lock(struct thread *td) +{ + return thread_lock_flags(td, 0); +} + +void +thread_lock_flags(struct thread *td, int flags) +{ + struct mtx *mtx; + int i; + + i = 0; + for (;;) { + mtx = __DEVOLATILE(struct mtx *, td->td_lock); + if (mtx == &blocked_lock) { + if (i++ < 10000000) + cpu_spinwait(); + else if (i++ < 60000000) + DELAY(1); + else + panic( + "thread_lock_flags %p blocked for too long", + td); + continue; + } + mtx_lock_spin_flags(mtx, flags); + if (td->td_lock == mtx) + break; + mtx_unlock_spin(mtx); + } +} + +struct mtx * +thread_lock_block(struct thread *td) +{ + struct mtx *lock; + + spinlock_enter(); + THREAD_LOCK_ASSERT(td, MA_OWNED); + lock = __DEVOLATILE(struct mtx *, td->td_lock); + td->td_lock = &blocked_lock; + mtx_unlock_spin(lock); + + return (lock); +} + +void +thread_lock_unblock(struct thread *td, struct mtx *new) +{ + mtx_assert(new, MA_OWNED); + MPASS(td->td_lock == &blocked_lock); + td->td_lock = new; + spinlock_exit(); +} + +void +thread_unlock(struct thread *td) +{ + struct mtx *mtx; + + /* + * As soon as the mutex is unlocked the value can change and + * the macro might access an updated value. Pass a local + * pointer instead. + */ + mtx = __DEVOLATILE(struct mtx *, td->td_lock); + mtx_unlock_spin(mtx); +} + +void +thread_lock_set(struct thread *td, struct mtx *new) { + struct mtx *lock; + + mtx_assert(new, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); + lock = __DEVOLATILE(struct mtx *, td->td_lock); + td->td_lock = new; + mtx_unlock_spin(lock); } #endif /* KERN_SWITCH_INCLUDE */ Index: kern/kern_synch.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_synch.c,v retrieving revision 1.288 diff -u -r1.288 kern_synch.c --- kern/kern_synch.c 23 Feb 2007 16:22:09 -0000 1.288 +++ kern/kern_synch.c 26 Feb 2007 07:21:20 -0000 @@ -202,9 +202,9 @@ */ pri = priority & PRIMASK; if (pri != 0 && pri != td->td_priority) { - mtx_lock_spin(&sched_lock); + thread_lock(td); sched_prio(td, pri); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } if (timo && catch) @@ -351,6 +351,7 @@ sleepq_lock(ident); sleepq_signal(ident, SLEEPQ_MSLEEP, -1, 0); + sleepq_release(ident); } /* @@ -363,8 +364,8 @@ struct thread *td; struct proc *p; - mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); td = curthread; /* XXX */ + THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED); p = td->td_proc; /* XXX */ KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code")); #ifdef INVARIANTS @@ -383,12 +384,15 @@ * Don't perform context switches from the debugger. */ if (kdb_active) { - mtx_unlock_spin(&sched_lock); + thread_unlock(td); kdb_backtrace(); kdb_reenter(); panic("%s: did not reenter debugger", __func__); } + /* + * XXX Need proc lock for stats! + */ if (flags & SW_VOL) p->p_stats->p_ru.ru_nvcsw++; else @@ -422,6 +426,7 @@ /* * Finish up stats for outgoing thread. */ + /* XXX Needs atomic! */ cnt.v_swtch++; PCPU_SET(switchtime, new_switchtime); PCPU_SET(switchticks, ticks); @@ -477,7 +482,7 @@ struct proc *p; p = td->td_proc; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); switch (p->p_state) { case PRS_ZOMBIE: panic("setrunnable(1)"); Index: kern/kern_thr.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_thr.c,v retrieving revision 1.59 diff -u -r1.59 kern_thr.c --- kern/kern_thr.c 23 Jan 2007 08:46:50 -0000 1.59 +++ kern/kern_thr.c 26 Feb 2007 07:21:20 -0000 @@ -226,12 +226,15 @@ PROC_LOCK(td->td_proc); td->td_proc->p_flag |= P_HADTHREADS; newtd->td_sigmask = td->td_sigmask; - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); thread_link(newtd, p); - PROC_UNLOCK(p); - + thread_lock(td); /* let the scheduler know about these things. */ sched_fork_thread(td, newtd); + thread_unlock(td); + PROC_SUNLOCK(p); + PROC_UNLOCK(p); + thread_lock(newtd); if (rtp != NULL) { if (!(td->td_pri_class == PRI_TIMESHARE && rtp->type == RTP_PRIO_NORMAL)) { @@ -242,7 +245,7 @@ TD_SET_CAN_RUN(newtd); /* if ((flags & THR_SUSPENDED) == 0) */ sched_add(newtd, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(newtd); return (error); } @@ -275,7 +278,7 @@ PROC_LOCK(p); sigqueue_flush(&td->td_sigqueue); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); /* * Shutting down last thread in the proc. This will actually @@ -286,7 +289,7 @@ thread_exit(); /* NOTREACHED */ } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); return (0); } @@ -379,9 +382,9 @@ error = msleep((void *)td, &td->td_proc->p_mtx, PCATCH, "lthr", hz); if (td->td_flags & TDF_THRWAKEUP) { - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags &= ~TDF_THRWAKEUP; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); PROC_UNLOCK(td->td_proc); return (0); } @@ -414,9 +417,9 @@ PROC_UNLOCK(p); return (ESRCH); } - mtx_lock_spin(&sched_lock); + thread_lock(ttd); ttd->td_flags |= TDF_THRWAKEUP; - mtx_unlock_spin(&sched_lock); + thread_unlock(ttd); wakeup((void *)ttd); PROC_UNLOCK(p); return (0); Index: kern/kern_thread.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_thread.c,v retrieving revision 1.240 diff -u -r1.240 kern_thread.c --- kern/kern_thread.c 31 Dec 2006 15:56:04 -0000 1.240 +++ kern/kern_thread.c 26 Feb 2007 07:21:20 -0000 @@ -70,8 +70,8 @@ #endif TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads); -struct mtx kse_zombie_lock; -MTX_SYSINIT(kse_zombie_lock, &kse_zombie_lock, "kse zombie lock", MTX_SPIN); +struct mtx zombie_lock; +MTX_SYSINIT(zombie_lock, &zombie_lock, "zombie lock", MTX_SPIN); #ifdef KSE static int @@ -221,6 +221,7 @@ void proc_linkup(struct proc *p, struct thread *td) { + TAILQ_INIT(&p->p_threads); /* all threads in proc */ TAILQ_INIT(&p->p_upcalls); /* upcall list */ sigqueue_init(&p->p_sigqueue, p); @@ -259,9 +260,9 @@ void thread_stash(struct thread *td) { - mtx_lock_spin(&kse_zombie_lock); + mtx_lock_spin(&zombie_lock); TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq); - mtx_unlock_spin(&kse_zombie_lock); + mtx_unlock_spin(&zombie_lock); } /* @@ -277,11 +278,11 @@ * we really don't care about the next instant.. */ if (!TAILQ_EMPTY(&zombie_threads)) { - mtx_lock_spin(&kse_zombie_lock); + mtx_lock_spin(&zombie_lock); td_first = TAILQ_FIRST(&zombie_threads); if (td_first) TAILQ_INIT(&zombie_threads); - mtx_unlock_spin(&kse_zombie_lock); + mtx_unlock_spin(&zombie_lock); while (td_first) { td_next = TAILQ_NEXT(td_first, td_slpq); if (td_first->td_ucred) @@ -356,8 +357,9 @@ td = curthread; p = td->td_proc; - mtx_assert(&sched_lock, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); mtx_assert(&Giant, MA_NOTOWNED); + PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(p != NULL, ("thread exiting without a process")); CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td, @@ -388,16 +390,6 @@ */ cpu_thread_exit(td); /* XXXSMP */ -#ifdef KSE - /* - * The thread is exiting. scheduler can release its stuff - * and collect stats etc. - * XXX this is not very right, since PROC_UNLOCK may still - * need scheduler stuff. - */ - sched_thread_exit(td); -#endif - /* Do the same timestamp bookkeeping that mi_switch() would do. */ new_switchtime = cpu_ticks(); p->p_rux.rux_runtime += (new_switchtime - PCPU_GET(switchtime)); @@ -406,6 +398,7 @@ p->p_rux.rux_iticks += td->td_iticks; PCPU_SET(switchtime, new_switchtime); PCPU_SET(switchticks, ticks); + /* XXX Needs protection */ cnt.v_swtch++; /* Add our usage into the usage of all our children. */ @@ -422,8 +415,11 @@ */ if (p->p_flag & P_HADTHREADS) { if (p->p_numthreads > 1) { +#ifdef KSE + kse_unlink(td); +#else thread_unlink(td); - +#endif sched_exit_thread(FIRST_THREAD_IN_PROC(p), td); /* @@ -433,7 +429,9 @@ */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount) { + thread_lock(p->p_singlethread); thread_unsuspend_one(p->p_singlethread); + thread_unlock(p->p_singlethread); } } @@ -450,8 +448,6 @@ */ upcall_remove(td); #endif - - PROC_UNLOCK(p); PCPU_SET(deadthread, td); } else { /* @@ -469,17 +465,13 @@ */ panic ("thread_exit: Last thread exiting on its own"); } - } else { - /* - * non threaded process comes here. - * This includes an EX threaded process that is coming - * here via exit1(). (exit1 dethreads the proc first). - */ - PROC_UNLOCK(p); - } + } + PROC_UNLOCK(p); + thread_lock(td); + PROC_SUNLOCK(p); td->td_state = TDS_INACTIVE; CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td); - cpu_throw(td, choosethread()); + sched_throw(td); panic("I'm a teapot!"); /* NOTREACHED */ } @@ -528,6 +520,11 @@ thread_link(struct thread *td, struct proc *p) { + /* + * XXX This can't be enabled because it's called for proc0 before + * it's spinlock has been created. + * PROC_SLOCK_ASSERT(p, MA_OWNED); + */ td->td_state = TDS_INACTIVE; td->td_proc = p; td->td_flags = 0; @@ -575,7 +572,7 @@ { struct proc *p = td->td_proc; - mtx_assert(&sched_lock, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); TAILQ_REMOVE(&p->p_threads, td, td_plist); p->p_numthreads--; /* could clear a few other things here */ @@ -627,7 +624,7 @@ p->p_flag &= ~P_SINGLE_BOUNDARY; } p->p_flag |= P_STOPPED_SINGLE; - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); p->p_singlethread = td; if (mode == SINGLE_EXIT) remaining = p->p_numthreads; @@ -641,6 +638,7 @@ FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; + thread_lock(td2); td2->td_flags |= TDF_ASTPENDING; if (TD_IS_INHIBITED(td2)) { switch (mode) { @@ -662,8 +660,10 @@ sleepq_abort(td2, ERESTART); break; default: - if (TD_IS_SUSPENDED(td2)) + if (TD_IS_SUSPENDED(td2)) { + thread_unlock(td2); continue; + } /* * maybe other inhibited states too? */ @@ -679,6 +679,7 @@ forward_signal(td2); } #endif + thread_unlock(td2); } if (mode == SINGLE_EXIT) remaining = p->p_numthreads; @@ -698,13 +699,7 @@ * Wake us up when everyone else has suspended. * In the mean time we suspend as well. */ - thread_stopped(p); - thread_suspend_one(td); - PROC_UNLOCK(p); - mi_switch(SW_VOL, NULL); - mtx_unlock_spin(&sched_lock); - PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + thread_suspend_switch(td); if (mode == SINGLE_EXIT) remaining = p->p_numthreads; else if (mode == SINGLE_BOUNDARY) @@ -723,7 +718,7 @@ p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT); thread_unthread(td); } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); return (0); } @@ -796,7 +791,7 @@ if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) sigqueue_flush(&td->td_sigqueue); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); thread_stopped(p); /* * If the process is waiting for us to exit, @@ -805,6 +800,7 @@ */ if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) thread_exit(); + thread_lock(td); /* * When a thread suspends, it just @@ -815,29 +811,62 @@ p->p_boundary_count++; td->td_flags |= TDF_BOUNDARY; } + thread_unlock(td); if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { - if (p->p_numthreads == p->p_suspcount) + if (p->p_numthreads == p->p_suspcount) { + thread_lock(p->p_singlethread); thread_unsuspend_one(p->p_singlethread); + thread_unlock(p->p_singlethread); + } } PROC_UNLOCK(p); + thread_lock(td); + PROC_SUNLOCK(p); mi_switch(SW_INVOL, NULL); - if (return_instead == 0) { - p->p_boundary_count--; + if (return_instead == 0) td->td_flags &= ~TDF_BOUNDARY; - } - mtx_unlock_spin(&sched_lock); + thread_unlock(td); PROC_LOCK(p); + if (return_instead == 0) + p->p_boundary_count--; } return (0); } void +thread_suspend_switch(struct thread *td) +{ + struct proc *p; + + p = td->td_proc; + KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); + PROC_LOCK_ASSERT(p, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); + /* + * We implement thread_suspend_one in stages here to avoid + * dropping the proc lock while the thread lock is owned. + */ + thread_stopped(p); + p->p_suspcount++; + PROC_UNLOCK(p); + thread_lock(td); + TD_SET_SUSPENDED(td); + PROC_SUNLOCK(p); + DROP_GIANT(); + mi_switch(SW_VOL, NULL); + thread_unlock(td); + PICKUP_GIANT(); + PROC_LOCK(p); + PROC_SLOCK(p); +} + +void thread_suspend_one(struct thread *td) { struct proc *p = td->td_proc; - mtx_assert(&sched_lock, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); p->p_suspcount++; TD_SET_SUSPENDED(td); @@ -848,8 +877,8 @@ { struct proc *p = td->td_proc; - mtx_assert(&sched_lock, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended")); TD_CLR_SUSPENDED(td); p->p_suspcount--; @@ -864,13 +893,15 @@ { struct thread *td; - mtx_assert(&sched_lock, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); if (!P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); if (TD_IS_SUSPENDED(td)) { thread_unsuspend_one(td); } + thread_unlock(td); } } else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) && (p->p_numthreads == p->p_suspcount)) { @@ -879,7 +910,9 @@ * threading request. Now we've downgraded to single-threaded, * let it continue. */ + thread_lock(p->p_singlethread); thread_unsuspend_one(p->p_singlethread); + thread_unlock(p->p_singlethread); } } @@ -896,7 +929,7 @@ p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); p->p_singlethread = NULL; /* * If there are other threads they mey now run, @@ -906,12 +939,14 @@ */ if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) { FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); if (TD_IS_SUSPENDED(td)) { thread_unsuspend_one(td); } + thread_unlock(td); } } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } struct thread * @@ -920,11 +955,11 @@ struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); FOREACH_THREAD_IN_PROC(p, td) { if (td->td_tid == tid) break; } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); return (td); } Index: kern/kern_umtx.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_umtx.c,v retrieving revision 1.58 diff -u -r1.58 kern_umtx.c --- kern/kern_umtx.c 20 Dec 2006 04:40:39 -0000 1.58 +++ kern/kern_umtx.c 26 Feb 2007 07:21:20 -0000 @@ -124,8 +124,8 @@ /* * Blocked on PI mutex. read can use chain lock - * or sched_lock, write must have both chain lock and - * sched_lock being hold. + * or umtx_lock, write must have both chain lock and + * umtx_lock being hold. */ struct umtx_pi *uq_pi_blocked; @@ -225,6 +225,8 @@ struct image_params *imgp __unused); SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL); +static struct mtx umtx_lock; + static void umtxq_sysinit(void *arg __unused) { @@ -240,6 +242,7 @@ umtxq_chains[i].uc_busy = 0; umtxq_chains[i].uc_waiters = 0; } + mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN); EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL, EVENTHANDLER_PRI_ANY); } @@ -1270,7 +1273,7 @@ struct umtx_q *uq, *uq1, *uq2; struct thread *td1; - mtx_assert(&sched_lock, MA_OWNED); + mtx_assert(&umtx_lock, MA_OWNED); if (pi == NULL) return (0); @@ -1316,7 +1319,7 @@ struct umtx_pi *pi; int pri; - mtx_assert(&sched_lock, MA_OWNED); + mtx_assert(&umtx_lock, MA_OWNED); pri = UPRI(td); uq = td->td_umtxq; pi = uq->uq_pi_blocked; @@ -1334,7 +1337,9 @@ if (UPRI(td) <= pri) return; + thread_lock(td); sched_lend_user_prio(td, pri); + thread_unlock(td); /* * Pick up the lock that td is blocked on. @@ -1358,7 +1363,7 @@ struct umtx_pi *pi2; int pri; - mtx_assert(&sched_lock, MA_OWNED); + mtx_assert(&umtx_lock, MA_OWNED); while (pi != NULL && pi->pi_owner != NULL) { pri = PRI_MAX; @@ -1374,7 +1379,9 @@ if (pri > uq_owner->uq_inherited_pri) pri = uq_owner->uq_inherited_pri; + thread_lock(pi->pi_owner); sched_unlend_user_prio(pi->pi_owner, pri); + thread_unlock(pi->pi_owner); pi = uq_owner->uq_pi_blocked; } } @@ -1388,7 +1395,7 @@ struct umtx_q *uq_owner; uq_owner = owner->td_umtxq; - mtx_assert(&sched_lock, MA_OWNED); + mtx_assert(&umtx_lock, MA_OWNED); if (pi->pi_owner != NULL) panic("pi_ower != NULL"); pi->pi_owner = owner; @@ -1404,9 +1411,9 @@ struct umtx_q *uq, *uq_owner; uq_owner = owner->td_umtxq; - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); if (pi->pi_owner == owner) { - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&umtx_lock); return (0); } @@ -1414,7 +1421,7 @@ /* * userland may have already messed the mutex, sigh. */ - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&umtx_lock); return (EPERM); } umtx_pi_setowner(pi, owner); @@ -1423,10 +1430,12 @@ int pri; pri = UPRI(uq->uq_thread); + thread_lock(owner); if (pri < UPRI(owner)) sched_lend_user_prio(owner, pri); + thread_unlock(owner); } - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&umtx_lock); return (0); } @@ -1442,7 +1451,7 @@ uq = td->td_umtxq; - mtx_assert(&sched_lock, MA_OWNED); + mtx_assert(&umtx_lock, MA_OWNED); MPASS(TD_ON_UPILOCK(td)); /* @@ -1493,14 +1502,14 @@ */ PROC_LOCK(curproc); td1 = thread_find(curproc, owner); - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); if (td1 != NULL && pi->pi_owner == NULL) { uq1 = td1->td_umtxq; umtx_pi_setowner(pi, td1); } PROC_UNLOCK(curproc); } else { - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); } TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) { @@ -1516,12 +1525,12 @@ uq->uq_pi_blocked = pi; td->td_flags |= TDF_UPIBLOCKED; - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&umtx_lock); umtxq_unlock(&uq->uq_key); - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); umtx_propagate_priority(td); - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&umtx_lock); umtxq_lock(&uq->uq_key); if (uq->uq_flags & UQF_UMTXQ) { @@ -1536,12 +1545,12 @@ } umtxq_unlock(&uq->uq_key); - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); uq->uq_pi_blocked = NULL; td->td_flags &= ~TDF_UPIBLOCKED; TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq); umtx_unpropagate_priority(pi); - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&umtx_lock); umtxq_lock(&uq->uq_key); @@ -1575,7 +1584,7 @@ UMTXQ_LOCKED_ASSERT(uc); KASSERT(pi->pi_refcount > 0, ("invalid reference count")); if (--pi->pi_refcount == 0) { - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); if (pi->pi_owner != NULL) { TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link); @@ -1583,7 +1592,7 @@ } KASSERT(TAILQ_EMPTY(&pi->pi_blocked), ("blocked queue not empty")); - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&umtx_lock); TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink); free = 1; } @@ -1822,7 +1831,7 @@ return (EPERM); } uq_me = curthread->td_umtxq; - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); pi->pi_owner = NULL; TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link); uq_first = TAILQ_FIRST(&pi->pi_blocked); @@ -1834,8 +1843,10 @@ pri = UPRI(uq_first2->uq_thread); } } + thread_lock(curthread); sched_unlend_user_prio(curthread, pri); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); + mtx_unlock_spin(&umtx_lock); } umtxq_unlock(&key); @@ -1891,18 +1902,20 @@ goto out; } - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); if (UPRI(td) < PRI_MIN_REALTIME + ceiling) { - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&umtx_lock); error = EINVAL; goto out; } if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) { uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling; + thread_lock(td); if (uq->uq_inherited_pri < UPRI(td)) sched_lend_user_prio(td, uq->uq_inherited_pri); + thread_unlock(td); } - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&umtx_lock); owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, id | UMUTEX_CONTESTED); @@ -1943,7 +1956,7 @@ umtxq_remove(uq); umtxq_unlock(&uq->uq_key); - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); uq->uq_inherited_pri = old_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { @@ -1955,12 +1968,14 @@ } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; + thread_lock(td); sched_unlend_user_prio(td, pri); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + mtx_unlock_spin(&umtx_lock); } if (error != 0) { - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); uq->uq_inherited_pri = old_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { @@ -1972,8 +1987,10 @@ } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; + thread_lock(td); sched_unlend_user_prio(td, pri); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + mtx_unlock_spin(&umtx_lock); } out: @@ -2048,7 +2065,7 @@ if (error == -1) error = EFAULT; else { - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); if (su != 0) uq->uq_inherited_pri = new_inherited_pri; pri = PRI_MAX; @@ -2061,8 +2078,10 @@ } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; + thread_lock(td); sched_unlend_user_prio(td, pri); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + mtx_unlock_spin(&umtx_lock); } umtx_key_release(&key); return (error); @@ -2749,12 +2768,12 @@ if ((uq = td->td_umtxq) == NULL) return; - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); uq->uq_inherited_pri = PRI_MAX; while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) { pi->pi_owner = NULL; TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link); } td->td_flags &= ~TDF_UBORROWING; - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&umtx_lock); } Index: kern/ksched.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/ksched.c,v retrieving revision 1.35 diff -u -r1.35 ksched.c --- kern/ksched.c 6 Dec 2006 06:34:55 -0000 1.35 +++ kern/ksched.c 26 Feb 2007 07:21:20 -0000 @@ -104,9 +104,7 @@ struct rtprio rtp; int e = 0; - mtx_lock_spin(&sched_lock); pri_to_rtp(td, &rtp); - mtx_unlock_spin(&sched_lock); switch (rtp.type) { case RTP_PRIO_FIFO: @@ -151,9 +149,7 @@ { struct rtprio rtp; - mtx_lock_spin(&sched_lock); pri_to_rtp(td, &rtp); - mtx_unlock_spin(&sched_lock); if (RTP_PRIO_IS_REALTIME(rtp.type)) param->sched_priority = rtpprio_to_p4prio(rtp.prio); @@ -186,9 +182,7 @@ rtp.type = (policy == SCHED_FIFO) ? RTP_PRIO_FIFO : RTP_PRIO_REALTIME; - mtx_lock_spin(&sched_lock); rtp_to_pri(&rtp, td); - mtx_unlock_spin(&sched_lock); } else e = EPERM; @@ -200,9 +194,7 @@ { rtp.type = RTP_PRIO_NORMAL; rtp.prio = p4prio_to_rtpprio(param->sched_priority); - mtx_lock_spin(&sched_lock); rtp_to_pri(&rtp, td); - mtx_unlock_spin(&sched_lock); } break; Index: kern/sched_4bsd.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/sched_4bsd.c,v retrieving revision 1.96 diff -u -r1.96 sched_4bsd.c --- kern/sched_4bsd.c 2 Feb 2007 05:14:21 -0000 1.96 +++ kern/sched_4bsd.c 26 Feb 2007 07:21:20 -0000 @@ -248,7 +248,7 @@ maybe_resched(struct thread *td) { - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_priority < curthread->td_priority) curthread->td_flags |= TDF_NEEDRESCHED; } @@ -377,10 +377,7 @@ realstathz = stathz ? stathz : hz; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { - /* - * Prevent state changes and protect run queue. - */ - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); /* * Increment time in/out of memory. We ignore overflow; with * 16-bit int's (remember them?) overflow takes 45 days. @@ -388,6 +385,7 @@ p->p_swtime++; FOREACH_THREAD_IN_PROC(p, td) { awake = 0; + thread_lock(td); ts = td->td_sched; /* * Increment sleep time (if sleeping). We @@ -456,13 +454,16 @@ td->td_slptime = 0; } else td->td_slptime++; - if (td->td_slptime > 1) + if (td->td_slptime > 1) { + thread_unlock(td); continue; + } td->td_estcpu = decay_cpu(loadfac, td->td_estcpu); resetpriority(td); resetpriority_thread(td); + thread_unlock(td); } /* end of thread loop */ - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } /* end of process loop */ sx_sunlock(&allproc_lock); } @@ -576,6 +577,7 @@ */ proc0.p_sched = NULL; /* XXX */ thread0.td_sched = &td_sched0; + thread0.td_lock = &sched_lock; td_sched0.ts_thread = &thread0; } @@ -616,7 +618,7 @@ { struct td_sched *ts; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td->td_sched; ts->ts_cpticks++; @@ -643,13 +645,10 @@ void sched_exit_thread(struct thread *td, struct thread *child) { - struct proc *childproc = child->td_proc; CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d", - child, childproc->p_comm, child->td_priority); + child, child->td_proc->p_comm, child->td_priority); td->td_estcpu = ESTCPULIM(td->td_estcpu + child->td_estcpu); - childproc->p_estcpu = ESTCPULIM(childproc->p_estcpu + - child->td_estcpu); if ((child->td_proc->p_flag & P_NOLOAD) == 0) sched_load_rem(); } @@ -664,6 +663,7 @@ sched_fork_thread(struct thread *td, struct thread *childtd) { childtd->td_estcpu = td->td_estcpu; + childtd->td_lock = &sched_lock; sched_newthread(childtd); } @@ -673,18 +673,20 @@ struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); - mtx_assert(&sched_lock, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); p->p_nice = nice; FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); resetpriority(td); resetpriority_thread(td); + thread_unlock(td); } } void sched_class(struct thread *td, int class) { - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_pri_class = class; } @@ -698,7 +700,7 @@ td, td->td_proc->p_comm, td->td_priority, prio, curthread, curthread->td_proc->p_comm); - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_priority == prio) return; td->td_priority = prio; @@ -819,7 +821,7 @@ sched_sleep(struct thread *td) { - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_slptime = 0; } @@ -832,26 +834,18 @@ ts = td->td_sched; p = td->td_proc; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); + /* + * Switch to the sched lock to fix things up and pick + * a new thread. + */ + if (td->td_lock != &sched_lock) { + mtx_lock_spin(&sched_lock); + thread_unlock(td); + } if ((p->p_flag & P_NOLOAD) == 0) sched_load_rem(); -#if 0 - /* - * We are volunteering to switch out so we get to nominate - * a successor for the rest of our quantum - * First try another thread in our process - * - * this is too expensive to do without per process run queues - * so skip it for now. - * XXX keep this comment as a marker. - */ - if (sched_followon && - (p->p_flag & P_HADTHREADS) && - (flags & SW_VOL) && - newtd == NULL) - newtd = mumble(); -#endif if (newtd) newtd->td_flags |= (td->td_flags & TDF_NEEDRESCHED); @@ -897,6 +891,7 @@ } else { newtd = choosethread(); } + MPASS(newtd->td_lock == &sched_lock); if (td != newtd) { #ifdef HWPMC_HOOKS @@ -933,12 +928,13 @@ #endif sched_lock.mtx_lock = (uintptr_t)td; td->td_oncpu = PCPU_GET(cpuid); + MPASS(td->td_lock == &sched_lock); } void sched_wakeup(struct thread *td) { - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_slptime > 1) { updatepri(td); resetpriority(td); @@ -1080,7 +1076,7 @@ int single_cpu = 0; ts = td->td_sched; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("sched_add: trying to run inhibited thread")); KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), @@ -1090,6 +1086,14 @@ CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", td, td->td_proc->p_comm, td->td_priority, curthread, curthread->td_proc->p_comm); + /* + * Now that the thread is moving to the run-queue, set the lock + * to the scheduler's lock. + */ + if (td->td_lock != &sched_lock) { + mtx_lock_spin(&sched_lock); + thread_lock_set(td, &sched_lock); + } TD_SET_RUNQ(td); if (td->td_pinned != 0) { @@ -1141,7 +1145,7 @@ { struct td_sched *ts; ts = td->td_sched; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("sched_add: trying to run inhibited thread")); KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), @@ -1151,6 +1155,14 @@ CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", td, td->td_proc->p_comm, td->td_priority, curthread, curthread->td_proc->p_comm); + /* + * Now that the thread is moving to the run-queue, set the lock + * to the scheduler's lock. + */ + if (td->td_lock != &sched_lock) { + mtx_lock_spin(&sched_lock); + thread_lock_set(td, &sched_lock); + } TD_SET_RUNQ(td); CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td); ts->ts_runq = &runq; @@ -1208,6 +1220,7 @@ struct td_sched *ts; struct runq *rq; + mtx_assert(&sched_lock, MA_OWNED); #ifdef SMP struct td_sched *kecpu; @@ -1257,10 +1270,10 @@ KASSERT((td->td_flags & TDF_BORROWING) == 0, ("thread with borrowed priority returning to userland")); if (td->td_priority != td->td_user_pri) { - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_priority = td->td_user_pri; td->td_base_pri = td->td_user_pri; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } } @@ -1269,7 +1282,7 @@ { struct td_sched *ts; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_RUNNING(td), ("sched_bind: cannot bind non-running thread")); @@ -1288,25 +1301,26 @@ void sched_unbind(struct thread* td) { - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_sched->ts_flags &= ~TSF_BOUND; } int sched_is_bound(struct thread *td) { - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); return (td->td_sched->ts_flags & TSF_BOUND); } void sched_relinquish(struct thread *td) { - mtx_lock_spin(&sched_lock); + thread_lock(td); if (td->td_pri_class == PRI_TIMESHARE) sched_prio(td, PRI_MAX_TIMESHARE); + SCHED_STAT_INC(switch_relinquish); mi_switch(SW_VOL, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } int @@ -1364,5 +1378,57 @@ } } +/* + * A CPU is entering for the first time or a thread is exiting. + */ +void +sched_throw(struct thread *td) +{ + /* + * Correct spinlock nesting. The idle thread context that we are + * borrowing was created so that it would start out with a single + * spin lock (sched_lock) held in fork_trampoline(). Since we've + * explicitly acquired locks in this function, the nesting count + * is now 2 rather than 1. Since we are nested, calling + * spinlock_exit() will simply adjust the counts without allowing + * spin lock using code to interrupt us. + */ + if (td == NULL) { + mtx_lock_spin(&sched_lock); + spinlock_exit(); + } else { + MPASS(td->td_lock == &sched_lock); + } + mtx_assert(&sched_lock, MA_OWNED); + KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); + PCPU_SET(switchtime, cpu_ticks()); + PCPU_SET(switchticks, ticks); + cpu_throw(NULL, choosethread()); /* doesn't return */ +} + +void +sched_fork_exit(struct thread *ctd) +{ + struct thread *td; + + /* + * Finish setting up thread glue so that it begins execution in a + * non-nested critical section with sched_lock held but not recursed. + */ + ctd->td_oncpu = PCPU_GET(cpuid); + sched_lock.mtx_lock = (uintptr_t)ctd; + THREAD_LOCK_ASSERT(ctd, MA_OWNED | MA_NOTRECURSED); + /* + * Processes normally resume in mi_switch() after being + * cpu_switch()'ed to, but when children start up they arrive here + * instead, so we must do much the same things as mi_switch() would. + */ + if ((td = PCPU_GET(deadthread))) { + PCPU_SET(deadthread, NULL); + thread_stash(td); + } + thread_unlock(ctd); +} + #define KERN_SWITCH_INCLUDE 1 #include "kern/kern_switch.c" Index: kern/sched_core.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/sched_core.c,v retrieving revision 1.12 diff -u -r1.12 sched_core.c --- kern/sched_core.c 23 Jan 2007 08:46:50 -0000 1.12 +++ kern/sched_core.c 26 Feb 2007 07:21:20 -0000 @@ -1747,5 +1747,33 @@ } } +/* + * A CPU is entering for the first time or a thread is exiting. + */ +void +sched_throw(struct thread *td) +{ + /* + * Correct spinlock nesting. The idle thread context that we are + * borrowing was created so that it would start out with a single + * spin lock (sched_lock) held in fork_trampoline(). Since we've + * explicitly acquired locks in this function, the nesting count + * is now 2 rather than 1. Since we are nested, calling + * spinlock_exit() will simply adjust the counts without allowing + * spin lock using code to interrupt us. + */ + if (td == NULL) { + mtx_lock_spin(&sched_lock); + spinlock_exit(); + } else { + MPASS(td->td_lock == &sched_lock); + } + mtx_assert(&sched_lock, MA_OWNED); + KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); + PCPU_SET(switchtime, cpu_ticks()); + PCPU_SET(switchticks, ticks); + cpu_throw(NULL, choosethread()); /* doesn't return */ +} + #define KERN_SWITCH_INCLUDE 1 #include "kern/kern_switch.c" Index: kern/sched_ule.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/sched_ule.c,v retrieving revision 1.187 diff -u -r1.187 sched_ule.c --- kern/sched_ule.c 8 Feb 2007 01:52:25 -0000 1.187 +++ kern/sched_ule.c 26 Feb 2007 07:21:20 -0000 @@ -434,7 +434,7 @@ mtx_assert(&sched_lock, MA_OWNED); class = PRI_BASE(ts->ts_thread->td_pri_class); tdq->tdq_load++; - CTR1(KTR_SCHED, "load: %d", tdq->tdq_load); + CTR2(KTR_SCHED, "cpu %jd load: %d", TDQ_ID(tdq), tdq->tdq_load); if (class != PRI_ITHD && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) #ifdef SMP @@ -1244,6 +1244,7 @@ */ proc0.p_sched = NULL; /* XXX */ thread0.td_sched = &td_sched0; + thread0.td_lock = &sched_lock; td_sched0.ts_ltick = ticks; td_sched0.ts_ftick = ticks; td_sched0.ts_thread = &thread0; @@ -1292,7 +1293,7 @@ td, td->td_proc->p_comm, td->td_priority, prio, curthread, curthread->td_proc->p_comm); ts = td->td_sched; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_priority == prio) return; @@ -1303,9 +1304,10 @@ * queue. This could be optimized to not re-add in some * cases. */ + MPASS(td->td_lock == &sched_lock); sched_rem(td); td->td_priority = prio; - sched_add(td, SRQ_BORROWING); + sched_add(td, SRQ_BORROWING|SRQ_OURSELF); } else td->td_priority = prio; } @@ -1423,7 +1425,7 @@ struct td_sched *ts; int preempt; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); preempt = flags & SW_PREEMPT; tdq = TDQ_SELF(); @@ -1436,24 +1438,33 @@ * If the thread has been assigned it may be in the process of switching * to the new cpu. This is the case in sched_bind(). */ + /* + * Switch to the sched lock to fix things up and pick + * a new thread. + */ + if (td->td_lock != &sched_lock) { + mtx_lock_spin(&sched_lock); + thread_unlock(td); + } if (td == PCPU_GET(idlethread)) { + MPASS(td->td_lock == &sched_lock); TD_SET_CAN_RUN(td); - } else { + } else if (TD_IS_RUNNING(td)) { + /* + * Don't allow the thread to migrate + * from a preemption. + */ tdq_load_rem(tdq, ts); - if (TD_IS_RUNNING(td)) { - /* - * Don't allow the thread to migrate - * from a preemption. - */ - if (preempt) - sched_pin_td(td); - sched_add(td, preempt ? - SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : - SRQ_OURSELF|SRQ_YIELDING); - if (preempt) - sched_unpin_td(td); - } - } + if (preempt) + sched_pin_td(td); + sched_add(td, preempt ? + SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : + SRQ_OURSELF|SRQ_YIELDING); + if (preempt) + sched_unpin_td(td); + } else + tdq_load_rem(tdq, ts); + mtx_assert(&sched_lock, MA_OWNED); if (newtd != NULL) { /* * If we bring in a thread account for it as if it had been @@ -1477,6 +1488,7 @@ } sched_lock.mtx_lock = (uintptr_t)td; td->td_oncpu = PCPU_GET(cpuid); + MPASS(td->td_lock == &sched_lock); } void @@ -1485,12 +1497,14 @@ struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); - mtx_assert(&sched_lock, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); p->p_nice = nice; FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); sched_priority(td); sched_prio(td, td->td_base_user_pri); + thread_unlock(td); } } @@ -1498,7 +1512,7 @@ sched_sleep(struct thread *td) { - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_sched->ts_slptime = ticks; } @@ -1509,7 +1523,7 @@ struct td_sched *ts; int slptime; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td->td_sched; /* * If we slept for more than a tick update our interactivity and @@ -1538,7 +1552,7 @@ void sched_fork(struct thread *td, struct thread *child) { - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); sched_fork_thread(td, child); /* * Penalize the parent and child for forking. @@ -1559,7 +1573,9 @@ /* * Initialize child. */ + THREAD_LOCK_ASSERT(td, MA_OWNED); sched_newthread(child); + child->td_lock = &sched_lock; ts = td->td_sched; ts2 = child->td_sched; ts2->ts_cpu = ts->ts_cpu; @@ -1584,7 +1600,7 @@ sched_class(struct thread *td, int class) { - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_pri_class == class) return; @@ -1623,8 +1639,10 @@ CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d", child, child->td_proc->p_comm, child->td_priority); + mtx_lock_spin(&sched_lock); td = FIRST_THREAD_IN_PROC(p); sched_exit_thread(td, child); + mtx_unlock_spin(&sched_lock); } void @@ -1669,10 +1687,10 @@ KASSERT((td->td_flags & TDF_BORROWING) == 0, ("thread with borrowed priority returning to userland")); if (td->td_priority != td->td_user_pri) { - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_priority = td->td_user_pri; td->td_base_pri = td->td_user_pri; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } } @@ -1801,9 +1819,22 @@ */ MPASS(TD_ON_RUNQ(td)); TD_SET_RUNNING(td); + MPASS(ctd->td_lock == &sched_lock); + MPASS(td->td_lock == &sched_lock); CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td, td->td_proc->p_pid, td->td_proc->p_comm); + /* + * We enter the switch with two runnable threads that both have + * the same lock. When we return td may be sleeping so we need + * to switch locks to make sure he's locked correctly. + */ + SCHED_STAT_INC(switch_preempt); mi_switch(SW_INVOL|SW_PREEMPT, td); + critical_enter(); + thread_unlock(ctd); + thread_lock(td); + critical_exit(); + return (1); } @@ -1820,7 +1851,7 @@ #endif ts = td->td_sched; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", td, td->td_proc->p_comm, td->td_priority, curthread, curthread->td_proc->p_comm); @@ -1830,8 +1861,15 @@ ("sched_add: bad thread state")); KASSERT(td->td_proc->p_sflag & PS_INMEM, ("sched_add: process swapped out")); - KASSERT(ts->ts_runq == NULL, - ("sched_add: thread %p is still assigned to a run queue", td)); + /* + * Now that the thread is moving to the run-queue, set the lock + * to the scheduler's lock. + */ + if (td->td_lock != &sched_lock) { + mtx_lock_spin(&sched_lock); + thread_lock_set(td, &sched_lock); + } + mtx_assert(&sched_lock, MA_OWNED); TD_SET_RUNQ(td); tdq = TDQ_SELF(); class = PRI_BASE(td->td_pri_class); @@ -1917,7 +1955,7 @@ CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)", td, td->td_proc->p_comm, td->td_priority, curthread, curthread->td_proc->p_comm); - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td->td_sched; KASSERT(TD_ON_RUNQ(td), ("sched_rem: thread not on run queue")); @@ -1939,7 +1977,7 @@ if (ts == NULL) return (0); - mtx_lock_spin(&sched_lock); + thread_lock(td); if (ts->ts_ticks) { int rtick; @@ -1949,7 +1987,7 @@ pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT; } td->td_proc->p_swtime = ts->ts_ltick - ts->ts_ftick; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); return (pctcpu); } @@ -1959,7 +1997,7 @@ { struct td_sched *ts; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td->td_sched; if (ts->ts_flags & TSF_BOUND) sched_unbind(td); @@ -1979,7 +2017,7 @@ { struct td_sched *ts; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td->td_sched; if ((ts->ts_flags & TSF_BOUND) == 0) return; @@ -1992,18 +2030,19 @@ int sched_is_bound(struct thread *td) { - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); return (td->td_sched->ts_flags & TSF_BOUND); } void sched_relinquish(struct thread *td) { - mtx_lock_spin(&sched_lock); + thread_lock(td); if (td->td_pri_class == PRI_TIMESHARE) sched_prio(td, PRI_MAX_TIMESHARE); + SCHED_STAT_INC(switch_relinquish); mi_switch(SW_VOL, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } int @@ -2064,8 +2103,65 @@ p = td->td_proc; mtx_assert(&Giant, MA_NOTOWNED); /* ULE Relies on preemption for idle interruption. */ - for (;;) - cpu_idle(); + for (;;) { + while (sched_runnable() == 0) + cpu_idle(); + thread_lock(td); + mi_switch(SW_VOL, NULL); + thread_unlock(td); + } +} + +/* + * A CPU is entering for the first time or a thread is exiting. + */ +void +sched_throw(struct thread *td) +{ + /* + * Correct spinlock nesting. The idle thread context that we are + * borrowing was created so that it would start out with a single + * spin lock (sched_lock) held in fork_trampoline(). Since we've + * explicitly acquired locks in this function, the nesting count + * is now 2 rather than 1. Since we are nested, calling + * spinlock_exit() will simply adjust the counts without allowing + * spin lock using code to interrupt us. + */ + if (td == NULL) { + mtx_lock_spin(&sched_lock); + spinlock_exit(); + } else { + MPASS(td->td_lock == &sched_lock); + } + mtx_assert(&sched_lock, MA_OWNED); + KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); + PCPU_SET(switchtime, cpu_ticks()); + PCPU_SET(switchticks, ticks); + cpu_throw(NULL, choosethread()); /* doesn't return */ +} + +void +sched_fork_exit(struct thread *ctd) +{ + struct thread *td; + + /* + * Finish setting up thread glue so that it begins execution in a + * non-nested critical section with sched_lock held but not recursed. + */ + ctd->td_oncpu = PCPU_GET(cpuid); + sched_lock.mtx_lock = (uintptr_t)ctd; + THREAD_LOCK_ASSERT(ctd, MA_OWNED | MA_NOTRECURSED); + /* + * Processes normally resume in mi_switch() after being + * cpu_switch()'ed to, but when children start up they arrive here + * instead, so we must do much the same things as mi_switch() would. + */ + if ((td = PCPU_GET(deadthread))) { + PCPU_SET(deadthread, NULL); + thread_stash(td); + } + thread_unlock(ctd); } static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler"); Index: kern/subr_prof.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/subr_prof.c,v retrieving revision 1.76 diff -u -r1.76 subr_prof.c --- kern/subr_prof.c 16 Dec 2005 22:08:32 -0000 1.76 +++ kern/subr_prof.c 26 Feb 2007 07:21:21 -0000 @@ -484,9 +484,9 @@ td->td_profil_addr = pc; td->td_profil_ticks = ticks; td->td_pflags |= TDP_OWEUPC; - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags |= TDF_ASTPENDING; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } } Index: kern/subr_sleepqueue.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/subr_sleepqueue.c,v retrieving revision 1.35 diff -u -r1.35 subr_sleepqueue.c --- kern/subr_sleepqueue.c 17 Dec 2006 00:14:20 -0000 1.35 +++ kern/subr_sleepqueue.c 26 Feb 2007 07:21:21 -0000 @@ -327,7 +327,6 @@ } TAILQ_INSERT_TAIL(&sq->sq_blocked[queue], td, td_slpq); td->td_sleepqueue = NULL; - mtx_lock_spin(&sched_lock); td->td_sqqueue = queue; td->td_wchan = wchan; td->td_wmesg = wmesg; @@ -335,7 +334,6 @@ td->td_flags |= TDF_SINTR; td->td_flags &= ~TDF_SLEEPABORT; } - mtx_unlock_spin(&sched_lock); } /* @@ -360,7 +358,8 @@ /* * Marks the pending sleep of the current thread as interruptible and * makes an initial check for pending signals before putting a thread - * to sleep. Return with sleep queue and scheduler lock held. + * to sleep. Enters and exits with the thread lock held. Thread lock + * may have transitioned from the sleepq lock to a run lock. */ static int sleepq_catch_signals(void *wchan) @@ -399,39 +398,38 @@ ret = ERESTART; mtx_unlock(&ps->ps_mtx); } - + /* + * Lock sleepq chain before unlocking proc + * without this, we could lose a race. + */ + mtx_lock_spin(&sc->sc_lock); + PROC_UNLOCK(p); + thread_lock(td); if (ret == 0) { - mtx_lock_spin(&sc->sc_lock); - /* - * Lock sched_lock before unlocking proc lock, - * without this, we could lose a race. - */ - mtx_lock_spin(&sched_lock); - PROC_UNLOCK(p); - if (!(td->td_flags & TDF_INTERRUPT)) + if (!(td->td_flags & TDF_INTERRUPT)) { + sleepq_switch(wchan); return (0); + } /* KSE threads tried unblocking us. */ ret = td->td_intrval; - mtx_unlock_spin(&sched_lock); - MPASS(ret == EINTR || ret == ERESTART); - } else { - PROC_UNLOCK(p); - mtx_lock_spin(&sc->sc_lock); + MPASS(ret == EINTR || ret == ERESTART || ret == EWOULDBLOCK); } /* * There were pending signals and this thread is still * on the sleep queue, remove it from the sleep queue. */ - sq = sleepq_lookup(wchan); - mtx_lock_spin(&sched_lock); - if (TD_ON_SLEEPQ(td)) + if (TD_ON_SLEEPQ(td)) { + sq = sleepq_lookup(wchan); sleepq_resume_thread(sq, td, -1); + } + mtx_unlock_spin(&sc->sc_lock); + MPASS(td->td_lock != &sc->sc_lock); return (ret); } /* - * Switches to another thread if we are still asleep on a sleep queue and - * drop the lock on the sleep queue chain. Returns with sched_lock held. + * Switches to another thread if we are still asleep on a sleep queue. + * Returns with thread lock. */ static void sleepq_switch(void *wchan) @@ -442,24 +440,18 @@ td = curthread; sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); - mtx_assert(&sched_lock, MA_OWNED); - - /* - * If we have a sleep queue, then we've already been woken up, so - * just return. - */ + THREAD_LOCK_ASSERT(td, MA_OWNED); + /* We were removed */ if (td->td_sleepqueue != NULL) { - MPASS(!TD_ON_SLEEPQ(td)); mtx_unlock_spin(&sc->sc_lock); return; } + thread_lock_set(td, &sc->sc_lock); - /* - * Otherwise, actually go to sleep. - */ - mtx_unlock_spin(&sc->sc_lock); + MPASS(td->td_sleepqueue == NULL); sched_sleep(td); TD_SET_SLEEPING(td); + SCHED_STAT_INC(switch_sleepq); mi_switch(SW_VOL, NULL); KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING")); CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)", @@ -474,8 +466,8 @@ { struct thread *td; - mtx_assert(&sched_lock, MA_OWNED); td = curthread; + THREAD_LOCK_ASSERT(td, MA_OWNED); /* * If TDF_TIMEOUT is set, we timed out. @@ -500,6 +492,7 @@ else if (callout_stop(&td->td_slpcallout) == 0) { td->td_flags |= TDF_TIMEOUT; TD_SET_SLEEPING(td); + SCHED_STAT_INC(switch_sleepqtimo); mi_switch(SW_INVOL, NULL); } return (0); @@ -513,8 +506,8 @@ { struct thread *td; - mtx_assert(&sched_lock, MA_OWNED); td = curthread; + THREAD_LOCK_ASSERT(td, MA_OWNED); /* We are no longer in an interruptible sleep. */ if (td->td_flags & TDF_SINTR) @@ -537,11 +530,13 @@ void sleepq_wait(void *wchan) { + struct thread *td; - MPASS(!(curthread->td_flags & TDF_SINTR)); - mtx_lock_spin(&sched_lock); + td = curthread; + MPASS(!(td->td_flags & TDF_SINTR)); + thread_lock(td); sleepq_switch(wchan); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } /* @@ -555,12 +550,8 @@ int rval; rcatch = sleepq_catch_signals(wchan); - if (rcatch == 0) - sleepq_switch(wchan); - else - sleepq_release(wchan); rval = sleepq_check_signals(); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); if (rcatch) return (rcatch); return (rval); @@ -573,13 +564,16 @@ int sleepq_timedwait(void *wchan) { + struct thread *td; int rval; - MPASS(!(curthread->td_flags & TDF_SINTR)); - mtx_lock_spin(&sched_lock); + td = curthread; + MPASS(!(td->td_flags & TDF_SINTR)); + thread_lock(td); sleepq_switch(wchan); rval = sleepq_check_timeout(); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + return (rval); } @@ -593,13 +587,9 @@ int rcatch, rvalt, rvals; rcatch = sleepq_catch_signals(wchan); - if (rcatch == 0) - sleepq_switch(wchan); - else - sleepq_release(wchan); rvalt = sleepq_check_timeout(); rvals = sleepq_check_signals(); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); if (rcatch) return (rcatch); if (rvals) @@ -620,9 +610,9 @@ MPASS(sq->sq_wchan != NULL); MPASS(td->td_wchan == sq->sq_wchan); MPASS(td->td_sqqueue < NR_SLEEPQS && td->td_sqqueue >= 0); + THREAD_LOCK_ASSERT(td, MA_OWNED); sc = SC_LOOKUP(sq->sq_wchan); mtx_assert(&sc->sc_lock, MA_OWNED); - mtx_assert(&sched_lock, MA_OWNED); /* Remove the thread from the queue. */ TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq); @@ -679,10 +669,8 @@ KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); MPASS((queue >= 0) && (queue < NR_SLEEPQS)); sq = sleepq_lookup(wchan); - if (sq == NULL) { - sleepq_release(wchan); + if (sq == NULL) return; - } KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE), ("%s: mismatch between sleep/wakeup and cv_*", __func__)); @@ -698,10 +686,9 @@ besttd = td; } MPASS(besttd != NULL); - mtx_lock_spin(&sched_lock); + thread_lock(besttd); sleepq_resume_thread(sq, besttd, pri); - mtx_unlock_spin(&sched_lock); - sleepq_release(wchan); + thread_unlock(besttd); } /* @@ -711,6 +698,7 @@ sleepq_broadcast(void *wchan, int flags, int pri, int queue) { struct sleepqueue *sq; + struct thread *td; CTR2(KTR_PROC, "sleepq_broadcast(%p, %d)", wchan, flags); KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); @@ -724,11 +712,12 @@ ("%s: mismatch between sleep/wakeup and cv_*", __func__)); /* Resume all blocked threads on the sleep queue. */ - mtx_lock_spin(&sched_lock); - while (!TAILQ_EMPTY(&sq->sq_blocked[queue])) - sleepq_resume_thread(sq, TAILQ_FIRST(&sq->sq_blocked[queue]), - pri); - mtx_unlock_spin(&sched_lock); + while (!TAILQ_EMPTY(&sq->sq_blocked[queue])) { + td = TAILQ_FIRST(&sq->sq_blocked[queue]); + thread_lock(td); + sleepq_resume_thread(sq, td, pri); + thread_unlock(td); + } sleepq_release(wchan); } @@ -739,6 +728,7 @@ static void sleepq_timeout(void *arg) { + struct sleepqueue_chain *sc; struct sleepqueue *sq; struct thread *td; void *wchan; @@ -751,38 +741,29 @@ * First, see if the thread is asleep and get the wait channel if * it is. */ - mtx_lock_spin(&sched_lock); - if (TD_ON_SLEEPQ(td)) { + thread_lock(td); + if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) { wchan = td->td_wchan; - mtx_unlock_spin(&sched_lock); - sleepq_lock(wchan); + sc = SC_LOOKUP(wchan); + MPASS(td->td_lock == &sc->sc_lock); sq = sleepq_lookup(wchan); - mtx_lock_spin(&sched_lock); - } else { - wchan = NULL; - sq = NULL; + MPASS(sq != NULL); + td->td_flags |= TDF_TIMEOUT; + sleepq_resume_thread(sq, td, -1); + thread_unlock(td); + return; } - /* - * At this point, if the thread is still on the sleep queue, - * we have that sleep queue locked as it cannot migrate sleep - * queues while we dropped sched_lock. If it had resumed and - * was on another CPU while the lock was dropped, it would have - * seen that TDF_TIMEOUT and TDF_TIMOFAIL are clear and the - * call to callout_stop() to stop this routine would have failed - * meaning that it would have already set TDF_TIMEOUT to - * synchronize with this function. + * If the thread is on the SLEEPQ but not sleeping and we have it + * locked it must be in sleepq_catch_signals(). Let it know we've + * timedout here so it can remove itself. */ if (TD_ON_SLEEPQ(td)) { - MPASS(td->td_wchan == wchan); - MPASS(sq != NULL); - td->td_flags |= TDF_TIMEOUT; - sleepq_resume_thread(sq, td, -1); - mtx_unlock_spin(&sched_lock); - sleepq_release(wchan); + td->td_flags |= TDF_TIMEOUT | TDF_INTERRUPT; + td->td_intrval = EWOULDBLOCK; + thread_unlock(td); return; - } else if (wchan != NULL) - sleepq_release(wchan); + } /* * Now check for the edge cases. First, if TDF_TIMEOUT is set, @@ -800,7 +781,7 @@ setrunnable(td); } else td->td_flags |= TDF_TIMOFAIL; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } /* @@ -820,33 +801,36 @@ MPASS(wchan != NULL); sleepq_lock(wchan); sq = sleepq_lookup(wchan); - mtx_lock_spin(&sched_lock); + /* + * We can not lock the thread here as it may be sleeping on a + * different sleepq. However, holding the sleepq lock for this + * wchan can guarantee that we do not miss a wakeup for this + * channel. The asserts below will catch any false positives. + */ if (!TD_ON_SLEEPQ(td) || td->td_wchan != wchan) { - mtx_unlock_spin(&sched_lock); sleepq_release(wchan); return; } - MPASS(sq != NULL); - /* Thread is asleep on sleep queue sq, so wake it up. */ + thread_lock(td); + MPASS(sq != NULL); + MPASS(td->td_wchan == wchan); sleepq_resume_thread(sq, td, -1); + thread_unlock(td); sleepq_release(wchan); - mtx_unlock_spin(&sched_lock); } /* * Abort a thread as if an interrupt had occurred. Only abort * interruptible waits (unfortunately it isn't safe to abort others). - * - * XXX: What in the world does the comment below mean? - * Also, whatever the signal code does... */ void sleepq_abort(struct thread *td, int intrval) { + struct sleepqueue *sq; void *wchan; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); MPASS(TD_ON_SLEEPQ(td)); MPASS(td->td_flags & TDF_SINTR); MPASS(intrval == EINTR || intrval == ERESTART); @@ -860,14 +844,22 @@ CTR3(KTR_PROC, "sleepq_abort: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, (void *)td->td_proc->p_comm); + td->td_intrval = intrval; + td->td_flags |= TDF_SLEEPABORT; + /* + * If the thread has not slept yet it will find the signal in + * sleepq_catch_signals() and call sleepq_resume_thread. Otherwise + * we have to do it here. + */ + if (!TD_IS_SLEEPING(td)) + return; wchan = td->td_wchan; - if (wchan != NULL) { - td->td_intrval = intrval; - td->td_flags |= TDF_SLEEPABORT; - } - mtx_unlock_spin(&sched_lock); - sleepq_remove(td, wchan); - mtx_lock_spin(&sched_lock); + MPASS(wchan != NULL); + sq = sleepq_lookup(wchan); + MPASS(sq != NULL); + + /* Thread is asleep on sleep queue sq, so wake it up. */ + sleepq_resume_thread(sq, td, -1); } #ifdef DDB Index: kern/subr_smp.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/subr_smp.c,v retrieving revision 1.197 diff -u -r1.197 subr_smp.c --- kern/subr_smp.c 24 Oct 2005 21:04:17 -0000 1.197 +++ kern/subr_smp.c 26 Feb 2007 07:21:21 -0000 @@ -159,7 +159,7 @@ * this thread, so all we need to do is poke it if it is currently * executing so that it executes ast(). */ - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_RUNNING(td), ("forward_signal: thread is not TDS_RUNNING")); Index: kern/subr_taskqueue.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/subr_taskqueue.c,v retrieving revision 1.38 diff -u -r1.38 subr_taskqueue.c --- kern/subr_taskqueue.c 23 Jan 2007 08:46:50 -0000 1.38 +++ kern/subr_taskqueue.c 26 Feb 2007 07:21:21 -0000 @@ -349,15 +349,15 @@ } else tq->tq_pcount++; } - mtx_lock_spin(&sched_lock); for (i = 0; i < count; i++) { if (tq->tq_pproc[i] == NULL) continue; td = FIRST_THREAD_IN_PROC(tq->tq_pproc[i]); + thread_lock(td); sched_prio(td, pri); sched_add(td, SRQ_BORING); + thread_unlock(td); } - mtx_unlock_spin(&sched_lock); return (0); } Index: kern/subr_trap.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/subr_trap.c,v retrieving revision 1.290 diff -u -r1.290 subr_trap.c --- kern/subr_trap.c 6 Dec 2006 06:34:55 -0000 1.290 +++ kern/subr_trap.c 26 Feb 2007 07:21:21 -0000 @@ -84,11 +84,11 @@ #ifdef DIAGNOSTIC /* Check that we called signotify() enough. */ PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + thread_lock(td); if (SIGPENDING(td) && ((td->td_flags & TDF_NEEDSIGCHK) == 0 || (td->td_flags & TDF_ASTPENDING) == 0)) printf("failed to set signal flags properly for ast()\n"); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); PROC_UNLOCK(p); #endif @@ -167,6 +167,7 @@ WITNESS_WARN(WARN_PANIC, NULL, "Returning to user mode"); mtx_assert(&Giant, MA_NOTOWNED); mtx_assert(&sched_lock, MA_NOTOWNED); + THREAD_LOCK_ASSERT(td, MA_NOTOWNED); td->td_frame = framep; td->td_pticks = 0; @@ -182,7 +183,7 @@ * AST's saved in sflag, the astpending flag will be set and * ast() will be called again. */ - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); flags = td->td_flags; sflag = p->p_sflag; if (p->p_sflag & (PS_ALRMPEND | PS_PROFPEND | PS_XCPU)) @@ -191,10 +192,13 @@ if (p->p_sflag & PS_MACPEND) p->p_sflag &= ~PS_MACPEND; #endif + PROC_SUNLOCK(p); + thread_lock(td); td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK | TDF_NEEDRESCHED | TDF_INTERRUPT); + thread_unlock(td); + /* XXX Needs atomic */ cnt.v_trap++; - mtx_unlock_spin(&sched_lock); /* * XXXKSE While the fact that we owe a user profiling @@ -236,14 +240,14 @@ if (sflag & PS_XCPU) { PROC_LOCK(p); lim_rlimit(p, RLIMIT_CPU, &rlim); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (p->p_rux.rux_runtime >= rlim.rlim_max * cpu_tickrate()) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); killproc(p, "exceeded maximum CPU limit"); } else { if (p->p_cpulimit < rlim.rlim_max) p->p_cpulimit += 5; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); psignal(p, SIGXCPU); } PROC_UNLOCK(p); @@ -257,10 +261,11 @@ if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 1); #endif - mtx_lock_spin(&sched_lock); + thread_lock(td); sched_prio(td, td->td_user_pri); + SCHED_STAT_INC(switch_needresched); mi_switch(SW_INVOL, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 1); Index: kern/subr_turnstile.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/subr_turnstile.c,v retrieving revision 1.166 diff -u -r1.166 subr_turnstile.c --- kern/subr_turnstile.c 23 Jan 2007 08:46:50 -0000 1.166 +++ kern/subr_turnstile.c 26 Feb 2007 07:21:21 -0000 @@ -115,6 +115,7 @@ * q - td_contested lock */ struct turnstile { + struct mtx ts_lock; /* Spin lock for self. */ struct threadqueue ts_blocked[2]; /* (c + q) Blocked threads. */ struct threadqueue ts_pending; /* (c) Pending threads. */ LIST_ENTRY(turnstile) ts_hash; /* (c) Chain and free list. */ @@ -167,13 +168,20 @@ static void propagate_priority(struct thread *td) { - struct turnstile_chain *tc; struct turnstile *ts; int pri; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); pri = td->td_priority; ts = td->td_blocked; + MPASS(td->td_lock == &ts->ts_lock); + /* + * Grab a recursive lock on this turnstile chain so it stays locked + * for the whole operation. The caller expects us to return with + * the original lock held. We only ever lock down the chain so + * the lock order is constant. + */ + mtx_lock_spin(&ts->ts_lock); for (;;) { td = ts->ts_owner; @@ -182,9 +190,12 @@ * This might be a read lock with no owner. There's * not much we can do, so just bail. */ + mtx_unlock_spin(&ts->ts_lock); return; } + thread_lock_flags(td, MTX_DUPOK); + mtx_unlock_spin(&ts->ts_lock); MPASS(td->td_proc != NULL); MPASS(td->td_proc->p_magic == P_MAGIC); @@ -209,8 +220,10 @@ * If this thread already has higher priority than the * thread that is being blocked, we are finished. */ - if (td->td_priority <= pri) + if (td->td_priority <= pri) { + thread_unlock(td); return; + } /* * Bump this thread's priority. @@ -223,6 +236,7 @@ */ if (TD_IS_RUNNING(td) || TD_ON_RUNQ(td)) { MPASS(td->td_blocked == NULL); + thread_unlock(td); return; } @@ -247,15 +261,13 @@ */ ts = td->td_blocked; MPASS(ts != NULL); - tc = TC_LOOKUP(ts->ts_lockobj); - mtx_lock_spin(&tc->tc_lock); - + MPASS(td->td_lock == &ts->ts_lock); /* Resort td on the list if needed. */ if (!turnstile_adjust_thread(ts, td)) { - mtx_unlock_spin(&tc->tc_lock); + mtx_unlock_spin(&ts->ts_lock); return; } - mtx_unlock_spin(&tc->tc_lock); + /* The thread lock is released as ts lock above. */ } } @@ -266,11 +278,10 @@ static int turnstile_adjust_thread(struct turnstile *ts, struct thread *td) { - struct turnstile_chain *tc; struct thread *td1, *td2; int queue; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); MPASS(TD_ON_LOCK(td)); /* @@ -291,8 +302,7 @@ * It needs to be moved if either its priority is lower than * the previous thread or higher than the next thread. */ - tc = TC_LOOKUP(ts->ts_lockobj); - mtx_assert(&tc->tc_lock, MA_OWNED); + MPASS(td->td_lock == &ts->ts_lock); td1 = TAILQ_PREV(td, threadqueue, td_lockq); td2 = TAILQ_NEXT(td, td_lockq); if ((td1 != NULL && td->td_priority < td1->td_priority) || @@ -390,10 +400,8 @@ void turnstile_adjust(struct thread *td, u_char oldpri) { - struct turnstile_chain *tc; struct turnstile *ts; - mtx_assert(&sched_lock, MA_OWNED); MPASS(TD_ON_LOCK(td)); /* @@ -401,15 +409,12 @@ */ ts = td->td_blocked; MPASS(ts != NULL); - tc = TC_LOOKUP(ts->ts_lockobj); - mtx_lock_spin(&tc->tc_lock); + MPASS(td->td_lock == &ts->ts_lock); + mtx_assert(&ts->ts_lock, MA_OWNED); /* Resort the turnstile on the list. */ - if (!turnstile_adjust_thread(ts, td)) { - mtx_unlock_spin(&tc->tc_lock); + if (!turnstile_adjust_thread(ts, td)) return; - } - /* * If our priority was lowered and we are at the head of the * turnstile, then propagate our new priority up the chain. @@ -420,12 +425,8 @@ td->td_tsqueue == TS_SHARED_QUEUE); if (td == TAILQ_FIRST(&ts->ts_blocked[td->td_tsqueue]) && td->td_priority < oldpri) { - mtx_unlock_spin(&tc->tc_lock); - critical_enter(); propagate_priority(td); - critical_exit(); - } else - mtx_unlock_spin(&tc->tc_lock); + } } /* @@ -460,6 +461,7 @@ TAILQ_INIT(&ts->ts_blocked[TS_SHARED_QUEUE]); TAILQ_INIT(&ts->ts_pending); LIST_INIT(&ts->ts_free); + mtx_init(&ts->ts_lock, "turnstile lock", NULL, MTX_SPIN | MTX_RECURSE); return (ts); } @@ -481,7 +483,7 @@ * Lock the turnstile chain associated with the specified lock. */ void -turnstile_lock(struct lock_object *lock) +turnstile_chain_lock(struct lock_object *lock) { struct turnstile_chain *tc; @@ -489,6 +491,45 @@ mtx_lock_spin(&tc->tc_lock); } +struct turnstile * +turnstile_trywait(struct lock_object *lock) +{ + struct turnstile_chain *tc; + struct turnstile *ts; + + tc = TC_LOOKUP(lock); + mtx_lock_spin(&tc->tc_lock); + LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash) + if (ts->ts_lockobj == lock) { + mtx_lock_spin(&ts->ts_lock); + return (ts); + } + + ts = curthread->td_turnstile; + MPASS(ts != NULL); + mtx_lock_spin(&ts->ts_lock); + KASSERT(ts->ts_lockobj == NULL, ("stale ts_lockobj pointer")); + ts->ts_lockobj = lock; + + return (ts); +} + +void +turnstile_cancel(struct turnstile *ts) +{ + struct turnstile_chain *tc; + struct lock_object *lock; + + mtx_assert(&ts->ts_lock, MA_OWNED); + + mtx_unlock_spin(&ts->ts_lock); + lock = ts->ts_lockobj; + if (ts == curthread->td_turnstile) + ts->ts_lockobj = NULL; + tc = TC_LOOKUP(lock); + mtx_unlock_spin(&tc->tc_lock); +} + /* * Look up the turnstile for a lock in the hash table locking the associated * turnstile chain along the way. If no turnstile is found in the hash @@ -503,8 +544,10 @@ tc = TC_LOOKUP(lock); mtx_assert(&tc->tc_lock, MA_OWNED); LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash) - if (ts->ts_lockobj == lock) + if (ts->ts_lockobj == lock) { + mtx_lock_spin(&ts->ts_lock); return (ts); + } return (NULL); } @@ -512,7 +555,7 @@ * Unlock the turnstile chain associated with a given lock. */ void -turnstile_release(struct lock_object *lock) +turnstile_chain_unlock(struct lock_object *lock) { struct turnstile_chain *tc; @@ -541,16 +584,13 @@ * owner appropriately. */ void -turnstile_claim(struct lock_object *lock) +turnstile_claim(struct turnstile *ts) { - struct turnstile_chain *tc; - struct turnstile *ts; struct thread *td, *owner; + struct turnstile_chain *tc; - tc = TC_LOOKUP(lock); - mtx_assert(&tc->tc_lock, MA_OWNED); - ts = turnstile_lookup(lock); - MPASS(ts != NULL); + mtx_assert(&ts->ts_lock, MA_OWNED); + MPASS(ts != curthread->td_turnstile); owner = curthread; mtx_lock_spin(&td_contested_lock); @@ -560,15 +600,18 @@ td = turnstile_first_waiter(ts); MPASS(td != NULL); MPASS(td->td_proc->p_magic == P_MAGIC); - mtx_unlock_spin(&tc->tc_lock); + MPASS(td->td_lock == &ts->ts_lock); /* * Update the priority of the new owner if needed. */ - mtx_lock_spin(&sched_lock); + thread_lock(owner); if (td->td_priority < owner->td_priority) sched_lend_prio(owner, td->td_priority); - mtx_unlock_spin(&sched_lock); + thread_unlock(owner); + tc = TC_LOOKUP(ts->ts_lockobj); + mtx_unlock_spin(&ts->ts_lock); + mtx_unlock_spin(&tc->tc_lock); } /* @@ -578,31 +621,28 @@ * turnstile chain locked and will return with it unlocked. */ void -turnstile_wait(struct lock_object *lock, struct thread *owner, int queue) +turnstile_wait(struct turnstile *ts, struct thread *owner, int queue) { struct turnstile_chain *tc; - struct turnstile *ts; struct thread *td, *td1; + struct lock_object *lock; td = curthread; - tc = TC_LOOKUP(lock); - mtx_assert(&tc->tc_lock, MA_OWNED); - MPASS(td->td_turnstile != NULL); + mtx_assert(&ts->ts_lock, MA_OWNED); if (queue == TS_SHARED_QUEUE) MPASS(owner != NULL); if (owner) MPASS(owner->td_proc->p_magic == P_MAGIC); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); - /* Look up the turnstile associated with the lock 'lock'. */ - ts = turnstile_lookup(lock); - /* * If the lock does not already have a turnstile, use this thread's * turnstile. Otherwise insert the current thread into the * turnstile already in use by this lock. */ - if (ts == NULL) { + tc = TC_LOOKUP(ts->ts_lockobj); + if (ts == td->td_turnstile) { + mtx_assert(&tc->tc_lock, MA_OWNED); #ifdef TURNSTILE_PROFILING tc->tc_depth++; if (tc->tc_depth > tc->tc_max_depth) { @@ -611,7 +651,7 @@ turnstile_max_depth = tc->tc_max_depth; } #endif - ts = td->td_turnstile; + tc = TC_LOOKUP(ts->ts_lockobj); LIST_INSERT_HEAD(&tc->tc_turnstiles, ts, ts_hash); KASSERT(TAILQ_EMPTY(&ts->ts_pending), ("thread's turnstile has pending threads")); @@ -621,8 +661,7 @@ ("thread's turnstile has shared waiters")); KASSERT(LIST_EMPTY(&ts->ts_free), ("thread's turnstile has a non-empty free list")); - KASSERT(ts->ts_lockobj == NULL, ("stale ts_lockobj pointer")); - ts->ts_lockobj = lock; + MPASS(ts->ts_lockobj != NULL); mtx_lock_spin(&td_contested_lock); TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq); turnstile_setowner(ts, owner); @@ -641,58 +680,31 @@ MPASS(td->td_turnstile != NULL); LIST_INSERT_HEAD(&ts->ts_free, td->td_turnstile, ts_hash); } + thread_lock(td); + thread_lock_set(td, &ts->ts_lock); td->td_turnstile = NULL; - mtx_unlock_spin(&tc->tc_lock); - - mtx_lock_spin(&sched_lock); - /* - * Handle race condition where a thread on another CPU that owns - * lock 'lock' could have woken us in between us dropping the - * turnstile chain lock and acquiring the sched_lock. - */ - if (td->td_flags & TDF_TSNOBLOCK) { - td->td_flags &= ~TDF_TSNOBLOCK; - mtx_unlock_spin(&sched_lock); - return; - } - -#ifdef notyet - /* - * If we're borrowing an interrupted thread's VM context, we - * must clean up before going to sleep. - */ - if (td->td_ithd != NULL) { - struct ithd *it = td->td_ithd; - - if (it->it_interrupted) { - if (LOCK_LOG_TEST(lock, 0)) - CTR3(KTR_LOCK, "%s: %p interrupted %p", - __func__, it, it->it_interrupted); - intr_thd_fixup(it); - } - } -#endif /* Save who we are blocked on and switch. */ + lock = ts->ts_lockobj; td->td_tsqueue = queue; td->td_blocked = ts; td->td_lockname = lock->lo_name; TD_SET_LOCK(td); - critical_enter(); + mtx_unlock_spin(&tc->tc_lock); propagate_priority(td); - critical_exit(); if (LOCK_LOG_TEST(lock, 0)) CTR4(KTR_LOCK, "%s: td %d blocked on [%p] %s", __func__, td->td_tid, lock, lock->lo_name); + MPASS(td->td_lock == &ts->ts_lock); + SCHED_STAT_INC(switch_turnstile); mi_switch(SW_VOL, NULL); if (LOCK_LOG_TEST(lock, 0)) CTR4(KTR_LOCK, "%s: td %d free from blocked on [%p] %s", __func__, td->td_tid, lock, lock->lo_name); - - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } /* @@ -707,11 +719,10 @@ int empty; MPASS(ts != NULL); + mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(curthread->td_proc->p_magic == P_MAGIC); MPASS(ts->ts_owner == curthread || (queue == TS_EXCLUSIVE_QUEUE && ts->ts_owner == NULL)); - tc = TC_LOOKUP(ts->ts_lockobj); - mtx_assert(&tc->tc_lock, MA_OWNED); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); /* @@ -733,6 +744,8 @@ empty = TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) && TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]); if (empty) { + tc = TC_LOOKUP(ts->ts_lockobj); + mtx_assert(&tc->tc_lock, MA_OWNED); MPASS(LIST_EMPTY(&ts->ts_free)); #ifdef TURNSTILE_PROFILING tc->tc_depth--; @@ -758,9 +771,14 @@ struct thread *td; MPASS(ts != NULL); + mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(curthread->td_proc->p_magic == P_MAGIC); MPASS(ts->ts_owner == curthread || (queue == TS_EXCLUSIVE_QUEUE && ts->ts_owner == NULL)); + /* + * We must have the chain locked so that we can remove the empty + * turnstile from the hash queue. + */ tc = TC_LOOKUP(ts->ts_lockobj); mtx_assert(&tc->tc_lock, MA_OWNED); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); @@ -800,15 +818,14 @@ turnstile_unpend(struct turnstile *ts, int owner_type) { TAILQ_HEAD( ,thread) pending_threads; - struct turnstile_chain *tc; + struct turnstile *nts; struct thread *td; u_char cp, pri; MPASS(ts != NULL); + mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(ts->ts_owner == curthread || (owner_type == TS_SHARED_LOCK && ts->ts_owner == NULL)); - tc = TC_LOOKUP(ts->ts_lockobj); - mtx_assert(&tc->tc_lock, MA_OWNED); MPASS(!TAILQ_EMPTY(&ts->ts_pending)); /* @@ -822,7 +839,15 @@ TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE])) ts->ts_lockobj = NULL; #endif - + /* + * Adjust the priority of curthread based on other contested + * locks it owns. Don't lower the priority below the base + * priority however. + */ + td = curthread; + pri = PRI_MAX; + thread_lock(td); + mtx_lock_spin(&td_contested_lock); /* * Remove the turnstile from this thread's list of contested locks * since this thread doesn't own it anymore. New threads will @@ -831,31 +856,17 @@ * lock. */ if (ts->ts_owner != NULL) { - mtx_lock_spin(&td_contested_lock); ts->ts_owner = NULL; LIST_REMOVE(ts, ts_link); - mtx_unlock_spin(&td_contested_lock); } - critical_enter(); - mtx_unlock_spin(&tc->tc_lock); - - /* - * Adjust the priority of curthread based on other contested - * locks it owns. Don't lower the priority below the base - * priority however. - */ - td = curthread; - pri = PRI_MAX; - mtx_lock_spin(&sched_lock); - mtx_lock_spin(&td_contested_lock); - LIST_FOREACH(ts, &td->td_contested, ts_link) { - cp = turnstile_first_waiter(ts)->td_priority; + LIST_FOREACH(nts, &td->td_contested, ts_link) { + cp = turnstile_first_waiter(nts)->td_priority; if (cp < pri) pri = cp; } mtx_unlock_spin(&td_contested_lock); sched_unlend_prio(td, pri); - + thread_unlock(td); /* * Wake up all the pending threads. If a thread is not blocked * on a lock, then it is currently executing on another CPU in @@ -867,22 +878,20 @@ td = TAILQ_FIRST(&pending_threads); TAILQ_REMOVE(&pending_threads, td, td_lockq); MPASS(td->td_proc->p_magic == P_MAGIC); - if (TD_ON_LOCK(td)) { - td->td_blocked = NULL; - td->td_lockname = NULL; + MPASS(TD_ON_LOCK(td)); + MPASS(td->td_lock == &ts->ts_lock); + td->td_blocked = NULL; + td->td_lockname = NULL; #ifdef INVARIANTS - td->td_tsqueue = 0xff; + td->td_tsqueue = 0xff; #endif - TD_CLR_LOCK(td); - MPASS(TD_CAN_RUN(td)); - sched_add(td, SRQ_BORING); - } else { - td->td_flags |= TDF_TSNOBLOCK; - MPASS(TD_IS_RUNNING(td) || TD_ON_RUNQ(td)); - } + TD_CLR_LOCK(td); + MPASS(TD_CAN_RUN(td)); + thread_lock(td); + sched_add(td, SRQ_BORING); + thread_unlock(td); } - critical_exit(); - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&ts->ts_lock); } /* @@ -892,14 +901,12 @@ void turnstile_disown(struct turnstile *ts) { - struct turnstile_chain *tc; struct thread *td; u_char cp, pri; MPASS(ts != NULL); + mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(ts->ts_owner == curthread); - tc = TC_LOOKUP(ts->ts_lockobj); - mtx_assert(&tc->tc_lock, MA_OWNED); MPASS(TAILQ_EMPTY(&ts->ts_pending)); MPASS(!TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) || !TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE])); @@ -914,7 +921,6 @@ ts->ts_owner = NULL; LIST_REMOVE(ts, ts_link); mtx_unlock_spin(&td_contested_lock); - mtx_unlock_spin(&tc->tc_lock); /* * Adjust the priority of curthread based on other contested @@ -923,7 +929,7 @@ */ td = curthread; pri = PRI_MAX; - mtx_lock_spin(&sched_lock); + thread_lock(td); mtx_lock_spin(&td_contested_lock); LIST_FOREACH(ts, &td->td_contested, ts_link) { cp = turnstile_first_waiter(ts)->td_priority; @@ -932,7 +938,8 @@ } mtx_unlock_spin(&td_contested_lock); sched_unlend_prio(td, pri); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + mtx_unlock_spin(&ts->ts_lock); } /* @@ -942,12 +949,10 @@ turnstile_head(struct turnstile *ts, int queue) { #ifdef INVARIANTS - struct turnstile_chain *tc; MPASS(ts != NULL); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); - tc = TC_LOOKUP(ts->ts_lockobj); - mtx_assert(&tc->tc_lock, MA_OWNED); + mtx_assert(&ts->ts_lock, MA_OWNED); #endif return (TAILQ_FIRST(&ts->ts_blocked[queue])); } @@ -959,12 +964,10 @@ turnstile_empty(struct turnstile *ts, int queue) { #ifdef INVARIANTS - struct turnstile_chain *tc; MPASS(ts != NULL); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); - tc = TC_LOOKUP(ts->ts_lockobj); - mtx_assert(&tc->tc_lock, MA_OWNED); + mtx_assert(&ts->ts_lock, MA_OWNED); #endif return (TAILQ_EMPTY(&ts->ts_blocked[queue])); } Index: kern/subr_witness.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/subr_witness.c,v retrieving revision 1.225 diff -u -r1.225 subr_witness.c --- kern/subr_witness.c 20 Feb 2007 23:49:30 -0000 1.225 +++ kern/subr_witness.c 26 Feb 2007 07:21:21 -0000 @@ -394,9 +394,12 @@ { "ng_worklist", &lock_class_mtx_spin }, { "fast_taskqueue", &lock_class_mtx_spin }, { "intr table", &lock_class_mtx_spin }, + { "process slock", &lock_class_mtx_spin }, + { "umtx lock", &lock_class_mtx_spin }, { "sleepq chain", &lock_class_mtx_spin }, - { "sched lock", &lock_class_mtx_spin }, { "turnstile chain", &lock_class_mtx_spin }, + { "turnstile lock", &lock_class_mtx_spin }, + { "sched lock", &lock_class_mtx_spin }, { "td_contested", &lock_class_mtx_spin }, { "callout", &lock_class_mtx_spin }, { "entropy harvest mutex", &lock_class_mtx_spin }, @@ -418,7 +421,8 @@ #endif { "clk", &lock_class_mtx_spin }, { "mutex profiling lock", &lock_class_mtx_spin }, - { "kse zombie lock", &lock_class_mtx_spin }, + { "kse lock", &lock_class_mtx_spin }, + { "zombie lock", &lock_class_mtx_spin }, { "ALD Queue", &lock_class_mtx_spin }, #ifdef __ia64__ { "MCA spin lock", &lock_class_mtx_spin }, Index: kern/sys_generic.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/sys_generic.c,v retrieving revision 1.151 diff -u -r1.151 sys_generic.c --- kern/sys_generic.c 14 Oct 2006 19:01:55 -0000 1.151 +++ kern/sys_generic.c 26 Feb 2007 07:21:21 -0000 @@ -767,9 +767,9 @@ mtx_lock(&sellock); retry: ncoll = nselcoll; - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags |= TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_unlock(&sellock); error = selscan(td, ibits, obits, nd); @@ -792,12 +792,12 @@ * collisions and rescan the file descriptors if * necessary. */ - mtx_lock_spin(&sched_lock); + thread_lock(td); if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { - mtx_unlock_spin(&sched_lock); + thread_unlock(td); goto retry; } - mtx_unlock_spin(&sched_lock); + thread_unlock(td); if (timo > 0) error = cv_timedwait_sig(&selwait, &sellock, timo); @@ -809,9 +809,9 @@ done: clear_selinfo_list(td); - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags &= ~TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_unlock(&sellock); done_nosellock: @@ -947,9 +947,9 @@ mtx_lock(&sellock); retry: ncoll = nselcoll; - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags |= TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_unlock(&sellock); error = pollscan(td, bits, nfds); @@ -970,12 +970,12 @@ * sellock, so check TDF_SELECT and the number of collisions * and rescan the file descriptors if necessary. */ - mtx_lock_spin(&sched_lock); + thread_lock(td); if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { - mtx_unlock_spin(&sched_lock); + thread_unlock(td); goto retry; } - mtx_unlock_spin(&sched_lock); + thread_unlock(td); if (timo > 0) error = cv_timedwait_sig(&selwait, &sellock, timo); @@ -987,9 +987,9 @@ done: clear_selinfo_list(td); - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags &= ~TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_unlock(&sellock); done_nosellock: @@ -1162,9 +1162,9 @@ } TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); sip->si_thread = NULL; - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags &= ~TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); sleepq_remove(td, &selwait); mtx_unlock(&sellock); } Index: kern/sys_process.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/sys_process.c,v retrieving revision 1.141 diff -u -r1.141 sys_process.c --- kern/sys_process.c 26 Oct 2006 21:42:20 -0000 1.141 +++ kern/sys_process.c 26 Feb 2007 07:21:21 -0000 @@ -530,12 +530,12 @@ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); FOREACH_THREAD_IN_PROC(p, td2) { if (td2->td_tid == pid) break; } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); if (td2 != NULL) break; /* proc lock held */ PROC_UNLOCK(p); @@ -704,15 +704,15 @@ break; case PT_SUSPEND: - mtx_lock_spin(&sched_lock); + thread_lock(td2); td2->td_flags |= TDF_DBSUSPEND; - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); break; case PT_RESUME: - mtx_lock_spin(&sched_lock); + thread_lock(td2); td2->td_flags &= ~TDF_DBSUSPEND; - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); break; case PT_STEP: @@ -783,32 +783,35 @@ proctree_locked = 0; } /* deliver or queue signal */ - mtx_lock_spin(&sched_lock); + thread_lock(td2); td2->td_flags &= ~TDF_XSIG; - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); td2->td_xsig = data; p->p_xstat = data; p->p_xthread = NULL; if ((p->p_flag & (P_STOPPED_SIG | P_STOPPED_TRACE)) != 0) { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (req == PT_DETACH) { struct thread *td3; - FOREACH_THREAD_IN_PROC(p, td3) + FOREACH_THREAD_IN_PROC(p, td3) { + thread_lock(td3); td3->td_flags &= ~TDF_DBSUSPEND; + thread_unlock(td3); + } } /* * unsuspend all threads, to not let a thread run, * you should use PT_SUSPEND to suspend it before * continuing process. */ - mtx_unlock_spin(&sched_lock); #ifdef KSE + PROC_SUNLOCK(p); thread_continued(p); + PROC_SLOCK(p); #endif p->p_flag &= ~(P_STOPPED_TRACE|P_STOPPED_SIG|P_WAITED); - mtx_lock_spin(&sched_lock); thread_unsuspend(p); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } if (data) @@ -971,13 +974,13 @@ buf = malloc(num * sizeof(lwpid_t), M_TEMP, M_WAITOK); tmp = 0; PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); FOREACH_THREAD_IN_PROC(p, td2) { if (tmp >= num) break; buf[tmp++] = td2->td_tid; } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); error = copyout(buf, addr, tmp * sizeof(lwpid_t)); free(buf, M_TEMP); Index: kern/tty.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/tty.c,v retrieving revision 1.268 diff -u -r1.268 tty.c --- kern/tty.c 20 Dec 2006 02:49:59 -0000 1.268 +++ kern/tty.c 26 Feb 2007 07:21:21 -0000 @@ -147,7 +147,9 @@ .d_flags = D_TTY | D_NEEDGIANT, }; -static int proc_compare(struct proc *p1, struct proc *p2); +static int proc_sum(struct proc *, int *); +static int proc_compare(struct proc *, struct proc *); +static int thread_compare(struct thread *, struct thread *); static int ttnread(struct tty *tp); static void ttyecho(int c, struct tty *tp); static int ttyoutput(int c, struct tty *tp); @@ -2528,7 +2530,7 @@ { struct timeval utime, stime; struct proc *p, *pick; - struct thread *td; + struct thread *td, *picktd; const char *stateprefix, *state; long rss; int load, pctcpu; @@ -2566,21 +2568,25 @@ /* * Pick the most interesting process and copy some of its - * state for printing later. sched_lock must be held for - * most parts of this. Holding it throughout is simplest - * and prevents even unimportant inconsistencies in the - * copy of the state, but may increase interrupt latency - * too much. + * state for printing later. This operation could rely on stale + * data as we can't hold the proc slock or thread locks over the + * whole list. However, we're guaranteed not to reference an exited + * thread or proc since we hold the tty locked. */ pick = NULL; - mtx_lock_spin(&sched_lock); LIST_FOREACH(p, &tp->t_pgrp->pg_members, p_pglist) if (proc_compare(pick, p)) pick = p; - /*^T can only show state for 1 thread. just pick the first. */ + PROC_SLOCK(pick); + picktd = NULL; td = FIRST_THREAD_IN_PROC(pick); + FOREACH_THREAD_IN_PROC(pick, td) + if (thread_compare(picktd, td)) + picktd = td; + td = picktd; stateprefix = ""; + thread_lock(td); if (TD_IS_RUNNING(td)) state = "running"; else if (TD_ON_RUNQ(td) || TD_CAN_RUN(td)) @@ -2601,11 +2607,12 @@ else state = "unknown"; pctcpu = (sched_pctcpu(td) * 10000 + FSCALE / 2) >> FSHIFT; + thread_unlock(td); if (pick->p_state == PRS_NEW || pick->p_state == PRS_ZOMBIE) rss = 0; else rss = pgtok(vmspace_resident_count(pick->p_vmspace)); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(pick); PROC_LOCK(pick); PGRP_UNLOCK(tp->t_pgrp); calcru(pick, &utime, &stime); @@ -2636,18 +2643,6 @@ * we pick out just "short-term" sleepers (P_SINTR == 0). * 4) Further ties are broken by picking the highest pid. */ -#define ISRUN(p, val) \ -do { \ - struct thread *td; \ - val = 0; \ - FOREACH_THREAD_IN_PROC(p, td) { \ - if (TD_ON_RUNQ(td) || \ - TD_IS_RUNNING(td)) { \ - val = 1; \ - break; \ - } \ - } \ -} while (0) #define TESTAB(a, b) ((a)<<1 | (b)) #define ONLYA 2 @@ -2655,69 +2650,134 @@ #define BOTH 3 static int -proc_compare(struct proc *p1, struct proc *p2) +proc_sum(struct proc *p, int *estcpup) { - - int esta, estb; struct thread *td; - mtx_assert(&sched_lock, MA_OWNED); - if (p1 == NULL) + int estcpu; + int val; + + val = 0; + estcpu = 0; + FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); + if (TD_ON_RUNQ(td) || + TD_IS_RUNNING(td)) + val = 1; + estcpu += sched_pctcpu(td); + thread_unlock(td); + } + *estcpup = estcpu; + + return (val); +} + +static int +thread_compare(struct thread *td, struct thread *td2) +{ + int runa, runb; + int slpa, slpb; + fixpt_t esta, estb; + + if (td == NULL) return (1); - ISRUN(p1, esta); - ISRUN(p2, estb); - + /* + * Fetch running stats, pctcpu usage, and interruptable flag. + */ + thread_lock(td); + runa = TD_IS_RUNNING(td) | TD_ON_RUNQ(td); + slpa = td->td_flags & TDF_SINTR; + esta = sched_pctcpu(td); + thread_unlock(td); + thread_lock(td2); + runb = TD_IS_RUNNING(td2) | TD_ON_RUNQ(td2); + estb = sched_pctcpu(td2); + slpb = td2->td_flags & TDF_SINTR; + thread_unlock(td2); /* * see if at least one of them is runnable */ - switch (TESTAB(esta, estb)) { + switch (TESTAB(runa, runb)) { case ONLYA: return (0); case ONLYB: return (1); case BOTH: - /* - * tie - favor one with highest recent cpu utilization - */ - esta = estb = 0; - FOREACH_THREAD_IN_PROC(p1, td) - esta += td->td_estcpu; - FOREACH_THREAD_IN_PROC(p2, td) - estb += td->td_estcpu; - if (estb > esta) - return (1); - if (esta > estb) - return (0); - return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ + break; } /* - * weed out zombies + * favor one with highest recent cpu utilization */ - switch (TESTAB(p1->p_state == PRS_ZOMBIE, p2->p_state == PRS_ZOMBIE)) { - case ONLYA: + if (estb > esta) return (1); - case ONLYB: + if (esta > estb) + return (0); + /* + * favor one sleeping in a non-interruptible sleep + */ + switch (TESTAB(slpa, slpb)) { + case ONLYA: return (0); + case ONLYB: + return (1); case BOTH: - return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ + break; } -#if 0 /* XXXKSE */ + return (td < td2); +} + +static int +proc_compare(struct proc *p1, struct proc *p2) +{ + + int runa, runb; + fixpt_t esta, estb; + + if (p1 == NULL) + return (1); + /* - * pick the one with the smallest sleep time + * Fetch various stats about these processes. After we drop the + * lock the information could be stale but the race is unimportant. */ - if (p2->p_slptime > p1->p_slptime) + PROC_SLOCK(p1); + runa = proc_sum(p1, &esta); + PROC_SUNLOCK(p1); + PROC_SLOCK(p2); + runb = proc_sum(p2, &estb); + PROC_SUNLOCK(p2); + + /* + * see if at least one of them is runnable + */ + switch (TESTAB(runa, runb)) { + case ONLYA: return (0); - if (p1->p_slptime > p2->p_slptime) + case ONLYB: return (1); + case BOTH: + break; + } /* - * favor one sleeping in a non-interruptible sleep + * favor one with highest recent cpu utilization */ - if (p1->p_sflag & PS_SINTR && (p2->p_sflag & PS_SINTR) == 0) + if (estb > esta) return (1); - if (p2->p_sflag & PS_SINTR && (p1->p_sflag & PS_SINTR) == 0) + if (esta > estb) return (0); -#endif + /* + * weed out zombies + */ + switch (TESTAB(p1->p_state == PRS_ZOMBIE, p2->p_state == PRS_ZOMBIE)) { + case ONLYA: + return (1); + case ONLYB: + return (0); + case BOTH: + break; + } + return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ } Index: netncp/ncp_sock.c =================================================================== RCS file: /usr/home/ncvs/src/sys/netncp/ncp_sock.c,v retrieving revision 1.17 diff -u -r1.17 ncp_sock.c --- netncp/ncp_sock.c 3 Aug 2006 15:31:52 -0000 1.17 +++ netncp/ncp_sock.c 26 Feb 2007 07:21:41 -0000 @@ -190,9 +190,9 @@ /* Fake up enough state to look like we are in poll(2). */ mtx_lock(&sellock); - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags |= TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_unlock(&sellock); TAILQ_INIT(&td->td_selq); @@ -201,9 +201,9 @@ /* Tear down the fake poll(2) state. */ mtx_lock(&sellock); clear_selinfo_list(td); - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags &= ~TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_unlock(&sellock); return (revents); @@ -230,9 +230,9 @@ retry: ncoll = nselcoll; - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags |= TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_unlock(&sellock); TAILQ_INIT(&td->td_selq); @@ -258,12 +258,12 @@ * the process, test TDF_SELECT and rescan file descriptors if * necessary. */ - mtx_lock_spin(&sched_lock); + thread_lock(td); if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { - mtx_unlock_spin(&sched_lock); + thread_unlock(td); goto retry; } - mtx_unlock_spin(&sched_lock); + thread_unlock(td); if (timo > 0) error = cv_timedwait(&selwait, &sellock, timo); @@ -275,9 +275,9 @@ done: clear_selinfo_list(td); - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags &= ~TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_unlock(&sellock); done_noproclock: Index: netsmb/smb_trantcp.c =================================================================== RCS file: /usr/home/ncvs/src/sys/netsmb/smb_trantcp.c,v retrieving revision 1.24 diff -u -r1.24 smb_trantcp.c --- netsmb/smb_trantcp.c 3 Aug 2006 15:31:52 -0000 1.24 +++ netsmb/smb_trantcp.c 26 Feb 2007 07:21:41 -0000 @@ -115,9 +115,9 @@ retry: ncoll = nselcoll; - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags |= TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_unlock(&sellock); /* XXX: Should be done when the thread is initialized. */ @@ -144,12 +144,12 @@ * the process, test P_SELECT and rescan file descriptors if * necessary. */ - mtx_lock_spin(&sched_lock); + thread_lock(td); if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { - mtx_unlock_spin(&sched_lock); + thread_unlock(td); goto retry; } - mtx_unlock_spin(&sched_lock); + thread_unlock(td); if (timo > 0) error = cv_timedwait(&selwait, &sellock, timo); @@ -161,9 +161,9 @@ done: clear_selinfo_list(td); - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags &= ~TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_unlock(&sellock); done_noproclock: Index: pc98/pc98/machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/pc98/pc98/machdep.c,v retrieving revision 1.388 diff -u -r1.388 machdep.c --- pc98/pc98/machdep.c 28 Jan 2007 07:19:14 -0000 1.388 +++ pc98/pc98/machdep.c 26 Feb 2007 07:21:43 -0000 @@ -1055,9 +1055,9 @@ #ifdef SMP /* Schedule ourselves on the indicated cpu. */ - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_bind(curthread, cpu_id); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); #endif /* Calibrate by measuring a short delay. */ @@ -1068,9 +1068,9 @@ intr_restore(reg); #ifdef SMP - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_unbind(curthread); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); #endif /* Index: security/mac_lomac/mac_lomac.c =================================================================== RCS file: /usr/home/ncvs/src/sys/security/mac_lomac/mac_lomac.c,v retrieving revision 1.45 diff -u -r1.45 mac_lomac.c --- security/mac_lomac/mac_lomac.c 6 Feb 2007 14:19:24 -0000 1.45 +++ security/mac_lomac/mac_lomac.c 26 Feb 2007 07:21:46 -0000 @@ -535,10 +535,10 @@ subj->mac_lomac.ml_rangelow = objlabel->ml_single; subj->mac_lomac.ml_rangehigh = objlabel->ml_single; subj->mac_lomac.ml_flags |= MAC_LOMAC_FLAG_UPDATE; - mtx_lock_spin(&sched_lock); + thread_lock(curthread); curthread->td_flags |= TDF_ASTPENDING; curthread->td_proc->p_sflag |= PS_MACPEND; - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); /* * Avoid memory allocation while holding a mutex; cache the Index: sparc64/sparc64/mp_machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/sparc64/sparc64/mp_machdep.c,v retrieving revision 1.33 diff -u -r1.33 mp_machdep.c --- sparc64/sparc64/mp_machdep.c 3 Sep 2006 21:20:21 -0000 1.33 +++ sparc64/sparc64/mp_machdep.c 26 Feb 2007 07:21:48 -0000 @@ -362,12 +362,8 @@ while (csa->csa_count != 0) ; - /* ok, now grab sched_lock and enter the scheduler */ - mtx_lock_spin(&sched_lock); - spinlock_exit(); - PCPU_SET(switchtime, cpu_ticks()); - PCPU_SET(switchticks, ticks); - cpu_throw(NULL, choosethread()); /* doesn't return */ + /* ok, now enter the scheduler */ + sched_throw(NULL); } void Index: sun4v/sun4v/mp_machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/sun4v/sun4v/mp_machdep.c,v retrieving revision 1.6 diff -u -r1.6 mp_machdep.c --- sun4v/sun4v/mp_machdep.c 2 Feb 2007 05:00:21 -0000 1.6 +++ sun4v/sun4v/mp_machdep.c 26 Feb 2007 07:21:49 -0000 @@ -404,12 +404,7 @@ while (csa->csa_count != 0) ; /* ok, now grab sched_lock and enter the scheduler */ - mtx_lock_spin(&sched_lock); - spinlock_exit(); - PCPU_SET(switchtime, cpu_ticks()); - PCPU_SET(switchticks, ticks); - - cpu_throw(NULL, choosethread()); /* doesn't return */ + sched_throw(NULL); } void Index: sys/mutex.h =================================================================== RCS file: /usr/home/ncvs/src/sys/sys/mutex.h,v retrieving revision 1.87 diff -u -r1.87 mutex.h --- sys/mutex.h 21 Dec 2006 22:42:18 -0000 1.87 +++ sys/mutex.h 26 Feb 2007 07:21:50 -0000 @@ -343,6 +343,7 @@ */ extern struct mtx sched_lock; extern struct mtx Giant; +extern struct mtx blocked_lock; /* * Giant lock manipulation and clean exit macros. Index: sys/proc.h =================================================================== RCS file: /usr/home/ncvs/src/sys/sys/proc.h,v retrieving revision 1.471 diff -u -r1.471 proc.h --- sys/proc.h 23 Jan 2007 08:46:50 -0000 1.471 +++ sys/proc.h 26 Feb 2007 07:21:50 -0000 @@ -194,26 +194,13 @@ * other than CPU cycles, which are parceled out to the threads. */ -/*************** - * Threads are the unit of execution - With a single run queue used by all processors: - - RUNQ: --->THREAD---THREAD--... SLEEPQ:[]---THREAD---THREAD---THREAD - []---THREAD - [] - []---THREAD---THREAD - -With PER-CPU run queues: -it gets more complicated. - * - *****************/ - /* * Kernel runnable context (thread). * This is what is put to sleep and reactivated. * Thread context. Processes may have multiple threads. */ struct thread { + volatile struct mtx *td_lock; /* replaces sched lock */ struct proc *td_proc; /* (*) Associated process. */ TAILQ_ENTRY(thread) td_plist; /* (*) All threads in this proc. */ @@ -293,7 +280,7 @@ TDS_CAN_RUN, TDS_RUNQ, TDS_RUNNING - } td_state; + } td_state; /* (j) thread state */ register_t td_retval[2]; /* (k) Syscall aux returns. */ struct callout td_slpcallout; /* (h) Callout for sleep. */ struct trapframe *td_frame; /* (k) */ @@ -309,6 +296,15 @@ struct kaudit_record *td_ar; /* (k) Active audit record, if any. */ }; +void thread_lock(struct thread *); +void thread_lock_flags(struct thread *, int); +struct mtx *thread_lock_block(struct thread *); +void thread_lock_unblock(struct thread *, struct mtx *); +void thread_lock_set(struct thread *, struct mtx *); +void thread_unlock(struct thread *); +#define THREAD_LOCK_ASSERT(td, type) \ + mtx_assert(__DEVOLATILE(struct mtx *, (td)->td_lock), (type)) + /* * Flags kept in td_flags: * To change these you MUST have the scheduler lock. @@ -320,22 +316,22 @@ #define TDF_IDLETD 0x00000020 /* This is a per-CPU idle thread. */ #define TDF_SELECT 0x00000040 /* Selecting; wakeup/waiting danger. */ #define TDF_SLEEPABORT 0x00000080 /* sleepq_abort was called. */ -#define TDF_TSNOBLOCK 0x00000100 /* Don't block on a turnstile due to race. */ +#define TDF_UNUSEDx100 0x00000100 /* --available-- */ #define TDF_UBORROWING 0x00000200 /* Thread is borrowing user pri. */ #define TDF_BOUNDARY 0x00000400 /* Thread suspended at user boundary */ #define TDF_ASTPENDING 0x00000800 /* Thread has some asynchronous events. */ #define TDF_TIMOFAIL 0x00001000 /* Timeout from sleep after we were awake. */ #define TDF_INTERRUPT 0x00002000 /* Thread is marked as interrupted. */ #define TDF_UPIBLOCKED 0x00004000 /* Thread blocked on user PI mutex. */ -#define TDF_UNUSED15 0x00008000 /* --available -- */ +#define TDF_UNUSED15 0x00008000 /* --available-- */ #define TDF_NEEDRESCHED 0x00010000 /* Thread needs to yield. */ #define TDF_NEEDSIGCHK 0x00020000 /* Thread may need signal delivery. */ #define TDF_XSIG 0x00040000 /* Thread is exchanging signal under trace */ #define TDF_UNUSED19 0x00080000 /* Thread is sleeping on a umtx. */ #define TDF_THRWAKEUP 0x00100000 /* Libthr thread must not suspend itself. */ #define TDF_DBSUSPEND 0x00200000 /* Thread is suspended by debugger */ -#define TDF_UNUSED22 0x00400000 /* --available -- */ -#define TDF_UNUSED23 0x00800000 /* --available -- */ +#define TDF_UNUSED22 0x00400000 /* --available-- */ +#define TDF_UNUSED23 0x00800000 /* --available-- */ #define TDF_SCHED0 0x01000000 /* Reserved for scheduler private use */ #define TDF_SCHED1 0x02000000 /* Reserved for scheduler private use */ #define TDF_SCHED2 0x04000000 /* Reserved for scheduler private use */ @@ -471,7 +467,8 @@ */ struct proc { LIST_ENTRY(proc) p_list; /* (d) List of all processes. */ - TAILQ_HEAD(, thread) p_threads; /* (j)(td_plist) Threads. (shortcut) */ + TAILQ_HEAD(, thread) p_threads; /* (j) all threads. */ + struct mtx p_slock; /* process spin lock */ struct ucred *p_ucred; /* (c) Process owner's identity. */ struct filedesc *p_fd; /* (b) Open files. */ struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */ @@ -479,7 +476,7 @@ struct pstats *p_stats; /* (b) Accounting/statistics (CPU). */ struct plimit *p_limit; /* (c) Process limits. */ struct sigacts *p_sigacts; /* (x) Signal actions, state (CPU). */ - TAILQ_HEAD(, kse_upcall) p_upcalls; /* All upcalls in the proc. */ + TAILQ_HEAD(, kse_upcall) p_upcalls; /* (j) All upcalls in the proc. */ /* * The following don't make too much sense. @@ -492,7 +489,6 @@ PRS_NORMAL, /* threads can be run. */ PRS_ZOMBIE } p_state; /* (j/c) S* process status. */ - pid_t p_pid; /* (b) Process identifier. */ LIST_ENTRY(proc) p_hash; /* (d) Hash chain. */ LIST_ENTRY(proc) p_pglist; /* (g + e) List of processes in pgrp. */ @@ -536,8 +532,6 @@ int p_pendingcnt; /* how many signals are pending */ struct itimers *p_itimers; /* (c) POSIX interval timers. */ /* from ksegrp */ - u_int p_estcpu; /* (j) Sum of the field in threads. */ - u_int p_slptime; /* (j) How long completely blocked. */ int p_numupcalls; /* (j) Num upcalls. */ int p_upsleeps; /* (c) Num threads in kse_release(). */ struct kse_thr_mailbox *p_completed; /* (c) Completed thread mboxes. */ @@ -580,6 +574,9 @@ #define NOCPU 0xff /* For when we aren't on a CPU. */ +#define PROC_SLOCK(p) mtx_lock_spin(&(p)->p_slock) +#define PROC_SUNLOCK(p) mtx_unlock_spin(&(p)->p_slock) +#define PROC_SLOCK_ASSERT(p, type) mtx_assert(&(p)->p_slock, (type)) /* These flags are kept in p_flag. */ #define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock. */ @@ -844,8 +841,8 @@ void threadinit(void); void cpu_idle(void); extern void (*cpu_idle_hook)(void); /* Hook to machdep CPU idler. */ -void cpu_switch(struct thread *old, struct thread *new); -void cpu_throw(struct thread *old, struct thread *new) __dead2; +void cpu_switch(struct thread *, struct thread *); +void cpu_throw(struct thread *, struct thread *) __dead2; void unsleep(struct thread *); void userret(struct thread *, struct trapframe *); @@ -855,6 +852,7 @@ void cpu_set_fork_handler(struct thread *, void (*)(void *), void *); /* New in KSE. */ +void kse_unlink(struct thread *); void kse_GC(void); void kseinit(void); void cpu_set_upcall(struct thread *td, struct thread *td0); @@ -883,6 +881,7 @@ void childproc_continued(struct proc *child); void childproc_exited(struct proc *child); int thread_suspend_check(int how); +void thread_suspend_switch(struct thread *); void thread_suspend_one(struct thread *td); struct thread *thread_switchout(struct thread *td, int flags, struct thread *newtd); Index: sys/sched.h =================================================================== RCS file: /usr/home/ncvs/src/sys/sys/sched.h,v retrieving revision 1.31 diff -u -r1.31 sched.h --- sys/sched.h 23 Jan 2007 08:46:50 -0000 1.31 +++ sys/sched.h 26 Feb 2007 07:21:50 -0000 @@ -81,6 +81,7 @@ */ void sched_exit(struct proc *p, struct thread *childtd); void sched_fork(struct thread *td, struct thread *childtd); +void sched_fork_exit(struct thread *td); /* * KSE Groups contain scheduling priority information. They record the @@ -101,6 +102,7 @@ void sched_prio(struct thread *td, u_char prio); void sched_sleep(struct thread *td); void sched_switch(struct thread *td, struct thread *newtd, int flags); +void sched_throw(struct thread *td); void sched_unlend_prio(struct thread *td, u_char prio); void sched_unlend_user_prio(struct thread *td, u_char pri); void sched_user_prio(struct thread *td, u_char prio); @@ -155,6 +157,20 @@ #define SRQ_PREEMPTED 0x0008 /* has been preempted.. be kind */ #define SRQ_BORROWING 0x0010 /* Priority updated due to prio_lend */ +#define SCHED_STATS +/* Switch stats. */ +#ifdef SCHED_STATS +extern long switch_preempt; +extern long switch_owepreempt; +extern long switch_turnstile; +extern long switch_sleepq; +extern long switch_sleepqtimo; +extern long switch_relinquish; +extern long switch_needresched; +#define SCHED_STAT_INC(var) atomic_add_long(&(var), 1) +#else +#define SCHED_STAT_INC(var) +#endif /* temporarily here */ void schedinit(void); @@ -162,7 +178,6 @@ void sched_set_concurrency(struct proc *p, int cuncurrency); void sched_schedinit(void); void sched_newproc(struct proc *p, struct thread *td); -void sched_thread_exit(struct thread *td); void sched_newthread(struct thread *td); #endif /* _KERNEL */ Index: sys/turnstile.h =================================================================== RCS file: /usr/home/ncvs/src/sys/sys/turnstile.h,v retrieving revision 1.11 diff -u -r1.11 turnstile.h --- sys/turnstile.h 18 Apr 2006 18:21:38 -0000 1.11 +++ sys/turnstile.h 26 Feb 2007 07:21:51 -0000 @@ -91,17 +91,19 @@ void turnstile_adjust(struct thread *, u_char); struct turnstile *turnstile_alloc(void); void turnstile_broadcast(struct turnstile *, int); -void turnstile_claim(struct lock_object *); +void turnstile_cancel(struct turnstile *); +void turnstile_chain_lock(struct lock_object *); +void turnstile_chain_unlock(struct lock_object *); +void turnstile_claim(struct turnstile *); void turnstile_disown(struct turnstile *); int turnstile_empty(struct turnstile *ts, int queue); void turnstile_free(struct turnstile *); struct thread *turnstile_head(struct turnstile *, int); -void turnstile_lock(struct lock_object *); struct turnstile *turnstile_lookup(struct lock_object *); -void turnstile_release(struct lock_object *); int turnstile_signal(struct turnstile *, int); +struct turnstile *turnstile_trywait(struct lock_object *); void turnstile_unpend(struct turnstile *, int); -void turnstile_wait(struct lock_object *, struct thread *, int); +void turnstile_wait(struct turnstile *, struct thread *, int); #endif /* _KERNEL */ #endif /* _SYS_TURNSTILE_H_ */ Index: ufs/ffs/ffs_snapshot.c =================================================================== RCS file: /usr/home/ncvs/src/sys/ufs/ffs/ffs_snapshot.c,v retrieving revision 1.133 diff -u -r1.133 ffs_snapshot.c --- ufs/ffs/ffs_snapshot.c 23 Jan 2007 10:01:18 -0000 1.133 +++ ufs/ffs/ffs_snapshot.c 26 Feb 2007 07:21:51 -0000 @@ -389,12 +389,15 @@ * Recind nice scheduling while running with the filesystem suspended. */ if (td->td_proc->p_nice > 0) { - PROC_LOCK(td->td_proc); - mtx_lock_spin(&sched_lock); - saved_nice = td->td_proc->p_nice; - sched_nice(td->td_proc, 0); - mtx_unlock_spin(&sched_lock); - PROC_UNLOCK(td->td_proc); + struct proc *p; + + p = td->td_proc; + PROC_LOCK(p); + PROC_SLOCK(p); + saved_nice = p->p_nice; + sched_nice(p, 0); + PROC_SUNLOCK(p); + PROC_UNLOCK(p); } /* * Suspend operation on filesystem. @@ -808,10 +811,13 @@ bawrite(sbp); out: if (saved_nice > 0) { - PROC_LOCK(td->td_proc); - mtx_lock_spin(&sched_lock); + struct proc *p; + + p = td->td_proc; + PROC_LOCK(p); + PROC_SLOCK(p); sched_nice(td->td_proc, saved_nice); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(td->td_proc); } UFS_LOCK(ump); Index: vm/vm_glue.c =================================================================== RCS file: /usr/home/ncvs/src/sys/vm/vm_glue.c,v retrieving revision 1.219 diff -u -r1.219 vm_glue.c --- vm/vm_glue.c 23 Jan 2007 08:46:51 -0000 1.219 +++ vm/vm_glue.c 26 Feb 2007 07:21:53 -0000 @@ -619,24 +619,26 @@ * busy swapping it in. */ ++p->p_lock; - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); p->p_sflag |= PS_SWAPPINGIN; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); FOREACH_THREAD_IN_PROC(p, td) vm_thread_swapin(td); PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); p->p_sflag &= ~PS_SWAPPINGIN; p->p_sflag |= PS_INMEM; FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); TD_CLR_SWAPPED(td); if (TD_CAN_RUN(td)) setrunnable(td); + thread_unlock(td); } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); wakeup(&p->p_sflag); @@ -672,9 +674,9 @@ loop: if (vm_page_count_min()) { VM_WAIT; - mtx_lock_spin(&sched_lock); + thread_lock(&thread0); proc0_rescan = 0; - mtx_unlock_spin(&sched_lock); + thread_unlock(&thread0); goto loop; } @@ -685,13 +687,14 @@ if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) { continue; } - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); FOREACH_THREAD_IN_PROC(p, td) { /* * An otherwise runnable thread of a process * swapped out has only the TDI_SWAPPED bit set. * */ + thread_lock(td); if (td->td_inhibitors == TDI_SWAPPED) { pri = p->p_swtime + td->td_slptime; if ((p->p_sflag & PS_SWAPINREQ) == 0) { @@ -709,8 +712,9 @@ ppri = pri; } } + thread_unlock(td); } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } sx_sunlock(&allproc_lock); @@ -718,13 +722,13 @@ * Nothing to do, back to sleep. */ if ((p = pp) == NULL) { - mtx_lock_spin(&sched_lock); + thread_lock(&thread0); if (!proc0_rescan) { TD_SET_IWAIT(&thread0); mi_switch(SW_VOL, NULL); } proc0_rescan = 0; - mtx_unlock_spin(&sched_lock); + thread_unlock(&thread0); goto loop; } PROC_LOCK(p); @@ -736,15 +740,15 @@ */ if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) { PROC_UNLOCK(p); - mtx_lock_spin(&sched_lock); + thread_lock(&thread0); proc0_rescan = 0; - mtx_unlock_spin(&sched_lock); + thread_unlock(&thread0); goto loop; } - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); p->p_sflag &= ~PS_SWAPINREQ; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); /* * We would like to bring someone in. (only if there is space). @@ -752,10 +756,12 @@ */ faultin(p); PROC_UNLOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); p->p_swtime = 0; + PROC_SUNLOCK(p); + thread_lock(&thread0); proc0_rescan = 0; - mtx_unlock_spin(&sched_lock); + thread_unlock(&thread0); goto loop; } @@ -763,7 +769,8 @@ { struct thread *td = &thread0; - + /* XXX This will probably cause a LOR in some cases */ + thread_lock(td); if (TD_AWAITING_INTR(td)) { CTR2(KTR_INTR, "%s: sched_add %d", __func__, 0); TD_CLR_IWAIT(td); @@ -773,6 +780,7 @@ CTR2(KTR_INTR, "%s: state %d", __func__, td->td_state); } + thread_unlock(td); } @@ -821,12 +829,12 @@ * creation. It may have no * address space or lock yet. */ - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (p->p_state == PRS_NEW) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); continue; } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); /* * An aio daemon switches its @@ -876,7 +884,7 @@ break; case PRS_NORMAL: - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); /* * do not swapout a realtime process * Check all the thread groups.. @@ -929,7 +937,7 @@ (minslptime > swap_idle_threshold2))) { swapout(p); didswap++; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); vm_map_unlock(&vm->vm_map); vmspace_free(vm); @@ -937,7 +945,7 @@ goto retry; } nextproc: - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } nextproc2: PROC_UNLOCK(p); @@ -962,7 +970,7 @@ struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); - mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); + mtx_assert(&p->p_slock, MA_OWNED | MA_NOTRECURSED); #if defined(SWAP_DEBUG) printf("swapping out %d\n", p->p_pid); #endif @@ -996,15 +1004,18 @@ p->p_sflag &= ~PS_INMEM; p->p_sflag |= PS_SWAPPINGOUT; PROC_UNLOCK(p); - FOREACH_THREAD_IN_PROC(p, td) + FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); TD_SET_SWAPPED(td); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + } + PROC_SUNLOCK(p); FOREACH_THREAD_IN_PROC(p, td) vm_thread_swapout(td); PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); p->p_sflag &= ~PS_SWAPPINGOUT; p->p_swtime = 0; } Index: vm/vm_meter.c =================================================================== RCS file: /usr/home/ncvs/src/sys/vm/vm_meter.c,v retrieving revision 1.91 diff -u -r1.91 vm_meter.c --- vm/vm_meter.c 20 Nov 2006 08:33:55 -0000 1.91 +++ vm/vm_meter.c 26 Feb 2007 07:21:53 -0000 @@ -131,17 +131,21 @@ FOREACH_PROC_IN_SYSTEM(p) { if (p->p_flag & P_SYSTEM) continue; - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); switch (p->p_state) { case PRS_NEW: - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); continue; break; default: FOREACH_THREAD_IN_PROC(p, td) { /* Need new statistics XXX */ + thread_lock(td); switch (td->td_state) { case TDS_INHIBITED: + /* + * XXX stats no longer synchronized. + */ if (TD_ON_LOCK(td) || (td->td_inhibitors == TDI_SWAPPED)) { @@ -162,13 +166,15 @@ case TDS_RUNQ: case TDS_RUNNING: total.t_rq++; + thread_unlock(td); continue; default: break; } + thread_unlock(td); } } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); /* * Note active objects. */ Index: vm/vm_pageout.c =================================================================== RCS file: /usr/home/ncvs/src/sys/vm/vm_pageout.c,v retrieving revision 1.279 diff -u -r1.279 vm_pageout.c --- vm/vm_pageout.c 7 Feb 2007 06:37:30 -0000 1.279 +++ vm/vm_pageout.c 26 Feb 2007 07:21:53 -0000 @@ -1246,22 +1246,24 @@ * If the process is in a non-running type state, * don't touch it. Check all the threads individually. */ - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); breakout = 0; FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td)) { + thread_unlock(td); breakout = 1; break; } + thread_unlock(td); } + PROC_SUNLOCK(p); if (breakout) { - mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); continue; } - mtx_unlock_spin(&sched_lock); /* * get the process size */ @@ -1287,9 +1289,9 @@ sx_sunlock(&allproc_lock); if (bigproc != NULL) { killproc(bigproc, "out of swap space"); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(bigproc); sched_nice(bigproc, PRIO_MIN); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(bigproc); PROC_UNLOCK(bigproc); wakeup(&cnt.v_free_count); } @@ -1594,17 +1596,20 @@ * if the process is in a non-running type state, * don't touch it. */ - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); breakout = 0; FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td)) { + thread_unlock(td); breakout = 1; break; } + thread_unlock(td); } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); if (breakout) { PROC_UNLOCK(p); continue; Index: vm/vm_zeroidle.c =================================================================== RCS file: /usr/home/ncvs/src/sys/vm/vm_zeroidle.c,v retrieving revision 1.44 diff -u -r1.44 vm_zeroidle.c --- vm/vm_zeroidle.c 11 Feb 2007 05:18:40 -0000 1.44 +++ vm/vm_zeroidle.c 26 Feb 2007 07:21:53 -0000 @@ -145,9 +145,9 @@ vm_page_zero_idle(); #ifndef PREEMPTION if (sched_runnable()) { - mtx_lock_spin(&sched_lock); + thread_lock(curthread); mi_switch(SW_VOL, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); } #endif } else { @@ -176,11 +176,11 @@ PROC_LOCK(pagezero_proc); pagezero_proc->p_flag |= P_NOLOAD; PROC_UNLOCK(pagezero_proc); - mtx_lock_spin(&sched_lock); td = FIRST_THREAD_IN_PROC(pagezero_proc); + thread_lock(td); sched_class(td, PRI_IDLE); sched_prio(td, PRI_MAX_IDLE); sched_add(td, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } SYSINIT(pagezero, SI_SUB_KTHREAD_VM, SI_ORDER_ANY, pagezero_start, NULL)