Index: amd64/amd64/cpu_switch.S =================================================================== RCS file: /usr/home/ncvs/src/sys/amd64/amd64/cpu_switch.S,v retrieving revision 1.156 diff -u -p -r1.156 cpu_switch.S --- amd64/amd64/cpu_switch.S 30 Mar 2007 00:06:20 -0000 1.156 +++ amd64/amd64/cpu_switch.S 31 May 2007 23:15:57 -0000 @@ -73,19 +73,16 @@ ENTRY(cpu_throw) movq TD_PCB(%rsi),%rdx /* newtd->td_proc */ movq PCB_CR3(%rdx),%rdx movq %rdx,%cr3 /* new address space */ - /* set bit in new pm_active */ - movq TD_PROC(%rsi),%rdx - movq P_VMSPACE(%rdx), %rdx - LK btsl %eax, VM_PMAP+PM_ACTIVE(%rdx) /* set new */ - jmp sw1 + jmp swact /* - * cpu_switch(old, new) + * cpu_switch(old, new, mtx) * * Save the current thread state, then select the next thread to run * and load its state. * %rdi = oldtd * %rsi = newtd + * %rdx = mtx */ ENTRY(cpu_switch) /* Switch to new thread. First, save context. */ @@ -147,17 +144,33 @@ ENTRY(cpu_switch) movq TD_PCB(%rsi),%r8 /* switch address space */ - movq PCB_CR3(%r8),%rdx + movq PCB_CR3(%r8),%rcx movq %cr3,%rax - cmpq %rdx,%rax /* Same address space? */ - je sw1 - movq %rdx,%cr3 /* new address space */ - + cmpq %rcx,%rax /* Same address space? */ + jne swinact + movq %rdx, TD_LOCK(%rdi) /* Release the old thread */ + /* Wait for the new thread to become unblocked */ + movq $blocked_lock, %rdx +1: + movq TD_LOCK(%rsi),%rcx + cmpq %rcx, %rdx + je 1b + jmp sw1 +swinact: + movq %rcx,%cr3 /* new address space */ movl PCPU(CPUID), %eax /* Release bit from old pmap->pm_active */ - movq TD_PROC(%rdi), %rdx /* oldproc */ - movq P_VMSPACE(%rdx), %rdx - LK btrl %eax, VM_PMAP+PM_ACTIVE(%rdx) /* clear old */ + movq TD_PROC(%rdi), %rcx /* oldproc */ + movq P_VMSPACE(%rcx), %rcx + LK btrl %eax, VM_PMAP+PM_ACTIVE(%rcx) /* clear old */ + movq %rdx, TD_LOCK(%rdi) /* Release the old thread */ +swact: + /* Wait for the new thread to become unblocked */ + movq $blocked_lock, %rdx +1: + movq TD_LOCK(%rsi),%rcx + cmpq %rcx, %rdx + je 1b /* Set bit in new pmap->pm_active */ movq TD_PROC(%rsi),%rdx /* newproc */ Index: amd64/amd64/genassym.c =================================================================== RCS file: /usr/home/ncvs/src/sys/amd64/amd64/genassym.c,v retrieving revision 1.161 diff -u -p -r1.161 genassym.c --- amd64/amd64/genassym.c 30 Mar 2007 00:06:20 -0000 1.161 +++ amd64/amd64/genassym.c 18 May 2007 10:37:00 -0000 @@ -76,6 +76,7 @@ ASSYM(VM_PMAP, offsetof(struct vmspace, ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); ASSYM(P_SFLAG, offsetof(struct proc, p_sflag)); +ASSYM(TD_LOCK, offsetof(struct thread, td_lock)); ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); ASSYM(TD_PCB, offsetof(struct thread, td_pcb)); ASSYM(TD_PROC, offsetof(struct thread, td_proc)); Index: amd64/amd64/machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/amd64/amd64/machdep.c,v retrieving revision 1.672 diff -u -p -r1.672 machdep.c --- amd64/amd64/machdep.c 31 May 2007 22:52:10 -0000 1.672 +++ amd64/amd64/machdep.c 31 May 2007 20:40:12 -0000 @@ -460,9 +460,9 @@ cpu_est_clockrate(int cpu_id, uint64_t * #ifdef SMP /* Schedule ourselves on the indicated cpu. */ - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_bind(curthread, cpu_id); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); #endif /* Calibrate by measuring a short delay. */ @@ -473,9 +473,9 @@ cpu_est_clockrate(int cpu_id, uint64_t * intr_restore(reg); #ifdef SMP - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_unbind(curthread); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); #endif /* Index: amd64/amd64/mp_machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/amd64/amd64/mp_machdep.c,v retrieving revision 1.285 diff -u -p -r1.285 mp_machdep.c --- amd64/amd64/mp_machdep.c 19 May 2007 05:03:59 -0000 1.285 +++ amd64/amd64/mp_machdep.c 20 May 2007 11:40:23 -0000 @@ -46,6 +46,7 @@ __FBSDID("$FreeBSD: src/sys/amd64/amd64/ #include #include #include +#include #include #include @@ -590,25 +591,7 @@ init_secondary(void) while (smp_started == 0) ia32_pause(); - /* ok, now grab sched_lock and enter the scheduler */ - mtx_lock_spin(&sched_lock); - - /* - * Correct spinlock nesting. The idle thread context that we are - * borrowing was created so that it would start out with a single - * spin lock (sched_lock) held in fork_trampoline(). Since we've - * explicitly acquired locks in this function, the nesting count - * is now 2 rather than 1. Since we are nested, calling - * spinlock_exit() will simply adjust the counts without allowing - * spin lock using code to interrupt us. - */ - spinlock_exit(); - KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); - - PCPU_SET(switchtime, cpu_ticks()); - PCPU_SET(switchticks, ticks); - - cpu_throw(NULL, choosethread()); /* doesn't return */ + sched_throw(NULL); panic("scheduler returned us to %s", __func__); /* NOTREACHED */ @@ -988,12 +971,12 @@ ipi_bitmap_handler(struct trapframe fram if (ipi_bitmap & (1 << IPI_PREEMPT)) { struct thread *running_thread = curthread; - mtx_lock_spin(&sched_lock); + thread_lock(running_thread); if (running_thread->td_critnest > 1) running_thread->td_owepreempt = 1; else mi_switch(SW_INVOL | SW_PREEMPT, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(running_thread); } /* Nothing to do for AST */ @@ -1177,11 +1160,9 @@ release_aps(void *dummy __unused) if (mp_ncpus == 1) return; - mtx_lock_spin(&sched_lock); atomic_store_rel_int(&aps_ready, 1); while (smp_started == 0) ia32_pause(); - mtx_unlock_spin(&sched_lock); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); Index: amd64/amd64/mp_watchdog.c =================================================================== RCS file: /usr/home/ncvs/src/sys/amd64/amd64/mp_watchdog.c,v retrieving revision 1.4 diff -u -p -r1.4 mp_watchdog.c --- amd64/amd64/mp_watchdog.c 28 Feb 2005 08:55:53 -0000 1.4 +++ amd64/amd64/mp_watchdog.c 31 May 2007 21:22:05 -0000 @@ -105,9 +105,7 @@ watchdog_function(void *arg) * locks to make sure. Then reset the timer. */ mtx_lock(&Giant); - mtx_lock_spin(&sched_lock); watchdog_timer = WATCHDOG_THRESHOLD; - mtx_unlock_spin(&sched_lock); mtx_unlock(&Giant); callout_reset(&watchdog_callout, 1 * hz, watchdog_function, NULL); } @@ -156,34 +154,6 @@ SYSCTL_PROC(_debug, OID_AUTO, watchdog, sysctl_watchdog, "I", ""); /* - * A badly behaved sysctl that leaks the sched lock when written to. Then - * spin holding it just to make matters worse. This can be used to test the - * effectiveness of the watchdog by generating a fairly hard and nast hang. - * Note that Giant is also held in the current world order when we get here. - */ -static int -sysctl_leak_schedlock(SYSCTL_HANDLER_ARGS) -{ - int error, temp; - - temp = 0; - error = sysctl_handle_int(oidp, &temp, 0, req); - if (error) - return (error); - - if (req->newptr != NULL) { - if (temp) { - printf("Leaking the sched lock...\n"); - mtx_lock_spin(&sched_lock); - while (1); - } - } - return (0); -} -SYSCTL_PROC(_debug, OID_AUTO, leak_schedlock, CTLTYPE_INT|CTLFLAG_RW, 0, 0, - sysctl_leak_schedlock, "IU", ""); - -/* * Drop into the debugger by sending an IPI NMI to the boot processor. */ static void Index: amd64/amd64/vm_machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/amd64/amd64/vm_machdep.c,v retrieving revision 1.254 diff -u -p -r1.254 vm_machdep.c --- amd64/amd64/vm_machdep.c 24 Apr 2007 21:17:45 -0000 1.254 +++ amd64/amd64/vm_machdep.c 31 May 2007 21:23:21 -0000 @@ -170,7 +170,7 @@ cpu_fork(td1, p2, td2, flags) * pcb2->pcb_[fg]sbase: cloned above */ - /* Setup to release sched_lock in fork_exit(). */ + /* Setup to release spin count in fork_exit(). */ td2->td_md.md_spinlock_count = 1; td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I; @@ -304,7 +304,7 @@ cpu_set_upcall(struct thread *td, struct * pcb2->pcb_[fg]sbase: cloned above */ - /* Setup to release sched_lock in fork_exit(). */ + /* Setup to release spin count in fork_exit(). */ td->td_md.md_spinlock_count = 1; td->td_md.md_saved_flags = PSL_KERNEL | PSL_I; } Index: amd64/linux32/linux32_machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/amd64/linux32/linux32_machdep.c,v retrieving revision 1.43 diff -u -p -r1.43 linux32_machdep.c --- amd64/linux32/linux32_machdep.c 11 May 2007 01:25:50 -0000 1.43 +++ amd64/linux32/linux32_machdep.c 18 May 2007 10:37:01 -0000 @@ -486,10 +486,10 @@ linux_fork(struct thread *td, struct lin /* * Make this runnable after we are finished with it. */ - mtx_lock_spin(&sched_lock); + thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); return (0); } @@ -529,10 +529,10 @@ linux_vfork(struct thread *td, struct li /* * Make this runnable after we are finished with it. */ - mtx_lock_spin(&sched_lock); + thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); /* wait for the children to exit, ie. emulate vfork */ PROC_LOCK(p2); @@ -715,10 +715,10 @@ linux_clone(struct thread *td, struct li /* * Make this runnable after we are finished with it. */ - mtx_lock_spin(&sched_lock); + thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; Index: arm/arm/vm_machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/arm/arm/vm_machdep.c,v retrieving revision 1.31 diff -u -p -r1.31 vm_machdep.c --- arm/arm/vm_machdep.c 23 May 2007 13:19:00 -0000 1.31 +++ arm/arm/vm_machdep.c 31 May 2007 21:23:48 -0000 @@ -143,7 +143,7 @@ cpu_fork(register struct thread *td1, re tf->tf_r1 = 0; pcb2->un_32.pcb32_sp = (u_int)sf; - /* Setup to release sched_lock in fork_exit(). */ + /* Setup to release spin count in fork_exit(). */ td2->td_md.md_spinlock_count = 1; td2->td_md.md_saved_cspr = 0; td2->td_md.md_tp = *(uint32_t **)ARM_TP_ADDRESS; @@ -288,7 +288,7 @@ cpu_set_upcall(struct thread *td, struct td->td_pcb->un_32.pcb32_sp = (u_int)sf; td->td_pcb->un_32.pcb32_und_sp = td->td_kstack + USPACE_UNDEF_STACK_TOP; - /* Setup to release sched_lock in fork_exit(). */ + /* Setup to release spin count in fork_exit(). */ td->td_md.md_spinlock_count = 1; td->td_md.md_saved_cspr = 0; } Index: compat/linprocfs/linprocfs.c =================================================================== RCS file: /usr/home/ncvs/src/sys/compat/linprocfs/linprocfs.c,v retrieving revision 1.114 diff -u -p -r1.114 linprocfs.c --- compat/linprocfs/linprocfs.c 31 May 2007 22:52:11 -0000 1.114 +++ compat/linprocfs/linprocfs.c 31 May 2007 20:40:13 -0000 @@ -636,7 +636,7 @@ linprocfs_doprocstatus(PFS_FILL_ARGS) if (P_SHOULDSTOP(p)) { state = "T (stopped)"; } else { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); switch(p->p_state) { case PRS_NEW: state = "I (idle)"; @@ -666,7 +666,7 @@ linprocfs_doprocstatus(PFS_FILL_ARGS) state = "? (unknown)"; break; } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } fill_kinfo_proc(p, &kp); Index: compat/ndis/subr_ntoskrnl.c =================================================================== RCS file: /usr/home/ncvs/src/sys/compat/ndis/subr_ntoskrnl.c,v retrieving revision 1.88 diff -u -p -r1.88 subr_ntoskrnl.c --- compat/ndis/subr_ntoskrnl.c 25 Dec 2006 17:04:41 -0000 1.88 +++ compat/ndis/subr_ntoskrnl.c 18 May 2007 10:37:01 -0000 @@ -3824,7 +3824,7 @@ ntoskrnl_dpc_thread(arg) * once scheduled by an ISR. */ - mtx_lock_spin(&sched_lock); + thread_lock(curthread); #ifdef NTOSKRNL_MULTIPLE_DPCS #if __FreeBSD_version >= 502102 sched_bind(curthread, kq->kq_cpu); @@ -3834,7 +3834,7 @@ ntoskrnl_dpc_thread(arg) #if __FreeBSD_version < 600000 curthread->td_base_pri = PRI_MIN_KERN; #endif - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); while (1) { KeWaitForSingleObject(&kq->kq_proc, 0, 0, TRUE, NULL); Index: compat/svr4/svr4_misc.c =================================================================== RCS file: /usr/home/ncvs/src/sys/compat/svr4/svr4_misc.c,v retrieving revision 1.93 diff -u -p -r1.93 svr4_misc.c --- compat/svr4/svr4_misc.c 31 May 2007 22:52:11 -0000 1.93 +++ compat/svr4/svr4_misc.c 31 May 2007 20:40:13 -0000 @@ -1253,12 +1253,12 @@ loop: * See if we have a stopped or continued process. * XXX: This duplicates the same code in kern_wait(). */ - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if ((p->p_flag & P_STOPPED_SIG) && (p->p_suspcount == p->p_numthreads) && (p->p_flag & P_WAITED) == 0 && (p->p_flag & P_TRACED || uap->options & SVR4_WSTOPPED)) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); if (((uap->options & SVR4_WNOWAIT)) == 0) p->p_flag |= P_WAITED; sx_sunlock(&proctree_lock); @@ -1278,7 +1278,7 @@ loop: DPRINTF(("jobcontrol %d\n", pid)); return (svr4_setinfo(pid, &ru, status, uap->info)); } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); if (uap->options & SVR4_WCONTINUED && (p->p_flag & P_CONTINUED)) { sx_sunlock(&proctree_lock); Index: conf/files =================================================================== RCS file: /usr/home/ncvs/src/sys/conf/files,v retrieving revision 1.1213 diff -u -p -r1.1213 files --- conf/files 31 May 2007 19:47:39 -0000 1.1213 +++ conf/files 31 May 2007 20:40:13 -0000 @@ -1430,6 +1430,7 @@ kern/posix4_mib.c standard kern/sched_4bsd.c optional sched_4bsd kern/sched_core.c optional sched_core kern/sched_ule.c optional sched_ule +kern/sched_smp.c optional sched_smp kern/serdev_if.m standard kern/subr_acl_posix1e.c standard kern/subr_autoconf.c standard Index: conf/options =================================================================== RCS file: /usr/home/ncvs/src/sys/conf/options,v retrieving revision 1.589 diff -u -p -r1.589 options --- conf/options 30 May 2007 17:39:44 -0000 1.589 +++ conf/options 31 May 2007 20:40:13 -0000 @@ -137,6 +137,7 @@ QUOTA SCHED_4BSD opt_sched.h SCHED_CORE opt_sched.h SCHED_ULE opt_sched.h +SCHED_SMP opt_sched.h SHOW_BUSYBUFS SLEEPQUEUE_PROFILING SLHCI_DEBUG opt_slhci.h Index: dev/hwpmc/hwpmc_mod.c =================================================================== RCS file: /usr/home/ncvs/src/sys/dev/hwpmc/hwpmc_mod.c,v retrieving revision 1.28 diff -u -p -r1.28 hwpmc_mod.c --- dev/hwpmc/hwpmc_mod.c 19 Apr 2007 08:02:51 -0000 1.28 +++ dev/hwpmc/hwpmc_mod.c 18 May 2007 10:37:01 -0000 @@ -591,10 +591,10 @@ static void pmc_save_cpu_binding(struct pmc_binding *pb) { PMCDBG(CPU,BND,2, "%s", "save-cpu"); - mtx_lock_spin(&sched_lock); + thread_lock(curthread); pb->pb_bound = sched_is_bound(curthread); pb->pb_cpu = curthread->td_oncpu; - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); PMCDBG(CPU,BND,2, "save-cpu cpu=%d", pb->pb_cpu); } @@ -607,12 +607,12 @@ pmc_restore_cpu_binding(struct pmc_bindi { PMCDBG(CPU,BND,2, "restore-cpu curcpu=%d restore=%d", curthread->td_oncpu, pb->pb_cpu); - mtx_lock_spin(&sched_lock); + thread_lock(curthread); if (pb->pb_bound) sched_bind(curthread, pb->pb_cpu); else sched_unbind(curthread); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); PMCDBG(CPU,BND,2, "%s", "restore-cpu done"); } @@ -631,9 +631,9 @@ pmc_select_cpu(int cpu) "disabled CPU %d", __LINE__, cpu)); PMCDBG(CPU,SEL,2, "select-cpu cpu=%d", cpu); - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_bind(curthread, cpu); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); KASSERT(curthread->td_oncpu == cpu, ("[pmc,%d] CPU not bound [cpu=%d, curr=%d]", __LINE__, Index: dev/md/md.c =================================================================== RCS file: /usr/home/ncvs/src/sys/dev/md/md.c,v retrieving revision 1.168 diff -u -p -r1.168 md.c --- dev/md/md.c 31 May 2007 11:51:49 -0000 1.168 +++ dev/md/md.c 31 May 2007 20:40:15 -0000 @@ -690,9 +690,9 @@ md_kthread(void *arg) int error; sc = arg; - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_prio(curthread, PRIBIO); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); if (sc->type == MD_VNODE) curthread->td_pflags |= TDP_NORUNNINGBUF; Index: dev/syscons/syscons.c =================================================================== RCS file: /usr/home/ncvs/src/sys/dev/syscons/syscons.c,v retrieving revision 1.450 diff -u -p -r1.450 syscons.c --- dev/syscons/syscons.c 16 Nov 2006 12:27:51 -0000 1.450 +++ dev/syscons/syscons.c 31 May 2007 21:24:31 -0000 @@ -2326,8 +2326,8 @@ sc_switch_scr(sc_softc_t *sc, u_int next if (sc->new_scp == sc->old_scp) { sc->switch_in_progress = 0; /* - * XXX wakeup() calls mtx_lock(&sched_lock) which will hang if - * sched_lock is in an in-between state, e.g., when we stop at + * XXX wakeup() locks the scheduler lock which will hang if + * the lock is in an in-between state, e.g., when we stop at * a breakpoint at fork_exit. It has always been wrong to call * wakeup() when the debugger is active. In RELENG_4, wakeup() * is supposed to be locked by splhigh(), but the debugger may Index: dev/syscons/syscons.h =================================================================== RCS file: /usr/home/ncvs/src/sys/dev/syscons/syscons.h,v retrieving revision 1.87 diff -u -p -r1.87 syscons.h --- dev/syscons/syscons.h 13 Sep 2006 15:48:15 -0000 1.87 +++ dev/syscons/syscons.h 18 May 2007 10:37:01 -0000 @@ -536,7 +536,7 @@ typedef struct { (*kbdsw[(kbd)->kb_index]->poll)((kbd), (on)) #define SC_VIDEO_LOCKINIT(sc) \ - mtx_init(&(sc)->video_mtx, "syscons video lock", NULL,MTX_SPIN); + mtx_init(&(sc)->video_mtx, "syscons video lock", NULL,MTX_QUIET|MTX_SPIN); #define SC_VIDEO_LOCK(sc) \ do { \ if (!cold) \ Index: fs/procfs/procfs_ctl.c =================================================================== RCS file: /usr/home/ncvs/src/sys/fs/procfs/procfs_ctl.c,v retrieving revision 1.55 diff -u -p -r1.55 procfs_ctl.c --- fs/procfs/procfs_ctl.c 22 Feb 2006 17:20:37 -0000 1.55 +++ fs/procfs/procfs_ctl.c 18 May 2007 10:37:01 -0000 @@ -286,9 +286,9 @@ out: panic("procfs_control"); } - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); thread_unsuspend(p); /* If it can run, let it do so. */ - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); return (0); } @@ -344,9 +344,9 @@ procfs_doprocctl(PFS_FILL_ARGS) #endif /* XXXKSE: */ p->p_flag &= ~P_STOPPED_SIG; - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); thread_unsuspend(p); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } else psignal(p, nm->nm_val); PROC_UNLOCK(p); Index: fs/procfs/procfs_ioctl.c =================================================================== RCS file: /usr/home/ncvs/src/sys/fs/procfs/procfs_ioctl.c,v retrieving revision 1.17 diff -u -p -r1.17 procfs_ioctl.c --- fs/procfs/procfs_ioctl.c 1 May 2007 12:59:20 -0000 1.17 +++ fs/procfs/procfs_ioctl.c 18 May 2007 10:37:01 -0000 @@ -185,9 +185,9 @@ procfs_ioctl(PFS_IOCTL_ARGS) if (P_SHOULDSTOP(p)) { p->p_xstat = sig; p->p_flag &= ~(P_STOPPED_TRACE|P_STOPPED_SIG); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); thread_unsuspend(p); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } else if (sig) psignal(p, sig); #else Index: fs/procfs/procfs_status.c =================================================================== RCS file: /usr/home/ncvs/src/sys/fs/procfs/procfs_status.c,v retrieving revision 1.59 diff -u -p -r1.59 procfs_status.c --- fs/procfs/procfs_status.c 6 Dec 2006 06:34:54 -0000 1.59 +++ fs/procfs/procfs_status.c 18 May 2007 10:37:01 -0000 @@ -112,7 +112,7 @@ procfs_doprocstatus(PFS_FILL_ARGS) sbuf_printf(sb, "noflags"); } - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); #ifdef KSE if (p->p_flag & P_SA) wmesg = "-kse- "; @@ -127,7 +127,7 @@ procfs_doprocstatus(PFS_FILL_ARGS) } else wmesg = "nochan"; } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); if (p->p_sflag & PS_INMEM) { struct timeval start, ut, st; Index: geom/geom_kern.c =================================================================== RCS file: /usr/home/ncvs/src/sys/geom/geom_kern.c,v retrieving revision 1.40 diff -u -p -r1.40 geom_kern.c --- geom/geom_kern.c 25 Nov 2005 10:09:30 -0000 1.40 +++ geom/geom_kern.c 18 May 2007 10:37:01 -0000 @@ -88,9 +88,9 @@ g_up_procbody(void) struct thread *tp = FIRST_THREAD_IN_PROC(p); mtx_assert(&Giant, MA_NOTOWNED); - mtx_lock_spin(&sched_lock); + thread_lock(tp); sched_prio(tp, PRIBIO); - mtx_unlock_spin(&sched_lock); + thread_unlock(tp); for(;;) { g_io_schedule_up(tp); } @@ -111,9 +111,9 @@ g_down_procbody(void) struct thread *tp = FIRST_THREAD_IN_PROC(p); mtx_assert(&Giant, MA_NOTOWNED); - mtx_lock_spin(&sched_lock); + thread_lock(tp); sched_prio(tp, PRIBIO); - mtx_unlock_spin(&sched_lock); + thread_unlock(tp); for(;;) { g_io_schedule_down(tp); } @@ -134,9 +134,9 @@ g_event_procbody(void) struct thread *tp = FIRST_THREAD_IN_PROC(p); mtx_assert(&Giant, MA_NOTOWNED); - mtx_lock_spin(&sched_lock); + thread_lock(tp); sched_prio(tp, PRIBIO); - mtx_unlock_spin(&sched_lock); + thread_unlock(tp); for(;;) { g_run_events(); tsleep(&g_wait_event, PRIBIO, "-", hz/10); Index: geom/eli/g_eli.c =================================================================== RCS file: /usr/home/ncvs/src/sys/geom/eli/g_eli.c,v retrieving revision 1.37 diff -u -p -r1.37 g_eli.c --- geom/eli/g_eli.c 8 Apr 2007 23:54:23 -0000 1.37 +++ geom/eli/g_eli.c 18 May 2007 10:37:01 -0000 @@ -332,11 +332,11 @@ g_eli_worker(void *arg) tsleep(wr, 0, "geli:smp", hz / 4); } #endif - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_prio(curthread, PRIBIO); if (sc->sc_crypto == G_ELI_CRYPTO_SW && g_eli_threads == 0) sched_bind(curthread, wr->w_number); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); G_ELI_DEBUG(1, "Thread %s started.", curthread->td_proc->p_comm); Index: geom/journal/g_journal.c =================================================================== RCS file: /usr/home/ncvs/src/sys/geom/journal/g_journal.c,v retrieving revision 1.11 diff -u -p -r1.11 g_journal.c --- geom/journal/g_journal.c 6 Apr 2007 12:53:54 -0000 1.11 +++ geom/journal/g_journal.c 18 May 2007 10:37:01 -0000 @@ -2057,9 +2057,9 @@ g_journal_worker(void *arg) time_t last_write; int type; - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_prio(curthread, PRIBIO); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); sc = arg; type = 0; /* gcc */ Index: geom/mirror/g_mirror.c =================================================================== RCS file: /usr/home/ncvs/src/sys/geom/mirror/g_mirror.c,v retrieving revision 1.92 diff -u -p -r1.92 g_mirror.c --- geom/mirror/g_mirror.c 1 Nov 2006 22:51:49 -0000 1.92 +++ geom/mirror/g_mirror.c 18 May 2007 10:37:01 -0000 @@ -1768,9 +1768,9 @@ g_mirror_worker(void *arg) int timeout; sc = arg; - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_prio(curthread, PRIBIO); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { Index: geom/raid3/g_raid3.c =================================================================== RCS file: /usr/home/ncvs/src/sys/geom/raid3/g_raid3.c,v retrieving revision 1.80 diff -u -p -r1.80 g_raid3.c --- geom/raid3/g_raid3.c 1 Nov 2006 22:51:49 -0000 1.80 +++ geom/raid3/g_raid3.c 18 May 2007 10:37:01 -0000 @@ -2017,9 +2017,9 @@ g_raid3_worker(void *arg) int timeout; sc = arg; - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_prio(curthread, PRIBIO); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { Index: i386/i386/machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/i386/i386/machdep.c,v retrieving revision 1.654 diff -u -p -r1.654 machdep.c --- i386/i386/machdep.c 31 May 2007 22:52:11 -0000 1.654 +++ i386/i386/machdep.c 31 May 2007 20:40:16 -0000 @@ -1058,9 +1058,9 @@ cpu_est_clockrate(int cpu_id, uint64_t * #ifdef SMP /* Schedule ourselves on the indicated cpu. */ - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_bind(curthread, cpu_id); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); #endif /* Calibrate by measuring a short delay. */ @@ -1071,9 +1071,9 @@ cpu_est_clockrate(int cpu_id, uint64_t * intr_restore(reg); #ifdef SMP - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_unbind(curthread); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); #endif /* Index: i386/i386/mp_machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/i386/i386/mp_machdep.c,v retrieving revision 1.279 diff -u -p -r1.279 mp_machdep.c --- i386/i386/mp_machdep.c 20 May 2007 22:03:57 -0000 1.279 +++ i386/i386/mp_machdep.c 23 May 2007 15:29:15 -0000 @@ -61,6 +61,7 @@ __FBSDID("$FreeBSD: src/sys/i386/i386/mp #include #include #include +#include #include #include @@ -642,25 +643,8 @@ init_secondary(void) while (smp_started == 0) ia32_pause(); - /* ok, now grab sched_lock and enter the scheduler */ - mtx_lock_spin(&sched_lock); - - /* - * Correct spinlock nesting. The idle thread context that we are - * borrowing was created so that it would start out with a single - * spin lock (sched_lock) held in fork_trampoline(). Since we've - * explicitly acquired locks in this function, the nesting count - * is now 2 rather than 1. Since we are nested, calling - * spinlock_exit() will simply adjust the counts without allowing - * spin lock using code to interrupt us. - */ - spinlock_exit(); - KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); - - PCPU_SET(switchtime, cpu_ticks()); - PCPU_SET(switchticks, ticks); - - cpu_throw(NULL, choosethread()); /* doesn't return */ + /* enter the scheduler */ + sched_throw(NULL); panic("scheduler returned us to %s", __func__); /* NOTREACHED */ @@ -1194,12 +1178,12 @@ ipi_bitmap_handler(struct trapframe fram #ifdef COUNT_IPIS (*ipi_preempt_counts[cpu])++; #endif - mtx_lock_spin(&sched_lock); + thread_lock(running_thread); if (running_thread->td_critnest > 1) running_thread->td_owepreempt = 1; else mi_switch(SW_INVOL | SW_PREEMPT, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(running_thread); } if (ipi_bitmap & (1 << IPI_AST)) { Index: i386/i386/mp_watchdog.c =================================================================== RCS file: /usr/home/ncvs/src/sys/i386/i386/mp_watchdog.c,v retrieving revision 1.4 diff -u -p -r1.4 mp_watchdog.c --- i386/i386/mp_watchdog.c 27 Feb 2005 22:34:07 -0000 1.4 +++ i386/i386/mp_watchdog.c 31 May 2007 21:22:34 -0000 @@ -105,9 +105,7 @@ watchdog_function(void *arg) * locks to make sure. Then reset the timer. */ mtx_lock(&Giant); - mtx_lock_spin(&sched_lock); watchdog_timer = WATCHDOG_THRESHOLD; - mtx_unlock_spin(&sched_lock); mtx_unlock(&Giant); callout_reset(&watchdog_callout, 1 * hz, watchdog_function, NULL); } @@ -156,34 +154,6 @@ SYSCTL_PROC(_debug, OID_AUTO, watchdog, sysctl_watchdog, "I", ""); /* - * A badly behaved sysctl that leaks the sched lock when written to. Then - * spin holding it just to make matters worse. This can be used to test the - * effectiveness of the watchdog by generating a fairly hard and nast hang. - * Note that Giant is also held in the current world order when we get here. - */ -static int -sysctl_leak_schedlock(SYSCTL_HANDLER_ARGS) -{ - int error, temp; - - temp = 0; - error = sysctl_handle_int(oidp, &temp, 0, req); - if (error) - return (error); - - if (req->newptr != NULL) { - if (temp) { - printf("Leaking the sched lock...\n"); - mtx_lock_spin(&sched_lock); - while (1); - } - } - return (0); -} -SYSCTL_PROC(_debug, OID_AUTO, leak_schedlock, CTLTYPE_INT|CTLFLAG_RW, 0, 0, - sysctl_leak_schedlock, "IU", ""); - -/* * Drop into the debugger by sending an IPI NMI to the boot processor. */ static void Index: i386/i386/vm_machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/i386/i386/vm_machdep.c,v retrieving revision 1.281 diff -u -p -r1.281 vm_machdep.c --- i386/i386/vm_machdep.c 29 May 2007 18:55:41 -0000 1.281 +++ i386/i386/vm_machdep.c 31 May 2007 21:23:06 -0000 @@ -264,7 +264,7 @@ cpu_fork(td1, p2, td2, flags) } mtx_unlock_spin(&dt_lock); - /* Setup to release sched_lock in fork_exit(). */ + /* Setup to release spin count in fork_exit(). */ td2->td_md.md_spinlock_count = 1; td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I; @@ -438,7 +438,7 @@ cpu_set_upcall(struct thread *td, struct */ pcb2->pcb_ext = NULL; - /* Setup to release sched_lock in fork_exit(). */ + /* Setup to release spin count in fork_exit(). */ td->td_md.md_spinlock_count = 1; td->td_md.md_saved_flags = PSL_KERNEL | PSL_I; } Index: i386/isa/npx.c =================================================================== RCS file: /usr/home/ncvs/src/sys/i386/isa/npx.c,v retrieving revision 1.171 diff -u -p -r1.171 npx.c --- i386/isa/npx.c 23 Feb 2007 12:19:00 -0000 1.171 +++ i386/isa/npx.c 18 May 2007 10:37:01 -0000 @@ -230,9 +230,9 @@ npx_intr(dummy) td = PCPU_GET(fpcurthread); if (td != NULL) { td->td_pcb->pcb_flags |= PCB_NPXTRAP; - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags |= TDF_ASTPENDING; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } return (FILTER_HANDLED); } Index: i386/linux/linux_machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/i386/linux/linux_machdep.c,v retrieving revision 1.75 diff -u -p -r1.75 linux_machdep.c --- i386/linux/linux_machdep.c 11 May 2007 01:25:51 -0000 1.75 +++ i386/linux/linux_machdep.c 18 May 2007 10:37:01 -0000 @@ -325,10 +325,10 @@ linux_fork(struct thread *td, struct lin /* * Make this runnable after we are finished with it. */ - mtx_lock_spin(&sched_lock); + thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); return (0); } @@ -368,10 +368,10 @@ linux_vfork(struct thread *td, struct li /* * Make this runnable after we are finished with it. */ - mtx_lock_spin(&sched_lock); + thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); /* wait for the children to exit, ie. emulate vfork */ PROC_LOCK(p2); @@ -569,10 +569,10 @@ linux_clone(struct thread *td, struct li /* * Make this runnable after we are finished with it. */ - mtx_lock_spin(&sched_lock); + thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; Index: ia64/ia64/machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/ia64/ia64/machdep.c,v retrieving revision 1.217 diff -u -p -r1.217 machdep.c --- ia64/ia64/machdep.c 31 May 2007 22:52:12 -0000 1.217 +++ ia64/ia64/machdep.c 31 May 2007 22:31:00 -0000 @@ -356,7 +356,7 @@ cpu_reset() } void -cpu_switch(struct thread *old, struct thread *new) +cpu_switch(struct thread *old, struct thread *new, struct mtx *mtx) { struct pcb *oldpcb, *newpcb; Index: ia64/ia64/mp_machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/ia64/ia64/mp_machdep.c,v retrieving revision 1.62 diff -u -p -r1.62 mp_machdep.c --- ia64/ia64/mp_machdep.c 18 Nov 2006 21:52:26 -0000 1.62 +++ ia64/ia64/mp_machdep.c 23 May 2007 20:17:13 -0000 @@ -111,16 +111,6 @@ ia64_ap_startup(void) PCPU_SET(curthread, PCPU_GET(idlethread)); /* - * Correct spinlock nesting. The idle thread context that we are - * borrowing was created so that it would start out with a single - * spin lock (sched_lock) held in fork_trampoline(). Since we - * don't have any locks and explicitly acquire locks when we need - * to, the nesting count will be off by 1. - */ - curthread->td_md.md_spinlock_count = 0; - critical_exit(); - - /* * Get and save the CPU specific MCA records. Should we get the * MCA state for each processor, or just the CMC state? */ @@ -133,17 +123,12 @@ ia64_ap_startup(void) CTR1(KTR_SMP, "SMP: cpu%d launched", PCPU_GET(cpuid)); - mtx_lock_spin(&sched_lock); - - PCPU_SET(switchtime, cpu_ticks()); - PCPU_SET(switchticks, ticks); - ia64_set_tpr(0); /* kick off the clock on this AP */ pcpu_initclock(); - cpu_throw(NULL, choosethread()); + sched_throw(NULL); /* NOTREACHED */ } Index: ia64/ia64/pmap.c =================================================================== RCS file: /usr/home/ncvs/src/sys/ia64/ia64/pmap.c,v retrieving revision 1.187 diff -u -p -r1.187 pmap.c --- ia64/ia64/pmap.c 31 May 2007 22:52:12 -0000 1.187 +++ ia64/ia64/pmap.c 31 May 2007 20:40:16 -0000 @@ -2235,8 +2235,7 @@ pmap_switch(pmap_t pm) pmap_t prevpm; int i; - mtx_assert(&sched_lock, MA_OWNED); - + THREAD_LOCK_ASSERT(curthread, MA_OWNED); prevpm = PCPU_GET(current_pmap); if (prevpm == pm) return (prevpm); @@ -2263,10 +2262,13 @@ static pmap_t pmap_install(pmap_t pm) { pmap_t prevpm; + struct thread *td; - mtx_lock_spin(&sched_lock); + td = curthread; + thread_lock(td); prevpm = pmap_switch(pm); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + return (prevpm); } Index: ia64/ia64/vm_machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/ia64/ia64/vm_machdep.c,v retrieving revision 1.93 diff -u -p -r1.93 vm_machdep.c --- ia64/ia64/vm_machdep.c 16 May 2006 14:32:15 -0000 1.93 +++ ia64/ia64/vm_machdep.c 31 May 2007 21:24:52 -0000 @@ -159,7 +159,7 @@ cpu_set_upcall(struct thread *td, struct pcb->pcb_special.rp = FDESC_FUNC(fork_trampoline); cpu_set_fork_handler(td, (void (*)(void*))fork_return, td); - /* Setup to release sched_lock in fork_exit(). */ + /* Setup to release the spin count in fork_exit(). */ td->td_md.md_spinlock_count = 1; td->td_md.md_saved_intr = 1; } @@ -284,7 +284,7 @@ cpu_fork(struct thread *td1, struct proc td2->td_pcb->pcb_special.rp = FDESC_FUNC(fork_trampoline); cpu_set_fork_handler(td2, (void (*)(void*))fork_return, td2); - /* Setup to release sched_lock in fork_exit(). */ + /* Setup to release the spin count in fork_exit(). */ td2->td_md.md_spinlock_count = 1; td2->td_md.md_saved_intr = 1; } Index: kern/init_main.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/init_main.c,v retrieving revision 1.277 diff -u -p -r1.277 init_main.c --- kern/init_main.c 1 Jun 2007 01:12:43 -0000 1.277 +++ kern/init_main.c 31 May 2007 20:40:16 -0000 @@ -713,9 +713,9 @@ create_init(const void *udata __unused) PROC_UNLOCK(initproc); crfree(oldcred); cred_update_thread(FIRST_THREAD_IN_PROC(initproc)); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(initproc); initproc->p_sflag |= PS_INMEM; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(initproc); cpu_set_fork_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL); } SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL) @@ -729,9 +729,9 @@ kick_init(const void *udata __unused) struct thread *td; td = FIRST_THREAD_IN_PROC(initproc); - mtx_lock_spin(&sched_lock); + thread_lock(td); TD_SET_CAN_RUN(td); sched_add(td, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL) Index: kern/kern_acct.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_acct.c,v retrieving revision 1.91 diff -u -p -r1.91 kern_acct.c --- kern/kern_acct.c 1 Jun 2007 01:12:43 -0000 1.91 +++ kern/kern_acct.c 31 May 2007 20:40:16 -0000 @@ -612,9 +612,9 @@ acct_thread(void *dummy) /* This is a low-priority kernel thread. */ pri = PRI_MAX_KERN; - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_prio(curthread, pri); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); /* If another accounting kthread is already running, just die. */ sx_xlock(&acct_sx); Index: kern/kern_clock.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_clock.c,v retrieving revision 1.199 diff -u -p -r1.199 kern_clock.c --- kern/kern_clock.c 1 Jun 2007 01:12:43 -0000 1.199 +++ kern/kern_clock.c 31 May 2007 21:59:43 -0000 @@ -201,32 +201,35 @@ hardclock_cpu(int usermode) struct pstats *pstats; struct thread *td = curthread; struct proc *p = td->td_proc; + int ast; /* * Run current process's virtual and profile time, as needed. */ - mtx_lock_spin_flags(&sched_lock, MTX_QUIET); - sched_tick(); -#ifdef KSE -#if 0 /* for now do nothing */ - if (p->p_flag & P_SA) { - /* XXXKSE What to do? Should do more. */ - } -#endif -#endif pstats = p->p_stats; + ast = 0; if (usermode && - timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) && - itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) { - p->p_sflag |= PS_ALRMPEND; - td->td_flags |= TDF_ASTPENDING; + timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value)) { + PROC_SLOCK(p); + if (itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) { + p->p_sflag |= PS_ALRMPEND; + ast = 1; + } + PROC_SUNLOCK(p); } - if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) && - itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) { - p->p_sflag |= PS_PROFPEND; - td->td_flags |= TDF_ASTPENDING; + if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value)) { + PROC_SLOCK(p); + if (itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) { + p->p_sflag |= PS_PROFPEND; + ast = 1; + } + PROC_SUNLOCK(p); } - mtx_unlock_spin_flags(&sched_lock, MTX_QUIET); + thread_lock(td); + sched_tick(); + if (ast) + td->td_flags |= TDF_ASTPENDING; + thread_unlock(td); #ifdef HWPMC_HOOKS if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid))) @@ -272,8 +275,8 @@ hardclock(int usermode, uintfptr_t pc) mtx_unlock_spin_flags(&callout_lock, MTX_QUIET); /* - * swi_sched acquires sched_lock, so we don't want to call it with - * callout_lock held; incorrect locking order. + * swi_sched acquires the thread lock, so we don't want to call it + * with callout_lock held; incorrect locking order. */ if (need_softclock) swi_sched(softclock_ih, 0); @@ -411,6 +414,7 @@ statclock(int usermode) td = curthread; p = td->td_proc; + thread_lock_flags(td, MTX_QUIET); if (usermode) { /* * Charge the time as appropriate. @@ -420,11 +424,10 @@ statclock(int usermode) thread_statclock(1); #endif td->td_uticks++; - mtx_lock_spin_flags(&time_lock, MTX_QUIET); if (p->p_nice > NZERO) - cp_time[CP_NICE]++; + atomic_add_long(&cp_time[CP_NICE], 1); else - cp_time[CP_USER]++; + atomic_add_long(&cp_time[CP_USER], 1); } else { /* * Came from kernel mode, so we were: @@ -441,8 +444,7 @@ statclock(int usermode) if ((td->td_pflags & TDP_ITHREAD) || td->td_intr_nesting_level >= 2) { td->td_iticks++; - mtx_lock_spin_flags(&time_lock, MTX_QUIET); - cp_time[CP_INTR]++; + atomic_add_long(&cp_time[CP_INTR], 1); } else { #ifdef KSE if (p->p_flag & P_SA) @@ -450,19 +452,12 @@ statclock(int usermode) #endif td->td_pticks++; td->td_sticks++; - mtx_lock_spin_flags(&time_lock, MTX_QUIET); if (!TD_IS_IDLETHREAD(td)) - cp_time[CP_SYS]++; + atomic_add_long(&cp_time[CP_SYS], 1); else - cp_time[CP_IDLE]++; + atomic_add_long(&cp_time[CP_IDLE], 1); } } - mtx_unlock_spin_flags(&time_lock, MTX_QUIET); - CTR4(KTR_SCHED, "statclock: %p(%s) prio %d stathz %d", - td, td->td_proc->p_comm, td->td_priority, (stathz)?stathz:hz); - - mtx_lock_spin_flags(&sched_lock, MTX_QUIET); - sched_clock(td); /* Update resource usage integrals and maximums. */ MPASS(p->p_vmspace != NULL); @@ -474,7 +469,10 @@ statclock(int usermode) rss = pgtok(vmspace_resident_count(vm)); if (ru->ru_maxrss < rss) ru->ru_maxrss = rss; - mtx_unlock_spin_flags(&sched_lock, MTX_QUIET); + CTR4(KTR_SCHED, "statclock: %p(%s) prio %d stathz %d", + td, td->td_proc->p_comm, td->td_priority, (stathz)?stathz:hz); + sched_clock(td); + thread_unlock(td); } void Index: kern/kern_condvar.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_condvar.c,v retrieving revision 1.61 diff -u -p -r1.61 kern_condvar.c --- kern/kern_condvar.c 8 May 2007 21:49:59 -0000 1.61 +++ kern/kern_condvar.c 18 May 2007 10:37:01 -0000 @@ -394,8 +394,8 @@ cv_signal(struct cv *cvp) if (cvp->cv_waiters > 0) { cvp->cv_waiters--; sleepq_signal(cvp, SLEEPQ_CONDVAR, -1, 0); - } else - sleepq_release(cvp); + } + sleepq_release(cvp); } /* Index: kern/kern_cpu.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_cpu.c,v retrieving revision 1.24 diff -u -p -r1.24 kern_cpu.c --- kern/kern_cpu.c 26 Mar 2007 18:03:29 -0000 1.24 +++ kern/kern_cpu.c 18 May 2007 10:37:01 -0000 @@ -298,17 +298,17 @@ cf_set_method(device_t dev, const struct cpu_id = PCPU_GET(cpuid); pc = cpu_get_pcpu(set->dev); if (cpu_id != pc->pc_cpuid) { - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_bind(curthread, pc->pc_cpuid); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); } CF_DEBUG("setting abs freq %d on %s (cpu %d)\n", set->freq, device_get_nameunit(set->dev), PCPU_GET(cpuid)); error = CPUFREQ_DRV_SET(set->dev, set); if (cpu_id != pc->pc_cpuid) { - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_unbind(curthread); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); } if (error) { goto out; @@ -327,17 +327,17 @@ cf_set_method(device_t dev, const struct cpu_id = PCPU_GET(cpuid); pc = cpu_get_pcpu(set->dev); if (cpu_id != pc->pc_cpuid) { - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_bind(curthread, pc->pc_cpuid); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); } CF_DEBUG("setting rel freq %d on %s (cpu %d)\n", set->freq, device_get_nameunit(set->dev), PCPU_GET(cpuid)); error = CPUFREQ_DRV_SET(set->dev, set); if (cpu_id != pc->pc_cpuid) { - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_unbind(curthread); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); } if (error) { /* XXX Back out any successful setting? */ Index: kern/kern_exit.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_exit.c,v retrieving revision 1.299 diff -u -p -r1.299 kern_exit.c --- kern/kern_exit.c 1 Jun 2007 01:12:43 -0000 1.299 +++ kern/kern_exit.c 31 May 2007 21:02:00 -0000 @@ -523,12 +523,13 @@ retry: * proc lock. */ wakeup(p->p_pptr); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p->p_pptr); + sched_exit(p->p_pptr, td); + PROC_SUNLOCK(p->p_pptr); + PROC_SLOCK(p); p->p_state = PRS_ZOMBIE; PROC_UNLOCK(p->p_pptr); - sched_exit(p->p_pptr, td); - /* * Hopefully no one will try to deliver a signal to the process this * late in the game. @@ -718,12 +719,13 @@ loop: * in thread_exit() after having dropped the process * lock via PROC_UNLOCK() but before it has completed * cpu_throw(). In that case, the other thread must - * still hold sched_lock, so simply by acquiring - * sched_lock once we will wait long enough for the + * still hold the proc slock, so simply by acquiring + * proc slock once we will wait long enough for the * thread to exit in that case. + * XXX This is questionable. */ - mtx_lock_spin(&sched_lock); - mtx_unlock_spin(&sched_lock); + PROC_SLOCK(p); + PROC_SUNLOCK(p); td->td_retval[0] = p->p_pid; if (status) @@ -820,12 +822,12 @@ loop: sx_xunlock(&allproc_lock); return (0); } - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if ((p->p_flag & P_STOPPED_SIG) && (p->p_suspcount == p->p_numthreads) && (p->p_flag & P_WAITED) == 0 && (p->p_flag & P_TRACED || options & WUNTRACED)) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); p->p_flag |= P_WAITED; sx_xunlock(&proctree_lock); td->td_retval[0] = p->p_pid; @@ -839,7 +841,7 @@ loop: return (0); } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); if (options & WCONTINUED && (p->p_flag & P_CONTINUED)) { sx_xunlock(&proctree_lock); td->td_retval[0] = p->p_pid; Index: kern/kern_fork.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_fork.c,v retrieving revision 1.274 diff -u -p -r1.274 kern_fork.c --- kern/kern_fork.c 1 Jun 2007 01:12:43 -0000 1.274 +++ kern/kern_fork.c 31 May 2007 21:44:45 -0000 @@ -407,8 +407,15 @@ again: lastpid = trypid; p2 = newproc; + td2 = FIRST_THREAD_IN_PROC(newproc); p2->p_state = PRS_NEW; /* protect against others */ p2->p_pid = trypid; + /* + * Allow the scheduler to initialize the child. + */ + thread_lock(td); + sched_fork(td, td2); + thread_unlock(td); AUDIT_ARG(pid, p2->p_pid); LIST_INSERT_HEAD(&allproc, p2, p_list); LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); @@ -476,8 +483,6 @@ again: * Start by zeroing the section of proc that is zero-initialized, * then copy the section that is copied directly from the parent. */ - td2 = FIRST_THREAD_IN_PROC(p2); - /* Allocate and switch to an alternate kstack if specified. */ if (pages != 0) vm_thread_new_altkstack(td2, pages); @@ -501,15 +506,9 @@ again: p2->p_flag = 0; if (p1->p_flag & P_PROFIL) startprofclock(p2); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p2); p2->p_sflag = PS_INMEM; - /* - * Allow the scheduler to adjust the priority of the child and - * parent while we hold the sched_lock. - */ - sched_fork(td, td2); - - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p2); td2->td_ucred = crhold(p2->p_ucred); #ifdef AUDIT audit_proc_fork(p1, p2); @@ -693,18 +692,20 @@ again: * Set the child start time and mark the process as being complete. */ microuptime(&p2->p_stats->p_start); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p2); p2->p_state = PRS_NORMAL; + PROC_SUNLOCK(p2); /* * If RFSTOPPED not requested, make child runnable and add to * run queue. */ if ((flags & RFSTOPPED) == 0) { + thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); + thread_unlock(td2); } - mtx_unlock_spin(&sched_lock); /* * Now can be swapped. @@ -778,31 +779,14 @@ fork_exit(callout, arg, frame) struct proc *p; struct thread *td; - /* - * Finish setting up thread glue so that it begins execution in a - * non-nested critical section with sched_lock held but not recursed. - */ td = curthread; p = td->td_proc; - td->td_oncpu = PCPU_GET(cpuid); KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new")); - sched_lock.mtx_lock = (uintptr_t)td; - mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); CTR4(KTR_PROC, "fork_exit: new thread %p (kse %p, pid %d, %s)", td, td->td_sched, p->p_pid, p->p_comm); - /* - * Processes normally resume in mi_switch() after being - * cpu_switch()'ed to, but when children start up they arrive here - * instead, so we must do much the same things as mi_switch() would. - */ - if ((td = PCPU_GET(deadthread))) { - PCPU_SET(deadthread, NULL); - thread_stash(td); - } - mtx_unlock_spin(&sched_lock); - + sched_fork_exit(td); /* * cpu_set_fork_handler intercepts this function call to * have this call a non-return function to stay in kernel mode. Index: kern/kern_idle.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_idle.c,v retrieving revision 1.47 diff -u -p -r1.47 kern_idle.c --- kern/kern_idle.c 23 Jan 2007 08:46:50 -0000 1.47 +++ kern/kern_idle.c 18 May 2007 10:37:01 -0000 @@ -73,13 +73,13 @@ idle_setup(void *dummy) PROC_LOCK(p); p->p_flag |= P_NOLOAD; - mtx_lock_spin(&sched_lock); td = FIRST_THREAD_IN_PROC(p); + thread_lock(td); TD_SET_CAN_RUN(td); td->td_flags |= TDF_IDLETD; sched_class(td, PRI_IDLE); sched_prio(td, PRI_MAX_IDLE); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); PROC_UNLOCK(p); #ifdef SMP } Index: kern/kern_intr.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_intr.c,v retrieving revision 1.145 diff -u -p -r1.145 kern_intr.c --- kern/kern_intr.c 31 May 2007 19:25:33 -0000 1.145 +++ kern/kern_intr.c 31 May 2007 21:02:57 -0000 @@ -173,9 +173,9 @@ ithread_update(struct intr_thread *ithd) /* Update name and priority. */ strlcpy(td->td_proc->p_comm, ie->ie_fullname, sizeof(td->td_proc->p_comm)); - mtx_lock_spin(&sched_lock); + thread_lock(td); sched_prio(td, pri); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } /* @@ -342,10 +342,10 @@ ithread_create(const char *name) if (error) panic("kthread_create() failed with %d", error); td = FIRST_THREAD_IN_PROC(p); /* XXXKSE */ - mtx_lock_spin(&sched_lock); + thread_lock(td); sched_class(td, PRI_ITHD); TD_SET_IWAIT(td); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); td->td_pflags |= TDP_ITHREAD; ithd->it_thread = td; CTR2(KTR_INTR, "%s: created %s", __func__, name); @@ -367,10 +367,10 @@ ithread_create(const char *name, struct if (error) panic("kthread_create() failed with %d", error); td = FIRST_THREAD_IN_PROC(p); /* XXXKSE */ - mtx_lock_spin(&sched_lock); + thread_lock(td); sched_class(td, PRI_ITHD); TD_SET_IWAIT(td); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); td->td_pflags |= TDP_ITHREAD; ithd->it_thread = td; CTR2(KTR_INTR, "%s: created %s", __func__, name); @@ -385,13 +385,13 @@ ithread_destroy(struct intr_thread *ithr CTR2(KTR_INTR, "%s: killing %s", __func__, ithread->it_event->ie_name); td = ithread->it_thread; - mtx_lock_spin(&sched_lock); + thread_lock(td); ithread->it_flags |= IT_DEAD; if (TD_AWAITING_INTR(td)) { TD_CLR_IWAIT(td); sched_add(td, SRQ_INTR); } - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } #ifndef INTR_FILTER @@ -622,7 +622,7 @@ ok: * so we have to remove the handler here rather than letting the * thread do it. */ - mtx_lock_spin(&sched_lock); + thread_lock(ie->ie_thread->it_thread); if (!TD_AWAITING_INTR(ie->ie_thread->it_thread) && !cold) { handler->ih_flags |= IH_DEAD; @@ -634,7 +634,7 @@ ok: ie->ie_thread->it_need = 1; } else TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next); - mtx_unlock_spin(&sched_lock); + thread_unlock(ie->ie_thread->it_thread); while (handler->ih_flags & IH_DEAD) msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0); intr_event_update(ie); @@ -699,11 +699,11 @@ intr_event_schedule_thread(struct intr_e /* * Set it_need to tell the thread to keep running if it is already - * running. Then, grab sched_lock and see if we actually need to - * put this thread on the runqueue. + * running. Then, lock the thread and see if we actually need to + * put it on the runqueue. */ it->it_need = 1; - mtx_lock_spin(&sched_lock); + thread_lock(td); if (TD_AWAITING_INTR(td)) { CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid, p->p_comm); @@ -713,7 +713,7 @@ intr_event_schedule_thread(struct intr_e CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d", __func__, p->p_pid, p->p_comm, it->it_need, td->td_state); } - mtx_unlock_spin(&sched_lock); + thread_unlock(td); return (0); } @@ -771,7 +771,7 @@ ok: * so we have to remove the handler here rather than letting the * thread do it. */ - mtx_lock_spin(&sched_lock); + thread_lock(it->it_thread); if (!TD_AWAITING_INTR(it->it_thread) && !cold) { handler->ih_flags |= IH_DEAD; @@ -783,7 +783,7 @@ ok: it->it_need = 1; } else TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next); - mtx_unlock_spin(&sched_lock); + thread_unlock(it->it_thread); while (handler->ih_flags & IH_DEAD) msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0); /* @@ -853,11 +853,11 @@ intr_event_schedule_thread(struct intr_e /* * Set it_need to tell the thread to keep running if it is already - * running. Then, grab sched_lock and see if we actually need to - * put this thread on the runqueue. + * running. Then, lock the thread and see if we actually need to + * put it on the runqueue. */ it->it_need = 1; - mtx_lock_spin(&sched_lock); + thread_lock(td); if (TD_AWAITING_INTR(td)) { CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid, p->p_comm); @@ -867,7 +867,7 @@ intr_event_schedule_thread(struct intr_e CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d", __func__, p->p_pid, p->p_comm, it->it_need, td->td_state); } - mtx_unlock_spin(&sched_lock); + thread_unlock(td); return (0); } @@ -1128,13 +1128,13 @@ ithread_loop(void *arg) * lock. This may take a while and it_need may get * set again, so we have to check it again. */ - mtx_lock_spin(&sched_lock); + thread_lock(td); if (!ithd->it_need && !(ithd->it_flags & IT_DEAD)) { TD_SET_IWAIT(td); ie->ie_count = 0; mi_switch(SW_VOL, NULL); } - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } } #else @@ -1202,13 +1202,13 @@ ithread_loop(void *arg) * lock. This may take a while and it_need may get * set again, so we have to check it again. */ - mtx_lock_spin(&sched_lock); + thread_lock(td); if (!ithd->it_need && !(ithd->it_flags & IT_DEAD)) { TD_SET_IWAIT(td); ie->ie_count = 0; mi_switch(SW_VOL, NULL); } - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } } Index: kern/kern_kse.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_kse.c,v retrieving revision 1.229 diff -u -p -r1.229 kern_kse.c --- kern/kern_kse.c 21 Mar 2007 21:20:50 -0000 1.229 +++ kern/kern_kse.c 20 May 2007 11:38:23 -0000 @@ -57,7 +57,7 @@ extern int thread_debug; extern int max_threads_per_proc; extern int max_groups_per_proc; extern int max_threads_hits; -extern struct mtx kse_zombie_lock; +extern struct mtx kse_lock; TAILQ_HEAD(, kse_upcall) zombie_upcalls = @@ -66,6 +66,9 @@ TAILQ_HEAD(, kse_upcall) zombie_upcalls static int thread_update_usr_ticks(struct thread *td); static void thread_alloc_spare(struct thread *td); +struct mtx kse_lock; +MTX_SYSINIT(kse_lock, &kse_lock, "kse lock", MTX_SPIN); + struct kse_upcall * upcall_alloc(void) { @@ -86,7 +89,7 @@ void upcall_link(struct kse_upcall *ku, struct proc *p) { - mtx_assert(&sched_lock, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); TAILQ_INSERT_TAIL(&p->p_upcalls, ku, ku_link); ku->ku_proc = p; } @@ -96,7 +99,7 @@ upcall_unlink(struct kse_upcall *ku) { struct proc *p = ku->ku_proc; - mtx_assert(&sched_lock, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); KASSERT(ku->ku_owner == NULL, ("%s: have owner", __func__)); TAILQ_REMOVE(&p->p_upcalls, ku, ku_link); upcall_stash(ku); @@ -106,7 +109,7 @@ void upcall_remove(struct thread *td) { - mtx_assert(&sched_lock, MA_OWNED); + PROC_SLOCK_ASSERT(td->td_proc, MA_OWNED); if (td->td_upcall != NULL) { /* * If we are not a bound thread then decrement the count of @@ -128,6 +131,16 @@ struct kse_switchin_args { }; #endif +#ifdef KSE +void +kse_unlink(struct thread *td) +{ + mtx_lock_spin(&kse_lock); + thread_unlink(td); + mtx_unlock_spin(&kse_lock); +} +#endif + int kse_switchin(struct thread *td, struct kse_switchin_args *uap) { @@ -160,11 +173,11 @@ kse_switchin(struct thread *td, struct k else ptrace_clear_single_step(td); if (tmbx.tm_dflags & TMDF_SUSPEND) { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(td->td_proc); /* fuword can block, check again */ if (td->td_upcall) ku->ku_flags |= KUF_DOUPCALL; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(td->td_proc); } _PRELE(td->td_proc); } @@ -208,23 +221,25 @@ kse_thr_interrupt(struct thread *td, str case KSE_INTR_INTERRUPT: case KSE_INTR_RESTART: PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); FOREACH_THREAD_IN_PROC(p, td2) { if (td2->td_mailbox == uap->tmbx) break; } if (td2 == NULL) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); return (ESRCH); } + thread_lock(td2); + PROC_SUNLOCK(p); if (uap->cmd == KSE_INTR_SENDSIG) { if (uap->data > 0) { td2->td_flags &= ~TDF_INTERRUPT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); tdsignal(p, td2, (int)uap->data, NULL); } else { - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); } } else { td2->td_flags |= TDF_INTERRUPT | TDF_ASTPENDING; @@ -236,7 +251,7 @@ kse_thr_interrupt(struct thread *td, str td2->td_intrval = ERESTART; if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR)) sleepq_abort(td2, td2->td_intrval); - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); } PROC_UNLOCK(p); break; @@ -261,12 +276,14 @@ kse_thr_interrupt(struct thread *td, str if (!(flags & TMDF_SUSPEND)) break; PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); thread_stopped(p); - thread_suspend_one(td); PROC_UNLOCK(p); + thread_lock(td); + thread_suspend_one(td); + PROC_SUNLOCK(p); mi_switch(SW_VOL, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } return (0); @@ -331,18 +348,18 @@ kse_exit(struct thread *td, struct kse_e */ count = 0; PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); FOREACH_UPCALL_IN_PROC(p, ku2) { if ((ku2->ku_flags & KUF_EXITING) == 0) count++; } if (count == 1 && (p->p_numthreads > 1)) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); return (EDEADLK); } ku->ku_flags |= KUF_EXITING; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); /* @@ -358,7 +375,7 @@ kse_exit(struct thread *td, struct kse_e if (error) psignal(p, SIGSEGV); sigqueue_flush(&td->td_sigqueue); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); upcall_remove(td); if (p->p_numthreads != 1) { thread_stopped(p); @@ -376,7 +393,7 @@ kse_exit(struct thread *td, struct kse_e * The other possibility would be to let the process exit. */ thread_unthread(td); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); #if 0 return (0); @@ -458,9 +475,9 @@ kse_release(struct thread *td, struct ks PROC_UNLOCK(p); } if (ku->ku_flags & KUF_DOUPCALL) { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); ku->ku_flags &= ~KUF_DOUPCALL; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } return (0); #else /* !KSE */ @@ -486,7 +503,7 @@ kse_wakeup(struct thread *td, struct kse if (!(p->p_flag & P_SA)) return (EINVAL); PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (uap->mbx) { FOREACH_UPCALL_IN_PROC(p, ku) { if (ku->ku_mailbox == uap->mbx) @@ -494,7 +511,7 @@ kse_wakeup(struct thread *td, struct kse } } else { if (p->p_upsleeps) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); wakeup(&p->p_completed); PROC_UNLOCK(p); return (0); @@ -502,15 +519,14 @@ kse_wakeup(struct thread *td, struct kse ku = TAILQ_FIRST(&p->p_upcalls); } if (ku == NULL) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); return (ESRCH); } if ((td2 = ku->ku_owner) == NULL) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); panic("%s: no owner", __func__); } else if (td2->td_kflags & (TDK_KSEREL | TDK_KSERELSIG)) { - mtx_unlock_spin(&sched_lock); if (!(td2->td_kflags & TDK_WAKEUP)) { td2->td_kflags |= TDK_WAKEUP; if (td2->td_kflags & TDK_KSEREL) @@ -520,8 +536,8 @@ kse_wakeup(struct thread *td, struct kse } } else { ku->ku_flags |= KUF_DOUPCALL; - mtx_unlock_spin(&sched_lock); } + PROC_SUNLOCK(p); PROC_UNLOCK(p); return (0); #else /* !KSE */ @@ -621,7 +637,7 @@ kse_create(struct thread *td, struct kse if (td->td_standin == NULL) thread_alloc_spare(td); PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); /* * If we are the first time, and a normal thread, * then transfer all the signals back to the 'process'. @@ -648,6 +664,7 @@ kse_create(struct thread *td, struct kse * Each upcall structure has an owner thread, find which * one owns it. */ + thread_lock(td); if (uap->newgroup) { /* * The newgroup parameter now means @@ -674,7 +691,8 @@ kse_create(struct thread *td, struct kse newtd = thread_schedule_upcall(td, newku); } } - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + PROC_SUNLOCK(p); /* * Let the UTS instance know its LWPID. @@ -699,9 +717,9 @@ kse_create(struct thread *td, struct kse * If we are starting a new thread, kick it off. */ if (newtd != td) { - mtx_lock_spin(&sched_lock); + thread_lock(newtd); sched_add(newtd, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(newtd); } } else { newtd->td_pflags &= ~TDP_SA; @@ -734,9 +752,9 @@ kse_create(struct thread *td, struct kse _PRELE(p); } PROC_UNLOCK(p); - mtx_lock_spin(&sched_lock); + thread_lock(newtd); sched_add(newtd, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(newtd); } } return (0); @@ -764,9 +782,9 @@ kseinit(void) void upcall_stash(struct kse_upcall *ku) { - mtx_lock_spin(&kse_zombie_lock); + mtx_lock_spin(&kse_lock); TAILQ_INSERT_HEAD(&zombie_upcalls, ku, ku_link); - mtx_unlock_spin(&kse_zombie_lock); + mtx_unlock_spin(&kse_lock); } /* @@ -782,11 +800,11 @@ kse_GC(void) * we really don't care about the next instant.. */ if (!TAILQ_EMPTY(&zombie_upcalls)) { - mtx_lock_spin(&kse_zombie_lock); + mtx_lock_spin(&kse_lock); ku_first = TAILQ_FIRST(&zombie_upcalls); if (ku_first) TAILQ_INIT(&zombie_upcalls); - mtx_unlock_spin(&kse_zombie_lock); + mtx_unlock_spin(&kse_lock); while (ku_first) { ku_next = TAILQ_NEXT(ku_first, ku_link); upcall_free(ku_first); @@ -818,9 +836,9 @@ thread_export_context(struct thread *td, */ PROC_LOCK(p); if (td->td_flags & TDF_NEEDSIGCHK) { - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags &= ~TDF_NEEDSIGCHK; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_lock(&p->p_sigacts->ps_mtx); while ((sig = cursig(td)) != 0) postsig(sig); @@ -921,9 +939,9 @@ thread_statclock(int user) return (0); if (user) { /* Current always do via ast() */ - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags |= TDF_ASTPENDING; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); td->td_uuticks++; } else if (td->td_mailbox != NULL) td->td_usticks++; @@ -966,7 +984,7 @@ error: /* * This function is intended to be used to initialize a spare thread - * for upcall. Initialize thread's large data area outside sched_lock + * for upcall. Initialize thread's large data area outside the thread lock * for thread_schedule_upcall(). The crhold is also here to get it out * from the schedlock as it has a mutex op itself. * XXX BUG.. we need to get the cr ref after the thread has @@ -996,7 +1014,7 @@ thread_schedule_upcall(struct thread *td { struct thread *td2; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); /* * Schedule an upcall thread on specified kse_upcall, @@ -1018,7 +1036,10 @@ thread_schedule_upcall(struct thread *td */ bcopy(&td->td_startcopy, &td2->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); + sched_fork_thread(td, td2); + mtx_lock_spin(&kse_lock); thread_link(td2, ku->ku_proc); + mtx_unlock_spin(&kse_lock); /* inherit parts of blocked thread's context as a good template */ cpu_set_upcall(td2, td); /* Let the new thread become owner of the upcall */ @@ -1030,7 +1051,6 @@ thread_schedule_upcall(struct thread *td td2->td_inhibitors = 0; SIGFILLSET(td2->td_sigmask); SIG_CANTMASK(td2->td_sigmask); - sched_fork_thread(td, td2); return (td2); /* bogus.. should be a void function */ } @@ -1069,7 +1089,7 @@ thread_switchout(struct thread *td, int struct kse_upcall *ku; struct thread *td2; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); /* * If the outgoing thread is in threaded group and has never @@ -1101,7 +1121,9 @@ thread_switchout(struct thread *td, int td->td_pflags &= ~TDP_CAN_UNBIND; td2 = thread_schedule_upcall(td, ku); if (flags & SW_INVOL || nextthread) { + thread_lock(td2); sched_add(td2, SRQ_YIELDING); + thread_unlock(td2); } else { /* Keep up with reality.. we have one extra thread * in the picture.. and it's 'running'. @@ -1171,11 +1193,11 @@ thread_user_enter(struct thread *td) if (__predict_false(p->p_flag & P_TRACED)) { flags = fuword32(&tmbx->tm_dflags); if (flags & TMDF_SUSPEND) { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(td->td_proc); /* fuword can block, check again */ if (td->td_upcall) ku->ku_flags |= KUF_DOUPCALL; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(td->td_proc); } } } @@ -1256,7 +1278,7 @@ thread_userret(struct thread *td, struct WITNESS_WARN(WARN_PANIC, &p->p_mtx.lock_object, "thread exiting in userret"); sigqueue_flush(&td->td_sigqueue); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); thread_stopped(p); thread_exit(); /* NOTREACHED */ @@ -1268,22 +1290,22 @@ thread_userret(struct thread *td, struct if (p->p_numthreads > max_threads_per_proc) { max_threads_hits++; PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); p->p_maxthrwaits++; while (p->p_numthreads > max_threads_per_proc) { if (p->p_numupcalls >= max_threads_per_proc) break; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); if (msleep(&p->p_numthreads, &p->p_mtx, PPAUSE|PCATCH, "maxthreads", hz/10) != EWOULDBLOCK) { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); break; } else { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); } } p->p_maxthrwaits--; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); } @@ -1300,9 +1322,9 @@ thread_userret(struct thread *td, struct td->td_pflags &= ~TDP_UPCALLING; if (ku->ku_flags & KUF_DOUPCALL) { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); ku->ku_flags &= ~KUF_DOUPCALL; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } /* * Set user context to the UTS @@ -1390,9 +1412,9 @@ thread_continued(struct proc *p) td = TAILQ_FIRST(&p->p_threads); if (td && (td->td_pflags & TDP_SA)) { FOREACH_UPCALL_IN_PROC(p, ku) { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); ku->ku_flags |= KUF_DOUPCALL; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); wakeup(&p->p_completed); } } Index: kern/kern_kthread.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_kthread.c,v retrieving revision 1.37 diff -u -p -r1.37 kern_kthread.c --- kern/kern_kthread.c 23 Jan 2007 08:46:50 -0000 1.37 +++ kern/kern_kthread.c 18 May 2007 10:37:01 -0000 @@ -113,9 +113,9 @@ kthread_create(void (*func)(void *), voi /* Delay putting it on the run queue until now. */ if (!(flags & RFSTOPPED)) { - mtx_lock_spin(&sched_lock); + thread_lock(td); sched_add(td, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } return 0; Index: kern/kern_lockf.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_lockf.c,v retrieving revision 1.54 diff -u -p -r1.54 kern_lockf.c --- kern/kern_lockf.c 29 Mar 2005 08:13:01 -0000 1.54 +++ kern/kern_lockf.c 18 May 2007 10:37:01 -0000 @@ -266,16 +266,19 @@ lf_setlock(lock) */ if ((lock->lf_flags & F_POSIX) && (block->lf_flags & F_POSIX)) { - register struct proc *wproc; + struct proc *wproc; + struct proc *nproc; struct thread *td; - register struct lockf *waitblock; + struct lockf *waitblock; int i = 0; /* The block is waiting on something */ - /* XXXKSE this is not complete under threads */ wproc = (struct proc *)block->lf_id; - mtx_lock_spin(&sched_lock); +restart: + nproc = NULL; + PROC_SLOCK(wproc); FOREACH_THREAD_IN_PROC(wproc, td) { + thread_lock(td); while (td->td_wchan && (td->td_wmesg == lockstr) && (i++ < maxlockdepth)) { @@ -284,15 +287,20 @@ lf_setlock(lock) waitblock = waitblock->lf_next; if ((waitblock->lf_flags & F_POSIX) == 0) break; - wproc = (struct proc *)waitblock->lf_id; - if (wproc == (struct proc *)lock->lf_id) { - mtx_unlock_spin(&sched_lock); + nproc = (struct proc *)waitblock->lf_id; + if (nproc == (struct proc *)lock->lf_id) { + PROC_SUNLOCK(wproc); + thread_unlock(td); free(lock, M_LOCKF); return (EDEADLK); } } + thread_unlock(td); } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(wproc); + wproc = nproc; + if (wproc) + goto restart; } /* * For flock type locks, we must first remove Index: kern/kern_mutex.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_mutex.c,v retrieving revision 1.190 diff -u -p -r1.190 kern_mutex.c --- kern/kern_mutex.c 18 May 2007 15:04:59 -0000 1.190 +++ kern/kern_mutex.c 31 May 2007 22:07:57 -0000 @@ -127,6 +127,7 @@ struct lock_class lock_class_mtx_spin = /* * System-wide mutexes */ +struct mtx blocked_lock; struct mtx sched_lock; struct mtx Giant; @@ -305,6 +306,7 @@ void _mtx_lock_sleep(struct mtx *m, uintptr_t tid, int opts, const char *file, int line) { + struct turnstile *ts; #ifdef ADAPTIVE_MUTEXES volatile struct thread *owner; #endif @@ -334,7 +336,7 @@ _mtx_lock_sleep(struct mtx *m, uintptr_t m->lock_object.lo_name, (void *)m->mtx_lock, file, line); while (!_obtain_lock(m, tid)) { - turnstile_lock(&m->lock_object); + ts = turnstile_trywait(&m->lock_object); v = m->mtx_lock; /* @@ -342,7 +344,7 @@ _mtx_lock_sleep(struct mtx *m, uintptr_t * the turnstile chain lock. */ if (v == MTX_UNOWNED) { - turnstile_release(&m->lock_object); + turnstile_cancel(ts); cpu_spinwait(); continue; } @@ -358,7 +360,7 @@ _mtx_lock_sleep(struct mtx *m, uintptr_t */ if (v == MTX_CONTESTED) { m->mtx_lock = tid | MTX_CONTESTED; - turnstile_claim(&m->lock_object); + turnstile_claim(ts); break; } #endif @@ -370,7 +372,7 @@ _mtx_lock_sleep(struct mtx *m, uintptr_t */ if ((v & MTX_CONTESTED) == 0 && !atomic_cmpset_ptr(&m->mtx_lock, v, v | MTX_CONTESTED)) { - turnstile_release(&m->lock_object); + turnstile_cancel(ts); cpu_spinwait(); continue; } @@ -387,7 +389,7 @@ _mtx_lock_sleep(struct mtx *m, uintptr_t if (m != &Giant && TD_IS_RUNNING(owner)) #endif { - turnstile_release(&m->lock_object); + turnstile_cancel(ts); while (mtx_owner(m) == owner && TD_IS_RUNNING(owner)) { cpu_spinwait(); } @@ -414,8 +416,7 @@ _mtx_lock_sleep(struct mtx *m, uintptr_t /* * Block on the turnstile. */ - turnstile_wait(&m->lock_object, mtx_owner(m), - TS_EXCLUSIVE_QUEUE); + turnstile_wait(ts, mtx_owner(m), TS_EXCLUSIVE_QUEUE); } #ifdef KTR if (cont_logged) { @@ -428,7 +429,25 @@ _mtx_lock_sleep(struct mtx *m, uintptr_t waittime, (file), (line)); } +static void +_mtx_lock_spin_failed(struct mtx *m) +{ + struct thread *td; + + td = mtx_owner(m); + + /* If the mutex is unlocked, try again. */ + if (td == NULL) + return; #ifdef SMP + printf( "spin lock %p (%s) held by %p (tid %d) too long\n", + m, m->lock_object.lo_name, td, td->td_tid); +#ifdef WITNESS + witness_display_spinlock(&m->lock_object, td); +#endif + panic("spin lock held too long"); +} + /* * _mtx_lock_spin: the tougher part of acquiring an MTX_SPIN lock. * @@ -440,7 +459,6 @@ _mtx_lock_spin(struct mtx *m, uintptr_t int line) { int i = 0, contested = 0; - struct thread *td; uint64_t waittime = 0; if (LOCK_LOG_TEST(&m->lock_object, opts)) @@ -458,20 +476,8 @@ _mtx_lock_spin(struct mtx *m, uintptr_t } if (i < 60000000 || kdb_active || panicstr != NULL) DELAY(1); - else { - td = mtx_owner(m); - - /* If the mutex is unlocked, try again. */ - if (td == NULL) - continue; - printf( - "spin lock %p (%s) held by %p (tid %d) too long\n", - m, m->lock_object.lo_name, td, td->td_tid); -#ifdef WITNESS - witness_display_spinlock(&m->lock_object, td); -#endif - panic("spin lock held too long"); - } + else + _mtx_lock_spin_failed(m); cpu_spinwait(); } spinlock_enter(); @@ -482,10 +488,87 @@ _mtx_lock_spin(struct mtx *m, uintptr_t lock_profile_obtain_lock_success(&m->lock_object, contested, waittime, (file), (line)); - } #endif /* SMP */ +void +_thread_lock_flags(struct thread *td, int opts, const char *file, int line) +{ + struct mtx *m; + uintptr_t tid; + int i; + + i = 0; + tid = (uintptr_t)curthread; + for (;;) { +retry: + spinlock_enter(); + m = __DEVOLATILE(struct mtx *, td->td_lock); + WITNESS_CHECKORDER(&m->lock_object, + opts | LOP_NEWORDER | LOP_EXCLUSIVE, file, line); + while (!_obtain_lock(m, tid)) { + if (m->mtx_lock == tid) { + m->mtx_recurse++; + break; + } + /* Give interrupts a chance while we spin. */ + spinlock_exit(); + while (m->mtx_lock != MTX_UNOWNED) { + if (i++ < 10000000) + cpu_spinwait(); + else if (i < 60000000 || + kdb_active || panicstr != NULL) + DELAY(1); + else + _mtx_lock_spin_failed(m); + cpu_spinwait(); + if (m != td->td_lock) + goto retry; + } + spinlock_enter(); + } + if (m == td->td_lock) + break; + _rel_spin_lock(m); /* does spinlock_exit() */ + } + WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); +} + +struct mtx * +thread_lock_block(struct thread *td) +{ + struct mtx *lock; + + spinlock_enter(); + THREAD_LOCK_ASSERT(td, MA_OWNED); + lock = __DEVOLATILE(struct mtx *, td->td_lock); + td->td_lock = &blocked_lock; + mtx_unlock_spin(lock); + + return (lock); +} + +void +thread_lock_unblock(struct thread *td, struct mtx *new) +{ + mtx_assert(new, MA_OWNED); + MPASS(td->td_lock == &blocked_lock); + atomic_store_rel_ptr((void *)&td->td_lock, (uintptr_t)new); + spinlock_exit(); +} + +void +thread_lock_set(struct thread *td, struct mtx *new) +{ + struct mtx *lock; + + mtx_assert(new, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); + lock = __DEVOLATILE(struct mtx *, td->td_lock); + td->td_lock = new; + mtx_unlock_spin(lock); +} + /* * _mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock. * @@ -508,7 +591,11 @@ _mtx_unlock_sleep(struct mtx *m, int opt return; } - turnstile_lock(&m->lock_object); + /* + * We have to lock the chain before the turnstile so this turnstile + * can be removed from the hash list if it is empty. + */ + turnstile_chain_lock(&m->lock_object); ts = turnstile_lookup(&m->lock_object); if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m); @@ -518,7 +605,7 @@ _mtx_unlock_sleep(struct mtx *m, int opt _release_lock_quick(m); if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p no sleepers", m); - turnstile_release(&m->lock_object); + turnstile_chain_unlock(&m->lock_object); return; } #else @@ -543,7 +630,12 @@ _mtx_unlock_sleep(struct mtx *m, int opt m); } #endif + /* + * This turnstile is now no longer associated with the mutex. We can + * unlock the chain lock so a new turnstile may take it's place. + */ turnstile_unpend(ts, TS_EXCLUSIVE_LOCK); + turnstile_chain_unlock(&m->lock_object); #ifndef PREEMPTION /* @@ -557,7 +649,7 @@ _mtx_unlock_sleep(struct mtx *m, int opt if (td->td_critnest > 0 || td1->td_priority >= td->td_priority) return; - mtx_lock_spin(&sched_lock); + thread_lock(td1); if (!TD_IS_RUNNING(td1)) { #ifdef notyet if (td->td_ithd != NULL) { @@ -582,7 +674,7 @@ _mtx_unlock_sleep(struct mtx *m, int opt CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p resuming lock=%p", m, (void *)m->mtx_lock); } - mtx_unlock_spin(&sched_lock); + thread_unlock(td1); #endif } @@ -761,7 +853,10 @@ mutex_init(void) */ mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE); mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE); + mtx_init(&blocked_lock, "blocked lock", NULL, MTX_SPIN); + blocked_lock.mtx_lock = 0xdeadc0de; /* Always blocked. */ mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK); + mtx_init(&proc0.p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE); mtx_init(&devmtx, "cdev", NULL, MTX_DEF); mtx_lock(&Giant); Index: kern/kern_poll.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_poll.c,v retrieving revision 1.28 diff -u -p -r1.28 kern_poll.c --- kern/kern_poll.c 6 Dec 2006 06:34:55 -0000 1.28 +++ kern/kern_poll.c 18 May 2007 10:37:01 -0000 @@ -580,17 +580,17 @@ poll_idle(void) rtp.prio = RTP_PRIO_MAX; /* lowest priority */ rtp.type = RTP_PRIO_IDLE; - mtx_lock_spin(&sched_lock); + PROC_SLOCK(td->td_proc); rtp_to_pri(&rtp, td); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(td->td_proc); for (;;) { if (poll_in_idle_loop && poll_handlers > 0) { idlepoll_sleeping = 0; ether_poll(poll_each_burst); - mtx_lock_spin(&sched_lock); + thread_lock(td); mi_switch(SW_VOL, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } else { idlepoll_sleeping = 1; tsleep(&idlepoll_sleeping, 0, "pollid", hz * 3); Index: kern/kern_proc.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_proc.c,v retrieving revision 1.248 diff -u -p -r1.248 kern_proc.c --- kern/kern_proc.c 1 Jun 2007 01:12:43 -0000 1.248 +++ kern/kern_proc.c 31 May 2007 20:40:17 -0000 @@ -177,6 +177,7 @@ proc_init(void *mem, int size, int flags td = thread_alloc(); bzero(&p->p_mtx, sizeof(struct mtx)); mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK); + mtx_init(&p->p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE); p->p_stats = pstats_alloc(); proc_linkup(p, td); sched_newproc(p, td); @@ -669,7 +670,7 @@ fill_kinfo_proc_only(struct proc *p, str kp->ki_sigcatch = ps->ps_sigcatch; mtx_unlock(&ps->ps_mtx); } - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (p->p_state != PRS_NEW && p->p_state != PRS_ZOMBIE && p->p_vmspace != NULL) { @@ -695,7 +696,7 @@ fill_kinfo_proc_only(struct proc *p, str kp->ki_nice = p->p_nice; rufetch(p, &kp->ki_rusage); kp->ki_runtime = cputick2usec(p->p_rux.rux_runtime); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); if ((p->p_sflag & PS_INMEM) && p->p_stats != NULL) { kp->ki_start = p->p_stats->p_start; timevaladd(&kp->ki_start, &boottime); @@ -747,7 +748,7 @@ fill_kinfo_proc_only(struct proc *p, str /* * Fill in information that is thread specific. - * Must be called with sched_lock locked. + * Must be called with p_slock locked. */ static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp) @@ -755,7 +756,9 @@ fill_kinfo_thread(struct thread *td, str struct proc *p; p = td->td_proc; + PROC_SLOCK_ASSERT(p, MA_OWNED); + thread_lock(td); if (td->td_wmesg != NULL) strlcpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg)); else @@ -813,6 +816,7 @@ fill_kinfo_thread(struct thread *td, str SIGSETOR(kp->ki_siglist, td->td_siglist); kp->ki_sigmask = td->td_sigmask; + thread_unlock(td); } /* @@ -824,10 +828,10 @@ fill_kinfo_proc(struct proc *p, struct k { fill_kinfo_proc_only(p, kp); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (FIRST_THREAD_IN_PROC(p) != NULL) fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } struct pstats * @@ -894,14 +898,14 @@ sysctl_out_proc(struct proc *p, struct s fill_kinfo_proc_only(p, &kinfo_proc); if (flags & KERN_PROC_NOTHREADS) { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (FIRST_THREAD_IN_PROC(p) != NULL) fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), &kinfo_proc); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc, sizeof(kinfo_proc)); } else { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (FIRST_THREAD_IN_PROC(p) != NULL) FOREACH_THREAD_IN_PROC(p, td) { fill_kinfo_thread(td, &kinfo_proc); @@ -913,7 +917,7 @@ sysctl_out_proc(struct proc *p, struct s else error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc, sizeof(kinfo_proc)); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } PROC_UNLOCK(p); if (error) @@ -1003,12 +1007,12 @@ sysctl_kern_proc(SYSCTL_HANDLER_ARGS) /* * Skip embryonic processes. */ - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (p->p_state == PRS_NEW) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); continue; } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_LOCK(p); KASSERT(p->p_ucred != NULL, ("process credential is NULL for non-NEW proc")); Index: kern/kern_resource.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_resource.c,v retrieving revision 1.173 diff -u -p -r1.173 kern_resource.c --- kern/kern_resource.c 1 Jun 2007 01:20:11 -0000 1.173 +++ kern/kern_resource.c 1 Jun 2007 02:10:51 -0000 @@ -263,9 +263,9 @@ donice(struct thread *td, struct proc *p n = PRIO_MIN; if (n < p->p_nice && priv_check(td, PRIV_SCHED_SETPRIORITY) != 0) return (EACCES); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); sched_nice(p, n); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); return (0); } @@ -306,7 +306,7 @@ rtprio_thread(struct thread *td, struct case RTP_LOOKUP: if ((error = p_cansee(td, p))) break; - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (uap->lwpid == 0 || uap->lwpid == td->td_tid) td1 = td; else @@ -315,7 +315,7 @@ rtprio_thread(struct thread *td, struct pri_to_rtp(td1, &rtp); else error = ESRCH; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); return (copyout(&rtp, uap->rtp, sizeof(struct rtprio))); case RTP_SET: @@ -342,7 +342,7 @@ rtprio_thread(struct thread *td, struct } } - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (uap->lwpid == 0 || uap->lwpid == td->td_tid) td1 = td; else @@ -351,7 +351,7 @@ rtprio_thread(struct thread *td, struct error = rtp_to_pri(&rtp, td1); else error = ESRCH; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); break; default: error = EINVAL; @@ -402,7 +402,7 @@ rtprio(td, uap) case RTP_LOOKUP: if ((error = p_cansee(td, p))) break; - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); /* * Return OUR priority if no pid specified, * or if one is, report the highest priority @@ -430,7 +430,7 @@ rtprio(td, uap) } } } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); return (copyout(&rtp, uap->rtp, sizeof(struct rtprio))); case RTP_SET: @@ -468,7 +468,7 @@ rtprio(td, uap) * do all the threads on that process. If we * specify our own pid we do the latter. */ - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (uap->pid == 0) { error = rtp_to_pri(&rtp, td); } else { @@ -477,7 +477,7 @@ rtprio(td, uap) break; } } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); break; default: error = EINVAL; @@ -492,9 +492,9 @@ rtp_to_pri(struct rtprio *rtp, struct th { u_char newpri; - mtx_assert(&sched_lock, MA_OWNED); if (rtp->prio > RTP_PRIO_MAX) return (EINVAL); + thread_lock(td); switch (RTP_PRIO_BASE(rtp->type)) { case RTP_PRIO_REALTIME: newpri = PRI_MIN_REALTIME + rtp->prio; @@ -506,12 +506,14 @@ rtp_to_pri(struct rtprio *rtp, struct th newpri = PRI_MIN_IDLE + rtp->prio; break; default: + thread_unlock(td); return (EINVAL); } sched_class(td, rtp->type); /* XXX fix */ sched_user_prio(td, newpri); if (curthread == td) sched_prio(curthread, td->td_user_pri); /* XXX dubious */ + thread_unlock(td); return (0); } @@ -519,7 +521,7 @@ void pri_to_rtp(struct thread *td, struct rtprio *rtp) { - mtx_assert(&sched_lock, MA_OWNED); + thread_lock(td); switch (PRI_BASE(td->td_pri_class)) { case PRI_REALTIME: rtp->prio = td->td_base_user_pri - PRI_MIN_REALTIME; @@ -534,6 +536,7 @@ pri_to_rtp(struct thread *td, struct rtp break; } rtp->type = td->td_pri_class; + thread_unlock(td); } #if defined(COMPAT_43) @@ -634,10 +637,13 @@ lim_cb(void *arg) */ if (p->p_cpulimit == RLIM_INFINITY) return; - mtx_lock_spin(&sched_lock); - FOREACH_THREAD_IN_PROC(p, td) + PROC_SLOCK(p); + FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); ruxagg(&p->p_rux, td); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + } + PROC_SUNLOCK(p); if (p->p_rux.rux_runtime > p->p_cpulimit * cpu_tickrate()) { lim_rlimit(p, RLIMIT_CPU, &rlim); if (p->p_rux.rux_runtime >= rlim.rlim_max * cpu_tickrate()) { @@ -699,9 +705,9 @@ kern_setrlimit(td, which, limp) if (limp->rlim_cur != RLIM_INFINITY && p->p_cpulimit == RLIM_INFINITY) callout_reset(&p->p_limco, hz, lim_cb, p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); p->p_cpulimit = limp->rlim_cur; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); break; case RLIMIT_DATA: if (limp->rlim_cur > maxdsiz) @@ -828,9 +834,7 @@ calcru(struct proc *p, struct timeval *u uint64_t u; PROC_LOCK_ASSERT(p, MA_OWNED); - mtx_assert(&sched_lock, MA_NOTOWNED); - mtx_lock_spin(&sched_lock); - + PROC_SLOCK(p); /* * If we are getting stats for the current process, then add in the * stats that this thread has accumulated in its current time slice. @@ -843,9 +847,9 @@ calcru(struct proc *p, struct timeval *u p->p_rux.rux_runtime += u - PCPU_GET(switchtime); PCPU_SET(switchtime, u); } - /* Work on a copy of p_rux so we can let go of sched_lock */ + /* Work on a copy of p_rux so we can let go of p_slock */ rux = p->p_rux; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); calcru1(p, &rux, up, sp); /* Update the result from the p_rux copy */ p->p_rux.rux_uu = rux.rux_uu; @@ -1013,6 +1017,9 @@ ruadd(struct rusage *ru, struct rusage_e void ruxagg(struct rusage_ext *rux, struct thread *td) { + + THREAD_LOCK_ASSERT(td, MA_OWNED); + PROC_SLOCK_ASSERT(td->td_proc, MA_OWNED); rux->rux_runtime += td->td_runtime; rux->rux_uticks += td->td_uticks; rux->rux_sticks += td->td_sticks; @@ -1033,17 +1040,19 @@ rufetch(struct proc *p, struct rusage *r struct thread *td; memset(ru, 0, sizeof(*ru)); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (p->p_ru == NULL) { KASSERT(p->p_numthreads > 0, ("rufetch: No threads or ru in proc %p", p)); FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); ruxagg(&p->p_rux, td); + thread_unlock(td); rucollect(ru, &td->td_ru); } } else *ru = *p->p_ru; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } /* Index: kern/kern_rwlock.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_rwlock.c,v retrieving revision 1.25 diff -u -p -r1.25 kern_rwlock.c --- kern/kern_rwlock.c 18 May 2007 15:04:59 -0000 1.25 +++ kern/kern_rwlock.c 20 May 2007 11:40:27 -0000 @@ -187,6 +187,7 @@ _rw_wunlock(struct rwlock *rw, const cha void _rw_rlock(struct rwlock *rw, const char *file, int line) { + struct turnstile *ts; #ifdef ADAPTIVE_RWLOCKS volatile struct thread *owner; #endif @@ -256,7 +257,7 @@ _rw_rlock(struct rwlock *rw, const char * has a write lock, so acquire the turnstile lock so we can * begin the process of blocking. */ - turnstile_lock(&rw->lock_object); + ts = turnstile_trywait(&rw->lock_object); /* * The lock might have been released while we spun, so @@ -265,7 +266,7 @@ _rw_rlock(struct rwlock *rw, const char */ x = rw->rw_lock; if (x & RW_LOCK_READ) { - turnstile_release(&rw->lock_object); + turnstile_cancel(ts); cpu_spinwait(); continue; } @@ -279,7 +280,7 @@ _rw_rlock(struct rwlock *rw, const char if (!(x & RW_LOCK_READ_WAITERS)) { if (!atomic_cmpset_ptr(&rw->rw_lock, x, x | RW_LOCK_READ_WAITERS)) { - turnstile_release(&rw->lock_object); + turnstile_cancel(ts); cpu_spinwait(); continue; } @@ -296,7 +297,7 @@ _rw_rlock(struct rwlock *rw, const char */ owner = (struct thread *)RW_OWNER(x); if (TD_IS_RUNNING(owner)) { - turnstile_release(&rw->lock_object); + turnstile_cancel(ts); if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, rw, owner); @@ -314,7 +315,7 @@ _rw_rlock(struct rwlock *rw, const char if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__, rw); - turnstile_wait(&rw->lock_object, rw_owner(rw), TS_SHARED_QUEUE); + turnstile_wait(ts, rw_owner(rw), TS_SHARED_QUEUE); if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from turnstile", __func__, rw); @@ -407,7 +408,7 @@ _rw_runlock(struct rwlock *rw, const cha * Ok, we know we have a waiting writer and we think we * are the last reader, so grab the turnstile lock. */ - turnstile_lock(&rw->lock_object); + turnstile_chain_lock(&rw->lock_object); /* * Try to drop our lock leaving the lock in a unlocked @@ -427,7 +428,7 @@ _rw_runlock(struct rwlock *rw, const cha */ if (!atomic_cmpset_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | RW_LOCK_WRITE_WAITERS, RW_UNLOCKED)) { - turnstile_release(&rw->lock_object); + turnstile_chain_unlock(&rw->lock_object); continue; } if (LOCK_LOG_TEST(&rw->lock_object, 0)) @@ -445,6 +446,7 @@ _rw_runlock(struct rwlock *rw, const cha MPASS(ts != NULL); turnstile_broadcast(ts, TS_EXCLUSIVE_QUEUE); turnstile_unpend(ts, TS_SHARED_LOCK); + turnstile_chain_unlock(&rw->lock_object); break; } lock_profile_release_lock(&rw->lock_object); @@ -458,6 +460,7 @@ _rw_runlock(struct rwlock *rw, const cha void _rw_wlock_hard(struct rwlock *rw, uintptr_t tid, const char *file, int line) { + struct turnstile *ts; #ifdef ADAPTIVE_RWLOCKS volatile struct thread *owner; #endif @@ -468,7 +471,7 @@ _rw_wlock_hard(struct rwlock *rw, uintpt rw->lock_object.lo_name, (void *)rw->rw_lock, file, line); while (!_rw_write_lock(rw, tid)) { - turnstile_lock(&rw->lock_object); + ts = turnstile_trywait(&rw->lock_object); v = rw->rw_lock; /* @@ -476,7 +479,7 @@ _rw_wlock_hard(struct rwlock *rw, uintpt * turnstile chain lock, try again. */ if (v == RW_UNLOCKED) { - turnstile_release(&rw->lock_object); + turnstile_cancel(ts); cpu_spinwait(); continue; } @@ -495,12 +498,12 @@ _rw_wlock_hard(struct rwlock *rw, uintpt if (atomic_cmpset_acq_ptr(&rw->rw_lock, RW_UNLOCKED | RW_LOCK_WRITE_WAITERS, tid | RW_LOCK_WRITE_WAITERS)) { - turnstile_claim(&rw->lock_object); + turnstile_claim(ts); CTR2(KTR_LOCK, "%s: %p claimed by new writer", __func__, rw); break; } - turnstile_release(&rw->lock_object); + turnstile_cancel(ts); cpu_spinwait(); continue; } @@ -513,7 +516,7 @@ _rw_wlock_hard(struct rwlock *rw, uintpt if (!(v & RW_LOCK_WRITE_WAITERS)) { if (!atomic_cmpset_ptr(&rw->rw_lock, v, v | RW_LOCK_WRITE_WAITERS)) { - turnstile_release(&rw->lock_object); + turnstile_cancel(ts); cpu_spinwait(); continue; } @@ -530,7 +533,7 @@ _rw_wlock_hard(struct rwlock *rw, uintpt */ owner = (struct thread *)RW_OWNER(v); if (!(v & RW_LOCK_READ) && TD_IS_RUNNING(owner)) { - turnstile_release(&rw->lock_object); + turnstile_cancel(ts); if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, rw, owner); @@ -548,8 +551,7 @@ _rw_wlock_hard(struct rwlock *rw, uintpt if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__, rw); - turnstile_wait(&rw->lock_object, rw_owner(rw), - TS_EXCLUSIVE_QUEUE); + turnstile_wait(ts, rw_owner(rw), TS_EXCLUSIVE_QUEUE); if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from turnstile", __func__, rw); @@ -574,7 +576,7 @@ _rw_wunlock_hard(struct rwlock *rw, uint if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p contested", __func__, rw); - turnstile_lock(&rw->lock_object); + turnstile_chain_lock(&rw->lock_object); ts = turnstile_lookup(&rw->lock_object); #ifdef ADAPTIVE_RWLOCKS @@ -587,7 +589,7 @@ _rw_wunlock_hard(struct rwlock *rw, uint atomic_store_rel_ptr(&rw->rw_lock, RW_UNLOCKED); if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p no sleepers", __func__, rw); - turnstile_release(&rw->lock_object); + turnstile_chain_unlock(&rw->lock_object); return; } #else @@ -640,6 +642,7 @@ _rw_wunlock_hard(struct rwlock *rw, uint CTR2(KTR_LOCK, "%s: %p no sleepers 2", __func__, rw); atomic_store_rel_ptr(&rw->rw_lock, v); turnstile_disown(ts); + turnstile_chain_unlock(&rw->lock_object); return; } #endif @@ -651,6 +654,7 @@ _rw_wunlock_hard(struct rwlock *rw, uint turnstile_broadcast(ts, queue); atomic_store_rel_ptr(&rw->rw_lock, v); turnstile_unpend(ts, TS_EXCLUSIVE_LOCK); + turnstile_chain_unlock(&rw->lock_object); } /* @@ -662,6 +666,7 @@ int _rw_try_upgrade(struct rwlock *rw, const char *file, int line) { uintptr_t v, tid; + struct turnstile *ts; int success; KASSERT(rw->rw_lock != RW_DESTROYED, @@ -686,7 +691,7 @@ _rw_try_upgrade(struct rwlock *rw, const * Ok, we think we have write waiters, so lock the * turnstile. */ - turnstile_lock(&rw->lock_object); + ts = turnstile_trywait(&rw->lock_object); /* * Try to switch from one reader to a writer again. This time @@ -705,9 +710,9 @@ _rw_try_upgrade(struct rwlock *rw, const #else if (success && v) #endif - turnstile_claim(&rw->lock_object); + turnstile_claim(ts); else - turnstile_release(&rw->lock_object); + turnstile_cancel(ts); out: LOCK_LOG_TRY("WUPGRADE", &rw->lock_object, 0, success, file, line); if (success) @@ -745,7 +750,7 @@ _rw_downgrade(struct rwlock *rw, const c * Ok, we think we have waiters, so lock the turnstile so we can * read the waiter flags without any races. */ - turnstile_lock(&rw->lock_object); + turnstile_chain_lock(&rw->lock_object); v = rw->rw_lock; MPASS(v & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS)); @@ -779,12 +784,9 @@ _rw_downgrade(struct rwlock *rw, const c (v & RW_LOCK_WRITE_WAITERS)); if (v & RW_LOCK_READ_WAITERS) turnstile_unpend(ts, TS_EXCLUSIVE_LOCK); -#ifdef ADAPTIVE_RWLOCKS - else if (ts == NULL) - turnstile_release(&rw->lock_object); -#endif - else + else if (ts) turnstile_disown(ts); + turnstile_chain_unlock(&rw->lock_object); out: LOCK_LOG_LOCK("WDOWNGRADE", &rw->lock_object, 0, 0, file, line); } Index: kern/kern_shutdown.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_shutdown.c,v retrieving revision 1.181 diff -u -p -r1.181 kern_shutdown.c --- kern/kern_shutdown.c 4 Mar 2007 22:36:46 -0000 1.181 +++ kern/kern_shutdown.c 18 May 2007 10:37:02 -0000 @@ -267,9 +267,9 @@ boot(int howto) * systems don't shutdown properly (i.e., ACPI power off) if we * run on another processor. */ - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_bind(curthread, 0); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); KASSERT(PCPU_GET(cpuid) == 0, ("boot: not running on cpu 0")); #endif /* We're in the process of rebooting. */ @@ -340,9 +340,9 @@ boot(int howto) */ DROP_GIANT(); for (subiter = 0; subiter < 50 * iter; subiter++) { - mtx_lock_spin(&sched_lock); + thread_lock(curthread); mi_switch(SW_VOL, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); DELAY(1000); } PICKUP_GIANT(); @@ -555,9 +555,9 @@ panic(const char *fmt, ...) } #endif #endif - mtx_lock_spin(&sched_lock); + /*thread_lock(td); */ td->td_flags |= TDF_INPANIC; - mtx_unlock_spin(&sched_lock); + /* thread_unlock(td); */ if (!sync_on_panic) bootopt |= RB_NOSYNC; boot(bootopt); Index: kern/kern_sig.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_sig.c,v retrieving revision 1.345 diff -u -p -r1.345 kern_sig.c --- kern/kern_sig.c 1 Jun 2007 01:12:43 -0000 1.345 +++ kern/kern_sig.c 31 May 2007 20:40:17 -0000 @@ -511,10 +511,10 @@ sigqueue_delete_set_proc(struct proc *p, sigqueue_init(&worklist, NULL); sigqueue_move_set(&p->p_sigqueue, &worklist, set); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); FOREACH_THREAD_IN_PROC(p, td0) sigqueue_move_set(&td0->td_sigqueue, &worklist, set); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); sigqueue_flush(&worklist); } @@ -552,7 +552,7 @@ cursig(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED); - mtx_assert(&sched_lock, MA_NOTOWNED); + THREAD_LOCK_ASSERT(td, MA_NOTOWNED); return (SIGPENDING(td) ? issignal(td) : 0); } @@ -588,9 +588,9 @@ signotify(struct thread *td) if (! SIGISEMPTY(set)) sigqueue_move_set(&p->p_sigqueue, &td->td_sigqueue, &set); if (SIGPENDING(td)) { - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } #ifdef KSE if ((p->p_flag & P_SA) && !(p->p_flag & P_SIGEVENT)) { @@ -758,7 +758,9 @@ kern_sigaction(td, sig, act, oact, flags } #endif /* never to be seen again */ + PROC_SLOCK(p); sigqueue_delete_proc(p, sig); + PROC_SUNLOCK(p); if (sig != SIGCONT) /* easier in psignal */ SIGADDSET(ps->ps_sigignore, sig); @@ -954,7 +956,9 @@ execsigs(struct proc *p) if (sigprop(sig) & SA_IGNORE) { if (sig != SIGCONT) SIGADDSET(ps->ps_sigignore, sig); + PROC_SLOCK(p); sigqueue_delete_proc(p, sig); + PROC_SUNLOCK(p); } ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } @@ -1849,7 +1853,7 @@ trapsignal(struct thread *td, ksiginfo_t thread_user_enter(td); PROC_LOCK(p); SIGDELSET(td->td_sigmask, sig); - mtx_lock_spin(&sched_lock); + thread_lock(td); /* * Force scheduling an upcall, so UTS has chance to * process the signal before thread runs again in @@ -1857,7 +1861,7 @@ trapsignal(struct thread *td, ksiginfo_t */ if (td->td_upcall) td->td_upcall->ku_flags |= KUF_DOUPCALL; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } else { PROC_LOCK(p); } @@ -1952,7 +1956,7 @@ sigtd(struct proc *p, int sig, int prop) if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig)) return (curthread); signal_td = NULL; - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); FOREACH_THREAD_IN_PROC(p, td) { if (!SIGISMEMBER(td->td_sigmask, sig)) { signal_td = td; @@ -1961,7 +1965,7 @@ sigtd(struct proc *p, int sig, int prop) } if (signal_td == NULL) signal_td = FIRST_THREAD_IN_PROC(p); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); return (signal_td); } @@ -2128,7 +2132,9 @@ do_tdsignal(struct proc *p, struct threa ksiginfo_tryfree(ksi); return (ret); } + PROC_SLOCK(p); sigqueue_delete_proc(p, SIGCONT); + PROC_SUNLOCK(p); if (p->p_flag & P_CONTINUED) { p->p_flag &= ~P_CONTINUED; PROC_LOCK(p->p_pptr); @@ -2166,6 +2172,7 @@ do_tdsignal(struct proc *p, struct threa * waking up threads so that they can cross the user boundary. * We try do the per-process part here. */ + PROC_SLOCK(p); if (P_SHOULDSTOP(p)) { /* * The process is in stopped mode. All the threads should be @@ -2177,6 +2184,7 @@ do_tdsignal(struct proc *p, struct threa * so no further action is necessary. * No signal can restart us. */ + PROC_SUNLOCK(p); goto out; } @@ -2203,15 +2211,21 @@ do_tdsignal(struct proc *p, struct threa */ p->p_flag &= ~P_STOPPED_SIG; if (p->p_numthreads == p->p_suspcount) { + PROC_SUNLOCK(p); p->p_flag |= P_CONTINUED; p->p_xstat = SIGCONT; PROC_LOCK(p->p_pptr); childproc_continued(p); PROC_UNLOCK(p->p_pptr); + PROC_SLOCK(p); } if (action == SIG_DFL) { + thread_unsuspend(p); + PROC_SUNLOCK(p); sigqueue_delete(sigqueue, sig); - } else if (action == SIG_CATCH) { + goto out; + } + if (action == SIG_CATCH) { #ifdef KSE /* * The process wants to catch it so it needs @@ -2223,20 +2237,18 @@ do_tdsignal(struct proc *p, struct threa * single thread is runnable asap. * XXXKSE for now however, make them all run. */ -#else +#endif /* * The process wants to catch it so it needs * to run at least one thread, but which one? */ -#endif goto runfast; } /* * The signal is not ignored or caught. */ - mtx_lock_spin(&sched_lock); thread_unsuspend(p); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); goto out; } @@ -2246,6 +2258,7 @@ do_tdsignal(struct proc *p, struct threa * (If we did the shell could get confused). * Just make sure the signal STOP bit set. */ + PROC_SUNLOCK(p); p->p_flag |= P_STOPPED_SIG; sigqueue_delete(sigqueue, sig); goto out; @@ -2259,10 +2272,11 @@ do_tdsignal(struct proc *p, struct threa * the PROCESS runnable, leave it stopped. * It may run a bit until it hits a thread_suspend_check(). */ - mtx_lock_spin(&sched_lock); + thread_lock(td); if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR)) sleepq_abort(td, intrval); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + PROC_SUNLOCK(p); goto out; /* * Mutexes are short lived. Threads waiting on them will @@ -2270,9 +2284,10 @@ do_tdsignal(struct proc *p, struct threa */ } else if (p->p_state == PRS_NORMAL) { if (p->p_flag & P_TRACED || action == SIG_CATCH) { - mtx_lock_spin(&sched_lock); + thread_lock(td); tdsigwakeup(td, sig, action, intrval); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + PROC_SUNLOCK(p); goto out; } @@ -2283,7 +2298,6 @@ do_tdsignal(struct proc *p, struct threa goto out; p->p_flag |= P_STOPPED_SIG; p->p_xstat = sig; - mtx_lock_spin(&sched_lock); sig_suspend_threads(td, p, 1); if (p->p_numthreads == p->p_suspcount) { /* @@ -2294,10 +2308,10 @@ do_tdsignal(struct proc *p, struct threa * should never be equal to p_suspcount. */ thread_stopped(p); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); sigqueue_delete_proc(p, p->p_xstat); } else - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); goto out; } else @@ -2305,6 +2319,7 @@ do_tdsignal(struct proc *p, struct threa /* NOTREACHED */ } else { /* Not in "NORMAL" state. discard the signal. */ + PROC_SUNLOCK(p); sigqueue_delete(sigqueue, sig); goto out; } @@ -2315,13 +2330,14 @@ do_tdsignal(struct proc *p, struct threa */ runfast: - mtx_lock_spin(&sched_lock); + thread_lock(td); tdsigwakeup(td, sig, action, intrval); + thread_unlock(td); thread_unsuspend(p); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); out: - /* If we jump here, sched_lock should not be owned. */ - mtx_assert(&sched_lock, MA_NOTOWNED); + /* If we jump here, proc slock should not be owned. */ + PROC_SLOCK_ASSERT(p, MA_NOTOWNED); return (ret); } @@ -2337,7 +2353,8 @@ tdsigwakeup(struct thread *td, int sig, register int prop; PROC_LOCK_ASSERT(p, MA_OWNED); - mtx_assert(&sched_lock, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); prop = sigprop(sig); /* @@ -2366,14 +2383,16 @@ tdsigwakeup(struct thread *td, int sig, * be awakened. */ if ((prop & SA_CONT) && action == SIG_DFL) { - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + PROC_SUNLOCK(p); sigqueue_delete(&p->p_sigqueue, sig); /* * It may be on either list in this state. * Remove from both for now. */ sigqueue_delete(&td->td_sigqueue, sig); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); + thread_lock(td); return; } @@ -2403,9 +2422,10 @@ sig_suspend_threads(struct thread *td, s struct thread *td2; PROC_LOCK_ASSERT(p, MA_OWNED); - mtx_assert(&sched_lock, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); FOREACH_THREAD_IN_PROC(p, td2) { + thread_lock(td2); if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) && (td2->td_flags & TDF_SINTR) && !TD_IS_SUSPENDED(td2)) { @@ -2418,6 +2438,7 @@ sig_suspend_threads(struct thread *td, s forward_signal(td2); #endif } + thread_unlock(td2); } } @@ -2430,15 +2451,17 @@ ptracestop(struct thread *td, int sig) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Stopping for traced signal"); - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags |= TDF_XSIG; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); td->td_xsig = sig; + PROC_SLOCK(p); while ((p->p_flag & P_TRACED) && (td->td_flags & TDF_XSIG)) { if (p->p_flag & P_SINGLE_EXIT) { - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags &= ~TDF_XSIG; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + PROC_SUNLOCK(p); return (sig); } /* @@ -2448,26 +2471,19 @@ ptracestop(struct thread *td, int sig) p->p_xstat = sig; p->p_xthread = td; p->p_flag |= (P_STOPPED_SIG|P_STOPPED_TRACE); - mtx_lock_spin(&sched_lock); sig_suspend_threads(td, p, 0); stopme: - thread_stopped(p); - thread_suspend_one(td); - PROC_UNLOCK(p); - DROP_GIANT(); - mi_switch(SW_VOL, NULL); - mtx_unlock_spin(&sched_lock); - PICKUP_GIANT(); - PROC_LOCK(p); - if (!(p->p_flag & P_TRACED)) + thread_suspend_switch(td); + if (!(p->p_flag & P_TRACED)) { break; + } if (td->td_flags & TDF_DBSUSPEND) { if (p->p_flag & P_SINGLE_EXIT) break; - mtx_lock_spin(&sched_lock); goto stopme; } } + PROC_SUNLOCK(p); return (td->td_xsig); } @@ -2621,16 +2637,10 @@ issignal(td) &p->p_mtx.lock_object, "Catching SIGSTOP"); p->p_flag |= P_STOPPED_SIG; p->p_xstat = sig; - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); sig_suspend_threads(td, p, 0); - thread_stopped(p); - thread_suspend_one(td); - PROC_UNLOCK(p); - DROP_GIANT(); - mi_switch(SW_INVOL, NULL); - mtx_unlock_spin(&sched_lock); - PICKUP_GIANT(); - PROC_LOCK(p); + thread_suspend_switch(td); + PROC_SUNLOCK(p); mtx_lock(&ps->ps_mtx); break; } else if (prop & SA_IGNORE) { @@ -2672,18 +2682,18 @@ thread_stopped(struct proc *p) int n; PROC_LOCK_ASSERT(p, MA_OWNED); - mtx_assert(&sched_lock, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); n = p->p_suspcount; if (p == curproc) n++; if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); p->p_flag &= ~P_WAITED; PROC_LOCK(p->p_pptr); childproc_stopped(p, (p->p_flag & P_TRACED) ? CLD_TRAPPED : CLD_STOPPED); PROC_UNLOCK(p->p_pptr); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); } } Index: kern/kern_subr.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_subr.c,v retrieving revision 1.102 diff -u -p -r1.102 kern_subr.c --- kern/kern_subr.c 16 Jan 2007 11:40:55 -0000 1.102 +++ kern/kern_subr.c 18 May 2007 10:37:02 -0000 @@ -453,11 +453,11 @@ uio_yield(void) struct thread *td; td = curthread; - mtx_lock_spin(&sched_lock); DROP_GIANT(); + thread_lock(td); sched_prio(td, td->td_user_pri); mi_switch(SW_INVOL, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); PICKUP_GIANT(); } Index: kern/kern_switch.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_switch.c,v retrieving revision 1.129 diff -u -p -r1.129 kern_switch.c --- kern/kern_switch.c 8 Feb 2007 01:52:25 -0000 1.129 +++ kern/kern_switch.c 31 May 2007 21:08:40 -0000 @@ -49,6 +49,8 @@ __FBSDID("$FreeBSD: src/sys/kern/kern_sw #include #endif +#include + /* Uncomment this to enable logging of critical_enter/exit. */ #if 0 #define KTR_CRITICAL KTR_SCHED @@ -77,6 +79,49 @@ static int kern_sched_preemption = 0; SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD, &kern_sched_preemption, 0, "Kernel preemption enabled"); +#ifdef SCHED_STATS +long switch_preempt; +long switch_owepreempt; +long switch_turnstile; +long switch_sleepq; +long switch_sleepqtimo; +long switch_relinquish; +long switch_needresched; +static SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW, 0, "switch stats"); +SYSCTL_INT(_kern_sched_stats, OID_AUTO, preempt, CTLFLAG_RD, &switch_preempt, 0, ""); +SYSCTL_INT(_kern_sched_stats, OID_AUTO, owepreempt, CTLFLAG_RD, &switch_owepreempt, 0, ""); +SYSCTL_INT(_kern_sched_stats, OID_AUTO, turnstile, CTLFLAG_RD, &switch_turnstile, 0, ""); +SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepq, CTLFLAG_RD, &switch_sleepq, 0, ""); +SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepqtimo, CTLFLAG_RD, &switch_sleepqtimo, 0, ""); +SYSCTL_INT(_kern_sched_stats, OID_AUTO, relinquish, CTLFLAG_RD, &switch_relinquish, 0, ""); +SYSCTL_INT(_kern_sched_stats, OID_AUTO, needresched, CTLFLAG_RD, &switch_needresched, 0, ""); +static int +sysctl_stats_reset(SYSCTL_HANDLER_ARGS) +{ + int error; + int val; + + val = 0; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (val == 0) + return (0); + switch_preempt = 0; + switch_owepreempt = 0; + switch_turnstile = 0; + switch_sleepq = 0; + switch_sleepqtimo = 0; + switch_relinquish = 0; + switch_needresched = 0; + + return (0); +} + +SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_WR, NULL, + 0, sysctl_stats_reset, "I", "Reset scheduler statistics"); +#endif + /************************************************************************ * Functions that manipulate runnability from a thread perspective. * ************************************************************************/ @@ -142,13 +187,13 @@ critical_exit(void) #ifdef PREEMPTION if (td->td_critnest == 1) { td->td_critnest = 0; - mtx_assert(&sched_lock, MA_NOTOWNED); if (td->td_owepreempt) { td->td_critnest = 1; - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_critnest--; + SCHED_STAT_INC(switch_owepreempt); mi_switch(SW_INVOL, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } } else #endif @@ -173,7 +218,6 @@ maybe_preempt(struct thread *td) int cpri, pri; #endif - mtx_assert(&sched_lock, MA_OWNED); #ifdef PREEMPTION /* * The new thread should not preempt the current thread if any of the @@ -199,6 +243,7 @@ maybe_preempt(struct thread *td) * to the new thread. */ ctd = curthread; + THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT ((ctd->td_sched != NULL && ctd->td_sched->ts_thread == ctd), ("thread has no (or wrong) sched-private part.")); KASSERT((td->td_inhibitors == 0), @@ -219,15 +264,25 @@ maybe_preempt(struct thread *td) ctd->td_owepreempt = 1; return (0); } - /* * Thread is runnable but not yet put on system run queue. */ + MPASS(ctd->td_lock == &sched_lock); + MPASS(td->td_lock == &sched_lock); MPASS(TD_ON_RUNQ(td)); TD_SET_RUNNING(td); CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td, td->td_proc->p_pid, td->td_proc->p_comm); + SCHED_STAT_INC(switch_preempt); mi_switch(SW_INVOL|SW_PREEMPT, td); + /* + * td's lock pointer may have changed. We have to return with it + * locked. + */ + spinlock_enter(); + thread_unlock(ctd); + thread_lock(td); + spinlock_exit(); return (1); #else return (0); @@ -442,7 +497,6 @@ runq_choose(struct runq *rq) struct td_sched *ts; int pri; - mtx_assert(&sched_lock, MA_OWNED); while ((pri = runq_findbit(rq)) != -1) { rqh = &rq->rq_queues[pri]; #if defined(SMP) && defined(SCHED_4BSD) @@ -484,7 +538,6 @@ runq_choose_from(struct runq *rq, u_char struct td_sched *ts; int pri; - mtx_assert(&sched_lock, MA_OWNED); if ((pri = runq_findbit_from(rq, idx)) != -1) { rqh = &rq->rq_queues[pri]; ts = TAILQ_FIRST(rqh); @@ -519,9 +572,20 @@ runq_remove_idx(struct runq *rq, struct KASSERT(ts->ts_thread->td_proc->p_sflag & PS_INMEM, ("runq_remove_idx: process swapped out")); pri = ts->ts_rqindex; + KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri)); rqh = &rq->rq_queues[pri]; CTR5(KTR_RUNQ, "runq_remove_idx: td=%p, ts=%p pri=%d %d rqh=%p", ts->ts_thread, ts, ts->ts_thread->td_priority, pri, rqh); + { + struct td_sched *nts; + + TAILQ_FOREACH(nts, rqh, ts_procq) + if (nts == ts) + break; + if (ts != nts) + panic("runq_remove_idx: ts %p not on rqindex %d", + ts, pri); + } TAILQ_REMOVE(rqh, ts, ts_procq); if (TAILQ_EMPTY(rqh)) { CTR0(KTR_RUNQ, "runq_remove_idx: empty"); @@ -589,18 +653,4 @@ sched_set_concurrency(struct proc *p, in { } -/* - * Called from thread_exit() for all exiting thread - * - * Not to be confused with sched_exit_thread() - * that is only called from thread_exit() for threads exiting - * without the rest of the process exiting because it is also called from - * sched_exit() and we wouldn't want to call it twice. - * XXX This can probably be fixed. - */ -void -sched_thread_exit(struct thread *td) -{ -} - #endif /* KERN_SWITCH_INCLUDE */ Index: kern/kern_synch.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_synch.c,v retrieving revision 1.298 diff -u -p -r1.298 kern_synch.c --- kern/kern_synch.c 1 Jun 2007 01:20:11 -0000 1.298 +++ kern/kern_synch.c 31 May 2007 21:03:27 -0000 @@ -213,9 +213,9 @@ _sleep(ident, lock, priority, wmesg, tim */ pri = priority & PRIMASK; if (pri != 0 && pri != td->td_priority) { - mtx_lock_spin(&sched_lock); + thread_lock(td); sched_prio(td, pri); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } if (timo && catch) @@ -362,6 +362,7 @@ wakeup_one(ident) sleepq_lock(ident); sleepq_signal(ident, SLEEPQ_SLEEP, -1, 0); + sleepq_release(ident); } /* @@ -374,8 +375,8 @@ mi_switch(int flags, struct thread *newt struct thread *td; struct proc *p; - mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); td = curthread; /* XXX */ + THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED); p = td->td_proc; /* XXX */ KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code")); #ifdef INVARIANTS @@ -394,12 +395,15 @@ mi_switch(int flags, struct thread *newt * Don't perform context switches from the debugger. */ if (kdb_active) { - mtx_unlock_spin(&sched_lock); + thread_unlock(td); kdb_backtrace(); kdb_reenter(); panic("%s: did not reenter debugger", __func__); } + /* + * XXX Need proc lock for stats! + */ if (flags & SW_VOL) td->td_ru.ru_nvcsw++; else @@ -466,7 +470,7 @@ setrunnable(struct thread *td) struct proc *p; p = td->td_proc; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); switch (p->p_state) { case PRS_ZOMBIE: panic("setrunnable(1)"); @@ -495,7 +499,7 @@ setrunnable(struct thread *td) if ((p->p_sflag & PS_SWAPPINGIN) == 0) { p->p_sflag |= PS_SWAPINREQ; /* - * due to a LOR between sched_lock and + * due to a LOR between the thread lock and * the sleepqueue chain locks, use * lower level scheduling functions. */ Index: kern/kern_thr.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_thr.c,v retrieving revision 1.59 diff -u -p -r1.59 kern_thr.c --- kern/kern_thr.c 23 Jan 2007 08:46:50 -0000 1.59 +++ kern/kern_thr.c 18 May 2007 10:37:02 -0000 @@ -226,12 +226,15 @@ create_thread(struct thread *td, mcontex PROC_LOCK(td->td_proc); td->td_proc->p_flag |= P_HADTHREADS; newtd->td_sigmask = td->td_sigmask; - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); thread_link(newtd, p); - PROC_UNLOCK(p); - + thread_lock(td); /* let the scheduler know about these things. */ sched_fork_thread(td, newtd); + thread_unlock(td); + PROC_SUNLOCK(p); + PROC_UNLOCK(p); + thread_lock(newtd); if (rtp != NULL) { if (!(td->td_pri_class == PRI_TIMESHARE && rtp->type == RTP_PRIO_NORMAL)) { @@ -242,7 +245,7 @@ create_thread(struct thread *td, mcontex TD_SET_CAN_RUN(newtd); /* if ((flags & THR_SUSPENDED) == 0) */ sched_add(newtd, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(newtd); return (error); } @@ -275,7 +278,7 @@ thr_exit(struct thread *td, struct thr_e PROC_LOCK(p); sigqueue_flush(&td->td_sigqueue); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); /* * Shutting down last thread in the proc. This will actually @@ -286,7 +289,7 @@ thr_exit(struct thread *td, struct thr_e thread_exit(); /* NOTREACHED */ } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); return (0); } @@ -379,9 +382,9 @@ kern_thr_suspend(struct thread *td, stru error = msleep((void *)td, &td->td_proc->p_mtx, PCATCH, "lthr", hz); if (td->td_flags & TDF_THRWAKEUP) { - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags &= ~TDF_THRWAKEUP; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); PROC_UNLOCK(td->td_proc); return (0); } @@ -414,9 +417,9 @@ thr_wake(struct thread *td, struct thr_w PROC_UNLOCK(p); return (ESRCH); } - mtx_lock_spin(&sched_lock); + thread_lock(ttd); ttd->td_flags |= TDF_THRWAKEUP; - mtx_unlock_spin(&sched_lock); + thread_unlock(ttd); wakeup((void *)ttd); PROC_UNLOCK(p); return (0); Index: kern/kern_thread.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_thread.c,v retrieving revision 1.245 diff -u -p -r1.245 kern_thread.c --- kern/kern_thread.c 1 Jun 2007 01:12:43 -0000 1.245 +++ kern/kern_thread.c 1 Jun 2007 02:09:25 -0000 @@ -70,8 +70,8 @@ int virtual_cpu; #endif TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads); -struct mtx kse_zombie_lock; -MTX_SYSINIT(kse_zombie_lock, &kse_zombie_lock, "kse zombie lock", MTX_SPIN); +struct mtx zombie_lock; +MTX_SYSINIT(zombie_lock, &zombie_lock, "zombie lock", MTX_SPIN); #ifdef KSE static int @@ -121,14 +121,7 @@ thread_ctor(void *mem, int size, void *a /* * Note that td_critnest begins life as 1 because the thread is not * running and is thereby implicitly waiting to be on the receiving - * end of a context switch. A context switch must occur inside a - * critical section, and in fact, includes hand-off of the sched_lock. - * After a context switch to a newly created thread, it will release - * sched_lock for the first time, and its td_critnest will hit 0 for - * the first time. This happens on the far end of a context switch, - * and when it context switches away from itself, it will in fact go - * back into a critical section, and hand off the sched lock to the - * next thread. + * end of a context switch. */ td->td_critnest = 1; @@ -222,6 +215,7 @@ thread_fini(void *mem, int size) void proc_linkup(struct proc *p, struct thread *td) { + TAILQ_INIT(&p->p_threads); /* all threads in proc */ TAILQ_INIT(&p->p_upcalls); /* upcall list */ sigqueue_init(&p->p_sigqueue, p); @@ -260,9 +254,9 @@ threadinit(void) void thread_stash(struct thread *td) { - mtx_lock_spin(&kse_zombie_lock); + mtx_lock_spin(&zombie_lock); TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq); - mtx_unlock_spin(&kse_zombie_lock); + mtx_unlock_spin(&zombie_lock); } /* @@ -278,11 +272,11 @@ thread_reap(void) * we really don't care about the next instant.. */ if (!TAILQ_EMPTY(&zombie_threads)) { - mtx_lock_spin(&kse_zombie_lock); + mtx_lock_spin(&zombie_lock); td_first = TAILQ_FIRST(&zombie_threads); if (td_first) TAILQ_INIT(&zombie_threads); - mtx_unlock_spin(&kse_zombie_lock); + mtx_unlock_spin(&zombie_lock); while (td_first) { td_next = TAILQ_NEXT(td_first, td_slpq); if (td_first->td_ucred) @@ -358,8 +352,9 @@ thread_exit(void) td = curthread; p = td->td_proc; - mtx_assert(&sched_lock, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); mtx_assert(&Giant, MA_NOTOWNED); + PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(p != NULL, ("thread exiting without a process")); CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td, @@ -390,28 +385,13 @@ thread_exit(void) */ cpu_thread_exit(td); /* XXXSMP */ -#ifdef KSE - /* - * The thread is exiting. scheduler can release its stuff - * and collect stats etc. - * XXX this is not very right, since PROC_UNLOCK may still - * need scheduler stuff. - */ - sched_thread_exit(td); -#endif - /* Do the same timestamp bookkeeping that mi_switch() would do. */ new_switchtime = cpu_ticks(); p->p_rux.rux_runtime += (new_switchtime - PCPU_GET(switchtime)); PCPU_SET(switchtime, new_switchtime); PCPU_SET(switchticks, ticks); cnt.v_swtch++; - /* - * Aggregate this thread's tick stats in the parent so they are not - * lost. Also add the child usage to our own when the final thread - * exits. - */ - ruxagg(&p->p_rux, td); + /* Add the child usage to our own when the final thread exits. */ if (p->p_numthreads == 1) ruadd(p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux); /* @@ -424,7 +404,13 @@ thread_exit(void) */ if (p->p_flag & P_HADTHREADS) { if (p->p_numthreads > 1) { + thread_lock(td); +#ifdef KSE + kse_unlink(td); +#else thread_unlink(td); +#endif + thread_unlock(td); /* Impart our resource usage on another thread */ td2 = FIRST_THREAD_IN_PROC(p); rucollect(&td2->td_ru, &td->td_ru); @@ -437,7 +423,9 @@ thread_exit(void) */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount) { + thread_lock(p->p_singlethread); thread_unsuspend_one(p->p_singlethread); + thread_unlock(p->p_singlethread); } } @@ -454,8 +442,6 @@ thread_exit(void) */ upcall_remove(td); #endif - - PROC_UNLOCK(p); PCPU_SET(deadthread, td); } else { /* @@ -473,17 +459,15 @@ thread_exit(void) */ panic ("thread_exit: Last thread exiting on its own"); } - } else { - /* - * non threaded process comes here. - * This includes an EX threaded process that is coming - * here via exit1(). (exit1 dethreads the proc first). - */ - PROC_UNLOCK(p); - } + } + PROC_UNLOCK(p); + thread_lock(td); + /* Aggregate our tick statistics into our parents rux. */ + ruxagg(&p->p_rux, td); + PROC_SUNLOCK(p); td->td_state = TDS_INACTIVE; CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td); - cpu_throw(td, choosethread()); + sched_throw(td); panic("I'm a teapot!"); /* NOTREACHED */ } @@ -532,6 +516,11 @@ void thread_link(struct thread *td, struct proc *p) { + /* + * XXX This can't be enabled because it's called for proc0 before + * it's spinlock has been created. + * PROC_SLOCK_ASSERT(p, MA_OWNED); + */ td->td_state = TDS_INACTIVE; td->td_proc = p; td->td_flags = 0; @@ -579,7 +568,7 @@ thread_unlink(struct thread *td) { struct proc *p = td->td_proc; - mtx_assert(&sched_lock, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); TAILQ_REMOVE(&p->p_threads, td, td_plist); p->p_numthreads--; /* could clear a few other things here */ @@ -631,7 +620,7 @@ thread_single(int mode) p->p_flag &= ~P_SINGLE_BOUNDARY; } p->p_flag |= P_STOPPED_SINGLE; - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); p->p_singlethread = td; if (mode == SINGLE_EXIT) remaining = p->p_numthreads; @@ -645,6 +634,7 @@ thread_single(int mode) FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; + thread_lock(td2); td2->td_flags |= TDF_ASTPENDING; if (TD_IS_INHIBITED(td2)) { switch (mode) { @@ -666,8 +656,10 @@ thread_single(int mode) sleepq_abort(td2, ERESTART); break; default: - if (TD_IS_SUSPENDED(td2)) + if (TD_IS_SUSPENDED(td2)) { + thread_unlock(td2); continue; + } /* * maybe other inhibited states too? */ @@ -683,6 +675,7 @@ thread_single(int mode) forward_signal(td2); } #endif + thread_unlock(td2); } if (mode == SINGLE_EXIT) remaining = p->p_numthreads; @@ -702,13 +695,7 @@ stopme: * Wake us up when everyone else has suspended. * In the mean time we suspend as well. */ - thread_stopped(p); - thread_suspend_one(td); - PROC_UNLOCK(p); - mi_switch(SW_VOL, NULL); - mtx_unlock_spin(&sched_lock); - PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + thread_suspend_switch(td); if (mode == SINGLE_EXIT) remaining = p->p_numthreads; else if (mode == SINGLE_BOUNDARY) @@ -727,7 +714,7 @@ stopme: p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT); thread_unthread(td); } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); return (0); } @@ -800,7 +787,7 @@ thread_suspend_check(int return_instead) if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) sigqueue_flush(&td->td_sigqueue); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); thread_stopped(p); /* * If the process is waiting for us to exit, @@ -809,7 +796,15 @@ thread_suspend_check(int return_instead) */ if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) thread_exit(); - + if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { + if (p->p_numthreads == p->p_suspcount + 1) { + thread_lock(p->p_singlethread); + thread_unsuspend_one(p->p_singlethread); + thread_unlock(p->p_singlethread); + } + } + PROC_UNLOCK(p); + thread_lock(td); /* * When a thread suspends, it just * gets taken off all queues. @@ -819,29 +814,52 @@ thread_suspend_check(int return_instead) p->p_boundary_count++; td->td_flags |= TDF_BOUNDARY; } - if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { - if (p->p_numthreads == p->p_suspcount) - thread_unsuspend_one(p->p_singlethread); - } - PROC_UNLOCK(p); + PROC_SUNLOCK(p); mi_switch(SW_INVOL, NULL); - if (return_instead == 0) { - p->p_boundary_count--; + if (return_instead == 0) td->td_flags &= ~TDF_BOUNDARY; - } - mtx_unlock_spin(&sched_lock); + thread_unlock(td); PROC_LOCK(p); + if (return_instead == 0) + p->p_boundary_count--; } return (0); } void +thread_suspend_switch(struct thread *td) +{ + struct proc *p; + + p = td->td_proc; + KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); + PROC_LOCK_ASSERT(p, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); + /* + * We implement thread_suspend_one in stages here to avoid + * dropping the proc lock while the thread lock is owned. + */ + thread_stopped(p); + p->p_suspcount++; + PROC_UNLOCK(p); + thread_lock(td); + TD_SET_SUSPENDED(td); + PROC_SUNLOCK(p); + DROP_GIANT(); + mi_switch(SW_VOL, NULL); + thread_unlock(td); + PICKUP_GIANT(); + PROC_LOCK(p); + PROC_SLOCK(p); +} + +void thread_suspend_one(struct thread *td) { struct proc *p = td->td_proc; - mtx_assert(&sched_lock, MA_OWNED); - PROC_LOCK_ASSERT(p, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); p->p_suspcount++; TD_SET_SUSPENDED(td); @@ -852,8 +870,8 @@ thread_unsuspend_one(struct thread *td) { struct proc *p = td->td_proc; - mtx_assert(&sched_lock, MA_OWNED); - PROC_LOCK_ASSERT(p, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended")); TD_CLR_SUSPENDED(td); p->p_suspcount--; @@ -868,13 +886,15 @@ thread_unsuspend(struct proc *p) { struct thread *td; - mtx_assert(&sched_lock, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); if (!P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); if (TD_IS_SUSPENDED(td)) { thread_unsuspend_one(td); } + thread_unlock(td); } } else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) && (p->p_numthreads == p->p_suspcount)) { @@ -883,7 +903,9 @@ thread_unsuspend(struct proc *p) * threading request. Now we've downgraded to single-threaded, * let it continue. */ + thread_lock(p->p_singlethread); thread_unsuspend_one(p->p_singlethread); + thread_unlock(p->p_singlethread); } } @@ -900,7 +922,7 @@ thread_single_end(void) p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); p->p_singlethread = NULL; /* * If there are other threads they mey now run, @@ -910,12 +932,14 @@ thread_single_end(void) */ if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) { FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); if (TD_IS_SUSPENDED(td)) { thread_unsuspend_one(td); } + thread_unlock(td); } } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } struct thread * @@ -924,11 +948,11 @@ thread_find(struct proc *p, lwpid_t tid) struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); FOREACH_THREAD_IN_PROC(p, td) { if (td->td_tid == tid) break; } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); return (td); } Index: kern/kern_time.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_time.c,v retrieving revision 1.140 diff -u -p -r1.140 kern_time.c --- kern/kern_time.c 22 Apr 2007 15:31:21 -0000 1.140 +++ kern/kern_time.c 31 May 2007 21:04:07 -0000 @@ -552,9 +552,9 @@ kern_getitimer(struct thread *td, u_int timevalsub(&aitv->it_value, &ctv); } } else { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); *aitv = p->p_stats->p_timer[which]; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } return (0); } @@ -623,10 +623,10 @@ kern_setitimer(struct thread *td, u_int timevalsub(&oitv->it_value, &ctv); } } else { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); *oitv = p->p_stats->p_timer[which]; p->p_stats->p_timer[which] = *aitv; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } return (0); } Index: kern/kern_umtx.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/kern_umtx.c,v retrieving revision 1.59 diff -u -p -r1.59 kern_umtx.c --- kern/kern_umtx.c 5 Mar 2007 13:10:57 -0000 1.59 +++ kern/kern_umtx.c 18 May 2007 10:37:02 -0000 @@ -124,8 +124,8 @@ struct umtx_q { /* * Blocked on PI mutex. read can use chain lock - * or sched_lock, write must have both chain lock and - * sched_lock being hold. + * or umtx_lock, write must have both chain lock and + * umtx_lock being hold. */ struct umtx_pi *uq_pi_blocked; @@ -225,6 +225,8 @@ static void umtx_exec_hook(void *arg __u struct image_params *imgp __unused); SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL); +static struct mtx umtx_lock; + static void umtxq_sysinit(void *arg __unused) { @@ -240,6 +242,7 @@ umtxq_sysinit(void *arg __unused) umtxq_chains[i].uc_busy = 0; umtxq_chains[i].uc_waiters = 0; } + mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN); EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL, EVENTHANDLER_PRI_ANY); } @@ -1270,7 +1273,7 @@ umtx_pi_adjust_thread(struct umtx_pi *pi struct umtx_q *uq, *uq1, *uq2; struct thread *td1; - mtx_assert(&sched_lock, MA_OWNED); + mtx_assert(&umtx_lock, MA_OWNED); if (pi == NULL) return (0); @@ -1316,7 +1319,7 @@ umtx_propagate_priority(struct thread *t struct umtx_pi *pi; int pri; - mtx_assert(&sched_lock, MA_OWNED); + mtx_assert(&umtx_lock, MA_OWNED); pri = UPRI(td); uq = td->td_umtxq; pi = uq->uq_pi_blocked; @@ -1334,7 +1337,9 @@ umtx_propagate_priority(struct thread *t if (UPRI(td) <= pri) return; + thread_lock(td); sched_lend_user_prio(td, pri); + thread_unlock(td); /* * Pick up the lock that td is blocked on. @@ -1358,7 +1363,7 @@ umtx_unpropagate_priority(struct umtx_pi struct umtx_pi *pi2; int pri; - mtx_assert(&sched_lock, MA_OWNED); + mtx_assert(&umtx_lock, MA_OWNED); while (pi != NULL && pi->pi_owner != NULL) { pri = PRI_MAX; @@ -1374,7 +1379,9 @@ umtx_unpropagate_priority(struct umtx_pi if (pri > uq_owner->uq_inherited_pri) pri = uq_owner->uq_inherited_pri; + thread_lock(pi->pi_owner); sched_unlend_user_prio(pi->pi_owner, pri); + thread_unlock(pi->pi_owner); pi = uq_owner->uq_pi_blocked; } } @@ -1388,7 +1395,7 @@ umtx_pi_setowner(struct umtx_pi *pi, str struct umtx_q *uq_owner; uq_owner = owner->td_umtxq; - mtx_assert(&sched_lock, MA_OWNED); + mtx_assert(&umtx_lock, MA_OWNED); if (pi->pi_owner != NULL) panic("pi_ower != NULL"); pi->pi_owner = owner; @@ -1404,9 +1411,9 @@ umtx_pi_claim(struct umtx_pi *pi, struct struct umtx_q *uq, *uq_owner; uq_owner = owner->td_umtxq; - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); if (pi->pi_owner == owner) { - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&umtx_lock); return (0); } @@ -1414,7 +1421,7 @@ umtx_pi_claim(struct umtx_pi *pi, struct /* * userland may have already messed the mutex, sigh. */ - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&umtx_lock); return (EPERM); } umtx_pi_setowner(pi, owner); @@ -1423,10 +1430,12 @@ umtx_pi_claim(struct umtx_pi *pi, struct int pri; pri = UPRI(uq->uq_thread); + thread_lock(owner); if (pri < UPRI(owner)) sched_lend_user_prio(owner, pri); + thread_unlock(owner); } - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&umtx_lock); return (0); } @@ -1442,7 +1451,7 @@ umtx_pi_adjust(struct thread *td, u_char uq = td->td_umtxq; - mtx_assert(&sched_lock, MA_OWNED); + mtx_assert(&umtx_lock, MA_OWNED); MPASS(TD_ON_UPILOCK(td)); /* @@ -1493,14 +1502,14 @@ umtxq_sleep_pi(struct umtx_q *uq, struct */ PROC_LOCK(curproc); td1 = thread_find(curproc, owner); - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); if (td1 != NULL && pi->pi_owner == NULL) { uq1 = td1->td_umtxq; umtx_pi_setowner(pi, td1); } PROC_UNLOCK(curproc); } else { - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); } TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) { @@ -1516,12 +1525,12 @@ umtxq_sleep_pi(struct umtx_q *uq, struct uq->uq_pi_blocked = pi; td->td_flags |= TDF_UPIBLOCKED; - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&umtx_lock); umtxq_unlock(&uq->uq_key); - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); umtx_propagate_priority(td); - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&umtx_lock); umtxq_lock(&uq->uq_key); if (uq->uq_flags & UQF_UMTXQ) { @@ -1536,12 +1545,12 @@ umtxq_sleep_pi(struct umtx_q *uq, struct } umtxq_unlock(&uq->uq_key); - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); uq->uq_pi_blocked = NULL; td->td_flags &= ~TDF_UPIBLOCKED; TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq); umtx_unpropagate_priority(pi); - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&umtx_lock); umtxq_lock(&uq->uq_key); @@ -1575,7 +1584,7 @@ umtx_pi_unref(struct umtx_pi *pi) UMTXQ_LOCKED_ASSERT(uc); KASSERT(pi->pi_refcount > 0, ("invalid reference count")); if (--pi->pi_refcount == 0) { - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); if (pi->pi_owner != NULL) { TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link); @@ -1583,7 +1592,7 @@ umtx_pi_unref(struct umtx_pi *pi) } KASSERT(TAILQ_EMPTY(&pi->pi_blocked), ("blocked queue not empty")); - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&umtx_lock); TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink); free = 1; } @@ -1822,7 +1831,7 @@ do_unlock_pi(struct thread *td, struct u return (EPERM); } uq_me = curthread->td_umtxq; - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); pi->pi_owner = NULL; TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link); uq_first = TAILQ_FIRST(&pi->pi_blocked); @@ -1834,8 +1843,10 @@ do_unlock_pi(struct thread *td, struct u pri = UPRI(uq_first2->uq_thread); } } + thread_lock(curthread); sched_unlend_user_prio(curthread, pri); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); + mtx_unlock_spin(&umtx_lock); } umtxq_unlock(&key); @@ -1891,18 +1902,20 @@ _do_lock_pp(struct thread *td, struct um goto out; } - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); if (UPRI(td) < PRI_MIN_REALTIME + ceiling) { - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&umtx_lock); error = EINVAL; goto out; } if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) { uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling; + thread_lock(td); if (uq->uq_inherited_pri < UPRI(td)) sched_lend_user_prio(td, uq->uq_inherited_pri); + thread_unlock(td); } - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&umtx_lock); owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, id | UMUTEX_CONTESTED); @@ -1943,7 +1956,7 @@ _do_lock_pp(struct thread *td, struct um umtxq_remove(uq); umtxq_unlock(&uq->uq_key); - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); uq->uq_inherited_pri = old_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { @@ -1955,12 +1968,14 @@ _do_lock_pp(struct thread *td, struct um } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; + thread_lock(td); sched_unlend_user_prio(td, pri); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + mtx_unlock_spin(&umtx_lock); } if (error != 0) { - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); uq->uq_inherited_pri = old_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { @@ -1972,8 +1987,10 @@ _do_lock_pp(struct thread *td, struct um } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; + thread_lock(td); sched_unlend_user_prio(td, pri); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + mtx_unlock_spin(&umtx_lock); } out: @@ -2048,7 +2065,7 @@ do_unlock_pp(struct thread *td, struct u if (error == -1) error = EFAULT; else { - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); if (su != 0) uq->uq_inherited_pri = new_inherited_pri; pri = PRI_MAX; @@ -2061,8 +2078,10 @@ do_unlock_pp(struct thread *td, struct u } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; + thread_lock(td); sched_unlend_user_prio(td, pri); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + mtx_unlock_spin(&umtx_lock); } umtx_key_release(&key); return (error); @@ -2748,12 +2767,12 @@ umtx_thread_cleanup(struct thread *td) if ((uq = td->td_umtxq) == NULL) return; - mtx_lock_spin(&sched_lock); + mtx_lock_spin(&umtx_lock); uq->uq_inherited_pri = PRI_MAX; while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) { pi->pi_owner = NULL; TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link); } td->td_flags &= ~TDF_UBORROWING; - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&umtx_lock); } Index: kern/ksched.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/ksched.c,v retrieving revision 1.35 diff -u -p -r1.35 ksched.c --- kern/ksched.c 6 Dec 2006 06:34:55 -0000 1.35 +++ kern/ksched.c 18 May 2007 10:37:02 -0000 @@ -104,9 +104,7 @@ getscheduler(struct ksched *ksched, stru struct rtprio rtp; int e = 0; - mtx_lock_spin(&sched_lock); pri_to_rtp(td, &rtp); - mtx_unlock_spin(&sched_lock); switch (rtp.type) { case RTP_PRIO_FIFO: @@ -151,9 +149,7 @@ ksched_getparam(struct ksched *ksched, { struct rtprio rtp; - mtx_lock_spin(&sched_lock); pri_to_rtp(td, &rtp); - mtx_unlock_spin(&sched_lock); if (RTP_PRIO_IS_REALTIME(rtp.type)) param->sched_priority = rtpprio_to_p4prio(rtp.prio); @@ -186,9 +182,7 @@ ksched_setscheduler(struct ksched *ksche rtp.type = (policy == SCHED_FIFO) ? RTP_PRIO_FIFO : RTP_PRIO_REALTIME; - mtx_lock_spin(&sched_lock); rtp_to_pri(&rtp, td); - mtx_unlock_spin(&sched_lock); } else e = EPERM; @@ -200,9 +194,7 @@ ksched_setscheduler(struct ksched *ksche { rtp.type = RTP_PRIO_NORMAL; rtp.prio = p4prio_to_rtpprio(param->sched_priority); - mtx_lock_spin(&sched_lock); rtp_to_pri(&rtp, td); - mtx_unlock_spin(&sched_lock); } break; Index: kern/sched_4bsd.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/sched_4bsd.c,v retrieving revision 1.97 diff -u -p -r1.97 sched_4bsd.c --- kern/sched_4bsd.c 27 Feb 2007 17:23:27 -0000 1.97 +++ kern/sched_4bsd.c 31 May 2007 22:17:55 -0000 @@ -248,7 +248,7 @@ static void maybe_resched(struct thread *td) { - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_priority < curthread->td_priority) curthread->td_flags |= TDF_NEEDRESCHED; } @@ -377,10 +377,7 @@ schedcpu(void) realstathz = stathz ? stathz : hz; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { - /* - * Prevent state changes and protect run queue. - */ - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); /* * Increment time in/out of memory. We ignore overflow; with * 16-bit int's (remember them?) overflow takes 45 days. @@ -388,6 +385,7 @@ schedcpu(void) p->p_swtime++; FOREACH_THREAD_IN_PROC(p, td) { awake = 0; + thread_lock(td); ts = td->td_sched; /* * Increment sleep time (if sleeping). We @@ -456,13 +454,16 @@ XXX this is broken td->td_slptime = 0; } else td->td_slptime++; - if (td->td_slptime > 1) + if (td->td_slptime > 1) { + thread_unlock(td); continue; + } td->td_estcpu = decay_cpu(loadfac, td->td_estcpu); resetpriority(td); resetpriority_thread(td); + thread_unlock(td); } /* end of thread loop */ - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } /* end of process loop */ sx_sunlock(&allproc_lock); } @@ -575,6 +576,7 @@ schedinit(void) */ proc0.p_sched = NULL; /* XXX */ thread0.td_sched = &td_sched0; + thread0.td_lock = &sched_lock; td_sched0.ts_thread = &thread0; } @@ -615,7 +617,7 @@ sched_clock(struct thread *td) { struct td_sched *ts; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td->td_sched; ts->ts_cpticks++; @@ -635,22 +637,23 @@ sched_exit(struct proc *p, struct thread CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d", td, td->td_proc->p_comm, td->td_priority); - + PROC_SLOCK_ASSERT(p, MA_OWNED); sched_exit_thread(FIRST_THREAD_IN_PROC(p), td); } void sched_exit_thread(struct thread *td, struct thread *child) { - struct proc *childproc = child->td_proc; CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d", - child, childproc->p_comm, child->td_priority); + child, child->td_proc->p_comm, child->td_priority); + thread_lock(td); td->td_estcpu = ESTCPULIM(td->td_estcpu + child->td_estcpu); - childproc->p_estcpu = ESTCPULIM(childproc->p_estcpu + - child->td_estcpu); + thread_unlock(td); + mtx_lock_spin(&sched_lock); if ((child->td_proc->p_flag & P_NOLOAD) == 0) sched_load_rem(); + mtx_unlock_spin(&sched_lock); } void @@ -663,6 +666,7 @@ void sched_fork_thread(struct thread *td, struct thread *childtd) { childtd->td_estcpu = td->td_estcpu; + childtd->td_lock = &sched_lock; sched_newthread(childtd); } @@ -672,18 +676,20 @@ sched_nice(struct proc *p, int nice) struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); - mtx_assert(&sched_lock, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); p->p_nice = nice; FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); resetpriority(td); resetpriority_thread(td); + thread_unlock(td); } } void sched_class(struct thread *td, int class) { - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_pri_class = class; } @@ -697,7 +703,7 @@ sched_priority(struct thread *td, u_char td, td->td_proc->p_comm, td->td_priority, prio, curthread, curthread->td_proc->p_comm); - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_priority == prio) return; td->td_priority = prio; @@ -818,7 +824,7 @@ void sched_sleep(struct thread *td) { - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_slptime = 0; } @@ -831,26 +837,18 @@ sched_switch(struct thread *td, struct t ts = td->td_sched; p = td->td_proc; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); + /* + * Switch to the sched lock to fix things up and pick + * a new thread. + */ + if (td->td_lock != &sched_lock) { + mtx_lock_spin(&sched_lock); + thread_unlock(td); + } if ((p->p_flag & P_NOLOAD) == 0) sched_load_rem(); -#if 0 - /* - * We are volunteering to switch out so we get to nominate - * a successor for the rest of our quantum - * First try another thread in our process - * - * this is too expensive to do without per process run queues - * so skip it for now. - * XXX keep this comment as a marker. - */ - if (sched_followon && - (p->p_flag & P_HADTHREADS) && - (flags & SW_VOL) && - newtd == NULL) - newtd = mumble(); -#endif if (newtd) newtd->td_flags |= (td->td_flags & TDF_NEEDRESCHED); @@ -896,6 +894,7 @@ sched_switch(struct thread *td, struct t } else { newtd = choosethread(); } + MPASS(newtd->td_lock == &sched_lock); if (td != newtd) { #ifdef HWPMC_HOOKS @@ -904,7 +903,7 @@ sched_switch(struct thread *td, struct t #endif /* I feel sleepy */ - cpu_switch(td, newtd); + cpu_switch(td, newtd, td->td_lock); /* * Where am I? What year is it? * We are in the same thread that went to sleep above, @@ -932,12 +931,13 @@ sched_switch(struct thread *td, struct t #endif sched_lock.mtx_lock = (uintptr_t)td; td->td_oncpu = PCPU_GET(cpuid); + MPASS(td->td_lock == &sched_lock); } void sched_wakeup(struct thread *td) { - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_slptime > 1) { updatepri(td); resetpriority(td); @@ -1079,7 +1079,7 @@ sched_add(struct thread *td, int flags) int single_cpu = 0; ts = td->td_sched; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("sched_add: trying to run inhibited thread")); KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), @@ -1089,6 +1089,14 @@ sched_add(struct thread *td, int flags) CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", td, td->td_proc->p_comm, td->td_priority, curthread, curthread->td_proc->p_comm); + /* + * Now that the thread is moving to the run-queue, set the lock + * to the scheduler's lock. + */ + if (td->td_lock != &sched_lock) { + mtx_lock_spin(&sched_lock); + thread_lock_set(td, &sched_lock); + } TD_SET_RUNQ(td); if (td->td_pinned != 0) { @@ -1140,7 +1148,7 @@ sched_add(struct thread *td, int flags) { struct td_sched *ts; ts = td->td_sched; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("sched_add: trying to run inhibited thread")); KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), @@ -1150,6 +1158,14 @@ sched_add(struct thread *td, int flags) CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", td, td->td_proc->p_comm, td->td_priority, curthread, curthread->td_proc->p_comm); + /* + * Now that the thread is moving to the run-queue, set the lock + * to the scheduler's lock. + */ + if (td->td_lock != &sched_lock) { + mtx_lock_spin(&sched_lock); + thread_lock_set(td, &sched_lock); + } TD_SET_RUNQ(td); CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td); ts->ts_runq = &runq; @@ -1207,6 +1223,7 @@ sched_choose(void) struct td_sched *ts; struct runq *rq; + mtx_assert(&sched_lock, MA_OWNED); #ifdef SMP struct td_sched *kecpu; @@ -1256,10 +1273,10 @@ sched_userret(struct thread *td) KASSERT((td->td_flags & TDF_BORROWING) == 0, ("thread with borrowed priority returning to userland")); if (td->td_priority != td->td_user_pri) { - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_priority = td->td_user_pri; td->td_base_pri = td->td_user_pri; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } } @@ -1268,7 +1285,7 @@ sched_bind(struct thread *td, int cpu) { struct td_sched *ts; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_RUNNING(td), ("sched_bind: cannot bind non-running thread")); @@ -1287,25 +1304,26 @@ sched_bind(struct thread *td, int cpu) void sched_unbind(struct thread* td) { - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_sched->ts_flags &= ~TSF_BOUND; } int sched_is_bound(struct thread *td) { - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); return (td->td_sched->ts_flags & TSF_BOUND); } void sched_relinquish(struct thread *td) { - mtx_lock_spin(&sched_lock); + thread_lock(td); if (td->td_pri_class == PRI_TIMESHARE) sched_prio(td, PRI_MAX_TIMESHARE); + SCHED_STAT_INC(switch_relinquish); mi_switch(SW_VOL, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } int @@ -1363,5 +1381,57 @@ sched_idletd(void *dummy) } } +/* + * A CPU is entering for the first time or a thread is exiting. + */ +void +sched_throw(struct thread *td) +{ + /* + * Correct spinlock nesting. The idle thread context that we are + * borrowing was created so that it would start out with a single + * spin lock (sched_lock) held in fork_trampoline(). Since we've + * explicitly acquired locks in this function, the nesting count + * is now 2 rather than 1. Since we are nested, calling + * spinlock_exit() will simply adjust the counts without allowing + * spin lock using code to interrupt us. + */ + if (td == NULL) { + mtx_lock_spin(&sched_lock); + spinlock_exit(); + } else { + MPASS(td->td_lock == &sched_lock); + } + mtx_assert(&sched_lock, MA_OWNED); + KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); + PCPU_SET(switchtime, cpu_ticks()); + PCPU_SET(switchticks, ticks); + cpu_throw(td, choosethread()); /* doesn't return */ +} + +void +sched_fork_exit(struct thread *ctd) +{ + struct thread *td; + + /* + * Finish setting up thread glue so that it begins execution in a + * non-nested critical section with sched_lock held but not recursed. + */ + ctd->td_oncpu = PCPU_GET(cpuid); + sched_lock.mtx_lock = (uintptr_t)ctd; + THREAD_LOCK_ASSERT(ctd, MA_OWNED | MA_NOTRECURSED); + /* + * Processes normally resume in mi_switch() after being + * cpu_switch()'ed to, but when children start up they arrive here + * instead, so we must do much the same things as mi_switch() would. + */ + if ((td = PCPU_GET(deadthread))) { + PCPU_SET(deadthread, NULL); + thread_stash(td); + } + thread_unlock(ctd); +} + #define KERN_SWITCH_INCLUDE 1 #include "kern/kern_switch.c" Index: kern/sched_core.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/sched_core.c,v retrieving revision 1.13 diff -u -p -r1.13 sched_core.c --- kern/sched_core.c 8 Mar 2007 06:44:33 -0000 1.13 +++ kern/sched_core.c 31 May 2007 23:38:26 -0000 @@ -784,6 +784,7 @@ schedinit(void) */ proc0.p_sched = NULL; /* XXX */ thread0.td_sched = &kse0; + thread0.td_lock = &sched_lock; kse0.ts_thread = &thread0; kse0.ts_slice = 100; } @@ -1018,7 +1019,7 @@ sched_switch(struct thread *td, struct t if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); #endif - cpu_switch(td, newtd); + cpu_switch(td, newtd, td->td_lock); #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); @@ -1110,6 +1111,7 @@ sched_fork_thread(struct thread *td, str ts = td->td_sched; ts2 = child->td_sched; + child->td_lock = td->td_lock; ts2->ts_slptime = ts2->ts_slptime * CHILD_WEIGHT / 100; if (child->td_pri_class == PRI_TIMESHARE) sched_user_prio(child, sched_calc_pri(ts2)); @@ -1142,7 +1144,8 @@ sched_class(struct thread *td, int class void sched_exit(struct proc *p, struct thread *childtd) { - mtx_assert(&sched_lock, MA_OWNED); + + PROC_SLOCK_ASSERT(p, MA_OWNED); sched_exit_thread(FIRST_THREAD_IN_PROC(p), childtd); } @@ -1747,5 +1750,57 @@ sched_idletd(void *dummy) } } +/* + * A CPU is entering for the first time or a thread is exiting. + */ +void +sched_throw(struct thread *td) +{ + /* + * Correct spinlock nesting. The idle thread context that we are + * borrowing was created so that it would start out with a single + * spin lock (sched_lock) held in fork_trampoline(). Since we've + * explicitly acquired locks in this function, the nesting count + * is now 2 rather than 1. Since we are nested, calling + * spinlock_exit() will simply adjust the counts without allowing + * spin lock using code to interrupt us. + */ + if (td == NULL) { + mtx_lock_spin(&sched_lock); + spinlock_exit(); + } else { + MPASS(td->td_lock == &sched_lock); + } + mtx_assert(&sched_lock, MA_OWNED); + KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); + PCPU_SET(switchtime, cpu_ticks()); + PCPU_SET(switchticks, ticks); + cpu_throw(td, choosethread()); /* doesn't return */ +} + +void +sched_fork_exit(struct thread *ctd) +{ + struct thread *td; + + /* + * Finish setting up thread glue so that it begins execution in a + * non-nested critical section with sched_lock held but not recursed. + */ + ctd->td_oncpu = PCPU_GET(cpuid); + sched_lock.mtx_lock = (uintptr_t)ctd; + THREAD_LOCK_ASSERT(ctd, MA_OWNED | MA_NOTRECURSED); + /* + * Processes normally resume in mi_switch() after being + * cpu_switch()'ed to, but when children start up they arrive here + * instead, so we must do much the same things as mi_switch() would. + */ + if ((td = PCPU_GET(deadthread))) { + PCPU_SET(deadthread, NULL); + thread_stash(td); + } + thread_unlock(ctd); +} + #define KERN_SWITCH_INCLUDE 1 #include "kern/kern_switch.c" Index: kern/sched_smp.c =================================================================== RCS file: kern/sched_smp.c diff -N kern/sched_smp.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ kern/sched_smp.c 31 May 2007 22:18:36 -0000 @@ -0,0 +1,2320 @@ +/*- + * Copyright (c) 2002-2007, Jeffrey Roberson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: src/sys/kern/sched_ule.c,v 1.192 2007/04/20 05:45:46 kmacy Exp $"); + +#include "opt_hwpmc_hooks.h" +#include "opt_sched.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef KTRACE +#include +#include +#endif + +#ifdef HWPMC_HOOKS +#include +#endif + +#include +#include + +#ifndef PREEMPTION +#error "SCHED_ULE requires options PREEMPTION" +#endif + +/* + * TODO: + * Pick idle from affinity group or self group first. + * Implement pick_score. + */ + +#define KTR_ULE KTR_SCHED /* Enable for pickpri debugging. */ + +/* + * Thread scheduler specific section. + */ +struct td_sched { + TAILQ_ENTRY(td_sched) ts_procq; /* (j/z) Run queue. */ + int ts_flags; /* (j) TSF_* flags. */ + struct thread *ts_thread; /* (*) Active associated thread. */ + u_char ts_rqindex; /* (j) Run queue index. */ + int ts_slptime; + int ts_slice; + struct runq *ts_runq; + u_char ts_cpu; /* CPU that we have affinity for. */ + /* The following variables are only used for pctcpu calculation */ + int ts_ltick; /* Last tick that we were running on */ + int ts_ftick; /* First tick that we were running on */ + int ts_ticks; /* Tick count */ +#ifdef SMP + int ts_rltick; /* Real last tick, for affinity. */ +#endif + + /* originally from kg_sched */ + u_int skg_slptime; /* Number of ticks we vol. slept */ + u_int skg_runtime; /* Number of ticks we were running */ +}; +/* flags kept in ts_flags */ +#define TSF_BOUND 0x0001 /* Thread can not migrate. */ +#define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */ + +static struct td_sched td_sched0; + +/* + * Cpu percentage computation macros and defines. + * + * SCHED_TICK_SECS: Number of seconds to average the cpu usage across. + * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across. + * SCHED_TICK_MAX: Maximum number of ticks before scaling back. + * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results. + * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count. + * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks. + */ +#define SCHED_TICK_SECS 10 +#define SCHED_TICK_TARG (hz * SCHED_TICK_SECS) +#define SCHED_TICK_MAX (SCHED_TICK_TARG + hz) +#define SCHED_TICK_SHIFT 10 +#define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT) +#define SCHED_TICK_TOTAL(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, hz)) + +/* + * These macros determine priorities for non-interactive threads. They are + * assigned a priority based on their recent cpu utilization as expressed + * by the ratio of ticks to the tick total. NHALF priorities at the start + * and end of the MIN to MAX timeshare range are only reachable with negative + * or positive nice respectively. + * + * PRI_RANGE: Priority range for utilization dependent priorities. + * PRI_NRESV: Number of nice values. + * PRI_TICKS: Compute a priority in PRI_RANGE from the ticks count and total. + * PRI_NICE: Determines the part of the priority inherited from nice. + */ +#define SCHED_PRI_NRESV (PRIO_MAX - PRIO_MIN) +#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) +#define SCHED_PRI_MIN (PRI_MIN_TIMESHARE + SCHED_PRI_NHALF) +#define SCHED_PRI_MAX (PRI_MAX_TIMESHARE - SCHED_PRI_NHALF) +#define SCHED_PRI_RANGE (SCHED_PRI_MAX - SCHED_PRI_MIN + 1) +#define SCHED_PRI_TICKS(ts) \ + (SCHED_TICK_HZ((ts)) / \ + (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE)) +#define SCHED_PRI_NICE(nice) (nice) + +/* + * These determine the interactivity of a process. Interactivity differs from + * cpu utilization in that it expresses the voluntary time slept vs time ran + * while cpu utilization includes all time not running. This more accurately + * models the intent of the thread. + * + * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate + * before throttling back. + * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. + * INTERACT_MAX: Maximum interactivity value. Smaller is better. + * INTERACT_THRESH: Threshhold for placement on the current runq. + */ +#define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT) +#define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT) +#define SCHED_INTERACT_MAX (100) +#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) +#define SCHED_INTERACT_THRESH (30) + +/* + * tickincr: Converts a stathz tick into a hz domain scaled by + * the shift factor. Without the shift the error rate + * due to rounding would be unacceptably high. + * realstathz: stathz is sometimes 0 and run off of hz. + * sched_slice: Runtime of each thread before rescheduling. + */ +static int sched_interact = SCHED_INTERACT_THRESH; +static int realstathz; +static int tickincr; +static int sched_slice; + +/* + * tdq - per processor runqs and statistics. + */ +struct tdq { + struct mtx tdq_lock; + struct runq tdq_idle; /* Queue of IDLE threads. */ + struct runq tdq_timeshare; /* timeshare run queue. */ + struct runq tdq_realtime; /* real-time run queue. */ + u_char tdq_idx; /* Current insert index. */ + u_char tdq_ridx; /* Current removal index. */ + short tdq_flags; /* Thread queue flags */ + int tdq_load; /* Aggregate load. */ +#ifdef SMP + int tdq_transferable; + LIST_ENTRY(tdq) tdq_siblings; /* Next in tdq group. */ + struct tdq_group *tdq_group; /* Our processor group. */ +#else + int tdq_sysload; /* For loadavg, !ITHD load. */ +#endif + char tdq_name[16]; /* lock name */ +}; + +#define TDQF_BUSY 0x0001 /* Queue is marked as busy */ + +#ifdef SMP +/* + * tdq groups are groups of processors which can cheaply share threads. When + * one processor in the group goes idle it will check the runqs of the other + * processors in its group prior to halting and waiting for an interrupt. + * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. + * In a numa environment we'd want an idle bitmap per group and a two tiered + * load balancer. + */ +struct tdq_group { + int tdg_cpus; /* Count of CPUs in this tdq group. */ + cpumask_t tdg_cpumask; /* Mask of cpus in this group. */ + cpumask_t tdg_idlemask; /* Idle cpus in this group. */ + cpumask_t tdg_mask; /* Bit mask for first cpu. */ + int tdg_load; /* Total load of this group. */ + int tdg_transferable; /* Transferable load of this group. */ + LIST_HEAD(, tdq) tdg_members; /* Linked list of all members. */ +}; + +#define SCHED_AFFINITY_DEFAULT (hz / 100) +#define SCHED_AFFINITY(ts) ((ts)->ts_rltick > ticks - affinity) + +/* + * Run-time tunables. + */ +static int rebalance = 0; +static int pick_pri = 0; +static int affinity; +static int tryself = 1; +static int tryselfidle = 1; +static int ipi_ast = 0; +static int ipi_preempt = 1; +static int ipi_thresh = PRI_MIN_KERN; +static int steal_htt = 0; +static int steal_busy = 0; +static int busy_thresh = 4; +static int topology = 0; + +/* + * One thread queue per processor. + */ +static volatile cpumask_t tdq_idle; +static volatile cpumask_t tdq_busy; +static int tdg_maxid; +static struct tdq tdq_cpu[MAXCPU]; +static struct tdq_group tdq_groups[MAXCPU]; +static int bal_tick; +static int gbal_tick; +static int balance_groups; + +#define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)]) +#define TDQ_CPU(x) (&tdq_cpu[(x)]) +#define TDQ_ID(x) ((x) - tdq_cpu) +#define TDQ_GROUP(x) (&tdq_groups[(x)]) +#else /* !SMP */ +static struct tdq tdq_cpu; + +#define TDQ_SELF() (&tdq_cpu) +#define TDQ_CPU(x) (&tdq_cpu) +#endif + +#define TDQ_LOCK_ASSERT(t, type) mtx_assert(TDQ_LOCKPTR((t)), (type)) +#define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t))) +#define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t))) +#if 0 +#define TDQ_LOCKPTR(t) (&sched_lock) +#else +#define TDQ_LOCKPTR(t) (&(t)->tdq_lock) +#endif + +static void sched_priority(struct thread *); +static void sched_thread_priority(struct thread *, u_char); +static int sched_interact_score(struct thread *); +static void sched_interact_update(struct thread *); +static void sched_interact_fork(struct thread *); +static void sched_pctcpu_update(struct td_sched *); +static inline void sched_pin_td(struct thread *td); +static inline void sched_unpin_td(struct thread *td); + +/* Operations on per processor queues */ +static struct td_sched * tdq_choose(struct tdq *); +static void tdq_setup(struct tdq *); +static void tdq_load_add(struct tdq *, struct td_sched *); +static void tdq_load_rem(struct tdq *, struct td_sched *); +static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int); +static __inline void tdq_runq_rem(struct tdq *, struct td_sched *); +void tdq_print(int cpu); +static void runq_print(struct runq *rq); +#ifdef SMP +static struct tdq *tdq_pickidle(struct td_sched *); +static struct tdq *tdq_pickpri(struct td_sched *, int); +static struct td_sched *runq_steal(struct runq *); +static void sched_balance(void); +static void sched_balance_groups(void); +static void sched_balance_group(struct tdq_group *); +static void sched_balance_pair(struct tdq *, struct tdq *); +static void sched_smp_tick(struct thread *); +static void tdq_move(struct tdq *, int); +static int tdq_idled(struct tdq *); +static void tdq_notify(struct td_sched *); +static struct td_sched *tdq_steal(struct tdq *, int); + +#define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0) +#endif + +static void sched_setup(void *dummy); +SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) + +static void sched_initticks(void *dummy); +SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL) + +static inline void +sched_pin_td(struct thread *td) +{ + td->td_pinned++; +} + +static inline void +sched_unpin_td(struct thread *td) +{ + td->td_pinned--; +} + +static void +runq_print(struct runq *rq) +{ + struct rqhead *rqh; + struct td_sched *ts; + int pri; + int j; + int i; + + for (i = 0; i < RQB_LEN; i++) { + printf("\t\trunq bits %d 0x%zx\n", + i, rq->rq_status.rqb_bits[i]); + for (j = 0; j < RQB_BPW; j++) + if (rq->rq_status.rqb_bits[i] & (1ul << j)) { + pri = j + (i << RQB_L2BPW); + rqh = &rq->rq_queues[pri]; + TAILQ_FOREACH(ts, rqh, ts_procq) { + printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n", + ts->ts_thread, ts->ts_thread->td_proc->p_comm, ts->ts_thread->td_priority, ts->ts_rqindex, pri); + } + } + } +} + +void +tdq_print(int cpu) +{ + struct tdq *tdq; + + tdq = TDQ_CPU(cpu); + + printf("tdq:\n"); + printf("\tlockptr %p\n", TDQ_LOCKPTR(tdq)); + printf("\tlock name %s\n", tdq->tdq_name); + printf("\tload: %d\n", tdq->tdq_load); + printf("\ttimeshare idx: %d\n", tdq->tdq_idx); + printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx); + printf("\trealtime runq:\n"); + runq_print(&tdq->tdq_realtime); + printf("\ttimeshare runq:\n"); + runq_print(&tdq->tdq_timeshare); + printf("\tidle runq:\n"); + runq_print(&tdq->tdq_idle); +#ifdef SMP + printf("\tload transferable: %d\n", tdq->tdq_transferable); +#endif +} + +static __inline void +tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags) +{ + TDQ_LOCK_ASSERT(tdq, MA_OWNED); + THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); +#ifdef SMP + if (THREAD_CAN_MIGRATE(ts->ts_thread)) { + tdq->tdq_transferable++; + tdq->tdq_group->tdg_transferable++; + ts->ts_flags |= TSF_XFERABLE; + if (tdq->tdq_transferable >= busy_thresh && + (tdq->tdq_flags & TDQF_BUSY) == 0) { + tdq->tdq_flags |= TDQF_BUSY; + atomic_set_int(&tdq_busy, 1 << TDQ_ID(tdq)); + } + } +#endif + if (ts->ts_runq == &tdq->tdq_timeshare) { + u_char pri; + + pri = ts->ts_thread->td_priority; + KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE, + ("Invalid priority %d on timeshare runq", pri)); + /* + * This queue contains only priorities between MIN and MAX + * realtime. Use the whole queue to represent these values. + */ +#define TS_RQ_PPQ (((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS) + if ((flags & SRQ_BORROWING) == 0) { + pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ; + pri = (pri + tdq->tdq_idx) % RQ_NQS; + /* + * This effectively shortens the queue by one so we + * can have a one slot difference between idx and + * ridx while we wait for threads to drain. + */ + if (tdq->tdq_ridx != tdq->tdq_idx && + pri == tdq->tdq_ridx) + pri = (unsigned char)(pri - 1) % RQ_NQS; + } else + pri = tdq->tdq_ridx; + runq_add_pri(ts->ts_runq, ts, pri, flags); + } else + runq_add(ts->ts_runq, ts, flags); +} + +static __inline void +tdq_runq_rem(struct tdq *tdq, struct td_sched *ts) +{ + TDQ_LOCK_ASSERT(tdq, MA_OWNED); + KASSERT(ts->ts_runq != NULL, + ("tdq_runq_remove: thread %p null ts_runq", ts->ts_thread)); +#ifdef SMP + if (ts->ts_flags & TSF_XFERABLE) { + tdq->tdq_transferable--; + tdq->tdq_group->tdg_transferable--; + ts->ts_flags &= ~TSF_XFERABLE; + if (tdq->tdq_transferable < busy_thresh && + (tdq->tdq_flags & TDQF_BUSY)) { + atomic_clear_int(&tdq_busy, 1 << TDQ_ID(tdq)); + tdq->tdq_flags &= ~TDQF_BUSY; + } + } +#endif + if (ts->ts_runq == &tdq->tdq_timeshare) { + if (tdq->tdq_idx != tdq->tdq_ridx) + runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx); + else + runq_remove_idx(ts->ts_runq, ts, NULL); + /* + * For timeshare threads we update the priority here so + * the priority reflects the time we've been sleeping. + */ + ts->ts_ltick = ticks; + sched_pctcpu_update(ts); + sched_priority(ts->ts_thread); + } else + runq_remove(ts->ts_runq, ts); +} + +static void +tdq_load_add(struct tdq *tdq, struct td_sched *ts) +{ + int class; + + TDQ_LOCK_ASSERT(tdq, MA_OWNED); + THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); + class = PRI_BASE(ts->ts_thread->td_pri_class); + tdq->tdq_load++; + CTR2(KTR_SCHED, "cpu %jd load: %d", TDQ_ID(tdq), tdq->tdq_load); + if (class != PRI_ITHD && + (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) +#ifdef SMP + tdq->tdq_group->tdg_load++; +#else + tdq->tdq_sysload++; +#endif +} + +static void +tdq_load_rem(struct tdq *tdq, struct td_sched *ts) +{ + int class; + + THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); + TDQ_LOCK_ASSERT(tdq, MA_OWNED); + class = PRI_BASE(ts->ts_thread->td_pri_class); + if (class != PRI_ITHD && + (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) +#ifdef SMP + tdq->tdq_group->tdg_load--; +#else + tdq->tdq_sysload--; +#endif + tdq->tdq_load--; + CTR1(KTR_SCHED, "load: %d", tdq->tdq_load); + ts->ts_runq = NULL; +} + +#ifdef SMP +static void +sched_smp_tick(struct thread *td) +{ + struct tdq *tdq; + + tdq = TDQ_SELF(); + if (rebalance) { + if (ticks >= bal_tick) + sched_balance(); + if (ticks >= gbal_tick && balance_groups) + sched_balance_groups(); + } +} + +/* + * sched_balance is a simple CPU load balancing algorithm. It operates by + * finding the least loaded and most loaded cpu and equalizing their load + * by migrating some processes. + * + * Dealing only with two CPUs at a time has two advantages. Firstly, most + * installations will only have 2 cpus. Secondly, load balancing too much at + * once can have an unpleasant effect on the system. The scheduler rarely has + * enough information to make perfect decisions. So this algorithm chooses + * algorithm simplicity and more gradual effects on load in larger systems. + * + * It could be improved by considering the priorities and slices assigned to + * each task prior to balancing them. There are many pathological cases with + * any approach and so the semi random algorithm below may work as well as any. + * + */ +static void +sched_balance(void) +{ + struct tdq_group *high; + struct tdq_group *low; + struct tdq_group *tdg; + int cnt; + int i; + + bal_tick = ticks + (random() % (hz * 2)); + if (smp_started == 0) + return; + low = high = NULL; + i = random() % (tdg_maxid + 1); + for (cnt = 0; cnt <= tdg_maxid; cnt++) { + tdg = TDQ_GROUP(i); + /* + * Find the CPU with the highest load that has some + * threads to transfer. + */ + if ((high == NULL || tdg->tdg_load > high->tdg_load) + && tdg->tdg_transferable) + high = tdg; + if (low == NULL || tdg->tdg_load < low->tdg_load) + low = tdg; + if (++i > tdg_maxid) + i = 0; + } + if (low != NULL && high != NULL && high != low) + sched_balance_pair(LIST_FIRST(&high->tdg_members), + LIST_FIRST(&low->tdg_members)); +} + +static void +sched_balance_groups(void) +{ + int i; + + gbal_tick = ticks + (random() % (hz * 2)); + if (smp_started) + for (i = 0; i <= tdg_maxid; i++) + sched_balance_group(TDQ_GROUP(i)); +} + +static void +sched_balance_group(struct tdq_group *tdg) +{ + struct tdq *tdq; + struct tdq *high; + struct tdq *low; + int load; + + if (tdg->tdg_transferable == 0) + return; + low = NULL; + high = NULL; + LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) { + load = tdq->tdq_load; + if (high == NULL || load > high->tdq_load) + high = tdq; + if (low == NULL || load < low->tdq_load) + low = tdq; + } + if (high != NULL && low != NULL && high != low) + sched_balance_pair(high, low); +} + +static void +sched_balance_pair(struct tdq *high, struct tdq *low) +{ + int transferable; + int high_load; + int low_load; + int move; + int diff; + int i; + + /* + * If we're transfering within a group we have to use this specific + * tdq's transferable count, otherwise we can steal from other members + * of the group. + */ + if (high->tdq_group == low->tdq_group) { + transferable = high->tdq_transferable; + high_load = high->tdq_load; + low_load = low->tdq_load; + } else { + transferable = high->tdq_group->tdg_transferable; + high_load = high->tdq_group->tdg_load; + low_load = low->tdq_group->tdg_load; + } + if (transferable == 0) + return; + /* + * Determine what the imbalance is and then adjust that to how many + * threads we actually have to give up (transferable). + */ + diff = high_load - low_load; + move = diff / 2; + if (diff & 0x1) + move++; + move = min(move, transferable); + for (i = 0; i < move; i++) + tdq_move(high, TDQ_ID(low)); + return; +} + +static void +tdq_move(struct tdq *from, int cpu) +{ + struct tdq *tdq; + struct tdq *to; + struct td_sched *ts; + + tdq = from; + to = TDQ_CPU(cpu); + ts = tdq_steal(tdq, 1); + if (ts == NULL) { + struct tdq_group *tdg; + + tdg = tdq->tdq_group; + LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) { + if (tdq == from || tdq->tdq_transferable == 0) + continue; + ts = tdq_steal(tdq, 1); + break; + } + if (ts == NULL) + panic("tdq_move: No threads available with a " + "transferable count of %d\n", + tdg->tdg_transferable); + } + if (tdq == to) + return; + sched_rem(ts->ts_thread); + ts->ts_cpu = cpu; + sched_pin_td(ts->ts_thread); + sched_add(ts->ts_thread, SRQ_YIELDING); + sched_unpin_td(ts->ts_thread); +} + +static int +tdq_idled(struct tdq *tdq) +{ + struct tdq_group *tdg; + struct tdq *steal; + struct td_sched *ts; + + spinlock_enter(); + tdg = tdq->tdq_group; + /* + * If we're in a cpu group, try and steal threads from another cpu in + * the group before idling. + */ + if (steal_htt && tdg->tdg_cpus > 1 && tdg->tdg_transferable) { + TDQ_UNLOCK(tdq); + LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) { + if (steal == tdq || steal->tdq_transferable == 0) + continue; + TDQ_LOCK(steal); + ts = tdq_steal(steal, 0); + if (ts) + goto steal; + TDQ_UNLOCK(steal); + } + TDQ_LOCK(tdq); + } + if (steal_busy && tdq_busy) { + TDQ_UNLOCK(tdq); + while (tdq_busy) { + int cpu; + + cpu = ffs(tdq_busy); + if (cpu == 0) + break; + cpu--; + steal = TDQ_CPU(cpu); + TDQ_LOCK(steal); + if (steal->tdq_transferable == 0) { + TDQ_UNLOCK(steal); + continue; + } + ts = tdq_steal(steal, 1); + if (ts == NULL) { + TDQ_UNLOCK(steal); + continue; + } + CTR5(KTR_ULE, + "tdq_idled: stealing td %p(%s) pri %d from %d busy 0x%X", + ts->ts_thread, ts->ts_thread->td_proc->p_comm, + ts->ts_thread->td_priority, cpu, tdq_busy); + goto steal; + } + TDQ_LOCK(tdq); + } + spinlock_exit(); + /* + * We only set the idled bit when all of the cpus in the group are + * idle. Otherwise we could get into a situation where a thread bounces + * back and forth between two idle cores on seperate physical CPUs. + */ + tdg->tdg_idlemask |= PCPU_GET(cpumask); + if (tdg->tdg_idlemask == tdg->tdg_cpumask) + atomic_set_int(&tdq_idle, tdg->tdg_mask); + return (1); +steal: + sched_rem(ts->ts_thread); + thread_lock_block(ts->ts_thread); + TDQ_LOCK(tdq); + thread_lock_unblock(ts->ts_thread, TDQ_LOCKPTR(tdq)); + spinlock_exit(); + ts->ts_cpu = PCPU_GET(cpuid); + sched_pin_td(ts->ts_thread); + sched_add(ts->ts_thread, SRQ_YIELDING); + sched_unpin_td(ts->ts_thread); + + return (0); +} + +static void +tdq_notify(struct td_sched *ts) +{ + struct thread *ctd; + struct pcpu *pcpu; + int cpri; + int pri; + int cpu; + + cpu = ts->ts_cpu; + pri = ts->ts_thread->td_priority; + pcpu = pcpu_find(cpu); + ctd = pcpu->pc_curthread; + cpri = ctd->td_priority; + + /* + * If our priority is not better than the current priority there is + * nothing to do. + */ + if (pri > cpri) + return; + /* + * Always IPI idle. + */ + if (cpri > PRI_MIN_IDLE) + goto sendipi; + /* + * If we're realtime or better and there is timeshare or worse running + * send an IPI. + */ + if (pri < PRI_MAX_REALTIME && cpri > PRI_MAX_REALTIME) + goto sendipi; + /* + * Otherwise only IPI if we exceed the threshold. + */ + if (pri > ipi_thresh) + return; +sendipi: + ctd->td_flags |= TDF_NEEDRESCHED; + if (cpri < PRI_MIN_IDLE) { + if (ipi_ast) + ipi_selected(1 << cpu, IPI_AST); + else if (ipi_preempt) + ipi_selected(1 << cpu, IPI_PREEMPT); + } else + ipi_selected(1 << cpu, IPI_PREEMPT); +} + +static struct td_sched * +runq_steal(struct runq *rq) +{ + struct rqhead *rqh; + struct rqbits *rqb; + struct td_sched *ts; + int word; + int bit; + + rqb = &rq->rq_status; + for (word = 0; word < RQB_LEN; word++) { + if (rqb->rqb_bits[word] == 0) + continue; + for (bit = 0; bit < RQB_BPW; bit++) { + if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) + continue; + rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; + TAILQ_FOREACH(ts, rqh, ts_procq) { + if (THREAD_CAN_MIGRATE(ts->ts_thread)) + return (ts); + } + } + } + return (NULL); +} + +static struct td_sched * +tdq_steal(struct tdq *tdq, int stealidle) +{ + struct td_sched *ts; + + TDQ_LOCK_ASSERT(tdq, MA_OWNED); + /* + * Steal from next first to try to get a non-interactive task that + * may not have run for a while. + * XXX Need to effect steal order for timeshare threads. + */ + if ((ts = runq_steal(&tdq->tdq_realtime)) != NULL) + return (ts); + if ((ts = runq_steal(&tdq->tdq_timeshare)) != NULL) + return (ts); + if (stealidle) + return (runq_steal(&tdq->tdq_idle)); + return (NULL); +} + +static struct tdq * +tdq_pickidle(struct td_sched *ts) +{ + struct tdq_group *tdg; + struct tdq *tdq; + int self; + int cpu; + + self = PCPU_GET(cpuid); + tdq = TDQ_CPU(self); + if (smp_started == 0) + goto self; + /* + * If we're bound to a particular cpu, schedule here. + */ + if (!THREAD_CAN_MIGRATE(ts->ts_thread)) { + CTR1(KTR_ULE, "bound to %d", ts->ts_cpu); + tdq = TDQ_CPU(ts->ts_cpu); + TDQ_LOCK(tdq); + return (tdq); + } + /* + * If the current CPU has idled, just run it here. + */ + if ((tdq->tdq_group->tdg_idlemask & PCPU_GET(cpumask)) != 0) { + CTR1(KTR_ULE, "self idle %X", tdq->tdq_group->tdg_idlemask); + goto self; + } + /* + * Try the last group we ran on. + */ + tdg = TDQ_CPU(ts->ts_cpu)->tdq_group; + CTR1(KTR_ULE, "tdg_idlemask %X", tdg->tdg_idlemask); + cpu = ffs(tdg->tdg_idlemask); + if (cpu) + goto pick; + /* + * Search for an idle group. + */ + CTR1(KTR_ULE, "tdq_idle %X", tdq_idle); + cpu = ffs(tdq_idle); + if (cpu) + goto pick; + /* + * XXX If there are no idle groups, check for an idle core. + */ + /* + * No idle CPUs? + */ + CTR1(KTR_ULE, "none idle %X", tdq_idle); +self: + TDQ_LOCK(tdq); + ts->ts_cpu = self; + return (tdq); + +pick: + cpu--; + tdq = TDQ_CPU(cpu); + TDQ_LOCK(tdq); + ts->ts_cpu = cpu; + + return (tdq); +} + +static struct tdq * +tdq_pickpri(struct td_sched *ts, int flags) +{ + struct pcpu *pcpu; + struct tdq *tdq; + int lowpri; + int lowcpu; + int lowload; + int load; + int self; + int pri; + int cpu; + + self = PCPU_GET(cpuid); + if (smp_started == 0) + goto self; + /* + * If we're bound to a particular cpu, schedule here. + */ + if (!THREAD_CAN_MIGRATE(ts->ts_thread)) { + tdq = TDQ_CPU(ts->ts_cpu); + TDQ_LOCK(tdq); + return (tdq); + } + pri = ts->ts_thread->td_priority; + /* + * Regardless of affinity, if the last cpu is idle send it there. + */ + tdq = TDQ_CPU(ts->ts_cpu); + TDQ_LOCK(tdq); + pcpu = pcpu_find(ts->ts_cpu); + if (pcpu->pc_curthread->td_priority > PRI_MIN_IDLE) { + CTR5(KTR_ULE, + "ts_cpu %d idle, ltick %d ticks %d pri %d curthread %d", + ts->ts_cpu, ts->ts_rltick, ticks, pri, + pcpu->pc_curthread->td_priority); + return (tdq); + } + /* + * If we have affinity, try to place it on the cpu we last ran on. + */ + if (SCHED_AFFINITY(ts) && pcpu->pc_curthread->td_priority > pri) { + CTR5(KTR_ULE, + "affinity for %d, ltick %d ticks %d pri %d curthread %d", + ts->ts_cpu, ts->ts_rltick, ticks, pri, + pcpu->pc_curthread->td_priority); + return (tdq); + } + TDQ_UNLOCK(tdq); + /* + * Try ourself first; If we're running something lower priority this + * may have some locality with the waking thread and execute faster + * here. + */ + if (tryself) { + /* + * If we're being awoken by an interrupt thread or the waker + * is going right to sleep run here as well. + */ + if ((TDQ_SELF()->tdq_load == 1) && (flags & SRQ_YIELDING || + curthread->td_pri_class == PRI_ITHD)) { + CTR2(KTR_ULE, "tryself load %d flags %d", + TDQ_SELF()->tdq_load, flags); + goto self; + } + } + /* + * Look for an idle group. + */ + CTR1(KTR_ULE, "tdq_idle %X", tdq_idle); + cpu = ffs(tdq_idle); + if (cpu) { + cpu--; + tdq = TDQ_CPU(cpu); + TDQ_LOCK(tdq); + ts->ts_cpu = cpu; + return (tdq); + } + if (tryselfidle && pri < curthread->td_priority) { + CTR1(KTR_ULE, "tryself %d", + curthread->td_priority); + goto self; + } + /* + * Now search for the cpu running the lowest priority thread with + * the least load. + */ + lowload = 0; + lowpri = lowcpu = 0; + for (cpu = 0; cpu <= mp_maxid; cpu++) { + if (CPU_ABSENT(cpu)) + continue; + tdq = TDQ_CPU(cpu); + pcpu = pcpu_find(cpu); + pri = pcpu->pc_curthread->td_priority; + CTR4(KTR_ULE, + "cpu %d pri %d lowcpu %d lowpri %d", + cpu, pri, lowcpu, lowpri); + if (pri < lowpri) + continue; + load = TDQ_CPU(cpu)->tdq_load; + if (lowpri && lowpri == pri && load > lowload) + continue; + lowpri = pri; + lowcpu = cpu; + lowload = load; + } + tdq = TDQ_CPU(lowcpu); + TDQ_LOCK(tdq); + ts->ts_cpu = lowcpu; + return (tdq); +self: + tdq = TDQ_CPU(self); + TDQ_LOCK(tdq); + ts->ts_cpu = self; + return (tdq); + +} + +#endif /* SMP */ + +/* + * Pick the highest priority task we have and return it. + */ + +static struct td_sched * +tdq_choose(struct tdq *tdq) +{ + struct td_sched *ts; + + TDQ_LOCK_ASSERT(tdq, MA_OWNED); + ts = runq_choose(&tdq->tdq_realtime); + if (ts != NULL) { + KASSERT(ts->ts_thread->td_priority <= PRI_MAX_REALTIME, + ("tdq_choose: Invalid priority on realtime queue %d", + ts->ts_thread->td_priority)); + return (ts); + } + ts = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx); + if (ts != NULL) { + KASSERT(ts->ts_thread->td_priority <= PRI_MAX_TIMESHARE && + ts->ts_thread->td_priority >= PRI_MIN_TIMESHARE, + ("tdq_choose: Invalid priority on timeshare queue %d", + ts->ts_thread->td_priority)); + return (ts); + } + + ts = runq_choose(&tdq->tdq_idle); + if (ts != NULL) { + KASSERT(ts->ts_thread->td_priority >= PRI_MIN_IDLE, + ("tdq_choose: Invalid priority on idle queue %d", + ts->ts_thread->td_priority)); + return (ts); + } + + return (NULL); +} + +static void +tdq_setup(struct tdq *tdq) +{ + + snprintf(tdq->tdq_name, sizeof(tdq->tdq_name), + "sched lock %d", (int)TDQ_ID(tdq)); + mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock", + MTX_SPIN | MTX_RECURSE); + runq_init(&tdq->tdq_realtime); + runq_init(&tdq->tdq_timeshare); + runq_init(&tdq->tdq_idle); + tdq->tdq_load = 0; +} + +static void +sched_setup(void *dummy) +{ + struct tdq *tdq; +#ifdef SMP + int i; +#endif + + /* + * To avoid divide-by-zero, we set realstathz a dummy value + * in case which sched_clock() called before sched_initticks(). + */ + realstathz = hz; + sched_slice = (realstathz/10); /* ~100ms */ + tickincr = 1 << SCHED_TICK_SHIFT; + +#ifdef SMP + balance_groups = 0; + /* + * Initialize the tdqs. + */ + for (i = 0; i < MAXCPU; i++) { + tdq = &tdq_cpu[i]; + tdq_setup(&tdq_cpu[i]); + } + if (smp_topology == NULL) { + struct tdq_group *tdg; + int cpus; + + for (cpus = 0, i = 0; i < MAXCPU; i++) { + if (CPU_ABSENT(i)) + continue; + tdq = &tdq_cpu[i]; + tdg = &tdq_groups[cpus]; + /* + * Setup a tdq group with one member. + */ + tdq->tdq_transferable = 0; + tdq->tdq_group = tdg; + tdg->tdg_cpus = 1; + tdg->tdg_idlemask = 0; + tdg->tdg_cpumask = tdg->tdg_mask = 1 << i; + tdg->tdg_load = 0; + tdg->tdg_transferable = 0; + LIST_INIT(&tdg->tdg_members); + LIST_INSERT_HEAD(&tdg->tdg_members, tdq, tdq_siblings); + cpus++; + } + tdg_maxid = cpus - 1; + } else { + struct tdq_group *tdg; + struct cpu_group *cg; + int j; + + topology = 1; + for (i = 0; i < smp_topology->ct_count; i++) { + cg = &smp_topology->ct_group[i]; + tdg = &tdq_groups[i]; + /* + * Initialize the group. + */ + tdg->tdg_idlemask = 0; + tdg->tdg_load = 0; + tdg->tdg_transferable = 0; + tdg->tdg_cpus = cg->cg_count; + tdg->tdg_cpumask = cg->cg_mask; + LIST_INIT(&tdg->tdg_members); + /* + * Find all of the group members and add them. + */ + for (j = 0; j < MAXCPU; j++) { + if ((cg->cg_mask & (1 << j)) != 0) { + if (tdg->tdg_mask == 0) + tdg->tdg_mask = 1 << j; + tdq_cpu[j].tdq_transferable = 0; + tdq_cpu[j].tdq_group = tdg; + LIST_INSERT_HEAD(&tdg->tdg_members, + &tdq_cpu[j], tdq_siblings); + } + } + if (tdg->tdg_cpus > 1) + balance_groups = 1; + } + tdg_maxid = smp_topology->ct_count - 1; + } + /* + * Stagger the group and global load balancer so they do not + * interfere with each other. + */ + bal_tick = ticks + hz; + if (balance_groups) + gbal_tick = ticks + (hz / 2); +#else + tdq_setup(TDQ_SELF()); +#endif + tdq = TDQ_SELF(); + TDQ_LOCK(tdq); + tdq_load_add(tdq, &td_sched0); + TDQ_UNLOCK(tdq); +} + +/* ARGSUSED */ +static void +sched_initticks(void *dummy) +{ + int incr; + + realstathz = stathz ? stathz : hz; + sched_slice = (realstathz/10); /* ~100ms */ + + /* + * tickincr is shifted out by 10 to avoid rounding errors due to + * hz not being evenly divisible by stathz on all platforms. + */ + incr = (hz << SCHED_TICK_SHIFT) / realstathz; + /* + * This does not work for values of stathz that are more than + * 1 << SCHED_TICK_SHIFT * hz. In practice this does not happen. + */ + if (incr == 0) + incr = 1; + tickincr = incr; +#ifdef SMP + affinity = SCHED_AFFINITY_DEFAULT; +#endif +} + + +/* + * Scale the scheduling priority according to the "interactivity" of this + * process. + */ +static void +sched_priority(struct thread *td) +{ + int score; + int pri; + + if (td->td_pri_class != PRI_TIMESHARE) + return; + /* + * If the score is interactive we place the thread in the realtime + * queue with a priority that is less than kernel and interrupt + * priorities. These threads are not subject to nice restrictions. + * + * Scores greater than this are placed on the normal realtime queue + * where the priority is partially decided by the most recent cpu + * utilization and the rest is decided by nice value. + */ + score = sched_interact_score(td); + if (score < sched_interact) { + pri = PRI_MIN_REALTIME; + pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact) + * score; + KASSERT(pri >= PRI_MIN_REALTIME && pri <= PRI_MAX_REALTIME, + ("sched_priority: invalid interactive priority %d score %d", + pri, score)); + } else { + pri = SCHED_PRI_MIN; + if (td->td_sched->ts_ticks) + pri += SCHED_PRI_TICKS(td->td_sched); + pri += SCHED_PRI_NICE(td->td_proc->p_nice); + if (!(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE)) { + static int once = 1; + if (once) { + printf("sched_priority: invalid priority %d", + pri); + printf("nice %d, ticks %d ftick %d ltick %d tick pri %d\n", + td->td_proc->p_nice, + td->td_sched->ts_ticks, + td->td_sched->ts_ftick, + td->td_sched->ts_ltick, + SCHED_PRI_TICKS(td->td_sched)); + once = 0; + } + pri = min(max(pri, PRI_MIN_TIMESHARE), + PRI_MAX_TIMESHARE); + } + } + sched_user_prio(td, pri); + + return; +} + +/* + * This routine enforces a maximum limit on the amount of scheduling history + * kept. It is called after either the slptime or runtime is adjusted. + */ +static void +sched_interact_update(struct thread *td) +{ + struct td_sched *ts; + u_int sum; + + ts = td->td_sched; + sum = ts->skg_runtime + ts->skg_slptime; + if (sum < SCHED_SLP_RUN_MAX) + return; + /* + * This only happens from two places: + * 1) We have added an unusual amount of run time from fork_exit. + * 2) We have added an unusual amount of sleep time from sched_sleep(). + */ + if (sum > SCHED_SLP_RUN_MAX * 2) { + if (ts->skg_runtime > ts->skg_slptime) { + ts->skg_runtime = SCHED_SLP_RUN_MAX; + ts->skg_slptime = 1; + } else { + ts->skg_slptime = SCHED_SLP_RUN_MAX; + ts->skg_runtime = 1; + } + return; + } + /* + * If we have exceeded by more than 1/5th then the algorithm below + * will not bring us back into range. Dividing by two here forces + * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] + */ + if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { + ts->skg_runtime /= 2; + ts->skg_slptime /= 2; + return; + } + ts->skg_runtime = (ts->skg_runtime / 5) * 4; + ts->skg_slptime = (ts->skg_slptime / 5) * 4; +} + +static void +sched_interact_fork(struct thread *td) +{ + int ratio; + int sum; + + sum = td->td_sched->skg_runtime + td->td_sched->skg_slptime; + if (sum > SCHED_SLP_RUN_FORK) { + ratio = sum / SCHED_SLP_RUN_FORK; + td->td_sched->skg_runtime /= ratio; + td->td_sched->skg_slptime /= ratio; + } +} + +static int +sched_interact_score(struct thread *td) +{ + int div; + + if (td->td_sched->skg_runtime > td->td_sched->skg_slptime) { + div = max(1, td->td_sched->skg_runtime / SCHED_INTERACT_HALF); + return (SCHED_INTERACT_HALF + + (SCHED_INTERACT_HALF - (td->td_sched->skg_slptime / div))); + } + if (td->td_sched->skg_slptime > td->td_sched->skg_runtime) { + div = max(1, td->td_sched->skg_slptime / SCHED_INTERACT_HALF); + return (td->td_sched->skg_runtime / div); + } + /* runtime == slptime */ + if (td->td_sched->skg_runtime) + return (SCHED_INTERACT_HALF); + + /* + * This can happen if slptime and runtime are 0. + */ + return (0); + +} + +/* + * Called from proc0_init() to bootstrap the scheduler. + */ +void +schedinit(void) +{ + + /* + * Set up the scheduler specific parts of proc0. + */ + proc0.p_sched = NULL; /* XXX */ + thread0.td_sched = &td_sched0; + thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF()); + td_sched0.ts_ltick = ticks; + td_sched0.ts_ftick = ticks; + td_sched0.ts_thread = &thread0; +} + +/* + * This is only somewhat accurate since given many processes of the same + * priority they will switch when their slices run out, which will be + * at most sched_slice stathz ticks. + */ +int +sched_rr_interval(void) +{ + + /* Convert sched_slice to hz */ + return (hz/(realstathz/sched_slice)); +} + +static void +sched_pctcpu_update(struct td_sched *ts) +{ + + if (ts->ts_ticks == 0) + return; + if (ticks - (hz / 10) < ts->ts_ltick && + SCHED_TICK_TOTAL(ts) < SCHED_TICK_MAX) + return; + /* + * Adjust counters and watermark for pctcpu calc. + */ + if (ts->ts_ltick > ticks - SCHED_TICK_TARG) + ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) * + SCHED_TICK_TARG; + else + ts->ts_ticks = 0; + ts->ts_ltick = ticks; + ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG; +} + +static void +sched_thread_priority(struct thread *td, u_char prio) +{ + struct td_sched *ts; + + CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)", + td, td->td_proc->p_comm, td->td_priority, prio, curthread, + curthread->td_proc->p_comm); + ts = td->td_sched; + THREAD_LOCK_ASSERT(td, MA_OWNED); + if (td->td_priority == prio) + return; + + if (TD_ON_RUNQ(td) && prio < td->td_priority) { + /* + * If the priority has been elevated due to priority + * propagation, we may have to move ourselves to a new + * queue. This could be optimized to not re-add in some + * cases. + */ + sched_rem(td); + td->td_priority = prio; + sched_add(td, SRQ_BORROWING|SRQ_OURSELF); + } else + td->td_priority = prio; +} + +/* + * Update a thread's priority when it is lent another thread's + * priority. + */ +void +sched_lend_prio(struct thread *td, u_char prio) +{ + + td->td_flags |= TDF_BORROWING; + sched_thread_priority(td, prio); +} + +/* + * Restore a thread's priority when priority propagation is + * over. The prio argument is the minimum priority the thread + * needs to have to satisfy other possible priority lending + * requests. If the thread's regular priority is less + * important than prio, the thread will keep a priority boost + * of prio. + */ +void +sched_unlend_prio(struct thread *td, u_char prio) +{ + u_char base_pri; + + if (td->td_base_pri >= PRI_MIN_TIMESHARE && + td->td_base_pri <= PRI_MAX_TIMESHARE) + base_pri = td->td_user_pri; + else + base_pri = td->td_base_pri; + if (prio >= base_pri) { + td->td_flags &= ~TDF_BORROWING; + sched_thread_priority(td, base_pri); + } else + sched_lend_prio(td, prio); +} + +void +sched_prio(struct thread *td, u_char prio) +{ + u_char oldprio; + + /* First, update the base priority. */ + td->td_base_pri = prio; + + /* + * If the thread is borrowing another thread's priority, don't + * ever lower the priority. + */ + if (td->td_flags & TDF_BORROWING && td->td_priority < prio) + return; + + /* Change the real priority. */ + oldprio = td->td_priority; + sched_thread_priority(td, prio); + + /* + * If the thread is on a turnstile, then let the turnstile update + * its state. + */ + if (TD_ON_LOCK(td) && oldprio != prio) + turnstile_adjust(td, oldprio); +} + +void +sched_user_prio(struct thread *td, u_char prio) +{ + u_char oldprio; + + td->td_base_user_pri = prio; + if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio) + return; + oldprio = td->td_user_pri; + td->td_user_pri = prio; + + if (TD_ON_UPILOCK(td) && oldprio != prio) + umtx_pi_adjust(td, oldprio); +} + +void +sched_lend_user_prio(struct thread *td, u_char prio) +{ + u_char oldprio; + + td->td_flags |= TDF_UBORROWING; + + oldprio = td->td_user_pri; + td->td_user_pri = prio; + + if (TD_ON_UPILOCK(td) && oldprio != prio) + umtx_pi_adjust(td, oldprio); +} + +void +sched_unlend_user_prio(struct thread *td, u_char prio) +{ + u_char base_pri; + + base_pri = td->td_base_user_pri; + if (prio >= base_pri) { + td->td_flags &= ~TDF_UBORROWING; + sched_user_prio(td, base_pri); + } else + sched_lend_user_prio(td, prio); +} + +static inline struct mtx *thread_block_switch(struct thread *td); + +static inline struct mtx * +thread_block_switch(struct thread *td) +{ + struct mtx *lock; + + THREAD_LOCK_ASSERT(td, MA_OWNED); + lock = __DEVOLATILE(struct mtx *, td->td_lock); + td->td_lock = &blocked_lock; + mtx_unlock_spin(lock); + + return (lock); +} + +void +sched_switch(struct thread *td, struct thread *newtd, int flags) +{ + struct tdq *tdq; + struct td_sched *ts; + struct mtx *mtx; + + THREAD_LOCK_ASSERT(td, MA_OWNED); + + tdq = TDQ_SELF(); + mtx = TDQ_LOCKPTR(tdq); + ts = td->td_sched; +#ifdef SMP + ts->ts_rltick = ticks; +#endif + td->td_lastcpu = td->td_oncpu; + td->td_oncpu = NOCPU; + td->td_flags &= ~TDF_NEEDRESCHED; + td->td_owepreempt = 0; + if (TD_IS_IDLETHREAD(td)) { + MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); + TD_SET_CAN_RUN(td); + goto choose; + } + if (TD_IS_RUNNING(td)) { + MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); + tdq_load_rem(tdq, ts); + sched_add(td, (flags & SW_PREEMPT) ? + SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : + SRQ_OURSELF|SRQ_YIELDING); + if (td->td_lock != TDQ_LOCKPTR(tdq)) { + spinlock_enter(); + mtx = thread_block_switch(td); + TDQ_LOCK(tdq); + spinlock_exit(); + } + } else { + if (td->td_lock != TDQ_LOCKPTR(tdq)) { + TDQ_LOCK(tdq); + mtx = thread_block_switch(td); + } + /* Remove the load for inhibited tds */ + tdq_load_rem(tdq, ts); + } +choose: + TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED); + if (newtd != NULL) { + /* + * If we bring in a thread account for it as if it had been + * added to the run queue and then chosen. + */ + MPASS(newtd->td_lock == TDQ_LOCKPTR(tdq)); + TD_SET_RUNNING(newtd); + tdq_load_add(TDQ_SELF(), newtd->td_sched); + } else + newtd = choosethread(); + if (td != newtd) { +#ifdef HWPMC_HOOKS + if (PMC_PROC_IS_USING_PMCS(td->td_proc)) + PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); +#endif + cpu_switch(td, newtd, mtx); + tdq = TDQ_SELF(); /* We may return on a different cpu */ + TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)td; +#ifdef HWPMC_HOOKS + if (PMC_PROC_IS_USING_PMCS(td->td_proc)) + PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); +#endif + } + TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED); + MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); + td->td_oncpu = PCPU_GET(cpuid); +} + +void +sched_nice(struct proc *p, int nice) +{ + struct thread *td; + + PROC_LOCK_ASSERT(p, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); + + p->p_nice = nice; + FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); + sched_priority(td); + sched_prio(td, td->td_base_user_pri); + thread_unlock(td); + } +} + +void +sched_sleep(struct thread *td) +{ + + THREAD_LOCK_ASSERT(td, MA_OWNED); + + td->td_sched->ts_slptime = ticks; +} + +void +sched_wakeup(struct thread *td) +{ + struct td_sched *ts; + int slptime; + + THREAD_LOCK_ASSERT(td, MA_OWNED); + ts = td->td_sched; + /* + * If we slept for more than a tick update our interactivity and + * priority. + */ + slptime = ts->ts_slptime; + ts->ts_slptime = 0; + if (slptime && slptime != ticks) { + u_int hzticks; + + hzticks = (ticks - slptime) << SCHED_TICK_SHIFT; + ts->skg_slptime += hzticks; + sched_interact_update(td); + sched_pctcpu_update(ts); + sched_priority(td); + } + /* Reset the slice value after we sleep. */ + ts->ts_slice = sched_slice; + sched_add(td, SRQ_BORING); +} + +/* + * Penalize the parent for creating a new child and initialize the child's + * priority. + */ +void +sched_fork(struct thread *td, struct thread *child) +{ + THREAD_LOCK_ASSERT(td, MA_OWNED); + sched_fork_thread(td, child); + /* + * Penalize the parent and child for forking. + */ + sched_interact_fork(child); + sched_priority(child); + td->td_sched->skg_runtime += tickincr; + sched_interact_update(td); + sched_priority(td); +} + +void +sched_fork_thread(struct thread *td, struct thread *child) +{ + struct td_sched *ts; + struct td_sched *ts2; + + /* + * Initialize child. + */ + THREAD_LOCK_ASSERT(td, MA_OWNED); + sched_newthread(child); + child->td_lock = TDQ_LOCKPTR(TDQ_SELF()); + ts = td->td_sched; + ts2 = child->td_sched; + ts2->ts_cpu = ts->ts_cpu; + ts2->ts_runq = NULL; + /* + * Grab our parents cpu estimation information and priority. + */ + ts2->ts_ticks = ts->ts_ticks; + ts2->ts_ltick = ts->ts_ltick; + ts2->ts_ftick = ts->ts_ftick; + child->td_user_pri = td->td_user_pri; + child->td_base_user_pri = td->td_base_user_pri; + /* + * And update interactivity score. + */ + ts2->skg_slptime = ts->skg_slptime; + ts2->skg_runtime = ts->skg_runtime; + ts2->ts_slice = 1; /* Attempt to quickly learn interactivity. */ +} + +void +sched_class(struct thread *td, int class) +{ + + THREAD_LOCK_ASSERT(td, MA_OWNED); + if (td->td_pri_class == class) + return; + +#ifdef SMP + /* + * On SMP if we're on the RUNQ we must adjust the transferable + * count because could be changing to or from an interrupt + * class. + */ + if (TD_ON_RUNQ(td)) { + struct tdq *tdq; + + tdq = TDQ_CPU(td->td_sched->ts_cpu); + if (THREAD_CAN_MIGRATE(td)) { + tdq->tdq_transferable--; + tdq->tdq_group->tdg_transferable--; + } + td->td_pri_class = class; + if (THREAD_CAN_MIGRATE(td)) { + tdq->tdq_transferable++; + tdq->tdq_group->tdg_transferable++; + } + } +#endif + td->td_pri_class = class; +} + +/* + * Return some of the child's priority and interactivity to the parent. + */ +void +sched_exit(struct proc *p, struct thread *child) +{ + struct thread *td; + + CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d", + child, child->td_proc->p_comm, child->td_priority); + + PROC_SLOCK_ASSERT(p, MA_OWNED); + td = FIRST_THREAD_IN_PROC(p); + sched_exit_thread(td, child); +} + +void +sched_exit_thread(struct thread *td, struct thread *child) +{ + struct tdq *tdq; + + CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d", + child, child->td_proc->p_comm, child->td_priority); + + tdq = TDQ_CPU(child->td_sched->ts_cpu); + TDQ_LOCK(tdq); + tdq_load_rem(tdq, child->td_sched); + TDQ_UNLOCK(tdq); +#ifdef KSE + /* + * KSE forks and exits so often that this penalty causes short-lived + * threads to always be non-interactive. This causes mozilla to + * crawl under load. + */ + if ((td->td_pflags & TDP_SA) && td->td_proc == child->td_proc) + return; +#endif + /* + * Give the child's runtime to the parent without returning the + * sleep time as a penalty to the parent. This causes shells that + * launch expensive things to mark their children as expensive. + */ + thread_lock(td); + td->td_sched->skg_runtime += child->td_sched->skg_runtime; + sched_interact_update(td); + sched_priority(td); + thread_unlock(td); +} + +void +sched_userret(struct thread *td) +{ + /* + * XXX we cheat slightly on the locking here to avoid locking in + * the usual case. Setting td_priority here is essentially an + * incomplete workaround for not setting it properly elsewhere. + * Now that some interrupt handlers are threads, not setting it + * properly elsewhere can clobber it in the window between setting + * it here and returning to user mode, so don't waste time setting + * it perfectly here. + */ + KASSERT((td->td_flags & TDF_BORROWING) == 0, + ("thread with borrowed priority returning to userland")); + if (td->td_priority != td->td_user_pri) { + thread_lock(td); + td->td_priority = td->td_user_pri; + td->td_base_pri = td->td_user_pri; + thread_unlock(td); + } +} + +void +sched_clock(struct thread *td) +{ + struct tdq *tdq; + struct td_sched *ts; + + THREAD_LOCK_ASSERT(td, MA_OWNED); +#ifdef SMP + sched_smp_tick(td); +#endif + tdq = TDQ_SELF(); + /* + * Advance the insert index once for each tick to ensure that all + * threads get a chance to run. + */ + if (tdq->tdq_idx == tdq->tdq_ridx) { + tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS; + if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx])) + tdq->tdq_ridx = tdq->tdq_idx; + } + ts = td->td_sched; + /* + * We only do slicing code for TIMESHARE threads. + */ + if (td->td_pri_class != PRI_TIMESHARE) + return; + /* + * We used a tick; charge it to the thread so that we can compute our + * interactivity. + */ + td->td_sched->skg_runtime += tickincr; + sched_interact_update(td); + /* + * We used up one time slice. + */ + if (--ts->ts_slice > 0) + return; + /* + * We're out of time, recompute priorities and requeue. + */ + sched_priority(td); + td->td_flags |= TDF_NEEDRESCHED; +} + +int +sched_runnable(void) +{ + struct tdq *tdq; + int load; + + load = 1; + + tdq = TDQ_SELF(); +#ifdef SMP + if (tdq_busy) + goto out; +#endif + if ((curthread->td_flags & TDF_IDLETD) != 0) { + if (tdq->tdq_load > 0) + goto out; + } else + if (tdq->tdq_load - 1 > 0) + goto out; + load = 0; +out: + return (load); +} + +struct thread * +sched_choose(void) +{ + struct tdq *tdq; + struct td_sched *ts; + + tdq = TDQ_SELF(); + TDQ_LOCK_ASSERT(tdq, MA_OWNED); +#ifdef SMP +restart: +#endif + ts = tdq_choose(tdq); + if (ts) { +#ifdef SMP + if (ts->ts_thread->td_priority > PRI_MIN_IDLE) + if (tdq_idled(tdq) == 0) + goto restart; +#endif + tdq_runq_rem(tdq, ts); + return (ts->ts_thread); + } +#ifdef SMP + if (tdq_idled(tdq) == 0) + goto restart; +#endif + return (PCPU_GET(idlethread)); +} + +static int +sched_preempt(struct thread *td) +{ + struct thread *ctd; + int cpri; + int pri; + + ctd = curthread; + pri = td->td_priority; + cpri = ctd->td_priority; + if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd)) + return (0); + /* + * Always preempt IDLE threads. Otherwise only if the preempting + * thread is an ithread. + */ + if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE) + return (0); + if (ctd->td_critnest > 1) { + CTR1(KTR_PROC, "sched_preempt: in critical section %d", + ctd->td_critnest); + ctd->td_owepreempt = 1; + return (0); + } + /* + * Thread is runnable but not yet put on system run queue. + */ + MPASS(TD_ON_RUNQ(td)); + TD_SET_RUNNING(td); + MPASS(ctd->td_lock == td->td_lock); + MPASS(td->td_lock == TDQ_LOCKPTR(TDQ_SELF())); + CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td, + td->td_proc->p_pid, td->td_proc->p_comm); + /* + * We enter the switch with two runnable threads that both have + * the same lock. When we return td may be sleeping so we need + * to switch locks to make sure it's locked correctly. + */ + SCHED_STAT_INC(switch_preempt); + mi_switch(SW_INVOL|SW_PREEMPT, td); + spinlock_enter(); + thread_unlock(ctd); + thread_lock(td); + spinlock_exit(); + + return (1); +} + +void +sched_add(struct thread *td, int flags) +{ + struct td_sched *ts; + struct tdq *tdq; + int preemptive; + int class; +#ifdef SMP + int cpuid; + int cpumask; +#endif + ts = td->td_sched; + + THREAD_LOCK_ASSERT(td, MA_OWNED); + CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", + td, td->td_proc->p_comm, td->td_priority, curthread, + curthread->td_proc->p_comm); + KASSERT((td->td_inhibitors == 0), + ("sched_add: trying to run inhibited thread")); + KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), + ("sched_add: bad thread state")); + KASSERT(td->td_proc->p_sflag & PS_INMEM, + ("sched_add: process swapped out")); + + TD_SET_RUNQ(td); + class = PRI_BASE(td->td_pri_class); + preemptive = !(flags & SRQ_YIELDING); + /* + * Recalculate the priority before we select the target cpu or + * run-queue. + */ + if (class == PRI_TIMESHARE) + sched_priority(td); + if (ts->ts_slice == 0) + ts->ts_slice = sched_slice; +#ifdef SMP + cpuid = PCPU_GET(cpuid); + /* + * Pick the destination cpu and if it isn't ours transfer to the + * target cpu. + */ + thread_lock_block(td); + if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_MIGRATE(td)) { + ts->ts_cpu = cpuid; + tdq = TDQ_CPU(cpuid); + TDQ_LOCK(tdq); + } else if (pick_pri) + tdq = tdq_pickpri(ts, flags); + else + tdq = tdq_pickidle(ts); + thread_lock_unblock(td, TDQ_LOCKPTR(tdq)); + if (ts->ts_cpu != cpuid) + preemptive = 0; + cpumask = 1 << ts->ts_cpu; + /* + * If we had been idle, clear our bit in the group and potentially + * the global bitmap. + */ + if ((class != PRI_IDLE && class != PRI_ITHD) && + (tdq->tdq_group->tdg_idlemask & cpumask) != 0) { + /* + * Check to see if our group is unidling, and if so, remove it + * from the global idle mask. + */ + if (tdq->tdq_group->tdg_idlemask == + tdq->tdq_group->tdg_cpumask) + atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask); + /* + * Now remove ourselves from the group specific idle mask. + */ + tdq->tdq_group->tdg_idlemask &= ~cpumask; + } +#else + tdq = TDQ_SELF(); + TDQ_LOCK(tdq); + /* + * Now that the thread is moving to the run-queue, set the lock + * to the scheduler's lock. + */ + thread_lock_set(td, TDQ_LOCKPTR(tdq)); +#endif + /* + * Pick the run queue based on priority. + */ + if (td->td_priority <= PRI_MAX_REALTIME) + ts->ts_runq = &tdq->tdq_realtime; + else if (td->td_priority <= PRI_MAX_TIMESHARE) + ts->ts_runq = &tdq->tdq_timeshare; + else + ts->ts_runq = &tdq->tdq_idle; + if (preemptive && sched_preempt(td)) + return; + tdq_runq_add(tdq, ts, flags); + tdq_load_add(tdq, ts); +#ifdef SMP + if (ts->ts_cpu != cpuid) { + tdq_notify(ts); + return; + } +#endif + if (td->td_priority < curthread->td_priority) + curthread->td_flags |= TDF_NEEDRESCHED; +} + +void +sched_rem(struct thread *td) +{ + struct tdq *tdq; + struct td_sched *ts; + + CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)", + td, td->td_proc->p_comm, td->td_priority, curthread, + curthread->td_proc->p_comm); + ts = td->td_sched; + tdq = TDQ_CPU(ts->ts_cpu); + TDQ_LOCK_ASSERT(tdq, MA_OWNED); + MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); + KASSERT(TD_ON_RUNQ(td), + ("sched_rem: thread not on run queue")); + tdq_runq_rem(tdq, ts); + tdq_load_rem(tdq, ts); + TD_SET_CAN_RUN(td); +} + +fixpt_t +sched_pctcpu(struct thread *td) +{ + fixpt_t pctcpu; + struct td_sched *ts; + + pctcpu = 0; + ts = td->td_sched; + if (ts == NULL) + return (0); + + thread_lock(td); + if (ts->ts_ticks) { + int rtick; + + sched_pctcpu_update(ts); + /* How many rtick per second ? */ + rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz); + pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT; + } + td->td_proc->p_swtime = ts->ts_ltick - ts->ts_ftick; + thread_unlock(td); + + return (pctcpu); +} + +void +sched_bind(struct thread *td, int cpu) +{ + struct td_sched *ts; + + THREAD_LOCK_ASSERT(td, MA_OWNED); + ts = td->td_sched; + if (ts->ts_flags & TSF_BOUND) + sched_unbind(td); + ts->ts_flags |= TSF_BOUND; +#ifdef SMP + sched_pin(); + if (PCPU_GET(cpuid) == cpu) + return; + ts->ts_cpu = cpu; + /* When we return from mi_switch we'll be on the correct cpu. */ + mi_switch(SW_VOL, NULL); +#endif +} + +void +sched_unbind(struct thread *td) +{ + struct td_sched *ts; + + THREAD_LOCK_ASSERT(td, MA_OWNED); + ts = td->td_sched; + if ((ts->ts_flags & TSF_BOUND) == 0) + return; + ts->ts_flags &= ~TSF_BOUND; +#ifdef SMP + sched_unpin(); +#endif +} + +int +sched_is_bound(struct thread *td) +{ + THREAD_LOCK_ASSERT(td, MA_OWNED); + return (td->td_sched->ts_flags & TSF_BOUND); +} + +void +sched_relinquish(struct thread *td) +{ + thread_lock(td); + if (td->td_pri_class == PRI_TIMESHARE) + sched_prio(td, PRI_MAX_TIMESHARE); + SCHED_STAT_INC(switch_relinquish); + mi_switch(SW_VOL, NULL); + thread_unlock(td); +} + +int +sched_load(void) +{ +#ifdef SMP + int total; + int i; + + total = 0; + for (i = 0; i <= tdg_maxid; i++) + total += TDQ_GROUP(i)->tdg_load; + return (total); +#else + return (TDQ_SELF()->tdq_sysload); +#endif +} + +int +sched_sizeof_proc(void) +{ + return (sizeof(struct proc)); +} + +int +sched_sizeof_thread(void) +{ + return (sizeof(struct thread) + sizeof(struct td_sched)); +} + +void +sched_tick(void) +{ + struct td_sched *ts; + + ts = curthread->td_sched; + /* Adjust ticks for pctcpu */ + ts->ts_ticks += 1 << SCHED_TICK_SHIFT; + ts->ts_ltick = ticks; + /* + * Update if we've exceeded our desired tick threshhold by over one + * second. + */ + if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick) + sched_pctcpu_update(ts); +} + +/* + * The actual idle process. + */ +void +sched_idletd(void *dummy) +{ + struct proc *p; + struct thread *td; + + td = curthread; + p = td->td_proc; + mtx_assert(&Giant, MA_NOTOWNED); + /* ULE Relies on preemption for idle interruption. */ + for (;;) + cpu_idle(); +} + +/* + * A CPU is entering for the first time or a thread is exiting. + */ +void +sched_throw(struct thread *td) +{ + struct tdq *tdq; + + /* + * Correct spinlock nesting. The idle thread context that we are + * borrowing was created so that it would start out with a single + * spin lock (sched_lock) held in fork_trampoline(). Since we've + * explicitly acquired locks in this function, the nesting count + * is now 2 rather than 1. Since we are nested, calling + * spinlock_exit() will simply adjust the counts without allowing + * spin lock using code to interrupt us. + */ + tdq = TDQ_SELF(); + if (td == NULL) { + TDQ_LOCK(tdq); + spinlock_exit(); + } else { + MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); + } + TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED); + KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); + PCPU_SET(switchtime, cpu_ticks()); + PCPU_SET(switchticks, ticks); + cpu_throw(td, choosethread()); /* doesn't return */ +} + +void +sched_fork_exit(struct thread *ctd) +{ + struct td_sched *ts; + struct thread *td; + struct tdq *tdq; + int cpuid; + + /* + * Finish setting up thread glue so that it begins execution in a + * non-nested critical section with sched_lock held but not recursed. + */ + cpuid = PCPU_GET(cpuid); + tdq = TDQ_CPU(cpuid); + if (TD_IS_IDLETHREAD(ctd)) + ctd->td_lock = TDQ_LOCKPTR(tdq); + ts = ctd->td_sched; + MPASS(ctd->td_lock == TDQ_LOCKPTR(tdq)); + ctd->td_oncpu = cpuid; + TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)ctd; + THREAD_LOCK_ASSERT(ctd, MA_OWNED | MA_NOTRECURSED); + /* + * Processes normally resume in mi_switch() after being + * cpu_switch()'ed to, but when children start up they arrive here + * instead, so we must do much the same things as mi_switch() would. + */ + if ((td = PCPU_GET(deadthread))) { + PCPU_SET(deadthread, NULL); + thread_stash(td); + } + thread_unlock(ctd); +} + +static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler"); +SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0, + "Scheduler name"); +SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, ""); +SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, ""); +SYSCTL_INT(_kern_sched, OID_AUTO, tickincr, CTLFLAG_RD, &tickincr, 0, ""); +SYSCTL_INT(_kern_sched, OID_AUTO, realstathz, CTLFLAG_RD, &realstathz, 0, ""); +#ifdef SMP +SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0, ""); +SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_affinity, CTLFLAG_RW, + &affinity, 0, ""); +SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_tryself, CTLFLAG_RW, + &tryself, 0, ""); +SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_tryselfidle, CTLFLAG_RW, + &tryselfidle, 0, ""); +SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, ""); +SYSCTL_INT(_kern_sched, OID_AUTO, ipi_preempt, CTLFLAG_RW, &ipi_preempt, 0, ""); +SYSCTL_INT(_kern_sched, OID_AUTO, ipi_ast, CTLFLAG_RW, &ipi_ast, 0, ""); +SYSCTL_INT(_kern_sched, OID_AUTO, ipi_thresh, CTLFLAG_RW, &ipi_thresh, 0, ""); +SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0, ""); +SYSCTL_INT(_kern_sched, OID_AUTO, steal_busy, CTLFLAG_RW, &steal_busy, 0, ""); +SYSCTL_INT(_kern_sched, OID_AUTO, busy_thresh, CTLFLAG_RW, &busy_thresh, 0, ""); +SYSCTL_INT(_kern_sched, OID_AUTO, topology, CTLFLAG_RD, &topology, 0, ""); +#endif + +/* ps compat */ +static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ +SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); + + +#define KERN_SWITCH_INCLUDE 1 +#include "kern/kern_switch.c" Index: kern/sched_ule.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/sched_ule.c,v retrieving revision 1.192 diff -u -p -r1.192 sched_ule.c --- kern/sched_ule.c 20 Apr 2007 05:45:46 -0000 1.192 +++ kern/sched_ule.c 31 May 2007 22:18:07 -0000 @@ -229,6 +229,7 @@ static int ipi_thresh = PRI_MIN_KERN; static int steal_htt = 1; static int steal_busy = 1; static int busy_thresh = 4; +static int topology = 0; /* * One thread queue per processor. @@ -434,7 +435,7 @@ tdq_load_add(struct tdq *tdq, struct td_ mtx_assert(&sched_lock, MA_OWNED); class = PRI_BASE(ts->ts_thread->td_pri_class); tdq->tdq_load++; - CTR1(KTR_SCHED, "load: %d", tdq->tdq_load); + CTR2(KTR_SCHED, "cpu %jd load: %d", TDQ_ID(tdq), tdq->tdq_load); if (class != PRI_ITHD && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) #ifdef SMP @@ -997,7 +998,7 @@ sched_setup(void *dummy) tdq = &tdq_cpu[i]; tdq_setup(&tdq_cpu[i]); } - if (1) { + if (smp_topology == NULL) { struct tdq_group *tdg; struct tdq *tdq; int cpus; @@ -1027,6 +1028,7 @@ sched_setup(void *dummy) struct cpu_group *cg; int j; + topology = 1; for (i = 0; i < smp_topology->ct_count; i++) { cg = &smp_topology->ct_group[i]; tdg = &tdq_groups[i]; @@ -1248,6 +1250,7 @@ schedinit(void) */ proc0.p_sched = NULL; /* XXX */ thread0.td_sched = &td_sched0; + thread0.td_lock = &sched_lock; td_sched0.ts_ltick = ticks; td_sched0.ts_ftick = ticks; td_sched0.ts_thread = &thread0; @@ -1296,7 +1299,7 @@ sched_thread_priority(struct thread *td, td, td->td_proc->p_comm, td->td_priority, prio, curthread, curthread->td_proc->p_comm); ts = td->td_sched; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_priority == prio) return; @@ -1307,9 +1310,10 @@ sched_thread_priority(struct thread *td, * queue. This could be optimized to not re-add in some * cases. */ + MPASS(td->td_lock == &sched_lock); sched_rem(td); td->td_priority = prio; - sched_add(td, SRQ_BORROWING); + sched_add(td, SRQ_BORROWING|SRQ_OURSELF); } else td->td_priority = prio; } @@ -1427,7 +1431,7 @@ sched_switch(struct thread *td, struct t struct td_sched *ts; int preempt; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); preempt = flags & SW_PREEMPT; tdq = TDQ_SELF(); @@ -1440,24 +1444,33 @@ sched_switch(struct thread *td, struct t * If the thread has been assigned it may be in the process of switching * to the new cpu. This is the case in sched_bind(). */ + /* + * Switch to the sched lock to fix things up and pick + * a new thread. + */ + if (td->td_lock != &sched_lock) { + mtx_lock_spin(&sched_lock); + thread_unlock(td); + } if (TD_IS_IDLETHREAD(td)) { + MPASS(td->td_lock == &sched_lock); TD_SET_CAN_RUN(td); - } else { + } else if (TD_IS_RUNNING(td)) { + /* + * Don't allow the thread to migrate + * from a preemption. + */ tdq_load_rem(tdq, ts); - if (TD_IS_RUNNING(td)) { - /* - * Don't allow the thread to migrate - * from a preemption. - */ - if (preempt) - sched_pin_td(td); - sched_add(td, preempt ? - SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : - SRQ_OURSELF|SRQ_YIELDING); - if (preempt) - sched_unpin_td(td); - } - } + if (preempt) + sched_pin_td(td); + sched_add(td, preempt ? + SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : + SRQ_OURSELF|SRQ_YIELDING); + if (preempt) + sched_unpin_td(td); + } else + tdq_load_rem(tdq, ts); + mtx_assert(&sched_lock, MA_OWNED); if (newtd != NULL) { /* * If we bring in a thread account for it as if it had been @@ -1473,7 +1486,7 @@ sched_switch(struct thread *td, struct t PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); #endif - cpu_switch(td, newtd); + cpu_switch(td, newtd, td->td_lock); #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); @@ -1481,6 +1494,7 @@ sched_switch(struct thread *td, struct t } sched_lock.mtx_lock = (uintptr_t)td; td->td_oncpu = PCPU_GET(cpuid); + MPASS(td->td_lock == &sched_lock); } void @@ -1489,12 +1503,14 @@ sched_nice(struct proc *p, int nice) struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); - mtx_assert(&sched_lock, MA_OWNED); + PROC_SLOCK_ASSERT(p, MA_OWNED); p->p_nice = nice; FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); sched_priority(td); sched_prio(td, td->td_base_user_pri); + thread_unlock(td); } } @@ -1502,7 +1518,7 @@ void sched_sleep(struct thread *td) { - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_sched->ts_slptime = ticks; } @@ -1513,7 +1529,7 @@ sched_wakeup(struct thread *td) struct td_sched *ts; int slptime; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td->td_sched; /* * If we slept for more than a tick update our interactivity and @@ -1542,7 +1558,7 @@ sched_wakeup(struct thread *td) void sched_fork(struct thread *td, struct thread *child) { - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); sched_fork_thread(td, child); /* * Penalize the parent and child for forking. @@ -1563,7 +1579,9 @@ sched_fork_thread(struct thread *td, str /* * Initialize child. */ + THREAD_LOCK_ASSERT(td, MA_OWNED); sched_newthread(child); + child->td_lock = &sched_lock; ts = td->td_sched; ts2 = child->td_sched; ts2->ts_cpu = ts->ts_cpu; @@ -1588,7 +1606,7 @@ void sched_class(struct thread *td, int class) { - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_pri_class == class) return; @@ -1627,6 +1645,7 @@ sched_exit(struct proc *p, struct thread CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d", child, child->td_proc->p_comm, child->td_priority); + PROC_SLOCK_ASSERT(p, MA_OWNED); td = FIRST_THREAD_IN_PROC(p); sched_exit_thread(td, child); } @@ -1638,7 +1657,9 @@ sched_exit_thread(struct thread *td, str CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d", child, child->td_proc->p_comm, child->td_priority); + thread_lock(child); tdq_load_rem(TDQ_CPU(child->td_sched->ts_cpu), child->td_sched); + thread_unlock(child); #ifdef KSE /* * KSE forks and exits so often that this penalty causes short-lived @@ -1653,9 +1674,11 @@ sched_exit_thread(struct thread *td, str * sleep time as a penalty to the parent. This causes shells that * launch expensive things to mark their children as expensive. */ + thread_lock(td); td->td_sched->skg_runtime += child->td_sched->skg_runtime; sched_interact_update(td); sched_priority(td); + thread_unlock(td); } void @@ -1673,10 +1696,10 @@ sched_userret(struct thread *td) KASSERT((td->td_flags & TDF_BORROWING) == 0, ("thread with borrowed priority returning to userland")); if (td->td_priority != td->td_user_pri) { - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_priority = td->td_user_pri; td->td_base_pri = td->td_user_pri; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } } @@ -1805,9 +1828,22 @@ sched_preempt(struct thread *td) */ MPASS(TD_ON_RUNQ(td)); TD_SET_RUNNING(td); + MPASS(ctd->td_lock == &sched_lock); + MPASS(td->td_lock == &sched_lock); CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td, td->td_proc->p_pid, td->td_proc->p_comm); + /* + * We enter the switch with two runnable threads that both have + * the same lock. When we return td may be sleeping so we need + * to switch locks to make sure he's locked correctly. + */ + SCHED_STAT_INC(switch_preempt); mi_switch(SW_INVOL|SW_PREEMPT, td); + spinlock_enter(); + thread_unlock(ctd); + thread_lock(td); + spinlock_exit(); + return (1); } @@ -1824,7 +1860,7 @@ sched_add(struct thread *td, int flags) #endif ts = td->td_sched; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", td, td->td_proc->p_comm, td->td_priority, curthread, curthread->td_proc->p_comm); @@ -1834,8 +1870,15 @@ sched_add(struct thread *td, int flags) ("sched_add: bad thread state")); KASSERT(td->td_proc->p_sflag & PS_INMEM, ("sched_add: process swapped out")); - KASSERT(ts->ts_runq == NULL, - ("sched_add: thread %p is still assigned to a run queue", td)); + /* + * Now that the thread is moving to the run-queue, set the lock + * to the scheduler's lock. + */ + if (td->td_lock != &sched_lock) { + mtx_lock_spin(&sched_lock); + thread_lock_set(td, &sched_lock); + } + mtx_assert(&sched_lock, MA_OWNED); TD_SET_RUNQ(td); tdq = TDQ_SELF(); class = PRI_BASE(td->td_pri_class); @@ -1920,7 +1963,7 @@ sched_rem(struct thread *td) CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)", td, td->td_proc->p_comm, td->td_priority, curthread, curthread->td_proc->p_comm); - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td->td_sched; KASSERT(TD_ON_RUNQ(td), ("sched_rem: thread not on run queue")); @@ -1942,7 +1985,7 @@ sched_pctcpu(struct thread *td) if (ts == NULL) return (0); - mtx_lock_spin(&sched_lock); + thread_lock(td); if (ts->ts_ticks) { int rtick; @@ -1952,7 +1995,7 @@ sched_pctcpu(struct thread *td) pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT; } td->td_proc->p_swtime = ts->ts_ltick - ts->ts_ftick; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); return (pctcpu); } @@ -1962,7 +2005,7 @@ sched_bind(struct thread *td, int cpu) { struct td_sched *ts; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td->td_sched; if (ts->ts_flags & TSF_BOUND) sched_unbind(td); @@ -1982,7 +2025,7 @@ sched_unbind(struct thread *td) { struct td_sched *ts; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td->td_sched; if ((ts->ts_flags & TSF_BOUND) == 0) return; @@ -1995,18 +2038,19 @@ sched_unbind(struct thread *td) int sched_is_bound(struct thread *td) { - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); return (td->td_sched->ts_flags & TSF_BOUND); } void sched_relinquish(struct thread *td) { - mtx_lock_spin(&sched_lock); + thread_lock(td); if (td->td_pri_class == PRI_TIMESHARE) sched_prio(td, PRI_MAX_TIMESHARE); + SCHED_STAT_INC(switch_relinquish); mi_switch(SW_VOL, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } int @@ -2071,6 +2115,58 @@ sched_idletd(void *dummy) cpu_idle(); } +/* + * A CPU is entering for the first time or a thread is exiting. + */ +void +sched_throw(struct thread *td) +{ + /* + * Correct spinlock nesting. The idle thread context that we are + * borrowing was created so that it would start out with a single + * spin lock (sched_lock) held in fork_trampoline(). Since we've + * explicitly acquired locks in this function, the nesting count + * is now 2 rather than 1. Since we are nested, calling + * spinlock_exit() will simply adjust the counts without allowing + * spin lock using code to interrupt us. + */ + if (td == NULL) { + mtx_lock_spin(&sched_lock); + spinlock_exit(); + } else { + MPASS(td->td_lock == &sched_lock); + } + mtx_assert(&sched_lock, MA_OWNED); + KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); + PCPU_SET(switchtime, cpu_ticks()); + PCPU_SET(switchticks, ticks); + cpu_throw(td, choosethread()); /* doesn't return */ +} + +void +sched_fork_exit(struct thread *ctd) +{ + struct thread *td; + + /* + * Finish setting up thread glue so that it begins execution in a + * non-nested critical section with sched_lock held but not recursed. + */ + ctd->td_oncpu = PCPU_GET(cpuid); + sched_lock.mtx_lock = (uintptr_t)ctd; + THREAD_LOCK_ASSERT(ctd, MA_OWNED | MA_NOTRECURSED); + /* + * Processes normally resume in mi_switch() after being + * cpu_switch()'ed to, but when children start up they arrive here + * instead, so we must do much the same things as mi_switch() would. + */ + if ((td = PCPU_GET(deadthread))) { + PCPU_SET(deadthread, NULL); + thread_stash(td); + } + thread_unlock(ctd); +} + static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler"); SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0, "Scheduler name"); @@ -2093,6 +2189,7 @@ SYSCTL_INT(_kern_sched, OID_AUTO, ipi_th SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0, ""); SYSCTL_INT(_kern_sched, OID_AUTO, steal_busy, CTLFLAG_RW, &steal_busy, 0, ""); SYSCTL_INT(_kern_sched, OID_AUTO, busy_thresh, CTLFLAG_RW, &busy_thresh, 0, ""); +SYSCTL_INT(_kern_sched, OID_AUTO, topology, CTLFLAG_RD, &topology, 0, ""); #endif /* ps compat */ Index: kern/subr_prof.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/subr_prof.c,v retrieving revision 1.78 diff -u -p -r1.78 subr_prof.c --- kern/subr_prof.c 20 May 2007 22:11:49 -0000 1.78 +++ kern/subr_prof.c 31 May 2007 21:39:57 -0000 @@ -423,12 +423,12 @@ profil(td, uap) } PROC_LOCK(p); upp = &td->td_proc->p_stats->p_prof; - mtx_lock_spin(&time_lock); + PROC_SLOCK(p); upp->pr_off = uap->offset; upp->pr_scale = uap->scale; upp->pr_base = uap->samples; upp->pr_size = uap->size; - mtx_unlock_spin(&time_lock); + PROC_SUNLOCK(p); startprofclock(p); PROC_UNLOCK(p); @@ -468,22 +468,22 @@ addupc_intr(struct thread *td, uintfptr_ if (ticks == 0) return; prof = &td->td_proc->p_stats->p_prof; - mtx_lock_spin(&time_lock); + PROC_SLOCK(td->td_proc); if (pc < prof->pr_off || (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) { - mtx_unlock_spin(&time_lock); + PROC_SUNLOCK(td->td_proc); return; /* out of range; ignore */ } addr = prof->pr_base + i; - mtx_unlock_spin(&time_lock); + PROC_SUNLOCK(td->td_proc); if ((v = fuswintr(addr)) == -1 || suswintr(addr, v + ticks) == -1) { td->td_profil_addr = pc; td->td_profil_ticks = ticks; td->td_pflags |= TDP_OWEUPC; - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags |= TDF_ASTPENDING; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } } @@ -511,12 +511,15 @@ addupc_task(struct thread *td, uintfptr_ } p->p_profthreads++; prof = &p->p_stats->p_prof; + PROC_SLOCK(p); if (pc < prof->pr_off || (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) { + PROC_SUNLOCK(p); goto out; } addr = prof->pr_base + i; + PROC_SUNLOCK(p); PROC_UNLOCK(p); if (copyin(addr, &v, sizeof(v)) == 0) { v += ticks; Index: kern/subr_sleepqueue.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/subr_sleepqueue.c,v retrieving revision 1.36 diff -u -p -r1.36 subr_sleepqueue.c --- kern/subr_sleepqueue.c 18 May 2007 06:32:24 -0000 1.36 +++ kern/subr_sleepqueue.c 20 May 2007 11:40:27 -0000 @@ -329,7 +329,6 @@ sleepq_add(void *wchan, struct lock_obje } TAILQ_INSERT_TAIL(&sq->sq_blocked[queue], td, td_slpq); td->td_sleepqueue = NULL; - mtx_lock_spin(&sched_lock); td->td_sqqueue = queue; td->td_wchan = wchan; td->td_wmesg = wmesg; @@ -337,7 +336,6 @@ sleepq_add(void *wchan, struct lock_obje td->td_flags |= TDF_SINTR; td->td_flags &= ~TDF_SLEEPABORT; } - mtx_unlock_spin(&sched_lock); } /* @@ -362,7 +360,8 @@ sleepq_set_timeout(void *wchan, int timo /* * Marks the pending sleep of the current thread as interruptible and * makes an initial check for pending signals before putting a thread - * to sleep. Return with sleep queue and scheduler lock held. + * to sleep. Enters and exits with the thread lock held. Thread lock + * may have transitioned from the sleepq lock to a run lock. */ static int sleepq_catch_signals(void *wchan) @@ -382,7 +381,6 @@ sleepq_catch_signals(void *wchan) CTR3(KTR_PROC, "sleepq catching signals: thread %p (pid %ld, %s)", (void *)td, (long)p->p_pid, p->p_comm); - MPASS(td->td_flags & TDF_SINTR); mtx_unlock_spin(&sc->sc_lock); /* See if there are any pending signals for this thread. */ @@ -401,39 +399,38 @@ sleepq_catch_signals(void *wchan) ret = ERESTART; mtx_unlock(&ps->ps_mtx); } - + /* + * Lock sleepq chain before unlocking proc + * without this, we could lose a race. + */ + mtx_lock_spin(&sc->sc_lock); + PROC_UNLOCK(p); + thread_lock(td); if (ret == 0) { - mtx_lock_spin(&sc->sc_lock); - /* - * Lock sched_lock before unlocking proc lock, - * without this, we could lose a race. - */ - mtx_lock_spin(&sched_lock); - PROC_UNLOCK(p); - if (!(td->td_flags & TDF_INTERRUPT)) + if (!(td->td_flags & TDF_INTERRUPT)) { + sleepq_switch(wchan); return (0); + } /* KSE threads tried unblocking us. */ ret = td->td_intrval; - mtx_unlock_spin(&sched_lock); - MPASS(ret == EINTR || ret == ERESTART); - } else { - PROC_UNLOCK(p); - mtx_lock_spin(&sc->sc_lock); + MPASS(ret == EINTR || ret == ERESTART || ret == EWOULDBLOCK); } /* * There were pending signals and this thread is still * on the sleep queue, remove it from the sleep queue. */ - sq = sleepq_lookup(wchan); - mtx_lock_spin(&sched_lock); - if (TD_ON_SLEEPQ(td)) + if (TD_ON_SLEEPQ(td)) { + sq = sleepq_lookup(wchan); sleepq_resume_thread(sq, td, -1); + } + mtx_unlock_spin(&sc->sc_lock); + MPASS(td->td_lock != &sc->sc_lock); return (ret); } /* - * Switches to another thread if we are still asleep on a sleep queue and - * drop the lock on the sleep queue chain. Returns with sched_lock held. + * Switches to another thread if we are still asleep on a sleep queue. + * Returns with thread lock. */ static void sleepq_switch(void *wchan) @@ -444,24 +441,18 @@ sleepq_switch(void *wchan) td = curthread; sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); - mtx_assert(&sched_lock, MA_OWNED); - - /* - * If we have a sleep queue, then we've already been woken up, so - * just return. - */ + THREAD_LOCK_ASSERT(td, MA_OWNED); + /* We were removed */ if (td->td_sleepqueue != NULL) { - MPASS(!TD_ON_SLEEPQ(td)); mtx_unlock_spin(&sc->sc_lock); return; } + thread_lock_set(td, &sc->sc_lock); - /* - * Otherwise, actually go to sleep. - */ - mtx_unlock_spin(&sc->sc_lock); + MPASS(td->td_sleepqueue == NULL); sched_sleep(td); TD_SET_SLEEPING(td); + SCHED_STAT_INC(switch_sleepq); mi_switch(SW_VOL, NULL); KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING")); CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)", @@ -476,8 +467,8 @@ sleepq_check_timeout(void) { struct thread *td; - mtx_assert(&sched_lock, MA_OWNED); td = curthread; + THREAD_LOCK_ASSERT(td, MA_OWNED); /* * If TDF_TIMEOUT is set, we timed out. @@ -502,6 +493,7 @@ sleepq_check_timeout(void) else if (callout_stop(&td->td_slpcallout) == 0) { td->td_flags |= TDF_TIMEOUT; TD_SET_SLEEPING(td); + SCHED_STAT_INC(switch_sleepqtimo); mi_switch(SW_INVOL, NULL); } return (0); @@ -515,8 +507,8 @@ sleepq_check_signals(void) { struct thread *td; - mtx_assert(&sched_lock, MA_OWNED); td = curthread; + THREAD_LOCK_ASSERT(td, MA_OWNED); /* We are no longer in an interruptible sleep. */ if (td->td_flags & TDF_SINTR) @@ -539,11 +531,13 @@ sleepq_check_signals(void) void sleepq_wait(void *wchan) { + struct thread *td; - MPASS(!(curthread->td_flags & TDF_SINTR)); - mtx_lock_spin(&sched_lock); + td = curthread; + MPASS(!(td->td_flags & TDF_SINTR)); + thread_lock(td); sleepq_switch(wchan); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } /* @@ -557,12 +551,8 @@ sleepq_wait_sig(void *wchan) int rval; rcatch = sleepq_catch_signals(wchan); - if (rcatch == 0) - sleepq_switch(wchan); - else - sleepq_release(wchan); rval = sleepq_check_signals(); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); if (rcatch) return (rcatch); return (rval); @@ -575,13 +565,16 @@ sleepq_wait_sig(void *wchan) int sleepq_timedwait(void *wchan) { + struct thread *td; int rval; - MPASS(!(curthread->td_flags & TDF_SINTR)); - mtx_lock_spin(&sched_lock); + td = curthread; + MPASS(!(td->td_flags & TDF_SINTR)); + thread_lock(td); sleepq_switch(wchan); rval = sleepq_check_timeout(); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + return (rval); } @@ -595,13 +588,9 @@ sleepq_timedwait_sig(void *wchan) int rcatch, rvalt, rvals; rcatch = sleepq_catch_signals(wchan); - if (rcatch == 0) - sleepq_switch(wchan); - else - sleepq_release(wchan); rvalt = sleepq_check_timeout(); rvals = sleepq_check_signals(); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); if (rcatch) return (rcatch); if (rvals) @@ -622,9 +611,9 @@ sleepq_resume_thread(struct sleepqueue * MPASS(sq->sq_wchan != NULL); MPASS(td->td_wchan == sq->sq_wchan); MPASS(td->td_sqqueue < NR_SLEEPQS && td->td_sqqueue >= 0); + THREAD_LOCK_ASSERT(td, MA_OWNED); sc = SC_LOOKUP(sq->sq_wchan); mtx_assert(&sc->sc_lock, MA_OWNED); - mtx_assert(&sched_lock, MA_OWNED); /* Remove the thread from the queue. */ TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq); @@ -714,10 +703,8 @@ sleepq_signal(void *wchan, int flags, in KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); MPASS((queue >= 0) && (queue < NR_SLEEPQS)); sq = sleepq_lookup(wchan); - if (sq == NULL) { - sleepq_release(wchan); + if (sq == NULL) return; - } KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE), ("%s: mismatch between sleep/wakeup and cv_*", __func__)); @@ -733,10 +720,9 @@ sleepq_signal(void *wchan, int flags, in besttd = td; } MPASS(besttd != NULL); - mtx_lock_spin(&sched_lock); + thread_lock(besttd); sleepq_resume_thread(sq, besttd, pri); - mtx_unlock_spin(&sched_lock); - sleepq_release(wchan); + thread_unlock(besttd); } /* @@ -746,6 +732,7 @@ void sleepq_broadcast(void *wchan, int flags, int pri, int queue) { struct sleepqueue *sq; + struct thread *td; CTR2(KTR_PROC, "sleepq_broadcast(%p, %d)", wchan, flags); KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); @@ -759,11 +746,12 @@ sleepq_broadcast(void *wchan, int flags, ("%s: mismatch between sleep/wakeup and cv_*", __func__)); /* Resume all blocked threads on the sleep queue. */ - mtx_lock_spin(&sched_lock); - while (!TAILQ_EMPTY(&sq->sq_blocked[queue])) - sleepq_resume_thread(sq, TAILQ_FIRST(&sq->sq_blocked[queue]), - pri); - mtx_unlock_spin(&sched_lock); + while (!TAILQ_EMPTY(&sq->sq_blocked[queue])) { + td = TAILQ_FIRST(&sq->sq_blocked[queue]); + thread_lock(td); + sleepq_resume_thread(sq, td, pri); + thread_unlock(td); + } sleepq_release(wchan); } @@ -774,6 +762,7 @@ sleepq_broadcast(void *wchan, int flags, static void sleepq_timeout(void *arg) { + struct sleepqueue_chain *sc; struct sleepqueue *sq; struct thread *td; void *wchan; @@ -786,38 +775,29 @@ sleepq_timeout(void *arg) * First, see if the thread is asleep and get the wait channel if * it is. */ - mtx_lock_spin(&sched_lock); - if (TD_ON_SLEEPQ(td)) { + thread_lock(td); + if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) { wchan = td->td_wchan; - mtx_unlock_spin(&sched_lock); - sleepq_lock(wchan); + sc = SC_LOOKUP(wchan); + MPASS(td->td_lock == &sc->sc_lock); sq = sleepq_lookup(wchan); - mtx_lock_spin(&sched_lock); - } else { - wchan = NULL; - sq = NULL; + MPASS(sq != NULL); + td->td_flags |= TDF_TIMEOUT; + sleepq_resume_thread(sq, td, -1); + thread_unlock(td); + return; } - /* - * At this point, if the thread is still on the sleep queue, - * we have that sleep queue locked as it cannot migrate sleep - * queues while we dropped sched_lock. If it had resumed and - * was on another CPU while the lock was dropped, it would have - * seen that TDF_TIMEOUT and TDF_TIMOFAIL are clear and the - * call to callout_stop() to stop this routine would have failed - * meaning that it would have already set TDF_TIMEOUT to - * synchronize with this function. + * If the thread is on the SLEEPQ but not sleeping and we have it + * locked it must be in sleepq_catch_signals(). Let it know we've + * timedout here so it can remove itself. */ if (TD_ON_SLEEPQ(td)) { - MPASS(td->td_wchan == wchan); - MPASS(sq != NULL); - td->td_flags |= TDF_TIMEOUT; - sleepq_resume_thread(sq, td, -1); - mtx_unlock_spin(&sched_lock); - sleepq_release(wchan); + td->td_flags |= TDF_TIMEOUT | TDF_INTERRUPT; + td->td_intrval = EWOULDBLOCK; + thread_unlock(td); return; - } else if (wchan != NULL) - sleepq_release(wchan); + } /* * Now check for the edge cases. First, if TDF_TIMEOUT is set, @@ -835,7 +815,7 @@ sleepq_timeout(void *arg) setrunnable(td); } else td->td_flags |= TDF_TIMOFAIL; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } /* @@ -855,33 +835,36 @@ sleepq_remove(struct thread *td, void *w MPASS(wchan != NULL); sleepq_lock(wchan); sq = sleepq_lookup(wchan); - mtx_lock_spin(&sched_lock); + /* + * We can not lock the thread here as it may be sleeping on a + * different sleepq. However, holding the sleepq lock for this + * wchan can guarantee that we do not miss a wakeup for this + * channel. The asserts below will catch any false positives. + */ if (!TD_ON_SLEEPQ(td) || td->td_wchan != wchan) { - mtx_unlock_spin(&sched_lock); sleepq_release(wchan); return; } - MPASS(sq != NULL); - /* Thread is asleep on sleep queue sq, so wake it up. */ + thread_lock(td); + MPASS(sq != NULL); + MPASS(td->td_wchan == wchan); sleepq_resume_thread(sq, td, -1); + thread_unlock(td); sleepq_release(wchan); - mtx_unlock_spin(&sched_lock); } /* * Abort a thread as if an interrupt had occurred. Only abort * interruptible waits (unfortunately it isn't safe to abort others). - * - * XXX: What in the world does the comment below mean? - * Also, whatever the signal code does... */ void sleepq_abort(struct thread *td, int intrval) { + struct sleepqueue *sq; void *wchan; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); MPASS(TD_ON_SLEEPQ(td)); MPASS(td->td_flags & TDF_SINTR); MPASS(intrval == EINTR || intrval == ERESTART); @@ -895,14 +878,22 @@ sleepq_abort(struct thread *td, int intr CTR3(KTR_PROC, "sleepq_abort: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, (void *)td->td_proc->p_comm); + td->td_intrval = intrval; + td->td_flags |= TDF_SLEEPABORT; + /* + * If the thread has not slept yet it will find the signal in + * sleepq_catch_signals() and call sleepq_resume_thread. Otherwise + * we have to do it here. + */ + if (!TD_IS_SLEEPING(td)) + return; wchan = td->td_wchan; - if (wchan != NULL) { - td->td_intrval = intrval; - td->td_flags |= TDF_SLEEPABORT; - } - mtx_unlock_spin(&sched_lock); - sleepq_remove(td, wchan); - mtx_lock_spin(&sched_lock); + MPASS(wchan != NULL); + sq = sleepq_lookup(wchan); + MPASS(sq != NULL); + + /* Thread is asleep on sleep queue sq, so wake it up. */ + sleepq_resume_thread(sq, td, -1); } #ifdef DDB Index: kern/subr_smp.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/subr_smp.c,v retrieving revision 1.198 diff -u -p -r1.198 subr_smp.c --- kern/subr_smp.c 8 Mar 2007 06:44:34 -0000 1.198 +++ kern/subr_smp.c 31 May 2007 21:06:57 -0000 @@ -159,7 +159,7 @@ forward_signal(struct thread *td) * this thread, so all we need to do is poke it if it is currently * executing so that it executes ast(). */ - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_RUNNING(td), ("forward_signal: thread is not TDS_RUNNING")); @@ -187,8 +187,6 @@ forward_roundrobin(void) struct thread *td; cpumask_t id, map, me; - mtx_assert(&sched_lock, MA_OWNED); - CTR0(KTR_SMP, "forward_roundrobin()"); if (!smp_started || cold || panicstr) Index: kern/subr_taskqueue.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/subr_taskqueue.c,v retrieving revision 1.38 diff -u -p -r1.38 subr_taskqueue.c --- kern/subr_taskqueue.c 23 Jan 2007 08:46:50 -0000 1.38 +++ kern/subr_taskqueue.c 18 May 2007 10:37:02 -0000 @@ -349,15 +349,15 @@ taskqueue_start_threads(struct taskqueue } else tq->tq_pcount++; } - mtx_lock_spin(&sched_lock); for (i = 0; i < count; i++) { if (tq->tq_pproc[i] == NULL) continue; td = FIRST_THREAD_IN_PROC(tq->tq_pproc[i]); + thread_lock(td); sched_prio(td, pri); sched_add(td, SRQ_BORING); + thread_unlock(td); } - mtx_unlock_spin(&sched_lock); return (0); } Index: kern/subr_trap.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/subr_trap.c,v retrieving revision 1.295 diff -u -p -r1.295 subr_trap.c --- kern/subr_trap.c 1 Jun 2007 01:20:11 -0000 1.295 +++ kern/subr_trap.c 31 May 2007 21:06:42 -0000 @@ -82,11 +82,11 @@ userret(struct thread *td, struct trapfr #ifdef DIAGNOSTIC /* Check that we called signotify() enough. */ PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + thread_lock(td); if (SIGPENDING(td) && ((td->td_flags & TDF_NEEDSIGCHK) == 0 || (td->td_flags & TDF_ASTPENDING) == 0)) printf("failed to set signal flags properly for ast()\n"); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); PROC_UNLOCK(p); #endif @@ -163,7 +163,7 @@ ast(struct trapframe *framep) KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode")); WITNESS_WARN(WARN_PANIC, NULL, "Returning to user mode"); mtx_assert(&Giant, MA_NOTOWNED); - mtx_assert(&sched_lock, MA_NOTOWNED); + THREAD_LOCK_ASSERT(td, MA_NOTOWNED); td->td_frame = framep; td->td_pticks = 0; @@ -179,7 +179,7 @@ ast(struct trapframe *framep) * AST's saved in sflag, the astpending flag will be set and * ast() will be called again. */ - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); flags = td->td_flags; sflag = p->p_sflag; if (p->p_sflag & (PS_ALRMPEND | PS_PROFPEND)) @@ -188,10 +188,12 @@ ast(struct trapframe *framep) if (p->p_sflag & PS_MACPEND) p->p_sflag &= ~PS_MACPEND; #endif + thread_lock(td); + PROC_SUNLOCK(p); td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK | TDF_NEEDRESCHED | TDF_INTERRUPT); + thread_unlock(td); cnt.v_trap++; - mtx_unlock_spin(&sched_lock); /* * XXXKSE While the fact that we owe a user profiling @@ -239,10 +241,11 @@ ast(struct trapframe *framep) if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 1); #endif - mtx_lock_spin(&sched_lock); + thread_lock(td); sched_prio(td, td->td_user_pri); + SCHED_STAT_INC(switch_needresched); mi_switch(SW_INVOL, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 1); Index: kern/subr_turnstile.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/subr_turnstile.c,v retrieving revision 1.167 diff -u -p -r1.167 subr_turnstile.c --- kern/subr_turnstile.c 18 May 2007 06:32:24 -0000 1.167 +++ kern/subr_turnstile.c 31 May 2007 22:17:16 -0000 @@ -116,6 +116,7 @@ __FBSDID("$FreeBSD: src/sys/kern/subr_tu * q - td_contested lock */ struct turnstile { + struct mtx ts_lock; /* Spin lock for self. */ struct threadqueue ts_blocked[2]; /* (c + q) Blocked threads. */ struct threadqueue ts_pending; /* (c) Pending threads. */ LIST_ENTRY(turnstile) ts_hash; /* (c) Chain and free list. */ @@ -162,6 +163,7 @@ static void turnstile_setowner(struct tu static void turnstile_dtor(void *mem, int size, void *arg); #endif static int turnstile_init(void *mem, int size, int flags); +static void turnstile_fini(void *mem, int size); /* * Walks the chain of turnstiles and their owners to propagate the priority @@ -171,13 +173,20 @@ static int turnstile_init(void *mem, int static void propagate_priority(struct thread *td) { - struct turnstile_chain *tc; struct turnstile *ts; int pri; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); pri = td->td_priority; ts = td->td_blocked; + MPASS(td->td_lock == &ts->ts_lock); + /* + * Grab a recursive lock on this turnstile chain so it stays locked + * for the whole operation. The caller expects us to return with + * the original lock held. We only ever lock down the chain so + * the lock order is constant. + */ + mtx_lock_spin(&ts->ts_lock); for (;;) { td = ts->ts_owner; @@ -186,9 +195,12 @@ propagate_priority(struct thread *td) * This might be a read lock with no owner. There's * not much we can do, so just bail. */ + mtx_unlock_spin(&ts->ts_lock); return; } + thread_lock_flags(td, MTX_DUPOK); + mtx_unlock_spin(&ts->ts_lock); MPASS(td->td_proc != NULL); MPASS(td->td_proc->p_magic == P_MAGIC); @@ -213,8 +225,10 @@ propagate_priority(struct thread *td) * If this thread already has higher priority than the * thread that is being blocked, we are finished. */ - if (td->td_priority <= pri) + if (td->td_priority <= pri) { + thread_unlock(td); return; + } /* * Bump this thread's priority. @@ -227,6 +241,7 @@ propagate_priority(struct thread *td) */ if (TD_IS_RUNNING(td) || TD_ON_RUNQ(td)) { MPASS(td->td_blocked == NULL); + thread_unlock(td); return; } @@ -251,15 +266,13 @@ propagate_priority(struct thread *td) */ ts = td->td_blocked; MPASS(ts != NULL); - tc = TC_LOOKUP(ts->ts_lockobj); - mtx_lock_spin(&tc->tc_lock); - + MPASS(td->td_lock == &ts->ts_lock); /* Resort td on the list if needed. */ if (!turnstile_adjust_thread(ts, td)) { - mtx_unlock_spin(&tc->tc_lock); + mtx_unlock_spin(&ts->ts_lock); return; } - mtx_unlock_spin(&tc->tc_lock); + /* The thread lock is released as ts lock above. */ } } @@ -270,17 +283,16 @@ propagate_priority(struct thread *td) static int turnstile_adjust_thread(struct turnstile *ts, struct thread *td) { - struct turnstile_chain *tc; struct thread *td1, *td2; int queue; - mtx_assert(&sched_lock, MA_OWNED); + THREAD_LOCK_ASSERT(td, MA_OWNED); MPASS(TD_ON_LOCK(td)); /* * This thread may not be blocked on this turnstile anymore * but instead might already be woken up on another CPU - * that is waiting on sched_lock in turnstile_unpend() to + * that is waiting on the thread lock in turnstile_unpend() to * finish waking this thread up. We can detect this case * by checking to see if this thread has been given a * turnstile by either turnstile_signal() or @@ -295,8 +307,7 @@ turnstile_adjust_thread(struct turnstile * It needs to be moved if either its priority is lower than * the previous thread or higher than the next thread. */ - tc = TC_LOOKUP(ts->ts_lockobj); - mtx_assert(&tc->tc_lock, MA_OWNED); + MPASS(td->td_lock == &ts->ts_lock); td1 = TAILQ_PREV(td, threadqueue, td_lockq); td2 = TAILQ_NEXT(td, td_lockq); if ((td1 != NULL && td->td_priority < td1->td_priority) || @@ -385,9 +396,10 @@ init_turnstile0(void *dummy) turnstile_zone = uma_zcreate("TURNSTILE", sizeof(struct turnstile), #ifdef INVARIANTS - NULL, turnstile_dtor, turnstile_init, NULL, UMA_ALIGN_CACHE, 0); + NULL, turnstile_dtor, turnstile_init, turnstile_fini, + UMA_ALIGN_CACHE, 0); #else - NULL, NULL, turnstile_init, NULL, UMA_ALIGN_CACHE, 0); + NULL, NULL, turnstile_init, turnstile_fini, UMA_ALIGN_CACHE, 0); #endif thread0.td_turnstile = turnstile_alloc(); } @@ -400,10 +412,8 @@ SYSINIT(turnstile0, SI_SUB_LOCK, SI_ORDE void turnstile_adjust(struct thread *td, u_char oldpri) { - struct turnstile_chain *tc; struct turnstile *ts; - mtx_assert(&sched_lock, MA_OWNED); MPASS(TD_ON_LOCK(td)); /* @@ -411,15 +421,12 @@ turnstile_adjust(struct thread *td, u_ch */ ts = td->td_blocked; MPASS(ts != NULL); - tc = TC_LOOKUP(ts->ts_lockobj); - mtx_lock_spin(&tc->tc_lock); + MPASS(td->td_lock == &ts->ts_lock); + mtx_assert(&ts->ts_lock, MA_OWNED); /* Resort the turnstile on the list. */ - if (!turnstile_adjust_thread(ts, td)) { - mtx_unlock_spin(&tc->tc_lock); + if (!turnstile_adjust_thread(ts, td)) return; - } - /* * If our priority was lowered and we are at the head of the * turnstile, then propagate our new priority up the chain. @@ -430,12 +437,8 @@ turnstile_adjust(struct thread *td, u_ch td->td_tsqueue == TS_SHARED_QUEUE); if (td == TAILQ_FIRST(&ts->ts_blocked[td->td_tsqueue]) && td->td_priority < oldpri) { - mtx_unlock_spin(&tc->tc_lock); - critical_enter(); propagate_priority(td); - critical_exit(); - } else - mtx_unlock_spin(&tc->tc_lock); + } } /* @@ -487,9 +490,19 @@ turnstile_init(void *mem, int size, int TAILQ_INIT(&ts->ts_blocked[TS_SHARED_QUEUE]); TAILQ_INIT(&ts->ts_pending); LIST_INIT(&ts->ts_free); + mtx_init(&ts->ts_lock, "turnstile lock", NULL, MTX_SPIN | MTX_RECURSE); return (0); } +static void +turnstile_fini(void *mem, int size) +{ + struct turnstile *ts; + + ts = mem; + mtx_destroy(&ts->ts_lock); +} + /* * Get a turnstile for a new thread. */ @@ -514,12 +527,51 @@ turnstile_free(struct turnstile *ts) * Lock the turnstile chain associated with the specified lock. */ void -turnstile_lock(struct lock_object *lock) +turnstile_chain_lock(struct lock_object *lock) +{ + struct turnstile_chain *tc; + + tc = TC_LOOKUP(lock); + mtx_lock_spin(&tc->tc_lock); +} + +struct turnstile * +turnstile_trywait(struct lock_object *lock) { struct turnstile_chain *tc; + struct turnstile *ts; tc = TC_LOOKUP(lock); mtx_lock_spin(&tc->tc_lock); + LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash) + if (ts->ts_lockobj == lock) { + mtx_lock_spin(&ts->ts_lock); + return (ts); + } + + ts = curthread->td_turnstile; + MPASS(ts != NULL); + mtx_lock_spin(&ts->ts_lock); + KASSERT(ts->ts_lockobj == NULL, ("stale ts_lockobj pointer")); + ts->ts_lockobj = lock; + + return (ts); +} + +void +turnstile_cancel(struct turnstile *ts) +{ + struct turnstile_chain *tc; + struct lock_object *lock; + + mtx_assert(&ts->ts_lock, MA_OWNED); + + mtx_unlock_spin(&ts->ts_lock); + lock = ts->ts_lockobj; + if (ts == curthread->td_turnstile) + ts->ts_lockobj = NULL; + tc = TC_LOOKUP(lock); + mtx_unlock_spin(&tc->tc_lock); } /* @@ -536,8 +588,10 @@ turnstile_lookup(struct lock_object *loc tc = TC_LOOKUP(lock); mtx_assert(&tc->tc_lock, MA_OWNED); LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash) - if (ts->ts_lockobj == lock) + if (ts->ts_lockobj == lock) { + mtx_lock_spin(&ts->ts_lock); return (ts); + } return (NULL); } @@ -545,7 +599,7 @@ turnstile_lookup(struct lock_object *loc * Unlock the turnstile chain associated with a given lock. */ void -turnstile_release(struct lock_object *lock) +turnstile_chain_unlock(struct lock_object *lock) { struct turnstile_chain *tc; @@ -574,16 +628,13 @@ turnstile_first_waiter(struct turnstile * owner appropriately. */ void -turnstile_claim(struct lock_object *lock) +turnstile_claim(struct turnstile *ts) { - struct turnstile_chain *tc; - struct turnstile *ts; struct thread *td, *owner; + struct turnstile_chain *tc; - tc = TC_LOOKUP(lock); - mtx_assert(&tc->tc_lock, MA_OWNED); - ts = turnstile_lookup(lock); - MPASS(ts != NULL); + mtx_assert(&ts->ts_lock, MA_OWNED); + MPASS(ts != curthread->td_turnstile); owner = curthread; mtx_lock_spin(&td_contested_lock); @@ -593,15 +644,18 @@ turnstile_claim(struct lock_object *lock td = turnstile_first_waiter(ts); MPASS(td != NULL); MPASS(td->td_proc->p_magic == P_MAGIC); - mtx_unlock_spin(&tc->tc_lock); + MPASS(td->td_lock == &ts->ts_lock); /* * Update the priority of the new owner if needed. */ - mtx_lock_spin(&sched_lock); + thread_lock(owner); if (td->td_priority < owner->td_priority) sched_lend_prio(owner, td->td_priority); - mtx_unlock_spin(&sched_lock); + thread_unlock(owner); + tc = TC_LOOKUP(ts->ts_lockobj); + mtx_unlock_spin(&ts->ts_lock); + mtx_unlock_spin(&tc->tc_lock); } /* @@ -611,31 +665,28 @@ turnstile_claim(struct lock_object *lock * turnstile chain locked and will return with it unlocked. */ void -turnstile_wait(struct lock_object *lock, struct thread *owner, int queue) +turnstile_wait(struct turnstile *ts, struct thread *owner, int queue) { struct turnstile_chain *tc; - struct turnstile *ts; struct thread *td, *td1; + struct lock_object *lock; td = curthread; - tc = TC_LOOKUP(lock); - mtx_assert(&tc->tc_lock, MA_OWNED); - MPASS(td->td_turnstile != NULL); + mtx_assert(&ts->ts_lock, MA_OWNED); if (queue == TS_SHARED_QUEUE) MPASS(owner != NULL); if (owner) MPASS(owner->td_proc->p_magic == P_MAGIC); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); - /* Look up the turnstile associated with the lock 'lock'. */ - ts = turnstile_lookup(lock); - /* * If the lock does not already have a turnstile, use this thread's * turnstile. Otherwise insert the current thread into the * turnstile already in use by this lock. */ - if (ts == NULL) { + tc = TC_LOOKUP(ts->ts_lockobj); + if (ts == td->td_turnstile) { + mtx_assert(&tc->tc_lock, MA_OWNED); #ifdef TURNSTILE_PROFILING tc->tc_depth++; if (tc->tc_depth > tc->tc_max_depth) { @@ -644,7 +695,7 @@ turnstile_wait(struct lock_object *lock, turnstile_max_depth = tc->tc_max_depth; } #endif - ts = td->td_turnstile; + tc = TC_LOOKUP(ts->ts_lockobj); LIST_INSERT_HEAD(&tc->tc_turnstiles, ts, ts_hash); KASSERT(TAILQ_EMPTY(&ts->ts_pending), ("thread's turnstile has pending threads")); @@ -654,8 +705,7 @@ turnstile_wait(struct lock_object *lock, ("thread's turnstile has shared waiters")); KASSERT(LIST_EMPTY(&ts->ts_free), ("thread's turnstile has a non-empty free list")); - KASSERT(ts->ts_lockobj == NULL, ("stale ts_lockobj pointer")); - ts->ts_lockobj = lock; + MPASS(ts->ts_lockobj != NULL); mtx_lock_spin(&td_contested_lock); TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq); turnstile_setowner(ts, owner); @@ -674,58 +724,31 @@ turnstile_wait(struct lock_object *lock, MPASS(td->td_turnstile != NULL); LIST_INSERT_HEAD(&ts->ts_free, td->td_turnstile, ts_hash); } + thread_lock(td); + thread_lock_set(td, &ts->ts_lock); td->td_turnstile = NULL; - mtx_unlock_spin(&tc->tc_lock); - - mtx_lock_spin(&sched_lock); - /* - * Handle race condition where a thread on another CPU that owns - * lock 'lock' could have woken us in between us dropping the - * turnstile chain lock and acquiring the sched_lock. - */ - if (td->td_flags & TDF_TSNOBLOCK) { - td->td_flags &= ~TDF_TSNOBLOCK; - mtx_unlock_spin(&sched_lock); - return; - } - -#ifdef notyet - /* - * If we're borrowing an interrupted thread's VM context, we - * must clean up before going to sleep. - */ - if (td->td_ithd != NULL) { - struct ithd *it = td->td_ithd; - - if (it->it_interrupted) { - if (LOCK_LOG_TEST(lock, 0)) - CTR3(KTR_LOCK, "%s: %p interrupted %p", - __func__, it, it->it_interrupted); - intr_thd_fixup(it); - } - } -#endif /* Save who we are blocked on and switch. */ + lock = ts->ts_lockobj; td->td_tsqueue = queue; td->td_blocked = ts; td->td_lockname = lock->lo_name; TD_SET_LOCK(td); - critical_enter(); + mtx_unlock_spin(&tc->tc_lock); propagate_priority(td); - critical_exit(); if (LOCK_LOG_TEST(lock, 0)) CTR4(KTR_LOCK, "%s: td %d blocked on [%p] %s", __func__, td->td_tid, lock, lock->lo_name); + MPASS(td->td_lock == &ts->ts_lock); + SCHED_STAT_INC(switch_turnstile); mi_switch(SW_VOL, NULL); if (LOCK_LOG_TEST(lock, 0)) CTR4(KTR_LOCK, "%s: td %d free from blocked on [%p] %s", __func__, td->td_tid, lock, lock->lo_name); - - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } /* @@ -740,11 +763,10 @@ turnstile_signal(struct turnstile *ts, i int empty; MPASS(ts != NULL); + mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(curthread->td_proc->p_magic == P_MAGIC); MPASS(ts->ts_owner == curthread || (queue == TS_EXCLUSIVE_QUEUE && ts->ts_owner == NULL)); - tc = TC_LOOKUP(ts->ts_lockobj); - mtx_assert(&tc->tc_lock, MA_OWNED); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); /* @@ -766,6 +788,8 @@ turnstile_signal(struct turnstile *ts, i empty = TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) && TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]); if (empty) { + tc = TC_LOOKUP(ts->ts_lockobj); + mtx_assert(&tc->tc_lock, MA_OWNED); MPASS(LIST_EMPTY(&ts->ts_free)); #ifdef TURNSTILE_PROFILING tc->tc_depth--; @@ -791,9 +815,14 @@ turnstile_broadcast(struct turnstile *ts struct thread *td; MPASS(ts != NULL); + mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(curthread->td_proc->p_magic == P_MAGIC); MPASS(ts->ts_owner == curthread || (queue == TS_EXCLUSIVE_QUEUE && ts->ts_owner == NULL)); + /* + * We must have the chain locked so that we can remove the empty + * turnstile from the hash queue. + */ tc = TC_LOOKUP(ts->ts_lockobj); mtx_assert(&tc->tc_lock, MA_OWNED); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); @@ -833,15 +862,14 @@ void turnstile_unpend(struct turnstile *ts, int owner_type) { TAILQ_HEAD( ,thread) pending_threads; - struct turnstile_chain *tc; + struct turnstile *nts; struct thread *td; u_char cp, pri; MPASS(ts != NULL); + mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(ts->ts_owner == curthread || (owner_type == TS_SHARED_LOCK && ts->ts_owner == NULL)); - tc = TC_LOOKUP(ts->ts_lockobj); - mtx_assert(&tc->tc_lock, MA_OWNED); MPASS(!TAILQ_EMPTY(&ts->ts_pending)); /* @@ -855,7 +883,15 @@ turnstile_unpend(struct turnstile *ts, i TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE])) ts->ts_lockobj = NULL; #endif - + /* + * Adjust the priority of curthread based on other contested + * locks it owns. Don't lower the priority below the base + * priority however. + */ + td = curthread; + pri = PRI_MAX; + thread_lock(td); + mtx_lock_spin(&td_contested_lock); /* * Remove the turnstile from this thread's list of contested locks * since this thread doesn't own it anymore. New threads will @@ -864,31 +900,17 @@ turnstile_unpend(struct turnstile *ts, i * lock. */ if (ts->ts_owner != NULL) { - mtx_lock_spin(&td_contested_lock); ts->ts_owner = NULL; LIST_REMOVE(ts, ts_link); - mtx_unlock_spin(&td_contested_lock); } - critical_enter(); - mtx_unlock_spin(&tc->tc_lock); - - /* - * Adjust the priority of curthread based on other contested - * locks it owns. Don't lower the priority below the base - * priority however. - */ - td = curthread; - pri = PRI_MAX; - mtx_lock_spin(&sched_lock); - mtx_lock_spin(&td_contested_lock); - LIST_FOREACH(ts, &td->td_contested, ts_link) { - cp = turnstile_first_waiter(ts)->td_priority; + LIST_FOREACH(nts, &td->td_contested, ts_link) { + cp = turnstile_first_waiter(nts)->td_priority; if (cp < pri) pri = cp; } mtx_unlock_spin(&td_contested_lock); sched_unlend_prio(td, pri); - + thread_unlock(td); /* * Wake up all the pending threads. If a thread is not blocked * on a lock, then it is currently executing on another CPU in @@ -899,23 +921,21 @@ turnstile_unpend(struct turnstile *ts, i while (!TAILQ_EMPTY(&pending_threads)) { td = TAILQ_FIRST(&pending_threads); TAILQ_REMOVE(&pending_threads, td, td_lockq); + thread_lock(td); + MPASS(td->td_lock == &ts->ts_lock); MPASS(td->td_proc->p_magic == P_MAGIC); - if (TD_ON_LOCK(td)) { - td->td_blocked = NULL; - td->td_lockname = NULL; + MPASS(TD_ON_LOCK(td)); + TD_CLR_LOCK(td); + MPASS(TD_CAN_RUN(td)); + td->td_blocked = NULL; + td->td_lockname = NULL; #ifdef INVARIANTS - td->td_tsqueue = 0xff; + td->td_tsqueue = 0xff; #endif - TD_CLR_LOCK(td); - MPASS(TD_CAN_RUN(td)); - sched_add(td, SRQ_BORING); - } else { - td->td_flags |= TDF_TSNOBLOCK; - MPASS(TD_IS_RUNNING(td) || TD_ON_RUNQ(td)); - } + sched_add(td, SRQ_BORING); + thread_unlock(td); } - critical_exit(); - mtx_unlock_spin(&sched_lock); + mtx_unlock_spin(&ts->ts_lock); } /* @@ -925,14 +945,12 @@ turnstile_unpend(struct turnstile *ts, i void turnstile_disown(struct turnstile *ts) { - struct turnstile_chain *tc; struct thread *td; u_char cp, pri; MPASS(ts != NULL); + mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(ts->ts_owner == curthread); - tc = TC_LOOKUP(ts->ts_lockobj); - mtx_assert(&tc->tc_lock, MA_OWNED); MPASS(TAILQ_EMPTY(&ts->ts_pending)); MPASS(!TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) || !TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE])); @@ -947,7 +965,6 @@ turnstile_disown(struct turnstile *ts) ts->ts_owner = NULL; LIST_REMOVE(ts, ts_link); mtx_unlock_spin(&td_contested_lock); - mtx_unlock_spin(&tc->tc_lock); /* * Adjust the priority of curthread based on other contested @@ -956,7 +973,8 @@ turnstile_disown(struct turnstile *ts) */ td = curthread; pri = PRI_MAX; - mtx_lock_spin(&sched_lock); + thread_lock(td); + mtx_unlock_spin(&ts->ts_lock); mtx_lock_spin(&td_contested_lock); LIST_FOREACH(ts, &td->td_contested, ts_link) { cp = turnstile_first_waiter(ts)->td_priority; @@ -965,7 +983,7 @@ turnstile_disown(struct turnstile *ts) } mtx_unlock_spin(&td_contested_lock); sched_unlend_prio(td, pri); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } /* @@ -975,12 +993,10 @@ struct thread * turnstile_head(struct turnstile *ts, int queue) { #ifdef INVARIANTS - struct turnstile_chain *tc; MPASS(ts != NULL); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); - tc = TC_LOOKUP(ts->ts_lockobj); - mtx_assert(&tc->tc_lock, MA_OWNED); + mtx_assert(&ts->ts_lock, MA_OWNED); #endif return (TAILQ_FIRST(&ts->ts_blocked[queue])); } @@ -992,12 +1008,10 @@ int turnstile_empty(struct turnstile *ts, int queue) { #ifdef INVARIANTS - struct turnstile_chain *tc; MPASS(ts != NULL); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); - tc = TC_LOOKUP(ts->ts_lockobj); - mtx_assert(&tc->tc_lock, MA_OWNED); + mtx_assert(&ts->ts_lock, MA_OWNED); #endif return (TAILQ_EMPTY(&ts->ts_blocked[queue])); } Index: kern/subr_witness.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/subr_witness.c,v retrieving revision 1.232 diff -u -p -r1.232 subr_witness.c --- kern/subr_witness.c 29 May 2007 18:55:41 -0000 1.232 +++ kern/subr_witness.c 31 May 2007 21:08:09 -0000 @@ -404,9 +404,12 @@ static struct witness_order_list_entry o #ifdef HWPMC_HOOKS { "pmc-per-proc", &lock_class_mtx_spin }, #endif + { "process slock", &lock_class_mtx_spin }, { "sleepq chain", &lock_class_mtx_spin }, - { "sched lock", &lock_class_mtx_spin }, + { "umtx lock", &lock_class_mtx_spin }, { "turnstile chain", &lock_class_mtx_spin }, + { "turnstile lock", &lock_class_mtx_spin }, + { "sched lock", &lock_class_mtx_spin }, { "td_contested", &lock_class_mtx_spin }, { "callout", &lock_class_mtx_spin }, { "entropy harvest mutex", &lock_class_mtx_spin }, @@ -429,7 +432,8 @@ static struct witness_order_list_entry o #endif { "clk", &lock_class_mtx_spin }, { "mutex profiling lock", &lock_class_mtx_spin }, - { "kse zombie lock", &lock_class_mtx_spin }, + { "kse lock", &lock_class_mtx_spin }, + { "zombie lock", &lock_class_mtx_spin }, { "ALD Queue", &lock_class_mtx_spin }, #ifdef __ia64__ { "MCA spin lock", &lock_class_mtx_spin }, @@ -446,6 +450,7 @@ static struct witness_order_list_entry o #ifdef HWPMC_HOOKS { "pmc-leaf", &lock_class_mtx_spin }, #endif + { "blocked lock", &lock_class_mtx_spin }, { NULL, NULL }, { NULL, NULL } }; @@ -1961,10 +1966,10 @@ witness_list(struct thread *td) * td->td_oncpu to get the list of spinlocks for this thread * and "fix" this. * - * That still wouldn't really fix this unless we locked sched_lock - * or stopped the other CPU to make sure it wasn't changing the list - * out from under us. It is probably best to just not try to handle - * threads on other CPU's for now. + * That still wouldn't really fix this unless we locked the scheduler + * lock or stopped the other CPU to make sure it wasn't changing the + * list out from under us. It is probably best to just not try to + * handle threads on other CPU's for now. */ if (td == curthread && PCPU_GET(spinlocks) != NULL) witness_list_locks(PCPU_PTR(spinlocks)); Index: kern/sys_generic.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/sys_generic.c,v retrieving revision 1.156 diff -u -p -r1.156 sys_generic.c --- kern/sys_generic.c 1 May 2007 06:35:54 -0000 1.156 +++ kern/sys_generic.c 18 May 2007 10:37:02 -0000 @@ -722,9 +722,9 @@ kern_select(struct thread *td, int nd, f mtx_lock(&sellock); retry: ncoll = nselcoll; - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags |= TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_unlock(&sellock); error = selscan(td, ibits, obits, nd); @@ -747,12 +747,12 @@ retry: * collisions and rescan the file descriptors if * necessary. */ - mtx_lock_spin(&sched_lock); + thread_lock(td); if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { - mtx_unlock_spin(&sched_lock); + thread_unlock(td); goto retry; } - mtx_unlock_spin(&sched_lock); + thread_unlock(td); if (timo > 0) error = cv_timedwait_sig(&selwait, &sellock, timo); @@ -764,9 +764,9 @@ retry: done: clear_selinfo_list(td); - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags &= ~TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_unlock(&sellock); done_nosellock: @@ -896,9 +896,9 @@ poll(td, uap) mtx_lock(&sellock); retry: ncoll = nselcoll; - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags |= TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_unlock(&sellock); error = pollscan(td, bits, nfds); @@ -919,12 +919,12 @@ retry: * sellock, so check TDF_SELECT and the number of collisions * and rescan the file descriptors if necessary. */ - mtx_lock_spin(&sched_lock); + thread_lock(td); if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { - mtx_unlock_spin(&sched_lock); + thread_unlock(td); goto retry; } - mtx_unlock_spin(&sched_lock); + thread_unlock(td); if (timo > 0) error = cv_timedwait_sig(&selwait, &sellock, timo); @@ -936,9 +936,9 @@ retry: done: clear_selinfo_list(td); - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags &= ~TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_unlock(&sellock); done_nosellock: @@ -1109,9 +1109,9 @@ doselwakeup(sip, pri) } TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); sip->si_thread = NULL; - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags &= ~TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); sleepq_remove(td, &selwait); mtx_unlock(&sellock); } Index: kern/sys_process.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/sys_process.c,v retrieving revision 1.142 diff -u -p -r1.142 sys_process.c --- kern/sys_process.c 4 Mar 2007 22:36:46 -0000 1.142 +++ kern/sys_process.c 18 May 2007 10:37:02 -0000 @@ -527,12 +527,12 @@ kern_ptrace(struct thread *td, int req, sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); FOREACH_THREAD_IN_PROC(p, td2) { if (td2->td_tid == pid) break; } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); if (td2 != NULL) break; /* proc lock held */ PROC_UNLOCK(p); @@ -701,15 +701,15 @@ kern_ptrace(struct thread *td, int req, break; case PT_SUSPEND: - mtx_lock_spin(&sched_lock); + thread_lock(td2); td2->td_flags |= TDF_DBSUSPEND; - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); break; case PT_RESUME: - mtx_lock_spin(&sched_lock); + thread_lock(td2); td2->td_flags &= ~TDF_DBSUSPEND; - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); break; case PT_STEP: @@ -780,32 +780,35 @@ kern_ptrace(struct thread *td, int req, proctree_locked = 0; } /* deliver or queue signal */ - mtx_lock_spin(&sched_lock); + thread_lock(td2); td2->td_flags &= ~TDF_XSIG; - mtx_unlock_spin(&sched_lock); + thread_unlock(td2); td2->td_xsig = data; p->p_xstat = data; p->p_xthread = NULL; if ((p->p_flag & (P_STOPPED_SIG | P_STOPPED_TRACE)) != 0) { - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (req == PT_DETACH) { struct thread *td3; - FOREACH_THREAD_IN_PROC(p, td3) + FOREACH_THREAD_IN_PROC(p, td3) { + thread_lock(td3); td3->td_flags &= ~TDF_DBSUSPEND; + thread_unlock(td3); + } } /* * unsuspend all threads, to not let a thread run, * you should use PT_SUSPEND to suspend it before * continuing process. */ - mtx_unlock_spin(&sched_lock); #ifdef KSE + PROC_SUNLOCK(p); thread_continued(p); + PROC_SLOCK(p); #endif p->p_flag &= ~(P_STOPPED_TRACE|P_STOPPED_SIG|P_WAITED); - mtx_lock_spin(&sched_lock); thread_unsuspend(p); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } if (data) @@ -968,13 +971,13 @@ kern_ptrace(struct thread *td, int req, buf = malloc(num * sizeof(lwpid_t), M_TEMP, M_WAITOK); tmp = 0; PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); FOREACH_THREAD_IN_PROC(p, td2) { if (tmp >= num) break; buf[tmp++] = td2->td_tid; } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); error = copyout(buf, addr, tmp * sizeof(lwpid_t)); free(buf, M_TEMP); Index: kern/tty.c =================================================================== RCS file: /usr/home/ncvs/src/sys/kern/tty.c,v retrieving revision 1.268 diff -u -p -r1.268 tty.c --- kern/tty.c 20 Dec 2006 02:49:59 -0000 1.268 +++ kern/tty.c 18 May 2007 10:37:02 -0000 @@ -147,7 +147,9 @@ static struct cdevsw ttys_cdevsw = { .d_flags = D_TTY | D_NEEDGIANT, }; -static int proc_compare(struct proc *p1, struct proc *p2); +static int proc_sum(struct proc *, int *); +static int proc_compare(struct proc *, struct proc *); +static int thread_compare(struct thread *, struct thread *); static int ttnread(struct tty *tp); static void ttyecho(int c, struct tty *tp); static int ttyoutput(int c, struct tty *tp); @@ -2528,7 +2530,7 @@ ttyinfo(struct tty *tp) { struct timeval utime, stime; struct proc *p, *pick; - struct thread *td; + struct thread *td, *picktd; const char *stateprefix, *state; long rss; int load, pctcpu; @@ -2566,21 +2568,25 @@ ttyinfo(struct tty *tp) /* * Pick the most interesting process and copy some of its - * state for printing later. sched_lock must be held for - * most parts of this. Holding it throughout is simplest - * and prevents even unimportant inconsistencies in the - * copy of the state, but may increase interrupt latency - * too much. + * state for printing later. This operation could rely on stale + * data as we can't hold the proc slock or thread locks over the + * whole list. However, we're guaranteed not to reference an exited + * thread or proc since we hold the tty locked. */ pick = NULL; - mtx_lock_spin(&sched_lock); LIST_FOREACH(p, &tp->t_pgrp->pg_members, p_pglist) if (proc_compare(pick, p)) pick = p; - /*^T can only show state for 1 thread. just pick the first. */ + PROC_SLOCK(pick); + picktd = NULL; td = FIRST_THREAD_IN_PROC(pick); + FOREACH_THREAD_IN_PROC(pick, td) + if (thread_compare(picktd, td)) + picktd = td; + td = picktd; stateprefix = ""; + thread_lock(td); if (TD_IS_RUNNING(td)) state = "running"; else if (TD_ON_RUNQ(td) || TD_CAN_RUN(td)) @@ -2601,11 +2607,12 @@ ttyinfo(struct tty *tp) else state = "unknown"; pctcpu = (sched_pctcpu(td) * 10000 + FSCALE / 2) >> FSHIFT; + thread_unlock(td); if (pick->p_state == PRS_NEW || pick->p_state == PRS_ZOMBIE) rss = 0; else rss = pgtok(vmspace_resident_count(pick->p_vmspace)); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(pick); PROC_LOCK(pick); PGRP_UNLOCK(tp->t_pgrp); calcru(pick, &utime, &stime); @@ -2636,18 +2643,6 @@ ttyinfo(struct tty *tp) * we pick out just "short-term" sleepers (P_SINTR == 0). * 4) Further ties are broken by picking the highest pid. */ -#define ISRUN(p, val) \ -do { \ - struct thread *td; \ - val = 0; \ - FOREACH_THREAD_IN_PROC(p, td) { \ - if (TD_ON_RUNQ(td) || \ - TD_IS_RUNNING(td)) { \ - val = 1; \ - break; \ - } \ - } \ -} while (0) #define TESTAB(a, b) ((a)<<1 | (b)) #define ONLYA 2 @@ -2655,69 +2650,134 @@ do { \ #define BOTH 3 static int -proc_compare(struct proc *p1, struct proc *p2) +proc_sum(struct proc *p, int *estcpup) { - - int esta, estb; struct thread *td; - mtx_assert(&sched_lock, MA_OWNED); - if (p1 == NULL) + int estcpu; + int val; + + val = 0; + estcpu = 0; + FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); + if (TD_ON_RUNQ(td) || + TD_IS_RUNNING(td)) + val = 1; + estcpu += sched_pctcpu(td); + thread_unlock(td); + } + *estcpup = estcpu; + + return (val); +} + +static int +thread_compare(struct thread *td, struct thread *td2) +{ + int runa, runb; + int slpa, slpb; + fixpt_t esta, estb; + + if (td == NULL) return (1); - ISRUN(p1, esta); - ISRUN(p2, estb); - + /* + * Fetch running stats, pctcpu usage, and interruptable flag. + */ + thread_lock(td); + runa = TD_IS_RUNNING(td) | TD_ON_RUNQ(td); + slpa = td->td_flags & TDF_SINTR; + esta = sched_pctcpu(td); + thread_unlock(td); + thread_lock(td2); + runb = TD_IS_RUNNING(td2) | TD_ON_RUNQ(td2); + estb = sched_pctcpu(td2); + slpb = td2->td_flags & TDF_SINTR; + thread_unlock(td2); /* * see if at least one of them is runnable */ - switch (TESTAB(esta, estb)) { + switch (TESTAB(runa, runb)) { case ONLYA: return (0); case ONLYB: return (1); case BOTH: - /* - * tie - favor one with highest recent cpu utilization - */ - esta = estb = 0; - FOREACH_THREAD_IN_PROC(p1, td) - esta += td->td_estcpu; - FOREACH_THREAD_IN_PROC(p2, td) - estb += td->td_estcpu; - if (estb > esta) - return (1); - if (esta > estb) - return (0); - return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ + break; } /* - * weed out zombies + * favor one with highest recent cpu utilization */ - switch (TESTAB(p1->p_state == PRS_ZOMBIE, p2->p_state == PRS_ZOMBIE)) { - case ONLYA: + if (estb > esta) return (1); - case ONLYB: + if (esta > estb) + return (0); + /* + * favor one sleeping in a non-interruptible sleep + */ + switch (TESTAB(slpa, slpb)) { + case ONLYA: return (0); + case ONLYB: + return (1); case BOTH: - return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ + break; } -#if 0 /* XXXKSE */ + return (td < td2); +} + +static int +proc_compare(struct proc *p1, struct proc *p2) +{ + + int runa, runb; + fixpt_t esta, estb; + + if (p1 == NULL) + return (1); + /* - * pick the one with the smallest sleep time + * Fetch various stats about these processes. After we drop the + * lock the information could be stale but the race is unimportant. */ - if (p2->p_slptime > p1->p_slptime) + PROC_SLOCK(p1); + runa = proc_sum(p1, &esta); + PROC_SUNLOCK(p1); + PROC_SLOCK(p2); + runb = proc_sum(p2, &estb); + PROC_SUNLOCK(p2); + + /* + * see if at least one of them is runnable + */ + switch (TESTAB(runa, runb)) { + case ONLYA: return (0); - if (p1->p_slptime > p2->p_slptime) + case ONLYB: return (1); + case BOTH: + break; + } /* - * favor one sleeping in a non-interruptible sleep + * favor one with highest recent cpu utilization */ - if (p1->p_sflag & PS_SINTR && (p2->p_sflag & PS_SINTR) == 0) + if (estb > esta) return (1); - if (p2->p_sflag & PS_SINTR && (p1->p_sflag & PS_SINTR) == 0) + if (esta > estb) return (0); -#endif + /* + * weed out zombies + */ + switch (TESTAB(p1->p_state == PRS_ZOMBIE, p2->p_state == PRS_ZOMBIE)) { + case ONLYA: + return (1); + case ONLYB: + return (0); + case BOTH: + break; + } + return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ } Index: netncp/ncp_sock.c =================================================================== RCS file: /usr/home/ncvs/src/sys/netncp/ncp_sock.c,v retrieving revision 1.18 diff -u -p -r1.18 ncp_sock.c --- netncp/ncp_sock.c 27 Feb 2007 17:23:29 -0000 1.18 +++ netncp/ncp_sock.c 18 May 2007 10:37:02 -0000 @@ -189,9 +189,9 @@ ncp_poll(struct socket *so, int events) /* Fake up enough state to look like we are in poll(2). */ mtx_lock(&sellock); - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags |= TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_unlock(&sellock); TAILQ_INIT(&td->td_selq); @@ -200,9 +200,9 @@ ncp_poll(struct socket *so, int events) /* Tear down the fake poll(2) state. */ mtx_lock(&sellock); clear_selinfo_list(td); - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags &= ~TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_unlock(&sellock); return (revents); @@ -229,9 +229,9 @@ ncp_sock_rselect(struct socket *so, stru retry: ncoll = nselcoll; - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags |= TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_unlock(&sellock); TAILQ_INIT(&td->td_selq); @@ -257,12 +257,12 @@ retry: * the process, test TDF_SELECT and rescan file descriptors if * necessary. */ - mtx_lock_spin(&sched_lock); + thread_lock(td); if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { - mtx_unlock_spin(&sched_lock); + thread_unlock(td); goto retry; } - mtx_unlock_spin(&sched_lock); + thread_unlock(td); if (timo > 0) error = cv_timedwait(&selwait, &sellock, timo); @@ -274,9 +274,9 @@ retry: done: clear_selinfo_list(td); - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags &= ~TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_unlock(&sellock); done_noproclock: Index: netsmb/smb_trantcp.c =================================================================== RCS file: /usr/home/ncvs/src/sys/netsmb/smb_trantcp.c,v retrieving revision 1.24 diff -u -p -r1.24 smb_trantcp.c --- netsmb/smb_trantcp.c 3 Aug 2006 15:31:52 -0000 1.24 +++ netsmb/smb_trantcp.c 18 May 2007 10:37:02 -0000 @@ -115,9 +115,9 @@ nbssn_rselect(struct nbpcb *nbp, struct retry: ncoll = nselcoll; - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags |= TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_unlock(&sellock); /* XXX: Should be done when the thread is initialized. */ @@ -144,12 +144,12 @@ retry: * the process, test P_SELECT and rescan file descriptors if * necessary. */ - mtx_lock_spin(&sched_lock); + thread_lock(td); if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { - mtx_unlock_spin(&sched_lock); + thread_unlock(td); goto retry; } - mtx_unlock_spin(&sched_lock); + thread_unlock(td); if (timo > 0) error = cv_timedwait(&selwait, &sellock, timo); @@ -161,9 +161,9 @@ retry: done: clear_selinfo_list(td); - mtx_lock_spin(&sched_lock); + thread_lock(td); td->td_flags &= ~TDF_SELECT; - mtx_unlock_spin(&sched_lock); + thread_unlock(td); mtx_unlock(&sellock); done_noproclock: Index: pc98/pc98/machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/pc98/pc98/machdep.c,v retrieving revision 1.393 diff -u -p -r1.393 machdep.c --- pc98/pc98/machdep.c 31 May 2007 22:52:13 -0000 1.393 +++ pc98/pc98/machdep.c 31 May 2007 20:40:24 -0000 @@ -1055,9 +1055,9 @@ cpu_est_clockrate(int cpu_id, uint64_t * #ifdef SMP /* Schedule ourselves on the indicated cpu. */ - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_bind(curthread, cpu_id); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); #endif /* Calibrate by measuring a short delay. */ @@ -1068,9 +1068,9 @@ cpu_est_clockrate(int cpu_id, uint64_t * intr_restore(reg); #ifdef SMP - mtx_lock_spin(&sched_lock); + thread_lock(curthread); sched_unbind(curthread); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); #endif /* Index: powerpc/powerpc/vm_machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/powerpc/powerpc/vm_machdep.c,v retrieving revision 1.113 diff -u -p -r1.113 vm_machdep.c --- powerpc/powerpc/vm_machdep.c 28 Dec 2006 23:56:50 -0000 1.113 +++ powerpc/powerpc/vm_machdep.c 31 May 2007 21:25:28 -0000 @@ -154,7 +154,7 @@ cpu_fork(struct thread *td1, struct proc pcb->pcb_lr = (register_t)fork_trampoline; pcb->pcb_usr = kernel_pmap->pm_sr[USER_SR]; - /* Setup to release sched_lock in fork_exit(). */ + /* Setup to release spin count in fork_exit(). */ td2->td_md.md_spinlock_count = 1; td2->td_md.md_saved_msr = PSL_KERNSET; @@ -327,7 +327,7 @@ cpu_set_upcall(struct thread *td, struct pcb2->pcb_lr = (register_t)fork_trampoline; pcb2->pcb_usr = kernel_pmap->pm_sr[USER_SR]; - /* Setup to release sched_lock in fork_exit(). */ + /* Setup to release spin count in fork_exit(). */ td->td_md.md_spinlock_count = 1; td->td_md.md_saved_msr = PSL_KERNSET; } Index: security/mac_lomac/mac_lomac.c =================================================================== RCS file: /usr/home/ncvs/src/sys/security/mac_lomac/mac_lomac.c,v retrieving revision 1.50 diff -u -p -r1.50 mac_lomac.c --- security/mac_lomac/mac_lomac.c 23 Apr 2007 13:36:53 -0000 1.50 +++ security/mac_lomac/mac_lomac.c 18 May 2007 10:37:02 -0000 @@ -536,10 +536,10 @@ maybe_demote(struct mac_lomac *subjlabel subj->mac_lomac.ml_rangelow = objlabel->ml_single; subj->mac_lomac.ml_rangehigh = objlabel->ml_single; subj->mac_lomac.ml_flags |= MAC_LOMAC_FLAG_UPDATE; - mtx_lock_spin(&sched_lock); + thread_lock(curthread); curthread->td_flags |= TDF_ASTPENDING; curthread->td_proc->p_sflag |= PS_MACPEND; - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); /* * Avoid memory allocation while holding a mutex; cache the Index: sparc64/sparc64/mp_machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/sparc64/sparc64/mp_machdep.c,v retrieving revision 1.34 diff -u -p -r1.34 mp_machdep.c --- sparc64/sparc64/mp_machdep.c 20 May 2007 14:49:01 -0000 1.34 +++ sparc64/sparc64/mp_machdep.c 23 May 2007 15:29:29 -0000 @@ -364,12 +364,8 @@ cpu_mp_bootstrap(struct pcpu *pc) while (csa->csa_count != 0) ; - /* ok, now grab sched_lock and enter the scheduler */ - mtx_lock_spin(&sched_lock); - spinlock_exit(); - PCPU_SET(switchtime, cpu_ticks()); - PCPU_SET(switchticks, ticks); - cpu_throw(NULL, choosethread()); /* doesn't return */ + /* ok, now enter the scheduler */ + sched_throw(NULL); } void Index: sparc64/sparc64/vm_machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/sparc64/sparc64/vm_machdep.c,v retrieving revision 1.74 diff -u -p -r1.74 vm_machdep.c --- sparc64/sparc64/vm_machdep.c 10 Jul 2005 23:31:11 -0000 1.74 +++ sparc64/sparc64/vm_machdep.c 31 May 2007 21:26:16 -0000 @@ -171,7 +171,7 @@ cpu_set_upcall(struct thread *td, struct pcb->pcb_pc = (u_long)fork_trampoline - 8; pcb->pcb_sp = (u_long)fr - SPOFF; - /* Setup to release sched_lock in fork_exit(). */ + /* Setup to release the spin count in fork_exit(). */ td->td_md.md_spinlock_count = 1; td->td_md.md_saved_pil = 0; } @@ -298,7 +298,7 @@ cpu_fork(struct thread *td1, struct proc pcb2->pcb_sp = (u_long)fp - SPOFF; pcb2->pcb_pc = (u_long)fork_trampoline - 8; - /* Setup to release sched_lock in fork_exit(). */ + /* Setup to release the spin count in fork_exit(). */ td2->td_md.md_spinlock_count = 1; td2->td_md.md_saved_pil = 0; Index: sun4v/sun4v/mp_machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/sun4v/sun4v/mp_machdep.c,v retrieving revision 1.6 diff -u -p -r1.6 mp_machdep.c --- sun4v/sun4v/mp_machdep.c 2 Feb 2007 05:00:21 -0000 1.6 +++ sun4v/sun4v/mp_machdep.c 31 May 2007 21:26:53 -0000 @@ -403,13 +403,8 @@ cpu_mp_bootstrap(struct pcpu *pc) while (csa->csa_count != 0) ; - /* ok, now grab sched_lock and enter the scheduler */ - mtx_lock_spin(&sched_lock); - spinlock_exit(); - PCPU_SET(switchtime, cpu_ticks()); - PCPU_SET(switchticks, ticks); - - cpu_throw(NULL, choosethread()); /* doesn't return */ + /* ok, now enter the scheduler */ + sched_throw(NULL); } void @@ -460,13 +455,12 @@ cpu_ipi_preempt(struct trapframe *tf) { struct thread *running_thread = curthread; - mtx_lock_spin(&sched_lock); + thread_lock(running_thread); if (running_thread->td_critnest > 1) running_thread->td_owepreempt = 1; else mi_switch(SW_INVOL | SW_PREEMPT, NULL); - mtx_unlock_spin(&sched_lock); - + thread_unlock(running_thread); } void Index: sun4v/sun4v/trap.c =================================================================== RCS file: /usr/home/ncvs/src/sys/sun4v/sun4v/trap.c,v retrieving revision 1.12 diff -u -p -r1.12 trap.c --- sun4v/sun4v/trap.c 25 May 2007 01:21:40 -0000 1.12 +++ sun4v/sun4v/trap.c 31 May 2007 21:27:07 -0000 @@ -712,6 +712,5 @@ syscall(struct trapframe *tf) WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning", (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???"); - mtx_assert(&sched_lock, MA_NOTOWNED); mtx_assert(&Giant, MA_NOTOWNED); } Index: sun4v/sun4v/vm_machdep.c =================================================================== RCS file: /usr/home/ncvs/src/sys/sun4v/sun4v/vm_machdep.c,v retrieving revision 1.5 diff -u -p -r1.5 vm_machdep.c --- sun4v/sun4v/vm_machdep.c 20 May 2007 13:06:45 -0000 1.5 +++ sun4v/sun4v/vm_machdep.c 31 May 2007 21:27:31 -0000 @@ -155,7 +155,7 @@ cpu_set_upcall(struct thread *td, struct pcb->pcb_pc = (u_long)fork_trampoline - 8; pcb->pcb_sp = (u_long)fr - SPOFF; - /* Setup to release sched_lock in fork_exit(). */ + /* Setup to release spin count in fork_exit(). */ td->td_md.md_spinlock_count = 1; td->td_md.md_saved_pil = 0; } @@ -288,7 +288,7 @@ cpu_fork(struct thread *td1, struct proc pcb2->pcb_pc = (u_long)fork_trampoline - 8; pcb2->pcb_kstack = (uint64_t)(((char *)pcb2orig) - (CCFSZ + SPOFF)); - /* Setup to release sched_lock in fork_exit(). */ + /* Setup to release spin count in fork_exit(). */ td2->td_md.md_spinlock_count = 1; td2->td_md.md_saved_pil = 0; Index: sys/mutex.h =================================================================== RCS file: /usr/home/ncvs/src/sys/sys/mutex.h,v retrieving revision 1.95 diff -u -p -r1.95 mutex.h --- sys/mutex.h 11 Apr 2007 13:44:55 -0000 1.95 +++ sys/mutex.h 23 May 2007 19:46:08 -0000 @@ -125,6 +125,14 @@ void _mtx_unlock_spin_flags(struct mtx * #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void _mtx_assert(struct mtx *m, int what, const char *file, int line); #endif +void _thread_lock_flags(struct thread *, int, const char *, int); + +#define thread_lock(tdp) \ + _thread_lock_flags((tdp), 0, __FILE__, __LINE__) +#define thread_lock_flags(tdp, opt) \ + _thread_lock_flags((tdp), (opt), __FILE__, __LINE__) +#define thread_unlock(tdp) \ + mtx_unlock_spin(__DEVOLATILE(struct mtx *, (tdp)->td_lock)) /* * We define our machine-independent (unoptimized) mutex micro-operations @@ -349,6 +357,7 @@ extern struct mtx_pool *mtxpool_sleep; */ extern struct mtx sched_lock; extern struct mtx Giant; +extern struct mtx blocked_lock; /* * Giant lock manipulation and clean exit macros. Index: sys/proc.h =================================================================== RCS file: /usr/home/ncvs/src/sys/sys/proc.h,v retrieving revision 1.478 diff -u -p -r1.478 proc.h --- sys/proc.h 1 Jun 2007 01:12:45 -0000 1.478 +++ sys/proc.h 1 Jun 2007 00:10:30 -0000 @@ -134,7 +134,7 @@ struct pargs { * g - process group mtx * h - callout_lock mtx * i - by curproc or the master session mtx - * j - locked by sched_lock mtx + * j - locked by proc slock * k - only accessed by curthread * k*- only accessed by curthread and from an interrupt * l - the attaching proc or attaching proc parent @@ -144,6 +144,7 @@ struct pargs { * p - select lock (sellock) * q - td_contested lock * r - p_peers lock + * t - thread lock * x - created at fork, only changes during single threading in exec * z - zombie threads lock * @@ -195,32 +196,19 @@ struct mqueue_notifier; * other than CPU cycles, which are parceled out to the threads. */ -/*************** - * Threads are the unit of execution - With a single run queue used by all processors: - - RUNQ: --->THREAD---THREAD--... SLEEPQ:[]---THREAD---THREAD---THREAD - []---THREAD - [] - []---THREAD---THREAD - -With PER-CPU run queues: -it gets more complicated. - * - *****************/ - /* * Kernel runnable context (thread). * This is what is put to sleep and reactivated. * Thread context. Processes may have multiple threads. */ struct thread { + volatile struct mtx *td_lock; /* replaces sched lock */ struct proc *td_proc; /* (*) Associated process. */ TAILQ_ENTRY(thread) td_plist; /* (*) All threads in this proc. */ /* The two queues below should someday be merged. */ - TAILQ_ENTRY(thread) td_slpq; /* (j) Sleep queue. */ - TAILQ_ENTRY(thread) td_lockq; /* (j) Lock queue. */ + TAILQ_ENTRY(thread) td_slpq; /* (t) Sleep queue. */ + TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */ TAILQ_HEAD(, selinfo) td_selq; /* (p) List of selinfos. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ @@ -232,20 +220,20 @@ struct thread { /* Cleared during fork1() or thread_schedule_upcall(). */ #define td_startzero td_flags - int td_flags; /* (j) TDF_* flags. */ - int td_inhibitors; /* (j) Why can not run. */ + int td_flags; /* (t) TDF_* flags. */ + int td_inhibitors; /* (t) Why can not run. */ int td_pflags; /* (k) Private thread (TDP_*) flags. */ int td_dupfd; /* (k) Ret value from fdopen. XXX */ - int td_sqqueue; /* (j) Sleepqueue queue blocked on. */ - void *td_wchan; /* (j) Sleep address. */ - const char *td_wmesg; /* (j) Reason for sleep. */ - u_char td_lastcpu; /* (j) Last cpu we were on. */ - u_char td_oncpu; /* (j) Which cpu we are on. */ + int td_sqqueue; /* (t) Sleepqueue queue blocked on. */ + void *td_wchan; /* (t) Sleep address. */ + const char *td_wmesg; /* (t) Reason for sleep. */ + u_char td_lastcpu; /* (t) Last cpu we were on. */ + u_char td_oncpu; /* (t) Which cpu we are on. */ volatile u_char td_owepreempt; /* (k*) Preempt on last critical_exit */ short td_locks; /* (k) Count of non-spin locks. */ - u_char td_tsqueue; /* (j) Turnstile queue blocked on. */ - struct turnstile *td_blocked; /* (j) Lock thread is blocked on. */ - const char *td_lockname; /* (j) Name of lock blocked on. */ + u_char td_tsqueue; /* (t) Turnstile queue blocked on. */ + struct turnstile *td_blocked; /* (t) Lock thread is blocked on. */ + const char *td_lockname; /* (t) Name of lock blocked on. */ LIST_HEAD(, turnstile) td_contested; /* (q) Contested locks. */ struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */ int td_intr_nesting_level; /* (k) Interrupt recursion. */ @@ -253,18 +241,18 @@ struct thread { struct kse_thr_mailbox *td_mailbox; /* (*) Userland mailbox address. */ struct ucred *td_ucred; /* (k) Reference to credentials. */ struct thread *td_standin; /* (k + a) Use this for an upcall. */ - struct kse_upcall *td_upcall; /* (k + j) Upcall structure. */ - u_int td_estcpu; /* (j) Sum of the same field in KSEs. */ - u_int td_slptime; /* (j) How long completely blocked. */ - struct rusage td_ru; /* (j) rusage information */ - uint64_t td_runtime; /* (j) How many cpu ticks we've run. */ - u_int td_pticks; /* (j) Statclock hits for profiling */ - u_int td_sticks; /* (j) Statclock hits in system mode. */ - u_int td_iticks; /* (j) Statclock hits in intr mode. */ - u_int td_uticks; /* (j) Statclock hits in user mode. */ + struct kse_upcall *td_upcall; /* (k + t) Upcall structure. */ + u_int td_estcpu; /* (t) estimated cpu utilization */ + u_int td_slptime; /* (t) How long completely blocked. */ + struct rusage td_ru; /* (t) rusage information */ + uint64_t td_runtime; /* (t) How many cpu ticks we've run. */ + u_int td_pticks; /* (t) Statclock hits for profiling */ + u_int td_sticks; /* (t) Statclock hits in system mode. */ + u_int td_iticks; /* (t) Statclock hits in intr mode. */ + u_int td_uticks; /* (t) Statclock hits in user mode. */ u_int td_uuticks; /* (k) Statclock hits (usr), for UTS. */ u_int td_usticks; /* (k) Statclock hits (sys), for UTS. */ - int td_intrval; /* (j) Return value of TDF_INTERRUPT. */ + int td_intrval; /* (t) Return value of TDF_INTERRUPT. */ sigset_t td_oldsigmask; /* (k) Saved mask from pre sigpause. */ sigset_t td_sigmask; /* (c) Current signal mask. */ volatile u_int td_generation; /* (k) For detection of preemption */ @@ -278,11 +266,11 @@ struct thread { /* Copied during fork1() or thread_sched_upcall(). */ #define td_startcopy td_endzero - u_char td_base_pri; /* (j) Thread base kernel priority. */ - u_char td_priority; /* (j) Thread active priority. */ - u_char td_pri_class; /* (j) Scheduling class. */ - u_char td_user_pri; /* (j) User pri from estcpu and nice. */ - u_char td_base_user_pri; /* (j) Base user pri */ + u_char td_base_pri; /* (t) Thread base kernel priority. */ + u_char td_priority; /* (t) Thread active priority. */ + u_char td_pri_class; /* (t) Scheduling class. */ + u_char td_user_pri; /* (t) User pri from estcpu and nice. */ + u_char td_base_user_pri; /* (t) Base user pri */ #define td_endcopy td_pcb /* @@ -296,7 +284,7 @@ struct thread { TDS_CAN_RUN, TDS_RUNQ, TDS_RUNNING - } td_state; + } td_state; /* (t) thread state */ register_t td_retval[2]; /* (k) Syscall aux returns. */ struct callout td_slpcallout; /* (h) Callout for sleep. */ struct trapframe *td_frame; /* (k) */ @@ -313,6 +301,16 @@ struct thread { int td_syscalls; /* per-thread syscall count (used by NFS :)) */ }; +struct mtx *thread_lock_block(struct thread *); +void thread_lock_unblock(struct thread *, struct mtx *); +void thread_lock_set(struct thread *, struct mtx *); +#define THREAD_LOCK_ASSERT(td, type) \ +do { \ + struct mtx *__m = __DEVOLATILE(struct mtx *, (td)->td_lock); \ + if (__m != &blocked_lock) \ + mtx_assert(__m, (type)); \ +} while (0) + /* * Flags kept in td_flags: * To change these you MUST have the scheduler lock. @@ -324,22 +322,22 @@ struct thread { #define TDF_IDLETD 0x00000020 /* This is a per-CPU idle thread. */ #define TDF_SELECT 0x00000040 /* Selecting; wakeup/waiting danger. */ #define TDF_SLEEPABORT 0x00000080 /* sleepq_abort was called. */ -#define TDF_TSNOBLOCK 0x00000100 /* Don't block on a turnstile due to race. */ +#define TDF_UNUSEDx100 0x00000100 /* --available-- */ #define TDF_UBORROWING 0x00000200 /* Thread is borrowing user pri. */ #define TDF_BOUNDARY 0x00000400 /* Thread suspended at user boundary */ #define TDF_ASTPENDING 0x00000800 /* Thread has some asynchronous events. */ #define TDF_TIMOFAIL 0x00001000 /* Timeout from sleep after we were awake. */ #define TDF_INTERRUPT 0x00002000 /* Thread is marked as interrupted. */ #define TDF_UPIBLOCKED 0x00004000 /* Thread blocked on user PI mutex. */ -#define TDF_UNUSED15 0x00008000 /* --available -- */ +#define TDF_UNUSED15 0x00008000 /* --available-- */ #define TDF_NEEDRESCHED 0x00010000 /* Thread needs to yield. */ #define TDF_NEEDSIGCHK 0x00020000 /* Thread may need signal delivery. */ #define TDF_XSIG 0x00040000 /* Thread is exchanging signal under trace */ #define TDF_UNUSED19 0x00080000 /* Thread is sleeping on a umtx. */ #define TDF_THRWAKEUP 0x00100000 /* Libthr thread must not suspend itself. */ #define TDF_DBSUSPEND 0x00200000 /* Thread is suspended by debugger */ -#define TDF_UNUSED22 0x00400000 /* --available -- */ -#define TDF_UNUSED23 0x00800000 /* --available -- */ +#define TDF_UNUSED22 0x00400000 /* --available-- */ +#define TDF_UNUSED23 0x00800000 /* --available-- */ #define TDF_SCHED0 0x01000000 /* Reserved for scheduler private use */ #define TDF_SCHED1 0x02000000 /* Reserved for scheduler private use */ #define TDF_SCHED2 0x04000000 /* Reserved for scheduler private use */ @@ -482,7 +480,8 @@ struct rusage_ext { */ struct proc { LIST_ENTRY(proc) p_list; /* (d) List of all processes. */ - TAILQ_HEAD(, thread) p_threads; /* (j)(td_plist) Threads. (shortcut) */ + TAILQ_HEAD(, thread) p_threads; /* (j) all threads. */ + struct mtx p_slock; /* process spin lock */ struct ucred *p_ucred; /* (c) Process owner's identity. */ struct filedesc *p_fd; /* (b) Open files. */ struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */ @@ -491,7 +490,7 @@ struct proc { struct plimit *p_limit; /* (c) Process limits. */ struct callout p_limco; /* (c) Limit callout handle */ struct sigacts *p_sigacts; /* (x) Signal actions, state (CPU). */ - TAILQ_HEAD(, kse_upcall) p_upcalls; /* All upcalls in the proc. */ + TAILQ_HEAD(, kse_upcall) p_upcalls; /* (j) All upcalls in the proc. */ /* * The following don't make too much sense. @@ -504,7 +503,6 @@ struct proc { PRS_NORMAL, /* threads can be run. */ PRS_ZOMBIE } p_state; /* (j/c) S* process status. */ - pid_t p_pid; /* (b) Process identifier. */ LIST_ENTRY(proc) p_hash; /* (d) Hash chain. */ LIST_ENTRY(proc) p_pglist; /* (g + e) List of processes in pgrp. */ @@ -542,14 +540,12 @@ struct proc { struct nlminfo *p_nlminfo; /* (?) Only used by/for lockd. */ struct kaioinfo *p_aioinfo; /* (c) ASYNC I/O info. */ struct thread *p_singlethread;/* (c + j) If single threading this is it */ - int p_suspcount; /* (c) Num threads in suspended mode. */ + int p_suspcount; /* (j) Num threads in suspended mode. */ struct thread *p_xthread; /* (c) Trap thread */ int p_boundary_count;/* (c) Num threads at user boundary */ int p_pendingcnt; /* how many signals are pending */ struct itimers *p_itimers; /* (c) POSIX interval timers. */ /* from ksegrp */ - u_int p_estcpu; /* (j) Sum of the field in threads. */ - u_int p_slptime; /* (j) How long completely blocked. */ int p_numupcalls; /* (j) Num upcalls. */ int p_upsleeps; /* (c) Num threads in kse_release(). */ struct kse_thr_mailbox *p_completed; /* (c) Completed thread mboxes. */ @@ -592,6 +588,9 @@ struct proc { #define NOCPU 0xff /* For when we aren't on a CPU. */ +#define PROC_SLOCK(p) mtx_lock_spin(&(p)->p_slock) +#define PROC_SUNLOCK(p) mtx_unlock_spin(&(p)->p_slock) +#define PROC_SLOCK_ASSERT(p, type) mtx_assert(&(p)->p_slock, (type)) /* These flags are kept in p_flag. */ #define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock. */ @@ -626,7 +625,7 @@ struct proc { #define P_STOPPED (P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE) #define P_SHOULDSTOP(p) ((p)->p_flag & P_STOPPED) -/* These flags are kept in p_sflag and are protected with sched_lock. */ +/* These flags are kept in p_sflag and are protected with proc slock. */ #define PS_INMEM 0x00001 /* Loaded into memory. */ #define PS_ALRMPEND 0x00020 /* Pending SIGVTALRM needs to be posted. */ #define PS_PROFPEND 0x00040 /* Pending SIGPROF needs to be posted. */ @@ -861,8 +860,8 @@ void stopevent(struct proc *, u_int, u_i void threadinit(void); void cpu_idle(void); extern void (*cpu_idle_hook)(void); /* Hook to machdep CPU idler. */ -void cpu_switch(struct thread *old, struct thread *new); -void cpu_throw(struct thread *old, struct thread *new) __dead2; +void cpu_switch(struct thread *, struct thread *, struct mtx *); +void cpu_throw(struct thread *, struct thread *) __dead2; void unsleep(struct thread *); void userret(struct thread *, struct trapframe *); @@ -872,6 +871,7 @@ void cpu_fork(struct thread *, struct pr void cpu_set_fork_handler(struct thread *, void (*)(void *), void *); /* New in KSE. */ +void kse_unlink(struct thread *); void kse_GC(void); void kseinit(void); void cpu_set_upcall(struct thread *td, struct thread *td0); @@ -900,6 +900,7 @@ void childproc_stopped(struct proc *chil void childproc_continued(struct proc *child); void childproc_exited(struct proc *child); int thread_suspend_check(int how); +void thread_suspend_switch(struct thread *); void thread_suspend_one(struct thread *td); struct thread *thread_switchout(struct thread *td, int flags, struct thread *newtd); Index: sys/resourcevar.h =================================================================== RCS file: /usr/home/ncvs/src/sys/sys/resourcevar.h,v retrieving revision 1.50 diff -u -p -r1.50 resourcevar.h --- sys/resourcevar.h 1 Jun 2007 01:12:45 -0000 1.50 +++ sys/resourcevar.h 31 May 2007 21:32:58 -0000 @@ -47,7 +47,7 @@ * Locking key: * b - created at fork, never changes * c - locked by proc mtx - * j - locked by sched_lock mtx + * j - locked by proc slock * k - only accessed by curthread */ struct pstats { Index: sys/sched.h =================================================================== RCS file: /usr/home/ncvs/src/sys/sys/sched.h,v retrieving revision 1.31 diff -u -p -r1.31 sched.h --- sys/sched.h 23 Jan 2007 08:46:50 -0000 1.31 +++ sys/sched.h 21 May 2007 23:41:12 -0000 @@ -81,6 +81,7 @@ int sched_runnable(void); */ void sched_exit(struct proc *p, struct thread *childtd); void sched_fork(struct thread *td, struct thread *childtd); +void sched_fork_exit(struct thread *td); /* * KSE Groups contain scheduling priority information. They record the @@ -101,6 +102,7 @@ fixpt_t sched_pctcpu(struct thread *td); void sched_prio(struct thread *td, u_char prio); void sched_sleep(struct thread *td); void sched_switch(struct thread *td, struct thread *newtd, int flags); +void sched_throw(struct thread *td); void sched_unlend_prio(struct thread *td, u_char prio); void sched_unlend_user_prio(struct thread *td, u_char pri); void sched_user_prio(struct thread *td, u_char prio); @@ -155,6 +157,20 @@ sched_unpin(void) #define SRQ_PREEMPTED 0x0008 /* has been preempted.. be kind */ #define SRQ_BORROWING 0x0010 /* Priority updated due to prio_lend */ +#define SCHED_STATS +/* Switch stats. */ +#ifdef SCHED_STATS +extern long switch_preempt; +extern long switch_owepreempt; +extern long switch_turnstile; +extern long switch_sleepq; +extern long switch_sleepqtimo; +extern long switch_relinquish; +extern long switch_needresched; +#define SCHED_STAT_INC(var) atomic_add_long(&(var), 1) +#else +#define SCHED_STAT_INC(var) +#endif /* temporarily here */ void schedinit(void); @@ -162,7 +178,6 @@ void sched_init_concurrency(struct proc void sched_set_concurrency(struct proc *p, int cuncurrency); void sched_schedinit(void); void sched_newproc(struct proc *p, struct thread *td); -void sched_thread_exit(struct thread *td); void sched_newthread(struct thread *td); #endif /* _KERNEL */ Index: sys/turnstile.h =================================================================== RCS file: /usr/home/ncvs/src/sys/sys/turnstile.h,v retrieving revision 1.11 diff -u -p -r1.11 turnstile.h --- sys/turnstile.h 18 Apr 2006 18:21:38 -0000 1.11 +++ sys/turnstile.h 18 May 2007 10:37:02 -0000 @@ -91,17 +91,19 @@ void init_turnstiles(void); void turnstile_adjust(struct thread *, u_char); struct turnstile *turnstile_alloc(void); void turnstile_broadcast(struct turnstile *, int); -void turnstile_claim(struct lock_object *); +void turnstile_cancel(struct turnstile *); +void turnstile_chain_lock(struct lock_object *); +void turnstile_chain_unlock(struct lock_object *); +void turnstile_claim(struct turnstile *); void turnstile_disown(struct turnstile *); int turnstile_empty(struct turnstile *ts, int queue); void turnstile_free(struct turnstile *); struct thread *turnstile_head(struct turnstile *, int); -void turnstile_lock(struct lock_object *); struct turnstile *turnstile_lookup(struct lock_object *); -void turnstile_release(struct lock_object *); int turnstile_signal(struct turnstile *, int); +struct turnstile *turnstile_trywait(struct lock_object *); void turnstile_unpend(struct turnstile *, int); -void turnstile_wait(struct lock_object *, struct thread *, int); +void turnstile_wait(struct turnstile *, struct thread *, int); #endif /* _KERNEL */ #endif /* _SYS_TURNSTILE_H_ */ Index: ufs/ffs/ffs_snapshot.c =================================================================== RCS file: /usr/home/ncvs/src/sys/ufs/ffs/ffs_snapshot.c,v retrieving revision 1.135 diff -u -p -r1.135 ffs_snapshot.c --- ufs/ffs/ffs_snapshot.c 10 Apr 2007 09:31:42 -0000 1.135 +++ ufs/ffs/ffs_snapshot.c 18 May 2007 10:37:02 -0000 @@ -389,12 +389,15 @@ restart: * Recind nice scheduling while running with the filesystem suspended. */ if (td->td_proc->p_nice > 0) { - PROC_LOCK(td->td_proc); - mtx_lock_spin(&sched_lock); - saved_nice = td->td_proc->p_nice; - sched_nice(td->td_proc, 0); - mtx_unlock_spin(&sched_lock); - PROC_UNLOCK(td->td_proc); + struct proc *p; + + p = td->td_proc; + PROC_LOCK(p); + PROC_SLOCK(p); + saved_nice = p->p_nice; + sched_nice(p, 0); + PROC_SUNLOCK(p); + PROC_UNLOCK(p); } /* * Suspend operation on filesystem. @@ -809,10 +812,13 @@ done: out: NDFREE(&nd, NDF_ONLY_PNBUF); if (saved_nice > 0) { - PROC_LOCK(td->td_proc); - mtx_lock_spin(&sched_lock); + struct proc *p; + + p = td->td_proc; + PROC_LOCK(p); + PROC_SLOCK(p); sched_nice(td->td_proc, saved_nice); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(td->td_proc); } UFS_LOCK(ump); Index: vm/vm_glue.c =================================================================== RCS file: /usr/home/ncvs/src/sys/vm/vm_glue.c,v retrieving revision 1.222 diff -u -p -r1.222 vm_glue.c --- vm/vm_glue.c 1 Jun 2007 01:12:45 -0000 1.222 +++ vm/vm_glue.c 31 May 2007 20:40:26 -0000 @@ -619,24 +619,26 @@ faultin(p) * busy swapping it in. */ ++p->p_lock; - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); p->p_sflag |= PS_SWAPPINGIN; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); FOREACH_THREAD_IN_PROC(p, td) vm_thread_swapin(td); PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); p->p_sflag &= ~PS_SWAPPINGIN; p->p_sflag |= PS_INMEM; FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); TD_CLR_SWAPPED(td); if (TD_CAN_RUN(td)) setrunnable(td); + thread_unlock(td); } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); wakeup(&p->p_sflag); @@ -672,9 +674,9 @@ scheduler(dummy) loop: if (vm_page_count_min()) { VM_WAIT; - mtx_lock_spin(&sched_lock); + thread_lock(&thread0); proc0_rescan = 0; - mtx_unlock_spin(&sched_lock); + thread_unlock(&thread0); goto loop; } @@ -685,13 +687,14 @@ loop: if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) { continue; } - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); FOREACH_THREAD_IN_PROC(p, td) { /* * An otherwise runnable thread of a process * swapped out has only the TDI_SWAPPED bit set. * */ + thread_lock(td); if (td->td_inhibitors == TDI_SWAPPED) { pri = p->p_swtime + td->td_slptime; if ((p->p_sflag & PS_SWAPINREQ) == 0) { @@ -709,8 +712,9 @@ loop: ppri = pri; } } + thread_unlock(td); } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } sx_sunlock(&allproc_lock); @@ -718,13 +722,13 @@ loop: * Nothing to do, back to sleep. */ if ((p = pp) == NULL) { - mtx_lock_spin(&sched_lock); + thread_lock(&thread0); if (!proc0_rescan) { TD_SET_IWAIT(&thread0); mi_switch(SW_VOL, NULL); } proc0_rescan = 0; - mtx_unlock_spin(&sched_lock); + thread_unlock(&thread0); goto loop; } PROC_LOCK(p); @@ -736,15 +740,15 @@ loop: */ if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) { PROC_UNLOCK(p); - mtx_lock_spin(&sched_lock); + thread_lock(&thread0); proc0_rescan = 0; - mtx_unlock_spin(&sched_lock); + thread_unlock(&thread0); goto loop; } - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); p->p_sflag &= ~PS_SWAPINREQ; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); /* * We would like to bring someone in. (only if there is space). @@ -752,10 +756,12 @@ loop: */ faultin(p); PROC_UNLOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); p->p_swtime = 0; + PROC_SUNLOCK(p); + thread_lock(&thread0); proc0_rescan = 0; - mtx_unlock_spin(&sched_lock); + thread_unlock(&thread0); goto loop; } @@ -763,7 +769,8 @@ void kick_proc0(void) { struct thread *td = &thread0; - + /* XXX This will probably cause a LOR in some cases */ + thread_lock(td); if (TD_AWAITING_INTR(td)) { CTR2(KTR_INTR, "%s: sched_add %d", __func__, 0); TD_CLR_IWAIT(td); @@ -773,6 +780,7 @@ void kick_proc0(void) CTR2(KTR_INTR, "%s: state %d", __func__, td->td_state); } + thread_unlock(td); } @@ -821,12 +829,12 @@ retry: * creation. It may have no * address space or lock yet. */ - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); if (p->p_state == PRS_NEW) { - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); continue; } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); /* * An aio daemon switches its @@ -876,7 +884,7 @@ retry: break; case PRS_NORMAL: - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); /* * do not swapout a realtime process * Check all the thread groups.. @@ -929,7 +937,7 @@ retry: (minslptime > swap_idle_threshold2))) { swapout(p); didswap++; - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); PROC_UNLOCK(p); vm_map_unlock(&vm->vm_map); vmspace_free(vm); @@ -937,7 +945,7 @@ retry: goto retry; } nextproc: - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); } nextproc2: PROC_UNLOCK(p); @@ -962,7 +970,7 @@ swapout(p) struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); - mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); + mtx_assert(&p->p_slock, MA_OWNED | MA_NOTRECURSED); #if defined(SWAP_DEBUG) printf("swapping out %d\n", p->p_pid); #endif @@ -996,15 +1004,18 @@ swapout(p) p->p_sflag &= ~PS_INMEM; p->p_sflag |= PS_SWAPPINGOUT; PROC_UNLOCK(p); - FOREACH_THREAD_IN_PROC(p, td) + FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); TD_SET_SWAPPED(td); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); + } + PROC_SUNLOCK(p); FOREACH_THREAD_IN_PROC(p, td) vm_thread_swapout(td); PROC_LOCK(p); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); p->p_sflag &= ~PS_SWAPPINGOUT; p->p_swtime = 0; } Index: vm/vm_meter.c =================================================================== RCS file: /usr/home/ncvs/src/sys/vm/vm_meter.c,v retrieving revision 1.93 diff -u -p -r1.93 vm_meter.c --- vm/vm_meter.c 31 May 2007 22:52:15 -0000 1.93 +++ vm/vm_meter.c 31 May 2007 20:40:26 -0000 @@ -131,17 +131,21 @@ vmtotal(SYSCTL_HANDLER_ARGS) FOREACH_PROC_IN_SYSTEM(p) { if (p->p_flag & P_SYSTEM) continue; - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); switch (p->p_state) { case PRS_NEW: - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); continue; break; default: FOREACH_THREAD_IN_PROC(p, td) { /* Need new statistics XXX */ + thread_lock(td); switch (td->td_state) { case TDS_INHIBITED: + /* + * XXX stats no longer synchronized. + */ if (TD_ON_LOCK(td) || (td->td_inhibitors == TDI_SWAPPED)) { @@ -162,13 +166,15 @@ vmtotal(SYSCTL_HANDLER_ARGS) case TDS_RUNQ: case TDS_RUNNING: total.t_rq++; + thread_unlock(td); continue; default: break; } + thread_unlock(td); } } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); /* * Note active objects. */ Index: vm/vm_pageout.c =================================================================== RCS file: /usr/home/ncvs/src/sys/vm/vm_pageout.c,v retrieving revision 1.281 diff -u -p -r1.281 vm_pageout.c --- vm/vm_pageout.c 31 May 2007 22:52:15 -0000 1.281 +++ vm/vm_pageout.c 31 May 2007 20:40:26 -0000 @@ -1246,22 +1246,24 @@ unlock_and_continue: * If the process is in a non-running type state, * don't touch it. Check all the threads individually. */ - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); breakout = 0; FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td)) { + thread_unlock(td); breakout = 1; break; } + thread_unlock(td); } + PROC_SUNLOCK(p); if (breakout) { - mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); continue; } - mtx_unlock_spin(&sched_lock); /* * get the process size */ @@ -1287,9 +1289,9 @@ unlock_and_continue: sx_sunlock(&allproc_lock); if (bigproc != NULL) { killproc(bigproc, "out of swap space"); - mtx_lock_spin(&sched_lock); + PROC_SLOCK(bigproc); sched_nice(bigproc, PRIO_MIN); - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(bigproc); PROC_UNLOCK(bigproc); wakeup(&cnt.v_free_count); } @@ -1594,17 +1596,20 @@ vm_daemon() * if the process is in a non-running type state, * don't touch it. */ - mtx_lock_spin(&sched_lock); + PROC_SLOCK(p); breakout = 0; FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td)) { + thread_unlock(td); breakout = 1; break; } + thread_unlock(td); } - mtx_unlock_spin(&sched_lock); + PROC_SUNLOCK(p); if (breakout) { PROC_UNLOCK(p); continue; Index: vm/vm_zeroidle.c =================================================================== RCS file: /usr/home/ncvs/src/sys/vm/vm_zeroidle.c,v retrieving revision 1.46 diff -u -p -r1.46 vm_zeroidle.c --- vm/vm_zeroidle.c 31 May 2007 22:52:15 -0000 1.46 +++ vm/vm_zeroidle.c 31 May 2007 20:40:26 -0000 @@ -145,9 +145,9 @@ vm_pagezero(void __unused *arg) vm_page_zero_idle(); #ifndef PREEMPTION if (sched_runnable()) { - mtx_lock_spin(&sched_lock); + thread_lock(curthread); mi_switch(SW_VOL, NULL); - mtx_unlock_spin(&sched_lock); + thread_unlock(curthread); } #endif } else { @@ -176,11 +176,11 @@ pagezero_start(void __unused *arg) PROC_LOCK(pagezero_proc); pagezero_proc->p_flag |= P_NOLOAD; PROC_UNLOCK(pagezero_proc); - mtx_lock_spin(&sched_lock); td = FIRST_THREAD_IN_PROC(pagezero_proc); + thread_lock(td); sched_class(td, PRI_IDLE); sched_prio(td, PRI_MAX_IDLE); sched_add(td, SRQ_BORING); - mtx_unlock_spin(&sched_lock); + thread_unlock(td); } SYSINIT(pagezero, SI_SUB_KTHREAD_VM, SI_ORDER_ANY, pagezero_start, NULL)