--- //depot/projects/smpng/sys/compat/linux/linux_misc.c 2009/05/13 13:56:17 +++ //depot/user/jhb/lock/compat/linux/linux_misc.c 2009/05/13 17:57:30 @@ -251,8 +251,8 @@ locked = 0; vp = NULL; - NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, - UIO_SYSSPACE, library, td); + NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE | + AUDITVNODE1, UIO_SYSSPACE, library, td); error = namei(&ni); LFREEPATH(library); if (error) @@ -312,6 +312,13 @@ if (error) goto cleanup; + /* + * Drop the vnode lock (but not the reference) while we map + * the header. + */ + locked = 0; + VOP_UNLOCK(vp, 0); + /* Pull in executable header into kernel_map */ error = vm_mmap(kernel_map, (vm_offset_t *)&a_out, PAGE_SIZE, VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0); @@ -374,14 +381,9 @@ * XXX: Note that if any of the VM operations fail below we don't * clear this flag. */ + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_TEXT; - - /* - * Lock no longer needed - */ - locked = 0; VOP_UNLOCK(vp, 0); - VFS_UNLOCK_GIANT(vfslocked); /* * Check if file_offset page aligned. Currently we cannot handle @@ -460,8 +462,12 @@ cleanup: /* Unlock vnode if needed */ - if (locked) { - VOP_UNLOCK(vp, 0); + if (locked || vp != NULL) { + if (locked) + vput(vp); + else + /* XXX: Should this do VOP_CLOSE(). */ + vrele(vp); VFS_UNLOCK_GIANT(vfslocked); } @@ -470,7 +476,7 @@ vm_map_remove(kernel_map, (vm_offset_t)a_out, (vm_offset_t)a_out + PAGE_SIZE); - return error; + return (error); } #endif /* __i386__ */ --- //depot/projects/smpng/sys/compat/svr4/svr4_misc.c 2009/02/13 18:22:54 +++ //depot/user/jhb/lock/compat/svr4/svr4_misc.c 2009/02/13 20:21:57 @@ -1611,14 +1611,14 @@ struct nameidata nd; int error, *retval = td->td_retval; unsigned int ncopy; - int vfslocked; NDINIT(&nd, LOOKUP, NOFOLLOW | SAVENAME | MPSAFE, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) - return error; - vfslocked = NDHASGIANT(&nd); + return (error); + NDFREE(&nd, NDF_NO_FREE_PNBUF); + VFS_UNLOCK_GIANT(NDHASGIANT(&nd)); ncopy = min(uap->bufsiz, strlen(nd.ni_cnd.cn_pnbuf) + 1); if ((error = copyout(nd.ni_cnd.cn_pnbuf, uap->buf, ncopy)) != 0) @@ -1627,7 +1627,5 @@ *retval = ncopy; bad: NDFREE(&nd, NDF_ONLY_PNBUF); - vput(nd.ni_vp); - VFS_UNLOCK_GIANT(vfslocked); return error; } --- //depot/projects/smpng/sys/contrib/altq/altq/altq_subr.c 2009/05/19 13:40:43 +++ //depot/user/jhb/lock/contrib/altq/altq/altq_subr.c 2009/05/20 17:51:14 @@ -1021,7 +1021,7 @@ microtime(&tv_start); start = read_machclk(); timo = hz; /* 1 sec */ - (void)tsleep(&wait, PWAIT | PCATCH, "init_machclk", timo); + (void)tsleep(&wait, PWAIT, "init_machclk", timo); microtime(&tv_end); end = read_machclk(); diff = (u_int64_t)(tv_end.tv_sec - tv_start.tv_sec) * 1000000 --- //depot/projects/smpng/sys/fs/cd9660/cd9660_lookup.c 2009/01/28 22:38:17 +++ //depot/user/jhb/lock/fs/cd9660/cd9660_lookup.c 2009/05/11 13:58:44 @@ -374,15 +374,17 @@ */ mp = pdp->v_mount; ltype = VOP_ISLOCKED(pdp); - for (;;) { - error = vfs_busy(mp, MBF_NOWAIT); - if (error == 0) - break; + error = vfs_busy(mp, MBF_NOWAIT); + if (error != 0) { VOP_UNLOCK(pdp, 0); - pause("vn_vget", 1); + error = vfs_busy(mp, 0); vn_lock(pdp, ltype | LK_RETRY); - if (pdp->v_iflag & VI_DOOMED) + if (error) + return (ENOENT); + if (pdp->v_iflag & VI_DOOMED) { + vfs_unbusy(mp); return (ENOENT); + } } VOP_UNLOCK(pdp, 0); error = cd9660_vget_internal(vdp->v_mount, i_ino, --- //depot/projects/smpng/sys/fs/cd9660/cd9660_vfsops.c 2009/05/13 13:56:17 +++ //depot/user/jhb/lock/fs/cd9660/cd9660_vfsops.c 2009/05/13 17:57:30 @@ -156,7 +156,8 @@ * Not an update, or updating the name: look up the name * and verify that it refers to a sensible block device. */ - NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); + NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE, + fspec, td); if ((error = namei(&ndp))) return (error); NDFREE(&ndp, NDF_ONLY_PNBUF); --- //depot/projects/smpng/sys/fs/hpfs/hpfs_vfsops.c 2009/05/13 13:56:17 +++ //depot/user/jhb/lock/fs/hpfs/hpfs_vfsops.c 2009/05/13 17:57:30 @@ -156,7 +156,8 @@ * Not an update, or updating the name: look up the name * and verify that it refers to a sensible block device. */ - NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, from, td); + NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE, from, + td); err = namei(&ndp); if (err) { /* can't get devvp!*/ --- //depot/projects/smpng/sys/fs/msdosfs/denode.h 2009/02/27 21:13:56 +++ //depot/user/jhb/lock/fs/msdosfs/denode.h 2009/03/05 20:57:52 @@ -135,20 +135,26 @@ /* * This is the in memory variant of a dos directory entry. It is usually * contained within a vnode. + * + * Locking key: + * (c) - only changed during initial vnode creation or reclaim + * (i) - VI_LOCK() + * (v) - vn_lock() + * (*) - TBD, but needs a lock of some sort */ struct denode { - struct vnode *de_vnode; /* addr of vnode we are part of */ - u_long de_flag; /* flag bits */ - u_long de_dirclust; /* cluster of the directory file containing this entry */ - u_long de_diroffset; /* offset of this entry in the directory cluster */ - u_long de_fndoffset; /* offset of found dir entry */ - int de_fndcnt; /* number of slots before de_fndoffset */ - long de_refcnt; /* reference count */ - struct msdosfsmount *de_pmp; /* addr of our mount struct */ + struct vnode *de_vnode; /* (c) addr of vnode we are part of */ + u_long de_flag; /* (*) flag bits */ + u_long de_dirclust; /* (*) cluster of the directory file containing this entry */ + u_long de_diroffset; /* (*) offset of this entry in the directory cluster */ + u_long de_fndoffset; /* (*) offset of found dir entry */ + int de_fndcnt; /* (*) number of slots before de_fndoffset */ + long de_refcnt; /* (*) reference count */ + struct msdosfsmount *de_pmp; /* (c) addr of our mount struct */ u_char de_Name[12]; /* name, from DOS directory entry */ u_char de_Attributes; /* attributes, from directory entry */ u_char de_LowerCase; /* NT VFAT lower case flags */ - u_char de_CHun; /* Hundredth of second of CTime*/ + u_char de_CHun; /* Hundredth of second of CTime*/ u_short de_CTime; /* creation time */ u_short de_CDate; /* creation date */ u_short de_ADate; /* access date */ --- //depot/projects/smpng/sys/fs/msdosfs/msdosfs_denode.c 2009/02/27 21:13:56 +++ //depot/user/jhb/lock/fs/msdosfs/msdosfs_denode.c 2009/03/05 20:57:52 @@ -168,6 +168,7 @@ ldep->de_dirclust = dirclust; ldep->de_diroffset = diroffset; ldep->de_inode = inode; + ldep->de_Name[0] = SLOT_DELETED; fc_purge(ldep, 0); /* init the fat cache for this denode */ lockmgr(nvp->v_vnlock, LK_EXCLUSIVE, NULL); @@ -184,9 +185,8 @@ return (error); } if (xvp != NULL) { - /* XXX: Not sure this is right */ - nvp = xvp; - ldep->de_vnode = nvp; + *depp = VTODE(xvp); + return (0); } ldep->de_pmp = pmp; @@ -228,17 +228,12 @@ ldep->de_ADate = ldep->de_CDate; ldep->de_MTime = ldep->de_CTime; ldep->de_MDate = ldep->de_CDate; - /* leave the other fields as garbage */ + /* Clear de_Name[0] as this is a real node now. */ + ldep->de_Name[0] = 0; + /* leave the other fields as garbage */ } else { error = readep(pmp, dirclust, diroffset, &bp, &direntptr); if (error) { - /* - * The denode does not contain anything useful, so - * it would be wrong to leave it on its hash chain. - * Arrange for vput() to just forget about it. - */ - ldep->de_Name[0] = SLOT_DELETED; - vput(nvp); *depp = NULL; return (error); --- //depot/projects/smpng/sys/fs/msdosfs/msdosfs_vfsops.c 2009/05/13 13:56:17 +++ //depot/user/jhb/lock/fs/msdosfs/msdosfs_vfsops.c 2009/05/13 17:57:30 @@ -314,7 +314,7 @@ * that user has necessary permissions on the device. */ devvp = pmp->pm_devvp; - vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + vn_lock(devvp, LK_SHARED | LK_RETRY); error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (error) @@ -350,7 +350,8 @@ */ if (vfs_getopt(mp->mnt_optnew, "from", (void **)&from, NULL)) return (EINVAL); - NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, from, td); + NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE, from, + td); error = namei(&ndp); if (error) return (error); --- //depot/projects/smpng/sys/fs/msdosfs/msdosfsmount.h 2009/02/27 21:13:56 +++ //depot/user/jhb/lock/fs/msdosfs/msdosfsmount.h 2009/03/05 20:57:52 @@ -63,50 +63,56 @@ /* * Layout of the mount control block for a msdos filesystem. + * + * Locking key: + * (c) - only changed during initial mount/unmount + * (i) - MNT_ILOCK() + * (u) - like (c) but can be changed via MNT_UPDATE request + * (*) - TBD, but needs a lock of some sort */ struct msdosfsmount { - struct mount *pm_mountp;/* vfs mount struct for this fs */ - struct g_consumer *pm_cp; - struct bufobj *pm_bo; - uid_t pm_uid; /* uid to set as owner of the files */ - gid_t pm_gid; /* gid to set as owner of the files */ - mode_t pm_mask; /* mask to and with file protection bits + struct mount *pm_mountp;/* (c) vfs mount struct for this fs */ + struct g_consumer *pm_cp; /* (c) */ + struct bufobj *pm_bo; /* (c) */ + uid_t pm_uid; /* (u) uid to set as owner of the files */ + gid_t pm_gid; /* (u) gid to set as owner of the files */ + mode_t pm_mask; /* (u) mask to and with file protection bits for files */ - mode_t pm_dirmask; /* mask to and with file protection bits + mode_t pm_dirmask; /* (u) mask to and with file protection bits for directories */ - struct vnode *pm_devvp; /* vnode for character device mounted */ - struct cdev *pm_dev; /* character device mounted */ - struct bpb50 pm_bpb; /* BIOS parameter blk for this fs */ - u_long pm_BlkPerSec; /* How many DEV_BSIZE blocks fit inside a physical sector */ - u_long pm_FATsecs; /* actual number of fat sectors */ - u_long pm_fatblk; /* block # of first FAT */ - u_long pm_rootdirblk; /* block # (cluster # for FAT32) of root directory number */ - u_long pm_rootdirsize; /* size in blocks (not clusters) */ - u_long pm_firstcluster; /* block number of first cluster */ - u_long pm_maxcluster; /* maximum cluster number */ - u_long pm_freeclustercount; /* number of free clusters */ - u_long pm_cnshift; /* shift file offset right this amount to get a cluster number */ - u_long pm_crbomask; /* and a file offset with this mask to get cluster rel offset */ - u_long pm_bnshift; /* shift file offset right this amount to get a block number */ - u_long pm_bpcluster; /* bytes per cluster */ - u_long pm_fmod; /* ~0 if fs is modified, this can rollover to 0 */ - u_long pm_fatblocksize; /* size of fat blocks in bytes */ - u_long pm_fatblocksec; /* size of fat blocks in sectors */ - u_long pm_fatsize; /* size of fat in bytes */ - u_int32_t pm_fatmask; /* mask to use for fat numbers */ - u_long pm_fsinfo; /* fsinfo block number */ - u_long pm_nxtfree; /* next place to search for a free cluster */ - u_int pm_fatmult; /* these 2 values are used in fat */ - u_int pm_fatdiv; /* offset computation */ - u_int pm_curfat; /* current fat for FAT32 (0 otherwise) */ - u_int *pm_inusemap; /* ptr to bitmap of in-use clusters */ - u_int pm_flags; /* see below */ - void *pm_u2w; /* Local->Unicode iconv handle */ - void *pm_w2u; /* Unicode->Local iconv handle */ - void *pm_u2d; /* Unicode->DOS iconv handle */ - void *pm_d2u; /* DOS->Local iconv handle */ - u_int32_t pm_nfileno; /* next 32-bit fileno */ - RB_HEAD(msdosfs_filenotree, msdosfs_fileno) pm_filenos; /* 64<->32-bit fileno mapping */ + struct vnode *pm_devvp; /* (c) vnode for character device mounted */ + struct cdev *pm_dev; /* (c) character device mounted */ + struct bpb50 pm_bpb; /* (c) BIOS parameter blk for this fs */ + u_long pm_BlkPerSec; /* (c) How many DEV_BSIZE blocks fit inside a physical sector */ + u_long pm_FATsecs; /* (c) actual number of fat sectors */ + u_long pm_fatblk; /* (c) block # of first FAT */ + u_long pm_rootdirblk; /* (c) block # (cluster # for FAT32) of root directory number */ + u_long pm_rootdirsize; /* (c) size in blocks (not clusters) */ + u_long pm_firstcluster; /* (c) block number of first cluster */ + u_long pm_maxcluster; /* (c) maximum cluster number */ + u_long pm_freeclustercount; /* (*) number of free clusters */ + u_long pm_cnshift; /* (c) shift file offset right this amount to get a cluster number */ + u_long pm_crbomask; /* (c) and a file offset with this mask to get cluster rel offset */ + u_long pm_bnshift; /* (c) shift file offset right this amount to get a block number */ + u_long pm_bpcluster; /* (c) bytes per cluster */ + u_long pm_fmod; /* (*) ~0 if fs is modified, this can rollover to 0 */ + u_long pm_fatblocksize; /* (c) size of fat blocks in bytes */ + u_long pm_fatblocksec; /* (c) size of fat blocks in sectors */ + u_long pm_fatsize; /* (c) size of fat in bytes */ + u_int32_t pm_fatmask; /* (c) mask to use for fat numbers */ + u_long pm_fsinfo; /* (*) fsinfo block number */ + u_long pm_nxtfree; /* (*) next place to search for a free cluster */ + u_int pm_fatmult; /* (c) these 2 values are used in fat */ + u_int pm_fatdiv; /* (c) offset computation */ + u_int pm_curfat; /* (c) current fat for FAT32 (0 otherwise) */ + u_int *pm_inusemap; /* (*) ptr to bitmap of in-use clusters */ + u_int pm_flags; /* (u) see below */ + void *pm_u2w; /* (u) Local->Unicode iconv handle */ + void *pm_w2u; /* (u) Unicode->Local iconv handle */ + void *pm_u2d; /* (u) Unicode->DOS iconv handle */ + void *pm_d2u; /* (u) DOS->Local iconv handle */ + u_int32_t pm_nfileno; /* (*) next 32-bit fileno */ + RB_HEAD(msdosfs_filenotree, msdosfs_fileno) pm_filenos; /* (*) 64<->32-bit fileno mapping */ }; /* --- //depot/projects/smpng/sys/fs/ntfs/ntfs_vfsops.c 2009/05/13 13:56:17 +++ //depot/user/jhb/lock/fs/ntfs/ntfs_vfsops.c 2009/05/13 17:57:30 @@ -181,7 +181,8 @@ * Not an update, or updating the name: look up the name * and verify that it refers to a sensible block device. */ - NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, from, curthread); + NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE, from, + curthread); err = namei(&ndp); if (err) { /* can't get devvp!*/ --- //depot/projects/smpng/sys/fs/nullfs/null_vfsops.c 2009/05/13 13:56:17 +++ //depot/user/jhb/lock/fs/nullfs/null_vfsops.c 2009/05/13 17:57:30 @@ -115,7 +115,8 @@ /* * Find lower node */ - NDINIT(ndp, LOOKUP, FOLLOW|LOCKLEAF, UIO_SYSSPACE, target, curthread); + NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE, + target, curthread); error = namei(ndp); /* * Re-lock vnode. --- //depot/projects/smpng/sys/fs/udf/udf_vfsops.c 2009/05/13 13:56:17 +++ //depot/user/jhb/lock/fs/udf/udf_vfsops.c 2009/05/13 17:57:30 @@ -225,7 +225,8 @@ /* Check that the mount device exists */ if (fspec == NULL) return (EINVAL); - NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); + NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE, fspec, + td); if ((error = namei(ndp))) return (error); NDFREE(ndp, NDF_ONLY_PNBUF); --- //depot/projects/smpng/sys/kern/kern_acct.c 2009/05/13 13:56:17 +++ //depot/user/jhb/lock/kern/kern_acct.c 2009/05/13 17:57:30 @@ -412,7 +412,6 @@ /* (8) The boolean flags that tell how the process terminated, etc. */ acct.ac_flagx = p->p_acflag; - PROC_UNLOCK(p); /* Setup ancillary structure fields. */ acct.ac_flagx |= ANVER; @@ -423,14 +422,17 @@ /* * Eliminate any file size rlimit. */ - newlim = lim_alloc(); - PROC_LOCK(p); - oldlim = p->p_limit; - lim_copy(newlim, oldlim); - newlim->pl_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - p->p_limit = newlim; + if (p->p_limit->pl_rlimit[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY) { + PROC_UNLOCK(p); + newlim = lim_alloc(); + PROC_LOCK(p); + oldlim = p->p_limit; + lim_copy(newlim, oldlim); + newlim->pl_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + p->p_limit = newlim; + lim_free(oldlim); + } PROC_UNLOCK(p); - lim_free(oldlim); /* * Write the accounting information to the file. --- //depot/projects/smpng/sys/kern/kern_condvar.c 2009/02/27 15:49:22 +++ //depot/user/jhb/lock/kern/kern_condvar.c 2009/02/27 16:32:01 @@ -122,7 +122,7 @@ sleepq_lock(cvp); - cvp->cv_waiters++; + cvp->cv_waiters = 1; if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); @@ -184,7 +184,7 @@ sleepq_lock(cvp); - cvp->cv_waiters++; + cvp->cv_waiters = 1; DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0); @@ -239,7 +239,7 @@ sleepq_lock(cvp); - cvp->cv_waiters++; + cvp->cv_waiters = 1; if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); @@ -305,7 +305,7 @@ sleepq_lock(cvp); - cvp->cv_waiters++; + cvp->cv_waiters = 1; if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); @@ -372,7 +372,7 @@ sleepq_lock(cvp); - cvp->cv_waiters++; + cvp->cv_waiters = 1; if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); @@ -417,10 +417,8 @@ wakeup_swapper = 0; sleepq_lock(cvp); - if (cvp->cv_waiters > 0) { - cvp->cv_waiters--; + if (cvp->cv_waiters > 0) wakeup_swapper = sleepq_signal(cvp, SLEEPQ_CONDVAR, 0, 0); - } sleepq_release(cvp); if (wakeup_swapper) kick_proc0(); --- //depot/projects/smpng/sys/kern/kern_mtxpool.c 2008/11/03 21:11:59 +++ //depot/user/jhb/lock/kern/kern_mtxpool.c 2009/05/10 13:05:31 @@ -71,16 +71,20 @@ int mtxpool_mask; int mtxpool_shift; int mtxpool_next; -}; +} __aligned(CACHE_LINE_SIZE); + +union mtx_c { + struct mtx mtx; +} __aligned(CACHE_LINE_SIZE); struct mtx_pool { struct mtxpool_header mtx_pool_header; - struct mtx mtx_pool_ary[1]; + union mtx_c mtx_pool_ary[1]; }; static struct mtx_pool_lockbuilder { struct mtxpool_header mtx_pool_header; - struct mtx mtx_pool_ary[MTX_POOL_LOCKBUILDER_SIZE]; + union mtx_c mtx_pool_ary[MTX_POOL_LOCKBUILDER_SIZE]; } lockbuilder_pool; #define mtx_pool_size mtx_pool_header.mtxpool_size @@ -117,7 +121,7 @@ */ p = ((HASH_MULTIPLIER * (uintptr_t)ptr) >> pool->mtx_pool_shift) & pool->mtx_pool_mask; - return (&pool->mtx_pool_ary[p]); + return (&pool->mtx_pool_ary[p].mtx); } static void @@ -133,7 +137,7 @@ pool->mtx_pool_shift = POINTER_BITS - maskbits; pool->mtx_pool_next = 0; for (i = 0; i < pool_size; ++i) - mtx_init(&pool->mtx_pool_ary[i], mtx_name, NULL, opts); + mtx_init(&pool->mtx_pool_ary[i].mtx, mtx_name, NULL, opts); } struct mtx_pool * @@ -147,7 +151,7 @@ pool_size = 128; } pool = malloc(sizeof (struct mtx_pool) + - ((pool_size - 1) * sizeof (struct mtx)), + ((pool_size - 1) * sizeof (union mtx_c)), M_MTXPOOL, M_WAITOK | M_ZERO); mtx_pool_initialize(pool, mtx_name, pool_size, opts); return pool; @@ -160,7 +164,7 @@ struct mtx_pool *pool = *poolp; for (i = pool->mtx_pool_size - 1; i >= 0; --i) - mtx_destroy(&pool->mtx_pool_ary[i]); + mtx_destroy(&pool->mtx_pool_ary[i].mtx); free(pool, M_MTXPOOL); *poolp = NULL; } @@ -199,7 +203,7 @@ */ i = pool->mtx_pool_next; pool->mtx_pool_next = (i + 1) & pool->mtx_pool_mask; - return (&pool->mtx_pool_ary[i]); + return (&pool->mtx_pool_ary[i].mtx); } /* --- //depot/projects/smpng/sys/kern/kern_mutex.c 2009/03/19 16:52:48 +++ //depot/user/jhb/lock/kern/kern_mutex.c 2009/03/19 17:44:47 @@ -369,21 +369,20 @@ */ if (v == MTX_UNOWNED) { turnstile_cancel(ts); - cpu_spinwait(); continue; } - MPASS(v != MTX_CONTESTED); - #ifdef ADAPTIVE_MUTEXES /* - * If the current owner of the lock is executing on another - * CPU quit the hard path and try to spin. + * The current lock owner might have started executing + * on another CPU (or the lock could have changed + * owners) while we were waiting on the turnstile + * chain lock. If so, drop the turnstile lock and try + * again. */ owner = (struct thread *)(v & ~MTX_FLAGMASK); if (TD_IS_RUNNING(owner)) { turnstile_cancel(ts); - cpu_spinwait(); continue; } #endif @@ -396,7 +395,6 @@ if ((v & MTX_CONTESTED) == 0 && !atomic_cmpset_ptr(&m->mtx_lock, v, v | MTX_CONTESTED)) { turnstile_cancel(ts); - cpu_spinwait(); continue; } --- //depot/projects/smpng/sys/kern/kern_rwlock.c 2009/03/19 16:52:48 +++ //depot/user/jhb/lock/kern/kern_rwlock.c 2009/03/19 17:44:47 @@ -322,7 +322,6 @@ (void *)(v + RW_ONE_READER)); break; } - cpu_spinwait(); continue; } lock_profile_obtain_lock_failed(&rw->lock_object, @@ -380,14 +379,16 @@ #ifdef ADAPTIVE_RWLOCKS /* - * If the current owner of the lock is executing on another - * CPU quit the hard path and try to spin. + * The current lock owner might have started executing + * on another CPU (or the lock could have changed + * owners) while we were waiting on the turnstile + * chain lock. If so, drop the turnstile lock and try + * again. */ if ((v & RW_LOCK_READ) == 0) { owner = (struct thread *)RW_OWNER(v); if (TD_IS_RUNNING(owner)) { turnstile_cancel(ts); - cpu_spinwait(); continue; } } @@ -408,7 +409,6 @@ if (!atomic_cmpset_ptr(&rw->rw_lock, v, v | RW_LOCK_READ_WAITERS)) { turnstile_cancel(ts); - cpu_spinwait(); continue; } if (LOCK_LOG_TEST(&rw->lock_object, 0)) @@ -650,14 +650,16 @@ #ifdef ADAPTIVE_RWLOCKS /* - * If the current owner of the lock is executing on another - * CPU quit the hard path and try to spin. + * The current lock owner might have started executing + * on another CPU (or the lock could have changed + * owners) while we were waiting on the turnstile + * chain lock. If so, drop the turnstile lock and try + * again. */ if (!(v & RW_LOCK_READ)) { owner = (struct thread *)RW_OWNER(v); if (TD_IS_RUNNING(owner)) { turnstile_cancel(ts); - cpu_spinwait(); continue; } } @@ -680,7 +682,6 @@ break; } turnstile_cancel(ts); - cpu_spinwait(); continue; } /* @@ -692,7 +693,6 @@ if (!atomic_cmpset_ptr(&rw->rw_lock, v, v | RW_LOCK_WRITE_WAITERS)) { turnstile_cancel(ts); - cpu_spinwait(); continue; } if (LOCK_LOG_TEST(&rw->lock_object, 0)) @@ -797,6 +797,13 @@ _rw_assert(rw, RA_RLOCKED, file, line); /* + * If we have multiple readers, just fail without doing any + * atomic operations. + */ + if (RW_READERS(rw->rw_lock) > 1) + return (0); + + /* * Attempt to switch from one reader to a writer. If there * are any write waiters, then we will have to lock the * turnstile first to prevent races with another writer --- //depot/projects/smpng/sys/kern/kern_shutdown.c 2009/04/07 17:48:51 +++ //depot/user/jhb/lock/kern/kern_shutdown.c 2009/04/07 19:19:09 @@ -115,6 +115,12 @@ SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RW, &sync_on_panic, 0, "Do a sync before rebooting from a panic"); +#ifdef INVARIANT_SUPPORT +int debugger_on_caught_panic = 0; +SYSCTL_INT(_debug, OID_AUTO, debugger_on_caught_panic, CTLFLAG_RW, + &debugger_on_caught_panic, 0, "Run debugger on caught kernel panic"); +#endif + SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW, 0, "Shutdown environment"); /* @@ -533,6 +539,21 @@ if (panicstr) bootopt |= RB_NOSYNC; else { +#ifdef INVARIANT_SUPPORT + if (td->td_pflags & TDP_CATCHPANIC) { + td->td_pflags &= ~TDP_CATCHPANIC; + td->td_caught_panic = fmt; + if (debugger_on_caught_panic) { + va_start(ap, fmt); + printf("caught panic: "); + vprintf(fmt, ap); + printf("\n"); + va_end(ap); + kdb_enter(NULL, "caught panic"); + } + longjmp(td->td_panic_buf, 1); + } +#endif panicstr = fmt; newpanic = 1; } --- //depot/projects/smpng/sys/kern/kern_thread.c 2009/03/19 16:52:48 +++ //depot/user/jhb/lock/kern/kern_thread.c 2009/03/19 17:44:47 @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -48,6 +49,7 @@ #include #include +#include #include #include @@ -164,6 +166,9 @@ td->td_sched = (struct td_sched *)&td[1]; umtx_thread_init(td); td->td_kstack = 0; +#ifdef INVARIANT_SUPPORT + td->td_panic_buf = malloc(sizeof(struct _jmp_buf), M_SUBPROC, M_WAITOK); +#endif return (0); } @@ -181,6 +186,9 @@ sleepq_free(td->td_sleepqueue); umtx_thread_fini(td); seltdfini(td); +#ifdef INVARIANT_SUPPORT + free(td->td_panic_buf, M_SUBPROC); +#endif } /* --- //depot/projects/smpng/sys/kern/kern_time.c 2009/04/14 19:06:19 +++ //depot/user/jhb/lock/kern/kern_time.c 2009/04/14 19:48:26 @@ -364,6 +364,7 @@ timespecadd(&ts, rqt); TIMESPEC_TO_TIMEVAL(&tv, rqt); for (;;) { + /* XXX: pause_catch()? */ error = tsleep(&nanowait, PWAIT | PCATCH, "nanslp", tvtohz(&tv)); getnanouptime(&ts2); --- //depot/projects/smpng/sys/kern/sched_4bsd.c 2009/01/26 15:26:58 +++ //depot/user/jhb/lock/kern/sched_4bsd.c 2009/05/01 19:05:35 @@ -1534,6 +1534,7 @@ sched_idletd(void *dummy) { + THREAD_NO_SLEEPING(); for (;;) { mtx_assert(&Giant, MA_NOTOWNED); --- //depot/projects/smpng/sys/kern/sched_ule.c 2009/05/08 11:53:25 +++ //depot/user/jhb/lock/kern/sched_ule.c 2009/05/08 14:17:47 @@ -2537,6 +2537,7 @@ mtx_assert(&Giant, MA_NOTOWNED); td = curthread; tdq = TDQ_SELF(); + THREAD_NO_SLEEPING(); for (;;) { #ifdef SMP if (tdq_idled(tdq) == 0) --- //depot/projects/smpng/sys/kern/subr_turnstile.c 2008/09/17 20:27:47 +++ //depot/user/jhb/lock/kern/subr_turnstile.c 2009/05/01 19:05:35 @@ -678,6 +678,7 @@ if (owner) MPASS(owner->td_proc->p_magic == P_MAGIC); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); + KASSERT(!TD_IS_IDLETHREAD(td), ("idle threads cannot block on locks")); /* * If the lock does not already have a turnstile, use this thread's --- //depot/projects/smpng/sys/kern/subr_witness.c 2009/05/20 17:20:32 +++ //depot/user/jhb/lock/kern/subr_witness.c 2009/05/20 17:51:14 @@ -824,7 +824,7 @@ * it to the pending_locks list. If it is not too early, then enroll * the lock now. */ - if (witness_watch < 1 || panicstr != NULL || + if (witness_watch == -1 || panicstr != NULL || (lock->lo_flags & LO_WITNESS) == 0) lock->lo_witness = NULL; else if (witness_cold) { @@ -1481,7 +1481,8 @@ register_t s; int i, j; - if (witness_cold || lock->lo_witness == NULL || panicstr != NULL) + if (witness_cold || witness_watch == -1 || lock->lo_witness == NULL || + panicstr != NULL) return; td = curthread; class = LOCK_CLASS(lock); @@ -1500,17 +1501,8 @@ goto found; } - /* - * When disabling WITNESS through witness_watch we could end up in - * having registered locks in the td_sleeplocks queue. - * We have to make sure we flush these queues, so just search for - * eventual register locks and remove them. - */ - if (witness_watch > 0) - panic("lock (%s) %s not locked @ %s:%d", class->lc_name, - lock->lo_name, file, line); - else - return; + panic("lock (%s) %s not locked @ %s:%d", class->lc_name, + lock->lo_name, file, line); found: /* First, check for shared/exclusive mismatches. */ @@ -1585,7 +1577,7 @@ int i, n; lle = td->td_sleeplocks; - if (lle == NULL || panicstr != NULL) + if (lle == NULL || witness_watch == -1 || panicstr != NULL) return; if (lle->ll_count != 0) { for (n = 0; lle != NULL; lle = lle->ll_next) --- //depot/projects/smpng/sys/kern/uipc_sockbuf.c 2009/05/08 11:53:25 +++ //depot/user/jhb/lock/kern/uipc_sockbuf.c 2009/05/08 14:17:47 @@ -610,45 +610,36 @@ /* * Append address and data, and optionally, control (ancillary) data to the * receive queue of a socket. If present, m0 must include a packet header - * with total length. Returns 0 if no space in sockbuf or insufficient - * mbufs. + * with total length. Returns 0 if no space in sockbuf. The sockaddr + * should be present in the MT_SONAME mbuf 'sa'. */ int -sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa, - struct mbuf *m0, struct mbuf *control) +sbappendaddr_mbuf(struct sockbuf *sb, struct mbuf *sa, struct mbuf *m0, + struct mbuf *control) { - struct mbuf *m, *n, *nlast; - int space = asa->sa_len; + struct mbuf *n, *nlast; + int space = sa->m_len; SOCKBUF_LOCK_ASSERT(sb); if (m0 && (m0->m_flags & M_PKTHDR) == 0) - panic("sbappendaddr_locked"); + panic("sbappendaddr_mbuf"); if (m0) space += m0->m_pkthdr.len; space += m_length(control, &n); if (space > sbspace(sb)) return (0); -#if MSIZE <= 256 - if (asa->sa_len > MLEN) - return (0); -#endif - MGET(m, M_DONTWAIT, MT_SONAME); - if (m == 0) - return (0); - m->m_len = asa->sa_len; - bcopy(asa, mtod(m, caddr_t), asa->sa_len); if (n) n->m_next = m0; /* concatenate data to control */ else control = m0; - m->m_next = control; - for (n = m; n->m_next != NULL; n = n->m_next) + sa->m_next = control; + for (n = sa; n->m_next != NULL; n = n->m_next) sballoc(sb, n); sballoc(sb, n); nlast = n; - SBLINKRECORD(sb, m); + SBLINKRECORD(sb, sa); sb->sb_mbtail = nlast; SBLASTMBUFCHK(sb); @@ -664,6 +655,25 @@ * mbufs. */ int +sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa, + struct mbuf *m0, struct mbuf *control) +{ + struct mbuf *m; + + SOCKBUF_LOCK_ASSERT(sb); + m = soputsockaddr(asa); + if (m == NULL) + return (0); + return (sbappendaddr_mbuf(sb, m, m0, control)); +} + +/* + * Append address and data, and optionally, control (ancillary) data to the + * receive queue of a socket. If present, m0 must include a packet header + * with total length. Returns 0 if no space in sockbuf or insufficient + * mbufs. + */ +int sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { --- //depot/projects/smpng/sys/kern/uipc_socket.c 2009/05/13 13:56:17 +++ //depot/user/jhb/lock/kern/uipc_socket.c 2009/05/13 17:57:30 @@ -3146,6 +3146,26 @@ } /* + * Make a copy of a sockaddr in an mbuf. + */ +struct mbuf * +soputsockaddr(const struct sockaddr *asa) +{ + struct mbuf *m; + +#if MSIZE <= 256 + if (asa->sa_len > MLEN) + return (NULL); +#endif + MGET(m, M_DONTWAIT, MT_SONAME); + if (m == NULL) + return (NULL); + m->m_len = asa->sa_len; + bcopy(asa, mtod(m, caddr_t), asa->sa_len); + return (m); +} + +/* * Create an external-format (``xsocket'') structure using the information in * the kernel-format socket structure pointed to by so. This is done to * reduce the spew of irrelevant information over this interface, to isolate --- //depot/projects/smpng/sys/modules/crash/crash.c 2008/08/07 20:34:04 +++ //depot/user/jhb/lock/modules/crash/crash.c 2009/02/02 18:40:33 @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -81,6 +82,8 @@ static struct thread *kthread; static int event; +SYSCTL_NODE(_debug, OID_AUTO, crash, CTLFLAG_RD, 0, "crash tree"); + static int mod_event(struct module *module, int cmd, void *arg); static int load(void *arg); static int unload(void *arg); @@ -90,6 +93,86 @@ /* Events. */ static void +sysctl_tree(void) +{ + struct sysctl_ctx_list ctx; + struct sysctl_oid *oidp; + int i; + + for (i = 0; i < 1000000; i++) { + sysctl_ctx_init(&ctx); + oidp = SYSCTL_ADD_NODE(&ctx, + SYSCTL_STATIC_CHILDREN(_debug_crash), OID_AUTO, "tree", + CTLFLAG_RD, NULL, "sysctl_tree tree"); + SYSCTL_ADD_INT(&ctx, SYSCTL_CHILDREN(oidp), OID_AUTO, "max", + CTLFLAG_RD, &event_max, 0, "event_max"); + DELAY(100000); + sysctl_ctx_free(&ctx); + } +} +CRASH_EVENT("add and remove dynamic sysctls", sysctl_tree); + +#ifdef WITNESS +static void +witness_interlock(void) +{ + struct lock lk; + struct mtx ilock; + + mtx_init(&ilock, "ilock", NULL, MTX_DEF | MTX_RECURSE); + lockinit(&lk, PZERO, "lk", 0, 0); + + printf("This should not cause a LOR report (interlock)...\n"); + mtx_lock(&ilock); + lockmgr(&lk, LK_EXCLUSIVE | LK_INTERLOCK, &ilock); + lockmgr(&lk, LK_RELEASE, 0); + + printf("This should cause a LOR report (no interlock)...\n"); + mtx_lock(&ilock); + lockmgr(&lk, LK_EXCLUSIVE, NULL); + mtx_unlock(&ilock); + lockmgr(&lk, LK_RELEASE, 0); + + printf("This should cause a LOR report (recursed interlock)...\n"); + mtx_lock(&ilock); + mtx_lock(&ilock); + lockmgr(&lk, LK_EXCLUSIVE | LK_INTERLOCK, &ilock); + mtx_unlock(&ilock); + lockmgr(&lk, LK_RELEASE, 0); + + lockdestroy(&lk); + mtx_destroy(&ilock); +} +CRASH_EVENT("test handling of lockmgr interlocks with witness", + witness_interlock); +#endif + +static int Giant_wchan; + +static void +sleeping_with_unlocked_Giant_interlock(void) +{ + struct cv cv; + + cv_init(&cv, "Giant"); + PANIC_TRY { + cv_timedwait(&cv, &Giant, 5); + } PANIC_CATCH { + sleepq_release(&cv); + IGNORE_PANIC_STARTS_WITH("mutex %s not owned"); + } PANIC_END; + cv_destroy(&cv); + PANIC_TRY { + mtx_sleep(&Giant_wchan, &Giant, 0, "Giant", 5); + } PANIC_CATCH { + sleepq_release(&Giant_wchan); + IGNORE_PANIC_STARTS_WITH("mutex %s not owned"); + } PANIC_END; +} +CRASH_EVENT("using unlocked Giant as sleep interlock", + sleeping_with_unlocked_Giant_interlock); + +static void broadcast(void *cv) { @@ -113,8 +196,6 @@ } CRASH_EVENT("try to cv_wait_unlock() with Giant", wait_unlock_Giant); -static int Giant_wchan; - static void pdrop_Giant(void) { @@ -768,7 +849,7 @@ { sx_slock(&foo); if (sx_try_upgrade(&foo) == 0) { - printf("crash: umm, upgrade failed?\n"); + panic("crash: umm, upgrade failed?\n"); sx_sunlock(&foo); } else sx_xunlock(&foo); @@ -788,7 +869,14 @@ upgrade_excl_foo(void) { sx_xlock(&foo); - sx_try_upgrade(&foo); + PANIC_TRY { + sx_try_upgrade(&foo); + } PANIC_CATCH { + IGNORE_PANIC_STARTS_WITH("Lock %s not %slocked"); + /* WITNESS */ + IGNORE_PANIC_STARTS_WITH("Lock (%s) %s exclusively locked"); + } PANIC_END; + sx_xunlock(&foo); } CRASH_EVENT("xlock foo, upgrade", upgrade_excl_foo); @@ -796,7 +884,14 @@ downgrade_shared_foo(void) { sx_slock(&foo); - sx_downgrade(&foo); + PANIC_TRY { + sx_downgrade(&foo); + } PANIC_CATCH { + IGNORE_PANIC_STARTS_WITH("Lock %s not %slocked"); + /* WITNESS */ + IGNORE_PANIC_STARTS_WITH("Lock (%s) %s not exclusively locked"); + } PANIC_END; + sx_sunlock(&foo); } CRASH_EVENT("slock foo, downgrade", downgrade_shared_foo); @@ -805,7 +900,14 @@ { sx_slock(&foo); sx_try_upgrade(&foo); - sx_sunlock(&foo); + PANIC_TRY { + sx_sunlock(&foo); + } PANIC_CATCH { + IGNORE_PANIC_STARTS_WITH("Lock %s not %slocked"); + /* WITNESS */ + IGNORE_PANIC_STARTS_WITH("Lock (%s) %s exclusively locked"); + } PANIC_END; + sx_xunlock(&foo); } CRASH_EVENT("slock foo, upgrade, sunlock", sunlock_upgraded_foo); @@ -814,24 +916,37 @@ { sx_xlock(&foo); sx_downgrade(&foo); - sx_xunlock(&foo); + PANIC_TRY { + sx_xunlock(&foo); + } PANIC_CATCH { + IGNORE_PANIC_STARTS_WITH("Lock %s not %slocked"); + /* WITNESS */ + IGNORE_PANIC_STARTS_WITH("Lock (%s) %s not exclusively locked"); + } PANIC_END; + sx_sunlock(&foo); + } CRASH_EVENT("xlock foo, downgrade, xunlock", xunlock_downgraded_foo); static void double_mtx_init(void) { - kdb_enter(KDB_WHY_CRASH, "about to init again"); - mtx_init(&test_mtx, "test", NULL, MTX_DEF); - kdb_enter(KDB_WHY_CRASH, "if we haven't panic'd by now, ouch. :("); - mtx_destroy(&test_mtx); + PANIC_TRY { + mtx_init(&test_mtx, "test", NULL, MTX_DEF); + } PANIC_CATCH { + IGNORE_PANIC("lock \"%s\" %p already initialized"); + } PANIC_END; } CRASH_EVENT("re-init of test_mtx", double_mtx_init); static void test_mtx_assert(void) { - mtx_assert(&Giant, MA_OWNED); + PANIC_TRY { + mtx_assert(&Giant, MA_OWNED); + } PANIC_CATCH { + IGNORE_PANIC_STARTS_WITH("mutex %s not owned"); + } PANIC_END; } CRASH_EVENT("assert that Giant is locked while it is unlocked", test_mtx_assert); @@ -840,7 +955,13 @@ test_sx_assert_slocked(void) { sx_xlock(&foo); - sx_assert(&foo, SX_SLOCKED); + PANIC_TRY { + sx_assert(&foo, SX_SLOCKED); + } PANIC_CATCH { + IGNORE_PANIC_STARTS_WITH("Lock %s not %slocked"); + /* WITNESS */ + IGNORE_PANIC_STARTS_WITH("Lock (%s) %s exclusively locked"); + } PANIC_END; sx_xunlock(&foo); } CRASH_EVENT("assert that foo is slocked while it is xlocked", @@ -857,30 +978,6 @@ } CRASH_EVENT("lock test, slock foo, sunlock foo, unlock test", test_sx_and_mtx_order); - -static void -test_witness_removal(void) -{ - bzero(&test1_mtx, sizeof(test1_mtx)); - bzero(&test2_mtx, sizeof(test2_mtx)); - mtx_init(&test1_mtx, "test1", NULL, MTX_DEF); - mtx_init(&test2_mtx, "test2", NULL, MTX_DEF); - kdb_enter(KDB_WHY_CRASH, "no order yet"); - mtx_lock(&Giant); - mtx_lock(&test1_mtx); - mtx_lock(&test2_mtx); - mtx_unlock(&test2_mtx); - mtx_unlock(&test1_mtx); - mtx_unlock(&Giant); - kdb_enter(KDB_WHY_CRASH, "test1 and test2 should be ordered"); - mtx_destroy(&test1_mtx); - kdb_enter(KDB_WHY_CRASH, - "test1 should be gone, test2 should be after Giant"); - mtx_destroy(&test2_mtx); - kdb_enter(KDB_WHY_CRASH, "test1 and test2 should be gone"); -} -CRASH_EVENT("use test1 and test2 mutexes to test witness removal", - test_witness_removal); #endif static void @@ -1035,7 +1132,11 @@ printf("Should panic\n"); THREAD_NO_SLEEPING(); - pause("sleep", 1); + PANIC_TRY { + pause("sleep", 1); + } PANIC_CATCH { + IGNORE_PANIC("Trying sleep, but thread marked as sleeping prohibited"); + } PANIC_END; THREAD_SLEEPING_OK(); } CRASH_EVENT("sleep while sleeping is prohibited", test_no_sleeping); @@ -1047,13 +1148,31 @@ printf("Should panic\n"); bzero(&test1_mtx, sizeof(test1_mtx)); mtx_init(&test1_mtx, "test1", NULL, MTX_SPIN | MTX_RECURSE); - if (mtx_trylock(&test1_mtx)) - printf("Hmm, locked!\n"); - else - printf("Not locked\n"); + PANIC_TRY { + if (mtx_trylock(&test1_mtx)) + printf("Hmm, locked!\n"); + else + printf("Not locked\n"); + } PANIC_CATCH { + IGNORE_PANIC_STARTS_WITH("mtx_trylock() of spin mutex"); + } PANIC_END; + mtx_destroy(&test1_mtx); } CRASH_EVENT("try lock on a spin mutex", test_trylock_spin); +static void +test_passert(void) +{ + + printf("Should panic on \"foo\"\n"); + PANIC_TRY { + panic("foo"); + } PANIC_CATCH { + IGNORE_PANIC("foo"); + } PANIC_END; +} +CRASH_EVENT("basic test of catching panics", test_passert); + /* Help event should be last so that it is always event 1. */ static void @@ -1078,8 +1197,6 @@ } CRASH_EVENT(NULL, nop); -SYSCTL_NODE(_debug, OID_AUTO, crash, CTLFLAG_RD, 0, "crash tree"); - static int sysctl_debug_crash_test(SYSCTL_HANDLER_ARGS) { --- //depot/projects/smpng/sys/netinet/udp_usrreq.c 2009/05/08 11:53:25 +++ //depot/user/jhb/lock/netinet/udp_usrreq.c 2009/05/08 14:17:47 @@ -207,7 +207,7 @@ { struct sockaddr *append_sa; struct socket *so; - struct mbuf *opts = 0; + struct mbuf *opts = 0, *msa; #ifdef INET6 struct sockaddr_in6 udp_in6; #endif @@ -229,15 +229,6 @@ return; } #endif - if (inp->inp_flags & INP_CONTROLOPTS || - inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) { -#ifdef INET6 - if (inp->inp_vflag & INP_IPV6) - (void)ip6_savecontrol_v4(inp, n, &opts, NULL); - else -#endif - ip_savecontrol(inp, &opts, ip, n); - } #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { bzero(&udp_in6, sizeof(udp_in6)); @@ -248,14 +239,29 @@ } else #endif append_sa = (struct sockaddr *)udp_in; + msa = soputsockaddr(append_sa); + if (msa == NULL) { + m_freem(n); + return; + } + if (inp->inp_flags & INP_CONTROLOPTS || + inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) { +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) + (void)ip6_savecontrol_v4(inp, n, &opts, NULL); + else +#endif + ip_savecontrol(inp, &opts, ip, n); + } m_adj(n, off); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_rcv); - if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) { + if (sbappendaddr_mbuf(&so->so_rcv, msa, n, opts) == 0) { INIT_VNET_INET(so->so_vnet); SOCKBUF_UNLOCK(&so->so_rcv); m_freem(n); + m_freem(msa); if (opts) m_freem(opts); UDPSTAT_INC(udps_fullsock); --- //depot/projects/smpng/sys/notes 2009/02/18 22:05:55 +++ //depot/user/jhb/lock/notes 2009/03/11 22:14:12 @@ -73,3 +73,214 @@ - jhb_socket - socket hacking Space reserved for child branches: +- Fix show witness output to use paging. +- Add a MTX_NEW flag to bypass the LO_INITIALIZED check. Here be dragons. +- Add a MTX_LEAF flag with LO_LEAF flag in WITNESS. +- Add a set of mtx_lock_if_sleep()/mtx_unlock_if_sleep() pair of functions + that use a per-thread array (until the array gets full, at which point + it does the normal unlock/lock). This would be useful to do things like + + mtx_unlock_if_sleep(&foo); + malloc(blah, M_WAITOK); + mtx_lock_if_sleep(&foo); + + rather than: + + mtx_unlock(&foo); + malloc(blah, M_WAITOK); + mtx_lock(&foo); + + where foo is only unlocked if malloc() actually blocks. This is purely + an optimization. +- Allow witness to output witness tree via sysctl or some such +- Allow witness to output witness tree via sysctl in dot friendly format +- Split critical section from spinlocks + - inline critical_enter? + - move prototypes to sys/proc.h if so +- Try an experiement on UP where we replace normal mutexes with critical_* + similar to using spinlock_* for spin mutexes thus effectively giving us + a two-layer spl scheme. +- Reader/writer locks + - add "owner of record" where first read locker gets priority propagations + until they drop the lock +- Stephan's VFS shared locking changes: + - shared locks should be ok for VOP_ACCESS() and VOP_GETATTR() + + VOP_READLINK()? + + cd9660 + + zfs + + udf + + nfs + + ffs + + VOP_READDIR()? (getdirentries()) + + cd9660 + + zfs + + udf + + nfs + + ffs + - VOP_VPTOFH()? (lgetfh() and getfh()) + - VOP_FHTOVP() should return a share-locked vnode or take a locking flag? + - system calls + - lookups + - vnode locks + - VOP_MARKATIME() for zfs can just use ZFS_ACCESSTIME_STAMP() +- Lock some filesystems + - devfs + - shared vnode locks + - VFS_ROOT + - VFS_VGET + + VOP_READ + + VOP_GETDIRENTRIES + + VOP_ACCESS + + VOP_GETATTR + + VOP_READLINK + - see if there are places dm_lock can be shared instead of xlocked + - devfs_vptocn + - LOOKUP_SHARED + - EXTENDED_SHARED? + - msdos + - MPSAFE + - denode (v_data) fields + - mount fields + - FAT related fields + - pm_fmod? + - pm_fsinfo + - pm_nextfree + - pm_inusemap[] + - fileno RB tree + - pm_nfileno + - pm_filenos + + msdosfs_conv.c + - msdosfs_denode.c + - msdosfs_fat.c + - msdosfs_fileno.c + + msdosfs_iconv.c + - msdosfs_lookup.c + - msdosfs_vfsops.c + - msdosfs_vnops.c + - LOOKUP_SHARED + - EXTENDED_SHARED +- NFS fun + + caching credentials across UIDs + - needs testing + - v_dd (kan has patch for this) +- Make all filesystems use vfs_getnewfsid() +- Locking notes on namei() and lookup(): + - namei(): + VREF(dp) + for (;;) { + ni_startdir = dp; + lookup(); + if (error) + no references or locks + if (!symlink) + return (0) + read symlink and build path + if (error) + break + vput(ni_vp) + dp = ni_dvp + } + vput(ni_vp) + vrele(ni_dvp) + - so lookup() is called with ni_startdir referenced, but not locked. On + return, ni_startdir's reference is always released. On error no other + vnodes are referenced or locked. On success, ni_vp is locked and + referenced and ni_dvp is referenced (symlink). + - lookup(): + dp = ni_startdir + ni_startdir = NULL + ni_dvp = NULL + vn_lock(dp) + if (doomed) goto bad + dirloop: + if (degenerate name) + if (error) goto bad; + if (wantparent) + ni_dvp = dp + VREF(dp); + ni_vp = dp; + if (!LOCKPARENT | LOCKLEAF) + VOP_UNLOCK(dp) + /* ni_dvp is ref'd (locked if LOCKPARENT), ni_vp is ref'd and locked (if LOCKLEAF) */ + goto success + if (ISDOTDOT) + for (;;) + if (hitroot) + ni_dvp = dp + VREF(dp) + ni_vp = dp + goto nextname + if (!mountpoint root) + break + update dp for mountpoint below vnode + if (doomed) goto bad + unionlookup: + ni_dvp = dp + ni_vp = NULL + if (ISLASTCN) + upgrade dp + if (doomed) goto bad + VOP_LOOKUP(dp, &ni_vp) + if (error) + if (ENOENT && union case) + switch dp to covered vnode + if (doomed) goto bad + goto unionlookup; + if (error) goto bad + if (!LOCKPARENT) + VOP_UNLOCK(dp) /* == ni_dvp */ + /* ni_vp is NULL, ni_dvp (dp) is ref'd (locked if LOCKPARENT) */ + goto success; + dp = ni_vp + if (dp is mountpoint) + vput(dp) + vput(ni_dvp) + ni_dvp = vp_crossmp + vn_lock(ni_dvp) + VFS_ROOT(&tdp) + if (error) + /* ni_dvp is ref'd and locked, but ni_vp and dp are not and invalid */ + dpunlocked = 1 + goto bad2 + ni_vp = dp = tdp + if (symlink) + if (error) goto bad2 + if (ni_dvp != ni_vp) + VOP_UNLOCK(ni_dvp) + /* ni_dvp is ref'd, ni_vp is locked and ref'd. ni_dvp is never locked + because symlink code in namei() assumes it isn't */ + goto success + nextname: + /* ni_vp (dp) is locked and ref'd, ni_dvp is ref'd and locked if + ni_dvp != dp */ + if (another component) + vput(ni_dvp) + goto dirloop /* dp (ni_vp) is now the parent dir for next lookup */ + if (EROFS) goto bad + if (SAVESTART) + ni_startdir = ni_dvp + VREF(ni_startdir) + if (!wantparent) + vput(ni_dvp) + if (!LOCKPARENT and dp != ni_dvp) + VOP_UNLOCK(ni_dvp) + if (!LOCKLEAK) + VOP_UNLOCK(dp) /* ni_vp */ + + success: + /* + * If ni_dvp is != NULL, should be ref'd and locked if LOCKPARENT. + * If ni_vp is != NULL, should be ref'd and locked if LOCKLEAF. + * dp == ni_vp unless ni_vp == NULL (CREATE) when dp == ni_dvp + */ + if (need upgrade of leaf) + vn_lock(dp) /* ni_vp */ + if (error) + /* XXX: leak ni_startdir if SAVESTART? */ + goto bad + return (0) + + + - VOP_LOOKUP(dp, &ni_vp) is called with dp locked. On error, nothing + changes. On success, ni_vp is locked and referenced and dp remains + unchanged. --- //depot/projects/smpng/sys/sys/proc.h 2009/05/13 13:56:17 +++ //depot/user/jhb/lock/sys/proc.h 2009/05/13 17:57:30 @@ -168,6 +168,7 @@ struct kdtrace_proc; struct kdtrace_thread; struct cpuset; +struct _jmp_buf; /* * Kernel runnable context (thread). @@ -234,6 +235,7 @@ struct file *td_fpop; /* (k) file referencing cdev under op */ int td_dbgflags; /* (c) Userland debugger flags */ struct osd td_osd; /* (k) Object specific data. */ + const char *td_caught_panic; /* (k) Caught panic. */ #define td_endzero td_base_pri /* Copied during fork1() or thread_sched_upcall(). */ @@ -277,6 +279,7 @@ int td_errno; /* Error returned by last syscall. */ struct vnet *td_vnet; /* (*) Effective vnet. */ const char *td_vnet_lpush; /* (*) Debugging vnet push / pop. */ + struct _jmp_buf *td_panic_buf; /* (k) Jump buffer for PANIC_CATCH(). */ }; struct mtx *thread_lock_block(struct thread *); @@ -369,6 +372,7 @@ #define TDP_CALLCHAIN 0x00400000 /* Capture thread's callchain */ #define TDP_IGNSUSP 0x00800000 /* Permission to ignore the MNTK_SUSPEND* */ #define TDP_AUDITREC 0x01000000 /* Audit record pending on thread */ +#define TDP_CATCHPANIC 0x02000000 /* Catching panics with PANIC_CATCH */ /* * Reasons that the current thread can not be run yet. --- //depot/projects/smpng/sys/sys/sockbuf.h 2008/08/05 21:26:01 +++ //depot/user/jhb/lock/sys/sockbuf.h 2008/09/30 18:25:15 @@ -121,6 +121,8 @@ void sbappend_locked(struct sockbuf *sb, struct mbuf *m); void sbappendstream(struct sockbuf *sb, struct mbuf *m); void sbappendstream_locked(struct sockbuf *sb, struct mbuf *m); +int sbappendaddr_mbuf(struct sockbuf *sb, struct mbuf *sa, struct mbuf *m0, + struct mbuf *control); int sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control); int sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa, --- //depot/projects/smpng/sys/sys/socketvar.h 2009/05/08 11:53:25 +++ //depot/user/jhb/lock/sys/socketvar.h 2009/05/08 14:17:47 @@ -335,6 +335,7 @@ struct thread *td); int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td); +struct mbuf *soputsockaddr(const struct sockaddr *asa); int soreceive(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int soreceive_dgram(struct socket *so, struct sockaddr **paddr, --- //depot/projects/smpng/sys/sys/systm.h 2009/05/19 13:40:43 +++ //depot/user/jhb/lock/sys/systm.h 2009/05/20 17:51:14 @@ -71,6 +71,46 @@ if (__predict_false(!(exp))) \ panic msg; \ } while (0) + +#define PANIC_TRY do { \ + switch (setjmp(curthread->td_panic_buf)) { \ + case 0: \ + curthread->td_pflags |= TDP_CATCHPANIC; + +#define PANIC_CATCH \ + curthread->td_pflags &= ~TDP_CATCHPANIC; \ + panic("Expected panic did not trigger"); \ + break; \ + case 1: \ + curthread->td_pflags &= ~TDP_CATCHPANIC; + +#define IGNORE_PANIC(panicstr) \ + if (strcmp(curthread->td_caught_panic, (panicstr)) == \ + 0) { \ + printf("Caught expected panic '%s'\n", \ + curthread->td_caught_panic); \ + break; \ + } + +#define IGNORE_PANIC_STARTS_WITH(panicstr) \ + if (strncmp(curthread->td_caught_panic, (panicstr), \ + strlen((panicstr))) == 0) { \ + printf("Caught expected panic '%s'\n", \ + curthread->td_caught_panic); \ + break; \ + } + +#define PANIC_END \ + panic("Unexpected panic '%s'", \ + curthread->td_caught_panic); \ + break; \ + default: \ + curthread->td_pflags &= ~TDP_CATCHPANIC; \ + panic("Unexpected return value from setjmp()"); \ + } \ + curthread->td_pflags &= ~TDP_CATCHPANIC; \ +} while (0) + #define VNASSERT(exp, vp, msg) do { \ if (__predict_false(!(exp))) { \ vn_printf(vp, "VNASSERT failed\n"); \ @@ -81,6 +121,14 @@ #define KASSERT(exp,msg) do { \ } while (0) +#define PANIC_TRY \ + while (0) { +#define PANIC_CATCH +#define IGNORE_PANIC +#define IGNORE_PANIC_STARTS_WITH +#define PANIC_END \ + } + #define VNASSERT(exp, vp, msg) do { \ } while (0) #endif --- //depot/projects/smpng/sys/ufs/ffs/ffs_vfsops.c 2009/05/13 13:56:17 +++ //depot/user/jhb/lock/ufs/ffs/ffs_vfsops.c 2009/05/13 17:57:30 @@ -278,7 +278,7 @@ * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ - vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + vn_lock(devvp, LK_SHARED | LK_RETRY); error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (error) --- //depot/projects/smpng/sys/ufs/ufs/ufs_lookup.c 2009/05/08 11:53:25 +++ //depot/user/jhb/lock/ufs/ufs/ufs_lookup.c 2009/05/08 14:17:47 @@ -472,6 +472,7 @@ */ if (i_offset + DIRSIZ(OFSFMT(vdp), ep) > dp->i_size) { ufs_dirbad(dp, i_offset, "i_size too small"); + /* XXX: This needs an exclusive lock, but we panic above. */ dp->i_size = i_offset + DIRSIZ(OFSFMT(vdp), ep); DIP_SET(dp, i_size, dp->i_size); dp->i_flag |= IN_CHANGE | IN_UPDATE;