diff -urN /Projects/clean/src/lib/libc/gen/lockf.c /Projects/M4/src/lib/libc/gen/lockf.c --- /Projects/clean/src/lib/libc/gen/lockf.c 2008-01-19 15:54:31.000000000 +0000 +++ /Projects/M4/src/lib/libc/gen/lockf.c 2008-02-12 09:57:16.000000000 +0000 @@ -74,7 +74,7 @@ fl.l_type = F_WRLCK; if (_fcntl(filedes, F_GETLK, &fl) == -1) return (-1); - if (fl.l_type == F_UNLCK || fl.l_pid == getpid()) + if (fl.l_type == F_UNLCK || (fl.l_sysid == 0 && fl.l_pid == getpid())) return (0); errno = EAGAIN; return (-1); diff -urN /Projects/clean/src/lib/libc/sys/fcntl.2 /Projects/M4/src/lib/libc/sys/fcntl.2 --- /Projects/clean/src/lib/libc/sys/fcntl.2 2008-01-19 15:54:32.000000000 +0000 +++ /Projects/M4/src/lib/libc/sys/fcntl.2 2008-02-12 09:57:18.000000000 +0000 @@ -177,6 +177,7 @@ pid_t l_pid; /* lock owner */ short l_type; /* lock type: read/write, etc. */ short l_whence; /* type of l_start */ + int l_sysid; /* remote system id or zero for local */ }; .Ed The commands available for advisory record locking are as follows: @@ -264,9 +265,13 @@ means end edge of the region. The .Fa l_pid -field is only used with +and +.Fa l_sysid +fields are only used with .Dv F_GETLK -to return the process ID of the process holding a blocking lock. +to return the process ID of the process holding a blocking lock and +the system ID of the system that owns that process. +Locks created by the local system will have a system ID of zero. After a successful .Dv F_GETLK request, the value of diff -urN /Projects/clean/src/sys/compat/linux/linux_file.c /Projects/M4/src/sys/compat/linux/linux_file.c --- /Projects/clean/src/sys/compat/linux/linux_file.c 2008-01-19 15:54:38.000000000 +0000 +++ /Projects/M4/src/sys/compat/linux/linux_file.c 2008-02-12 09:56:43.000000000 +0000 @@ -1051,6 +1051,7 @@ bsd_flock->l_start = (off_t)linux_flock->l_start; bsd_flock->l_len = (off_t)linux_flock->l_len; bsd_flock->l_pid = (pid_t)linux_flock->l_pid; + bsd_flock->l_sysid = 0; } static void @@ -1107,6 +1108,7 @@ bsd_flock->l_start = (off_t)linux_flock->l_start; bsd_flock->l_len = (off_t)linux_flock->l_len; bsd_flock->l_pid = (pid_t)linux_flock->l_pid; + bsd_flock->l_sysid = 0; } static void diff -urN /Projects/clean/src/sys/compat/svr4/svr4_fcntl.c /Projects/M4/src/sys/compat/svr4/svr4_fcntl.c --- /Projects/clean/src/sys/compat/svr4/svr4_fcntl.c 2008-01-19 15:54:40.000000000 +0000 +++ /Projects/M4/src/sys/compat/svr4/svr4_fcntl.c 2008-02-12 09:56:44.000000000 +0000 @@ -191,7 +191,7 @@ oflp->l_start = (off_t) iflp->l_start; oflp->l_len = (off_t) iflp->l_len; oflp->l_pid = (pid_t) iflp->l_pid; - + oflp->l_sysid = iflp->l_sysid; } static void @@ -217,7 +217,7 @@ oflp->l_whence = (short) iflp->l_whence; oflp->l_start = (svr4_off64_t) iflp->l_start; oflp->l_len = (svr4_off64_t) iflp->l_len; - oflp->l_sysid = 0; + oflp->l_sysid = iflp->l_sysid; oflp->l_pid = (svr4_pid_t) iflp->l_pid; } diff -urN /Projects/clean/src/sys/conf/options /Projects/M4/src/sys/conf/options --- /Projects/clean/src/sys/conf/options 2008-01-19 15:43:49.000000000 +0000 +++ /Projects/M4/src/sys/conf/options 2008-02-12 09:56:44.000000000 +0000 @@ -54,6 +54,7 @@ KDB_TRACE opt_kdb.h KDB_UNATTENDED opt_kdb.h SYSCTL_DEBUG opt_sysctl.h +ADVLOCKASYNC_TESTING opt_global.h NO_SYSCTL_DESCR opt_global.h diff -urN /Projects/clean/src/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c /Projects/M4/src/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c --- /Projects/clean/src/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c 2008-01-19 15:43:55.000000000 +0000 +++ /Projects/M4/src/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c 2008-02-12 09:56:50.000000000 +0000 @@ -3547,6 +3547,25 @@ return (lf_advlock(ap, &(zp->z_lockf), zp->z_phys->zp_size)); } +/* + * Advisory record locking support + */ +static int +zfs_freebsd_advlockasync(ap) + struct vop_advlockasync_args /* { + struct vnode *a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; + int a_flags; + struct task *a_task; + } */ *ap; +{ + znode_t *zp = VTOZ(ap->a_vp); + + return (lf_advlockasync(ap, &(zp->z_lockf), zp->z_phys->zp_size)); +} + struct vop_vector zfs_vnodeops; struct vop_vector zfs_fifoops; @@ -3580,6 +3599,7 @@ .vop_remove = zfs_freebsd_remove, .vop_rename = zfs_freebsd_rename, .vop_advlock = zfs_freebsd_advlock, + .vop_advlockasync = zfs_freebsd_advlockasync, .vop_pathconf = zfs_freebsd_pathconf, .vop_bmap = VOP_EOPNOTSUPP, .vop_fid = zfs_freebsd_fid, diff -urN /Projects/clean/src/sys/fs/msdosfs/msdosfs_vnops.c /Projects/M4/src/sys/fs/msdosfs/msdosfs_vnops.c --- /Projects/clean/src/sys/fs/msdosfs/msdosfs_vnops.c 2008-01-19 15:44:13.000000000 +0000 +++ /Projects/M4/src/sys/fs/msdosfs/msdosfs_vnops.c 2008-02-12 09:57:01.000000000 +0000 @@ -83,6 +83,7 @@ * Prototypes for MSDOSFS vnode operations */ static vop_advlock_t msdosfs_advlock; +static vop_advlockasync_t msdosfs_advlockasync; static vop_create_t msdosfs_create; static vop_mknod_t msdosfs_mknod; static vop_open_t msdosfs_open; @@ -1963,6 +1964,22 @@ } static int +msdosfs_advlockasync(ap) + struct vop_advlockasync_args /* { + struct vnode *a_vp; + u_char a_id; + int a_op; + struct flock *a_fl; + int a_flags; + struct task *a_task; + } */ *ap; +{ + struct denode *dep = VTODE(ap->a_vp); + + return (lf_advlockasync(ap, &dep->de_lockf, dep->de_FileSize)); +} + +static int msdosfs_vptofh(ap) struct vop_vptofh_args /* { struct vnode *a_vp; @@ -1987,6 +2004,7 @@ .vop_access = msdosfs_access, .vop_advlock = msdosfs_advlock, + .vop_advlockasync = msdosfs_advlockasync, .vop_bmap = msdosfs_bmap, .vop_cachedlookup = msdosfs_lookup, .vop_open = msdosfs_open, diff -urN /Projects/clean/src/sys/fs/tmpfs/tmpfs_vnops.c /Projects/M4/src/sys/fs/tmpfs/tmpfs_vnops.c --- /Projects/clean/src/sys/fs/tmpfs/tmpfs_vnops.c 2008-01-19 15:44:13.000000000 +0000 +++ /Projects/M4/src/sys/fs/tmpfs/tmpfs_vnops.c 2008-02-12 09:57:01.000000000 +0000 @@ -1446,6 +1446,20 @@ /* --------------------------------------------------------------------- */ static int +tmpfs_advlockasync(struct vop_advlockasync_args *v) +{ + struct vnode *vp = v->a_vp; + + struct tmpfs_node *node; + + node = VP_TO_TMPFS_NODE(vp); + + return lf_advlockasync(v, &node->tn_lockf, node->tn_size); +} + +/* --------------------------------------------------------------------- */ + +static int tmpfs_vptofh(struct vop_vptofh_args *ap) { struct tmpfs_fid *tfhp; @@ -1493,6 +1507,7 @@ .vop_print = tmpfs_print, .vop_pathconf = tmpfs_pathconf, .vop_advlock = tmpfs_advlock, + .vop_advlockasync = tmpfs_advlockasync, .vop_vptofh = tmpfs_vptofh, .vop_bmap = VOP_EOPNOTSUPP, }; diff -urN /Projects/clean/src/sys/i386/ibcs2/ibcs2_fcntl.c /Projects/M4/src/sys/i386/ibcs2/ibcs2_fcntl.c --- /Projects/clean/src/sys/i386/ibcs2/ibcs2_fcntl.c 2008-01-19 15:54:41.000000000 +0000 +++ /Projects/M4/src/sys/i386/ibcs2/ibcs2_fcntl.c 2008-02-12 09:57:03.000000000 +0000 @@ -93,7 +93,7 @@ iflp->l_whence = (short)flp->l_whence; iflp->l_start = (ibcs2_off_t)flp->l_start; iflp->l_len = (ibcs2_off_t)flp->l_len; - iflp->l_sysid = 0; + iflp->l_sysid = flp->l_sysid; iflp->l_pid = (ibcs2_pid_t)flp->l_pid; } @@ -127,6 +127,7 @@ break; } flp->l_whence = iflp->l_whence; + flk->l_sysid = iflp->l_sysid; } /* convert iBCS2 mode into NetBSD mode */ diff -urN /Projects/clean/src/sys/kern/kern_descrip.c /Projects/M4/src/sys/kern/kern_descrip.c --- /Projects/clean/src/sys/kern/kern_descrip.c 2008-01-19 15:54:42.000000000 +0000 +++ /Projects/M4/src/sys/kern/kern_descrip.c 2008-02-12 09:57:05.000000000 +0000 @@ -69,6 +69,9 @@ #include #include #include +#ifdef ADVLOCKASYNC_TESTING +#include /* XXX for async lock testing */ +#endif #include #include #include @@ -316,28 +319,67 @@ fcntl(struct thread *td, struct fcntl_args *uap) { struct flock fl; + struct oflock ofl; intptr_t arg; int error; + int cmd; error = 0; + cmd = uap->cmd; switch (uap->cmd) { - case F_GETLK: - case F_SETLK: - case F_SETLKW: - error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl)); + case F_OGETLK: + case F_OSETLK: + case F_OSETLKW: + /* + * Convert old flock structure to new. + */ + error = copyin((void *)(intptr_t)uap->arg, &ofl, sizeof(ofl)); + fl.l_start = ofl.l_start; + fl.l_len = ofl.l_len; + fl.l_pid = ofl.l_pid; + fl.l_type = ofl.l_type; + fl.l_whence = ofl.l_whence; + fl.l_sysid = 0; + + switch (uap->cmd) { + case F_OGETLK: + cmd = F_GETLK; + break; + case F_OSETLK: + cmd = F_SETLK; + break; + case F_OSETLKW: + cmd = F_SETLKW; + break; + } arg = (intptr_t)&fl; break; + case F_GETLK: + case F_SETLK: + case F_SETLKW: + case F_SETLK_REMOTE: + error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl)); + arg = (intptr_t)&fl; + break; default: arg = uap->arg; break; } if (error) return (error); - error = kern_fcntl(td, uap->fd, uap->cmd, arg); + error = kern_fcntl(td, uap->fd, cmd, arg); if (error) return (error); - if (uap->cmd == F_GETLK) + if (uap->cmd == F_OGETLK) { + ofl.l_start = fl.l_start; + ofl.l_len = fl.l_len; + ofl.l_pid = fl.l_pid; + ofl.l_type = fl.l_type; + ofl.l_whence = fl.l_whence; + error = copyout(&ofl, (void *)(intptr_t)uap->arg, sizeof(ofl)); + } else if (uap->cmd == F_GETLK) { error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl)); + } return (error); } @@ -353,11 +395,61 @@ return (fp); } +#ifdef ADVLOCKASYNC_TESTING + +struct async_flock { + struct task af_task; + struct vnode *af_vp; + struct proc *af_proc; + struct flock af_fl; + int af_error; +}; + +/* + * This async callback happens when a lock which was blocking an async + * lock request is removed. We re-attempt the lock and if it succeeds, + * wakeup the client's thread. + */ +extern void kern_fcntl_callback(void *arg, int pending); +void +kern_fcntl_callback(void *arg, int pending) +{ + struct async_flock *af = (struct async_flock *) arg; + struct vnode *vp; + int error; + + mtx_pool_lock(mtxpool_sleep, af); + vp = af->af_vp; + + if (!vp) { + af->af_error = ECANCELED; + mtx_pool_unlock(mtxpool_sleep, af); + return; + } + + mtx_pool_unlock(mtxpool_sleep, af); + + error = VOP_ADVLOCKASYNC(af->af_vp, (caddr_t)af->af_proc, + F_SETLK, &af->af_fl, F_POSIX, &af->af_task); + + mtx_pool_lock(mtxpool_sleep, af); + af->af_error = error; + mtx_pool_unlock(mtxpool_sleep, af); + + if (error != EINPROGRESS) + wakeup(af); +} + +#endif + int kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) { struct filedesc *fdp; struct flock *flp; +#ifdef ADVLOCKASYNC_TESTING + struct async_flock *af = 0; +#endif struct file *fp; struct proc *p; char *pop; @@ -490,11 +582,16 @@ fdrop(fp, td); break; + case F_SETLK_REMOTE: + flg = F_REMOTE; + goto do_setlk; + case F_SETLKW: flg |= F_WAIT; /* FALLTHROUGH F_SETLK */ case F_SETLK: + do_setlk: FILEDESC_SLOCK(fdp); if ((fp = fdtofp(fd, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); @@ -524,6 +621,21 @@ fhold(fp); FILEDESC_SUNLOCK(fdp); vp = fp->f_vnode; +#ifdef ADVLOCKASYNC_TESTING + if (flg & F_WAIT) { + /* + * XXX temporary support for testing async lock + * infrastructure. + */ + af = malloc(sizeof(struct async_flock), + M_TEMP, M_WAITOK); + TASK_INIT(&af->af_task, 0, kern_fcntl_callback, af); + af->af_vp = vp; + af->af_proc = p->p_leader; + af->af_fl = *flp; + flg &= ~F_WAIT; + } +#endif vfslocked = VFS_LOCK_GIANT(vp->v_mount); switch (flp->l_type) { case F_RDLCK: @@ -534,8 +646,13 @@ PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); +#ifdef ADVLOCKASYNC_TESTING + error = VOP_ADVLOCKASYNC(vp, (caddr_t)p->p_leader, + F_SETLK, flp, flg, af ? &af->af_task: NULL); +#else error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); +#endif break; case F_WRLCK: if ((fp->f_flag & FWRITE) == 0) { @@ -545,17 +662,134 @@ PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); +#ifdef ADVLOCKASYNC_TESTING + error = VOP_ADVLOCKASYNC(vp, (caddr_t)p->p_leader, + F_SETLK, flp, flg, af ? &af->af_task: NULL); +#else error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); +#endif break; case F_UNLCK: error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, - flp, F_POSIX); + flp, flg); + break; + case F_UNLCKSYS: + /* + * Temporary api for testing remote lock + * infrastructure. + */ + if (flg != F_REMOTE) { + error = EINVAL; + break; + } + error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, + F_UNLCKSYS, flp, flg); break; default: error = EINVAL; break; } +#ifdef ADVLOCKASYNC_TESTING + /* + * XXX temporary support for testing async lock + * infrastructure. + */ + if (error == EINPROGRESS) { + struct mtx *m = mtx_pool_find(mtxpool_sleep, af); + mtx_lock(m); + error = msleep(af, m, PCATCH, "F_SETLK", 0); + if (error == EINTR) { + /* + * Cancel our async request. This is + * slightly complicated by a potential + * race with our own callback. We deal + * with this as follows: + * + * First, we set af_vp to null - this + * restricts the number of times we + * have to compete with + * kern_fcntl_callback to at most + * twice. + * + * Second, we attempt to cancel the + * lock. Since the vnode interlock + * protects both the cancel and the + * callback trigger in the locking + * code, we are guaranteed that either + * we successfuly cancel or that our + * callback has been triggered. + * + * We handle any failure to cancel by + * first ensuring that the callback + * has finished by calling + * taskqueue_drain. We can then + * examine the value of af_error to + * figure out whether we need to + * re-attempt the cancel. + */ + int e; + af->af_vp = NULL; + retry_cancel: + mtx_unlock(m); + e = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, + F_CANCEL, flp, flg); + if (e) { + /* + * We failed to cancel - make + * sure our callback has + * completed before we + * continue. + */ + taskqueue_drain(taskqueue_thread, + &af->af_task); + + mtx_lock(m); + + /* + * If the value of af_error is + * EINPROGRESS, our callback + * has re-registered the async + * lock with the lock manager + * so we must re-attempt the + * cancel. + */ + if (af->af_error == EINPROGRESS) { + goto retry_cancel; + } + + /* + * If we managed to set af_vp + * to null before the + * callback, we will get a + * value of ECANCELED in + * af_error. This means we + * successfully cancelled and + * can report EINTR to the + * caller. + * + * Any other value of af_error + * should be reported to the + * user as it represents the + * success or failure of the + * lock request. + */ + if (af->af_error != ECANCELED) + error = af->af_error; + mtx_unlock(m); + } + } else { + /* + * We were woken up by the callback - + * take our return value from + * af_error. + */ + error = af->af_error; + mtx_unlock(m); + } + free(af, M_TEMP); + } +#endif VFS_UNLOCK_GIANT(vfslocked); vfslocked = 0; /* Check for race with close */ diff -urN /Projects/clean/src/sys/kern/kern_lockf.c /Projects/M4/src/sys/kern/kern_lockf.c --- /Projects/clean/src/sys/kern/kern_lockf.c 2008-01-19 15:54:43.000000000 +0000 +++ /Projects/M4/src/sys/kern/kern_lockf.c 2008-02-12 09:57:05.000000000 +0000 @@ -39,6 +39,7 @@ #include #include +#include #include #include #include @@ -50,6 +51,7 @@ #include #include #include +#include /* * This variable controls the maximum number of processes that will @@ -57,6 +59,7 @@ */ static int maxlockdepth = MAXDEPTH; +#define LOCKF_DEBUG #ifdef LOCKF_DEBUG #include @@ -80,36 +83,215 @@ lf_getblock(struct lockf *); static int lf_getlock(struct lockf *, struct flock *); static int lf_setlock(struct lockf *, struct vnode *, struct lockf **); -static void lf_split(struct lockf *, struct lockf *, struct lockf **); -static void lf_wakelock(struct lockf *); +/*static*/ int lf_cancel(struct lockf *, struct lockf **); +static void lf_split(struct lockf *, struct lockf *, struct lockf **, + int addlock); +static void lf_wakelock(struct lockf *, struct lockf **); +static void lf_clearremotesys(int sysid); #ifdef LOCKF_DEBUG static void lf_print(char *, struct lockf *); static void lf_printlist(char *, struct lockf *); +static void lf_print_owner(struct lock_owner *); #endif /* + * This structure is used to keep track of both local and remote lock + * owners. The lf_owner field of the struct lockf points back at the + * lock owner structure. Each possible lock owner (local proc for + * POSIX fcntl locks, local file for BSD flock locks or + * pair for remote locks) is represented by a unique instance of + * struct lock_owner. + * + * Locks: + * (l) locked by lf_lock_owners_mutex + * (p) locked by mtx_pool_lock(mtxpool_sleep, lo) + * (c) const until freeing + */ +#define LOCK_OWNER_HASH_SIZE 256 + +struct lock_owner { + LIST_ENTRY(lock_owner) lo_link; /* (l) hash chain */ + int lo_refs; /* (l) Number of locks referring to this */ + int lo_flags; /* (c) Flags passwd to lf_advlock */ + caddr_t lo_id; /* (c) Id value passed to lf_advlock */ + pid_t lo_pid; /* (c) Process Id of the lock owner */ + int lo_sysid; /* (c) System Id of the lock owner */ + struct locklist lo_active; /* (p) Active locks for this owner */ + struct locklist lo_pending; /* (p) Pending locks for this owner */ +}; + +LIST_HEAD(lock_owner_list, lock_owner); + +static struct mtx lf_lock_owners_mutex; +static struct lock_owner_list lf_lock_owners[LOCK_OWNER_HASH_SIZE]; /* (l) */ + +/* + * Initialise the lock owner structures. + */ +static void +lf_init(void *dummy) +{ + int i; + + mtx_init(&lf_lock_owners_mutex, "lock owners lock", NULL, MTX_DEF); + for (i = 0; i < LOCK_OWNER_HASH_SIZE; i++) + LIST_INIT(&lf_lock_owners[i]); +} +SYSINIT(lf_init, SI_SUB_LOCK, SI_ORDER_FIRST, lf_init, NULL) + +/* + * Generate a hash value for a lock owner. + */ +static int +lf_hash_owner(caddr_t id, struct flock *fl, int flags) +{ + uint32_t h; + + if (flags & F_REMOTE) { + h = HASHSTEP(0, fl->l_pid); + h = HASHSTEP(h, fl->l_sysid); + } else if (flags & F_FLOCK) { + h = ((uintptr_t) id) >> 7; + } else { + struct proc *p = (struct proc *) id; + h = HASHSTEP(0, p->p_pid); + h = HASHSTEP(h, 0); + } + + return (h % LOCK_OWNER_HASH_SIZE); +} + +/* + * Return true if a lock owner matches the details passed to + * lf_advlock. + */ +static int +lf_owner_matches(struct lock_owner *lo, caddr_t id, struct flock *fl, + int flags) +{ + if (flags & F_REMOTE) { + return lo->lo_pid == fl->l_pid + && lo->lo_sysid == fl->l_sysid; + } else { + return lo->lo_id == id; + } +} + +static void +lf_cleanup_lock(struct lockf *lock) +{ + /* + * Adjust the lock_owner reference count and + * reclaim the entry if this is the last lock + * for that owner. + */ + struct lock_owner *lo = lock->lf_owner; + if (lo) { + mtx_lock(&lf_lock_owners_mutex); + lo->lo_refs--; + if (lo->lo_refs == 0) { +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) + printf("lf_cleanup_lock: freeing lock owner %p\n", + lo); +#endif + KASSERT(TAILQ_EMPTY(&lo->lo_active), + ("freeing lock owner with active locks")); + KASSERT(TAILQ_EMPTY(&lo->lo_pending), + ("freeing lock owner with pending locks")); + LIST_REMOVE(lo, lo_link); + free(lo, M_LOCKF); + } + mtx_unlock(&lf_lock_owners_mutex); + } +} + +/* * Advisory record locking support */ int -lf_advlock(ap, head, size) - struct vop_advlock_args /* { - struct vnode *a_vp; - caddr_t a_id; - int a_op; - struct flock *a_fl; - int a_flags; - } */ *ap; - struct lockf **head; - u_quad_t size; +lf_advlockasync(struct vop_advlockasync_args *ap, struct lockf **head, u_quad_t size) { struct flock *fl = ap->a_fl; struct lockf *lock; struct vnode *vp = ap->a_vp; + caddr_t id = ap->a_id; + int flags = ap->a_flags; + int hash; + struct lock_owner *lo; off_t start, end, oadd; struct lockf *clean, *n; int error; /* + * Handle the F_UNLKSYS case first - no need to mess about + * creating a lock owner for this one. + */ + if (ap->a_op == F_UNLCKSYS) { + lf_clearremotesys(fl->l_sysid); + return (0); + } + + /* + * Map our arguments to an existing lock owner or create one + * if this is the first time we have seen this owner. + */ + hash = lf_hash_owner(id, fl, flags); + mtx_lock(&lf_lock_owners_mutex); + LIST_FOREACH(lo, &lf_lock_owners[hash], lo_link) + if (lf_owner_matches(lo, id, fl, flags)) + break; + if (!lo) { + /* + * We initialise the lock with a reference + * count of one which refers to the new lockf + * structure created below. + */ + lo = malloc(sizeof(struct lock_owner), + M_LOCKF, M_NOWAIT); + if (!lo) { + mtx_unlock(&lf_lock_owners_mutex); + return (ENOMEM); + } + + lo->lo_refs = 1; + lo->lo_flags = flags; + lo->lo_id = id; + if (flags & F_REMOTE) { + lo->lo_pid = fl->l_pid; + lo->lo_sysid = fl->l_sysid; + } else if (flags & F_FLOCK) { + lo->lo_pid = -1; + lo->lo_sysid = 0; + } else { + struct proc *p = (struct proc *) id; + lo->lo_pid = p->p_pid; + lo->lo_sysid = 0; + } + TAILQ_INIT(&lo->lo_active); + TAILQ_INIT(&lo->lo_pending); + +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) { + printf("lf_advlockasync: new lock owner %p ", lo); + lf_print_owner(lo); + printf("\n"); + } +#endif + + LIST_INSERT_HEAD(&lf_lock_owners[hash], + lo, lo_link); + } else { + /* + * We have seen this lock owner before, + * increase its reference count to account for + * the new lockf struct we create below. + */ + lo->lo_refs++; + } + mtx_unlock(&lf_lock_owners_mutex); + + /* * Convert the flock structure into a start and end. */ switch (fl->l_whence) { @@ -165,6 +347,7 @@ clean = NULL; if (ap->a_op == F_SETLK || ap->a_op == F_UNLCK) { MALLOC(clean, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK); + clean->lf_owner = 0; clean->lf_next = NULL; } /* @@ -173,7 +356,8 @@ MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK); lock->lf_start = start; lock->lf_end = end; - lock->lf_id = ap->a_id; + lock->lf_owner = lo; + lock->lf_vnode = vp; /* * XXX The problem is that VTOI is ufs specific, so it will * break LOCKF_DEBUG for all other FS's other than UFS because @@ -185,7 +369,9 @@ lock->lf_head = head; lock->lf_next = (struct lockf *)0; TAILQ_INIT(&lock->lf_blkhd); + lock->lf_async_task = ap->a_task; lock->lf_flags = ap->a_flags; + /* * Do the requested operation. */ @@ -207,6 +393,12 @@ clean = lock; break; + case F_CANCEL: + error = lf_cancel(lock, &clean); + lock->lf_next = clean; + clean = lock; + break; + default: lock->lf_next = clean; clean = lock; @@ -215,6 +407,7 @@ } VI_UNLOCK(vp); for (lock = clean; lock != NULL; ) { + lf_cleanup_lock(lock); n = lock->lf_next; free(lock, M_LOCKF); lock = n; @@ -222,14 +415,26 @@ return (error); } +int +lf_advlock(struct vop_advlock_args *ap, struct lockf **head, u_quad_t size) +{ + struct vop_advlockasync_args a; + + a.a_vp = ap->a_vp; + a.a_id = ap->a_id; + a.a_op = ap->a_op; + a.a_fl = ap->a_fl; + a.a_flags = ap->a_flags; + a.a_task = NULL; + + return (lf_advlockasync(&a, head, size)); +} + /* * Set a byte-range lock. */ static int -lf_setlock(lock, vp, clean) - struct lockf *lock; - struct vnode *vp; - struct lockf **clean; +lf_setlock(struct lockf *lock, struct vnode *vp, struct lockf **clean) { struct lockf *block; struct lockf **head = lock->lf_head; @@ -256,7 +461,8 @@ /* * Free the structure and return if nonblocking. */ - if ((lock->lf_flags & F_WAIT) == 0) { + if ((lock->lf_flags & F_WAIT) == 0 + && lock->lf_async_task == NULL) { lock->lf_next = *clean; *clean = lock; return (EAGAIN); @@ -267,48 +473,57 @@ * For byte-range locks we must check for deadlock. * * Deadlock detection is done by looking through the - * wait channels to see if there are any cycles that - * involve us. MAXDEPTH is set just to make sure we - * do not go off into neverland. + * lock owner pending lists to see if there are any + * cycles that involve us. MAXDEPTH is set just to + * make sure we do not go off into neverland. + * + * This algorithm is simplistic - it only considers + * the first blocking lock and it doesn't follow all + * paths through the lock graph. */ if ((lock->lf_flags & F_POSIX) && (block->lf_flags & F_POSIX)) { - struct proc *wproc; - struct proc *nproc; - struct thread *td; struct lockf *waitblock; - int i = 0; - - /* The block is waiting on something */ - wproc = (struct proc *)block->lf_id; -restart: - nproc = NULL; - PROC_SLOCK(wproc); - FOREACH_THREAD_IN_PROC(wproc, td) { - thread_lock(td); - while (td->td_wchan && - (td->td_wmesg == lockstr) && - (i++ < maxlockdepth)) { - waitblock = (struct lockf *)td->td_wchan; - /* Get the owner of the blocking lock */ - waitblock = waitblock->lf_next; - if ((waitblock->lf_flags & F_POSIX) == 0) - break; - nproc = (struct proc *)waitblock->lf_id; - if (nproc == (struct proc *)lock->lf_id) { - PROC_SUNLOCK(wproc); - thread_unlock(td); + struct lockf *nblock; + struct lock_owner *lo; + struct lock_owner *nlo; + int i; + + lo = block->lf_owner; + i = 0; + while (lo) { + if (i++ == maxlockdepth) + break; + mtx_pool_lock(mtxpool_sleep, lo); + nlo = NULL; + TAILQ_FOREACH(waitblock, &lo->lo_pending, + lf_olock) { + /* + * Get the owner of the + * blocking lock. + * + * XXX this is unsafe - if + * waitblock is on a different + * vnode to this one, our + * vnode interlock will not + * protect us against changes + * to waitblock->lf_next. + */ + nblock = waitblock->lf_next; + if ((nblock->lf_flags & F_POSIX) == 0) + continue; + nlo = nblock->lf_owner; + if (nlo == lock->lf_owner) { + mtx_pool_unlock(mtxpool_sleep, + lo); lock->lf_next = *clean; *clean = lock; return (EDEADLK); } } - thread_unlock(td); + mtx_pool_unlock(mtxpool_sleep, block->lf_owner); + lo = nlo; } - PROC_SUNLOCK(wproc); - wproc = nproc; - if (wproc) - goto restart; } /* * For flock type locks, we must first remove @@ -327,12 +542,26 @@ */ lock->lf_next = block; TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block); + mtx_pool_lock(mtxpool_sleep, lock->lf_owner); + TAILQ_INSERT_TAIL(&lock->lf_owner->lo_pending, lock, lf_olock); + mtx_pool_unlock(mtxpool_sleep, lock->lf_owner); #ifdef LOCKF_DEBUG if (lockf_debug & 1) { lf_print("lf_setlock: blocking on", block); lf_printlist("lf_setlock", block); } #endif /* LOCKF_DEBUG */ + + if ((lock->lf_flags & F_WAIT) == 0) { + /* + * The caller requested async notification - + * this callback happens when the blocking + * lock is released, allowing the caller to + * make another attempt to take the lock. + */ + return (EINPROGRESS); + } + error = msleep(lock, VI_MTX(vp), priority, lockstr, 0); /* * We may have been awakened by a signal and/or by a @@ -344,6 +573,10 @@ */ if (lock->lf_next) { TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block); + mtx_pool_lock(mtxpool_sleep, lock->lf_owner); + TAILQ_REMOVE(&lock->lf_owner->lo_pending, lock, + lf_olock); + mtx_pool_unlock(mtxpool_sleep, lock->lf_owner); lock->lf_next = NOLOCKF; } if (error) { @@ -381,6 +614,10 @@ if (needtolink) { *prev = lock; lock->lf_next = overlap; + mtx_pool_lock(mtxpool_sleep, lock->lf_owner); + TAILQ_INSERT_TAIL(&lock->lf_owner->lo_active, + lock, lf_olock); + mtx_pool_unlock(mtxpool_sleep, lock->lf_owner); } break; @@ -391,7 +628,7 @@ */ if (lock->lf_type == F_RDLCK && overlap->lf_type == F_WRLCK) - lf_wakelock(overlap); + lf_wakelock(overlap, clean); overlap->lf_type = lock->lf_type; lock->lf_next = *clean; *clean = lock; @@ -412,9 +649,13 @@ *prev = lock; lock->lf_next = overlap; overlap->lf_start = lock->lf_end + 1; + mtx_pool_lock(mtxpool_sleep, lock->lf_owner); + TAILQ_INSERT_TAIL(&lock->lf_owner->lo_active, + lock, lf_olock); + mtx_pool_unlock(mtxpool_sleep, lock->lf_owner); } else - lf_split(overlap, lock, clean); - lf_wakelock(overlap); + lf_split(overlap, lock, clean, TRUE); + lf_wakelock(overlap, clean); break; case 3: /* lock contains overlap */ @@ -424,7 +665,7 @@ */ if (lock->lf_type == F_RDLCK && overlap->lf_type == F_WRLCK) { - lf_wakelock(overlap); + lf_wakelock(overlap, clean); } else { while (!TAILQ_EMPTY(&overlap->lf_blkhd)) { ltmp = TAILQ_FIRST(&overlap->lf_blkhd); @@ -438,13 +679,21 @@ /* * Add the new lock if necessary and delete the overlap. */ + mtx_pool_lock(mtxpool_sleep, lock->lf_owner); + KASSERT(lock->lf_owner == overlap->lf_owner, + ("unexpected lock owner for overlap")); if (needtolink) { *prev = lock; lock->lf_next = overlap->lf_next; prev = &lock->lf_next; + TAILQ_INSERT_TAIL(&lock->lf_owner->lo_active, + lock, lf_olock); needtolink = 0; } else *prev = overlap->lf_next; + TAILQ_REMOVE(&lock->lf_owner->lo_active, + overlap, lf_olock); + mtx_pool_unlock(mtxpool_sleep, lock->lf_owner); overlap->lf_next = *clean; *clean = overlap; continue; @@ -457,7 +706,11 @@ overlap->lf_next = lock; overlap->lf_end = lock->lf_start - 1; prev = &lock->lf_next; - lf_wakelock(overlap); + mtx_pool_lock(mtxpool_sleep, lock->lf_owner); + TAILQ_INSERT_TAIL(&lock->lf_owner->lo_active, + lock, lf_olock); + mtx_pool_unlock(mtxpool_sleep, lock->lf_owner); + lf_wakelock(overlap, clean); needtolink = 0; continue; @@ -468,9 +721,13 @@ if (needtolink) { *prev = lock; lock->lf_next = overlap; + mtx_pool_lock(mtxpool_sleep, lock->lf_owner); + TAILQ_INSERT_TAIL(&lock->lf_owner->lo_active, + lock, lf_olock); + mtx_pool_unlock(mtxpool_sleep, lock->lf_owner); } overlap->lf_start = lock->lf_end + 1; - lf_wakelock(overlap); + lf_wakelock(overlap, clean); break; } break; @@ -491,12 +748,10 @@ * and remove it (or shrink it), then wakeup anyone we can. */ static int -lf_clearlock(unlock, clean) - struct lockf *unlock; - struct lockf **clean; +lf_clearlock(struct lockf *unlock, struct lockf **clean) { struct lockf **head = unlock->lf_head; - register struct lockf *lf = *head; + struct lockf *lf = *head; struct lockf *overlap, **prev; int ovcase; @@ -513,7 +768,7 @@ /* * Wakeup the list of locks to be retried. */ - lf_wakelock(overlap); + lf_wakelock(overlap, clean); switch (ovcase) { @@ -521,6 +776,10 @@ *prev = overlap->lf_next; overlap->lf_next = *clean; *clean = overlap; + mtx_pool_lock(mtxpool_sleep, overlap->lf_owner); + TAILQ_REMOVE(&overlap->lf_owner->lo_active, + overlap, lf_olock); + mtx_pool_unlock(mtxpool_sleep, overlap->lf_owner); break; case 2: /* overlap contains lock: split it */ @@ -528,8 +787,7 @@ overlap->lf_start = unlock->lf_end + 1; break; } - lf_split(overlap, unlock, clean); - overlap->lf_next = unlock->lf_next; + lf_split(overlap, unlock, clean, FALSE); break; case 3: /* lock contains overlap */ @@ -537,6 +795,10 @@ lf = overlap->lf_next; overlap->lf_next = *clean; *clean = overlap; + mtx_pool_lock(mtxpool_sleep, overlap->lf_owner); + TAILQ_REMOVE(&overlap->lf_owner->lo_active, + overlap, lf_olock); + mtx_pool_unlock(mtxpool_sleep, overlap->lf_owner); continue; case 4: /* overlap starts before lock */ @@ -563,11 +825,9 @@ * and if so return its process identifier. */ static int -lf_getlock(lock, fl) - register struct lockf *lock; - register struct flock *fl; +lf_getlock(struct lockf *lock, struct flock *fl) { - register struct lockf *block; + struct lockf *block; #ifdef LOCKF_DEBUG if (lockf_debug & 1) @@ -582,10 +842,8 @@ fl->l_len = 0; else fl->l_len = block->lf_end - block->lf_start + 1; - if (block->lf_flags & F_POSIX) - fl->l_pid = ((struct proc *)(block->lf_id))->p_pid; - else - fl->l_pid = -1; + fl->l_pid = block->lf_owner->lo_pid; + fl->l_sysid = block->lf_owner->lo_sysid; } else { fl->l_type = F_UNLCK; } @@ -593,12 +851,65 @@ } /* + * Cancel an async lock request. + */ +/*static*/ int +lf_cancel(struct lockf *lock, struct lockf **clean) +{ + struct lock_owner *lo = lock->lf_owner; + struct lockf *reallock; + + /* + * We need to match this request with an existing lock + * request. We need to take the pool mutex to protect the + * lock owner's lists. + */ + mtx_pool_lock(mtxpool_sleep, lo); + + TAILQ_FOREACH(reallock, &lo->lo_pending, lf_olock) { + if (reallock->lf_vnode == lock->lf_vnode + && reallock->lf_start == lock->lf_start + && reallock->lf_end == lock->lf_end) { + /* + * Make sure this lock was async and then just + * remove it from its wait lists. + */ + if (!reallock->lf_async_task) { + mtx_pool_unlock(mtxpool_sleep, lo); + return (ENOENT); + } + + /* + * Note that since any other thread must take + * the vnode interlock before it can possibly + * trigger the async callback, we are safe + * from a race with lf_wakelock, i.e. we + * can free the lock (actually our caller does + * this). + */ + TAILQ_REMOVE(&reallock->lf_blkhd, reallock, lf_block); + TAILQ_REMOVE(&lo->lo_pending, reallock, lf_olock); + reallock->lf_next = *clean; + *clean = reallock; + mtx_pool_unlock(mtxpool_sleep, lo); + return (0); + } + } + + mtx_pool_unlock(mtxpool_sleep, lo); + + /* + * We didn't find a matching lock - not much we can do here. + */ + return (ENOENT); +} + +/* * Walk the list of locks for an inode and * return the first blocking lock. */ static struct lockf * -lf_getblock(lock) - register struct lockf *lock; +lf_getblock(struct lockf *lock) { struct lockf **prev, *overlap, *lf = *(lock->lf_head); int ovcase; @@ -627,12 +938,8 @@ * may be more than one. */ static int -lf_findoverlap(lf, lock, type, prev, overlap) - register struct lockf *lf; - struct lockf *lock; - int type; - struct lockf ***prev; - struct lockf **overlap; +lf_findoverlap(struct lockf *lf, struct lockf *lock, int type, + struct lockf ***prev, struct lockf **overlap) { off_t start, end; @@ -646,8 +953,8 @@ start = lock->lf_start; end = lock->lf_end; while (lf != NOLOCKF) { - if (((type & SELF) && lf->lf_id != lock->lf_id) || - ((type & OTHERS) && lf->lf_id == lock->lf_id)) { + if (((type & SELF) && lf->lf_owner != lock->lf_owner) || + ((type & OTHERS) && lf->lf_owner == lock->lf_owner)) { *prev = &lf->lf_next; *overlap = lf = lf->lf_next; continue; @@ -733,14 +1040,13 @@ } /* - * Split a lock and a contained region into - * two or three locks as necessary. + * Split a lock and a contained region into two or three locks as + * necessary. If addlock is TRUE, lock2 is being set so it must be + * added to the list, otherwise it is being cleared. */ static void -lf_split(lock1, lock2, split) - struct lockf *lock1; - struct lockf *lock2; - struct lockf **split; +lf_split(struct lockf *lock1, struct lockf *lock2, struct lockf **split, + int addlock) { struct lockf *splitlock; @@ -755,13 +1061,16 @@ */ if (lock1->lf_start == lock2->lf_start) { lock1->lf_start = lock2->lf_end + 1; - lock2->lf_next = lock1; + if (addlock) + lock2->lf_next = lock1; return; } if (lock1->lf_end == lock2->lf_end) { lock1->lf_end = lock2->lf_start - 1; - lock2->lf_next = lock1->lf_next; - lock1->lf_next = lock2; + if (addlock) { + lock2->lf_next = lock1->lf_next; + lock1->lf_next = lock2; + } return; } /* @@ -773,6 +1082,15 @@ KASSERT(splitlock != NULL, ("no split")); *split = splitlock->lf_next; bcopy(lock1, splitlock, sizeof *splitlock); + + /* + * Update the lock owner reference count to account for the + * new lock. + */ + mtx_lock(&lf_lock_owners_mutex); + splitlock->lf_owner->lo_refs++; + mtx_unlock(&lf_lock_owners_mutex); + splitlock->lf_start = lock2->lf_end + 1; TAILQ_INIT(&splitlock->lf_blkhd); lock1->lf_end = lock2->lf_start - 1; @@ -780,46 +1098,144 @@ * OK, now link it in */ splitlock->lf_next = lock1->lf_next; - lock2->lf_next = splitlock; - lock1->lf_next = lock2; + mtx_pool_lock(mtxpool_sleep, lock1->lf_owner); + TAILQ_INSERT_TAIL(&lock1->lf_owner->lo_active, splitlock, lf_olock); + if (addlock) { + KASSERT(lock1->lf_owner == lock2->lf_owner, + ("unexpected lock owner for split")); + TAILQ_INSERT_TAIL(&lock1->lf_owner->lo_active, lock2, + lf_olock); + lock2->lf_next = splitlock; + lock1->lf_next = lock2; + } else { + lock1->lf_next = splitlock; + } + mtx_pool_unlock(mtxpool_sleep, lock1->lf_owner); } /* * Wakeup a blocklist */ static void -lf_wakelock(listhead) - struct lockf *listhead; +lf_wakelock(struct lockf *listhead, struct lockf **clean) { - register struct lockf *wakelock; + struct lockf *wakelock; while (!TAILQ_EMPTY(&listhead->lf_blkhd)) { wakelock = TAILQ_FIRST(&listhead->lf_blkhd); TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block); wakelock->lf_next = NOLOCKF; + mtx_pool_lock(mtxpool_sleep, wakelock->lf_owner); + TAILQ_REMOVE(&wakelock->lf_owner->lo_pending, wakelock, + lf_olock); + mtx_pool_unlock(mtxpool_sleep, wakelock->lf_owner); #ifdef LOCKF_DEBUG if (lockf_debug & 2) lf_print("lf_wakelock: awakening", wakelock); #endif /* LOCKF_DEBUG */ - wakeup(wakelock); + if (wakelock->lf_async_task) { + /* + * Perform async notification to allow a + * caller to re-attempt the lock. + */ + taskqueue_enqueue(taskqueue_thread, + wakelock->lf_async_task); + wakelock->lf_next = *clean; + *clean = wakelock; + } else { + wakeup(wakelock); + } + } +} + +struct clearlock { + STAILQ_ENTRY(clearlock) link; + struct vnode *vp; + struct flock fl; +}; +STAILQ_HEAD(clearlocklist, clearlock); + +static void +lf_clearremotesys(int sysid) +{ + int i; + struct lock_owner *lo; + struct lockf *lf; + struct clearlock *cl; + struct clearlocklist locks; + + KASSERT(sysid != 0, ("Can't clear local locks with F_UNLCKSYS")); + + /* + * In order to keep the locking simple, we iterate over the + * active lock lists to build a list of locks that need + * releasing. We then call VOP_ADVLOCK for each one in turn. + */ + STAILQ_INIT(&locks); + mtx_lock(&lf_lock_owners_mutex); + for (i = 0; i < LOCK_OWNER_HASH_SIZE; i++) { + LIST_FOREACH(lo, &lf_lock_owners[i], lo_link) { + if (lo->lo_sysid != sysid) + continue; + + mtx_pool_lock(mtxpool_sleep, lo); + TAILQ_FOREACH(lf, &lo->lo_active, lf_olock) { + cl = malloc(sizeof(struct clearlock), + M_LOCKF, M_NOWAIT); + if (!cl) + continue; + cl->vp = lf->lf_vnode; + cl->fl.l_start = lf->lf_start; + if (lf->lf_end == -1) + cl->fl.l_len = 0; + else + cl->fl.l_len = + lf->lf_end - lf->lf_start; + cl->fl.l_whence = SEEK_SET; + cl->fl.l_type = F_UNLCK; + cl->fl.l_pid = lo->lo_pid; + cl->fl.l_sysid = sysid; + STAILQ_INSERT_TAIL(&locks, cl, link); + } + mtx_pool_unlock(mtxpool_sleep, lo); + } + } + mtx_unlock(&lf_lock_owners_mutex); + + while ((cl = STAILQ_FIRST(&locks)) != NULL) { + STAILQ_REMOVE_HEAD(&locks, link); + VOP_ADVLOCK(cl->vp, 0, F_UNLCK, &cl->fl, F_REMOTE); + free(cl, M_LOCKF); } } #ifdef LOCKF_DEBUG /* + * Print description of a lock owner + */ +static void +lf_print_owner(struct lock_owner *lo) +{ + + if (lo->lo_flags & F_REMOTE) { + printf("remote pid %d, system %d", + lo->lo_pid, lo->lo_sysid); + } else if (lo->lo_flags & F_FLOCK) { + printf("file %p", lo->lo_id); + } else { + printf("local pid %d", lo->lo_pid); + } +} + +/* * Print out a lock. */ static void -lf_print(tag, lock) - char *tag; - register struct lockf *lock; +lf_print(char *tag, struct lockf *lock) { printf("%s: lock %p for ", tag, (void *)lock); - if (lock->lf_flags & F_POSIX) - printf("proc %ld", (long)((struct proc *)lock->lf_id)->p_pid); - else - printf("id %p", (void *)lock->lf_id); + lf_print_owner(lock->lf_owner); if (lock->lf_inode != (struct inode *)0) printf(" in ino %ju on dev <%s>, %s, start %jd, end %jd", (uintmax_t)lock->lf_inode->i_number, @@ -841,11 +1257,9 @@ } static void -lf_printlist(tag, lock) - char *tag; - struct lockf *lock; +lf_printlist(char *tag, struct lockf *lock) { - register struct lockf *lf, *blk; + struct lockf *lf, *blk; if (lock->lf_inode == (struct inode *)0) return; @@ -855,11 +1269,7 @@ devtoname(lock->lf_inode->i_dev)); for (lf = lock->lf_inode->i_lockf; lf; lf = lf->lf_next) { printf("\tlock %p for ",(void *)lf); - if (lf->lf_flags & F_POSIX) - printf("proc %ld", - (long)((struct proc *)lf->lf_id)->p_pid); - else - printf("id %p", (void *)lf->lf_id); + lf_print_owner(lock->lf_owner); printf(", %s, start %jd, end %jd", lf->lf_type == F_RDLCK ? "shared" : lf->lf_type == F_WRLCK ? "exclusive" : @@ -867,11 +1277,7 @@ "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end); TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) { printf("\n\t\tlock request %p for ", (void *)blk); - if (blk->lf_flags & F_POSIX) - printf("proc %ld", - (long)((struct proc *)blk->lf_id)->p_pid); - else - printf("id %p", (void *)blk->lf_id); + lf_print_owner(blk->lf_owner); printf(", %s, start %jd, end %jd", blk->lf_type == F_RDLCK ? "shared" : blk->lf_type == F_WRLCK ? "exclusive" : diff -urN /Projects/clean/src/sys/kern/vnode_if.src /Projects/M4/src/sys/kern/vnode_if.src --- /Projects/clean/src/sys/kern/vnode_if.src 2008-01-19 15:44:20.000000000 +0000 +++ /Projects/M4/src/sys/kern/vnode_if.src 2008-02-12 09:57:05.000000000 +0000 @@ -438,6 +438,18 @@ }; +%% advlockasync vp U U U + +vop_advlockasync { + IN struct vnode *vp; + IN void *id; + IN int op; + IN struct flock *fl; + IN int flags; + IN struct task *task; +}; + + %% reallocblks vp E E E vop_reallocblks { diff -urN /Projects/clean/src/sys/nfs4client/nfs4_vnops.c /Projects/M4/src/sys/nfs4client/nfs4_vnops.c --- /Projects/clean/src/sys/nfs4client/nfs4_vnops.c 2008-01-19 15:44:36.000000000 +0000 +++ /Projects/M4/src/sys/nfs4client/nfs4_vnops.c 2008-02-12 09:57:11.000000000 +0000 @@ -157,6 +157,7 @@ static vop_readlink_t nfs4_readlink; static vop_print_t nfs4_print; static vop_advlock_t nfs4_advlock; +static vop_advlockasync_t nfs4_advlockasync; /* * Global vfs data structures for nfs @@ -165,6 +166,7 @@ .vop_default = &default_vnodeops, .vop_access = nfs4_access, .vop_advlock = nfs4_advlock, + .vop_advlockasync = nfs4_advlockasync, .vop_close = nfs4_close, .vop_create = nfs4_create, .vop_fsync = nfs4_fsync, @@ -2777,6 +2779,22 @@ } /* + * NFS advisory byte-level locks. + */ +static int +nfs4_advlockasync(struct vop_advlockasync_args *ap) +{ + return (EPERM); + + if ((VFSTONFS(ap->a_vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) { + struct nfsnode *np = VTONFS(ap->a_vp); + + return (lf_advlockasync(ap, &(np->n_lockf), np->n_size)); + } + return (EOPNOTSUPP); +} + +/* * Print out the contents of an nfsnode. */ static int diff -urN /Projects/clean/src/sys/nfsclient/nfs_vnops.c /Projects/M4/src/sys/nfsclient/nfs_vnops.c --- /Projects/clean/src/sys/nfsclient/nfs_vnops.c 2008-01-19 15:44:36.000000000 +0000 +++ /Projects/M4/src/sys/nfsclient/nfs_vnops.c 2008-02-12 09:57:11.000000000 +0000 @@ -129,6 +129,7 @@ static vop_readlink_t nfs_readlink; static vop_print_t nfs_print; static vop_advlock_t nfs_advlock; +static vop_advlockasync_t nfs_advlockasync; /* * Global vfs data structures for nfs @@ -137,6 +138,7 @@ .vop_default = &default_vnodeops, .vop_access = nfs_access, .vop_advlock = nfs_advlock, + .vop_advlockasync = nfs_advlockasync, .vop_close = nfs_close, .vop_create = nfs_create, .vop_fsync = nfs_fsync, @@ -3057,6 +3059,27 @@ } /* + * NFS advisory byte-level locks. + */ +static int +nfs_advlockasync(struct vop_advlockasync_args *ap) +{ + int error; + + mtx_lock(&Giant); + if ((VFSTONFS(ap->a_vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) { + struct nfsnode *np = VTONFS(ap->a_vp); + + error = lf_advlockasync(ap, &(np->n_lockf), np->n_size); + goto out; + } + error = EOPNOTSUPP; +out: + mtx_unlock(&Giant); + return (error); +} + +/* * Print out the contents of an nfsnode. */ static int diff -urN /Projects/clean/src/sys/sys/fcntl.h /Projects/M4/src/sys/sys/fcntl.h --- /Projects/clean/src/sys/sys/fcntl.h 2008-01-19 15:54:44.000000000 +0000 +++ /Projects/M4/src/sys/sys/fcntl.h 2008-02-12 09:57:13.000000000 +0000 @@ -178,9 +178,13 @@ #define F_GETOWN 5 /* get SIGIO/SIGURG proc/pgrp */ #define F_SETOWN 6 /* set SIGIO/SIGURG proc/pgrp */ #endif -#define F_GETLK 7 /* get record locking information */ -#define F_SETLK 8 /* set record locking information */ -#define F_SETLKW 9 /* F_SETLK; wait if blocked */ +#define F_OGETLK 7 /* get record locking information */ +#define F_OSETLK 8 /* set record locking information */ +#define F_OSETLKW 9 /* F_SETLK; wait if blocked */ +#define F_GETLK 10 /* get record locking information */ +#define F_SETLK 11 /* set record locking information */ +#define F_SETLKW 12 /* F_SETLK; wait if blocked */ +#define F_SETLK_REMOTE 13 /* debugging support for remote locks */ /* file descriptor flags (F_GETFD, F_SETFD) */ #define FD_CLOEXEC 1 /* close-on-exec flag */ @@ -189,10 +193,13 @@ #define F_RDLCK 1 /* shared or read lock */ #define F_UNLCK 2 /* unlock */ #define F_WRLCK 3 /* exclusive or write lock */ +#define F_UNLCKSYS 4 /* purge locks for a given system ID */ +#define F_CANCEL 5 /* cancel an async lock request */ #ifdef _KERNEL #define F_WAIT 0x010 /* Wait until lock is granted */ #define F_FLOCK 0x020 /* Use flock(2) semantics for lock */ #define F_POSIX 0x040 /* Use POSIX semantics for lock */ +#define F_REMOTE 0x080 /* Lock owner is remote NFS client */ #endif /* @@ -205,6 +212,19 @@ pid_t l_pid; /* lock owner */ short l_type; /* lock type: read/write, etc. */ short l_whence; /* type of l_start */ + int l_sysid; /* remote system id or zero for local */ +}; + +/* + * Old advisory file segment locking data type, + * before adding l_sysid. + */ +struct oflock { + off_t l_start; /* starting offset */ + off_t l_len; /* len = 0 means until end of file */ + pid_t l_pid; /* lock owner */ + short l_type; /* lock type: read/write, etc. */ + short l_whence; /* type of l_start */ }; diff -urN /Projects/clean/src/sys/sys/lockf.h /Projects/M4/src/sys/sys/lockf.h --- /Projects/clean/src/sys/sys/lockf.h 2008-01-19 15:44:41.000000000 +0000 +++ /Projects/M4/src/sys/sys/lockf.h 2008-02-12 09:57:14.000000000 +0000 @@ -39,6 +39,7 @@ #include struct vop_advlock_args; +struct vop_advlockasync_args; /* * The lockf structure is a kernel structure which contains the information @@ -53,17 +54,21 @@ short lf_type; /* Lock type: F_RDLCK, F_WRLCK */ off_t lf_start; /* Byte # of the start of the lock */ off_t lf_end; /* Byte # of the end of the lock (-1=EOF) */ - caddr_t lf_id; /* Id of the resource holding the lock */ + struct lock_owner *lf_owner; /* Owner of the lock */ + struct vnode *lf_vnode; /* File being locked (only valid for active lock) */ struct lockf **lf_head; /* Back pointer to the head of the lockf list */ struct inode *lf_inode; /* Back pointer to the inode */ struct lockf *lf_next; /* Pointer to the next lock on this inode */ struct locklist lf_blkhd; /* List of requests blocked on this lock */ + struct task *lf_async_task;/* Async lock callback */ TAILQ_ENTRY(lockf) lf_block;/* A request waiting for a lock */ + TAILQ_ENTRY(lockf) lf_olock;/* Linkage for owner lock lists */ }; /* Maximum length of sleep chains to traverse to try and detect deadlock. */ #define MAXDEPTH 50 int lf_advlock(struct vop_advlock_args *, struct lockf **, u_quad_t); +int lf_advlockasync(struct vop_advlockasync_args *, struct lockf **, u_quad_t); #endif /* !_SYS_LOCKF_H_ */ diff -urN /Projects/clean/src/sys/ufs/ufs/ufs_vnops.c /Projects/M4/src/sys/ufs/ufs/ufs_vnops.c --- /Projects/clean/src/sys/ufs/ufs/ufs_vnops.c 2008-01-19 15:44:30.000000000 +0000 +++ /Projects/M4/src/sys/ufs/ufs/ufs_vnops.c 2008-02-12 09:57:09.000000000 +0000 @@ -92,6 +92,7 @@ static vop_access_t ufs_access; static vop_advlock_t ufs_advlock; +static vop_advlockasync_t ufs_advlockasync; static int ufs_chmod(struct vnode *, int, struct ucred *, struct thread *); static int ufs_chown(struct vnode *, uid_t, gid_t, struct ucred *, struct thread *); static vop_close_t ufs_close; @@ -2182,6 +2183,25 @@ } /* + * Advisory record locking support + */ +static int +ufs_advlockasync(ap) + struct vop_advlockasync_args /* { + struct vnode *a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; + int a_flags; + struct task *a_task; + } */ *ap; +{ + struct inode *ip = VTOI(ap->a_vp); + + return (lf_advlockasync(ap, &(ip->i_lockf), ip->i_size)); +} + +/* * Initialize the vnode associated with a new inode, handle aliased * vnodes. */ @@ -2449,6 +2469,7 @@ .vop_write = VOP_PANIC, .vop_access = ufs_access, .vop_advlock = ufs_advlock, + .vop_advlockasync = ufs_advlockasync, .vop_bmap = ufs_bmap, .vop_cachedlookup = ufs_lookup, .vop_close = ufs_close, diff -urN /Projects/clean/src/tools/regression/file/flock/Makefile /Projects/M4/src/tools/regression/file/flock/Makefile --- /Projects/clean/src/tools/regression/file/flock/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ /Projects/M4/src/tools/regression/file/flock/Makefile 2008-02-12 09:57:19.000000000 +0000 @@ -0,0 +1,8 @@ +# $FreeBSD$ + +PROG= flock +NO_MAN= +WARNS?= 6 +DEBUG_FLAGS= -g -O0 + +.include diff -urN /Projects/clean/src/tools/regression/file/flock/flock.c /Projects/M4/src/tools/regression/file/flock/flock.c --- /Projects/clean/src/tools/regression/file/flock/flock.c 1970-01-01 01:00:00.000000000 +0100 +++ /Projects/M4/src/tools/regression/file/flock/flock.c 2008-02-12 09:57:19.000000000 +0000 @@ -0,0 +1,1098 @@ +/*- + * Copyright (c) 2008 Isilon Inc http://www.isilon.com/ + * Authors: Doug Rabson + * Developed with Red Inc: Alfred Perlstein + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __FreeBSD__ +#define HAVE_SYSID +#include +#else +#define __unused +#endif + +static int +make_file(const char *dir, off_t sz) +{ + const char *template = "/flocktempXXXXXX"; + size_t len; + char *filename; + int fd; + + len = strlen(dir) + strlen(template) + 1; + filename = malloc(len); + strcpy(filename, dir); + strcat(filename, template); + fd = mkstemp(filename); + if (fd < 0) + err(1, "mkstemp"); + if (ftruncate(fd, sz) < 0) + err(1, "ftruncate"); + if (unlink(filename) < 0) + err(1, "unlink"); + free(filename); + + return (fd); +} + +static void +ignore_alarm(int __unused sig) +{ +} + +#define FAIL(test) \ + do { \ + if (test) { \ + printf("FAIL (%s)\n", #test); \ + return -1; \ + } \ + } while (0) + +#define SUCCEED \ + do { printf("SUCCEED\n"); return 0; } while (0) + +/* + * Test 1 - F_GETLK on unlocked region + * + * If no lock is found that would prevent this lock from being + * created, the structure is left unchanged by this function call + * except for the lock type which is set to F_UNLCK. + */ +static int +test1(int fd) +{ + struct flock fl1, fl2; + + memset(&fl1, 1, sizeof(fl1)); + fl1.l_type = F_WRLCK; + fl1.l_whence = SEEK_SET; + fl2 = fl1; + + if (fcntl(fd, F_GETLK, &fl1) < 0) + err(1, "F_GETLK"); + + printf("1 - F_GETLK on unlocked region: "); + FAIL(fl1.l_start != fl2.l_start); + FAIL(fl1.l_len != fl2.l_len); + FAIL(fl1.l_pid != fl2.l_pid); + FAIL(fl1.l_type != F_UNLCK); + FAIL(fl1.l_whence != fl2.l_whence); +#ifdef HAVE_SYSID + FAIL(fl1.l_sysid != fl2.l_sysid); +#endif + + SUCCEED; +} + +/* + * Test 2 - F_SETLK on locked region + * + * If a shared or exclusive lock cannot be set, fcntl returns + * immediately with EACCES or EAGAIN. + */ +static int +test2(int fd) +{ + /* + * We create a child process to hold the lock which we will + * test. We use a pipe to communicate with the child. + */ + int pid; + int pfd[2]; + struct flock fl; + char ch; + int res; + + if (pipe(pfd) < 0) + err(1, "pipe"); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + + pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + /* + * We are the child. We set a write lock and then + * write one byte back to the parent to tell it. The + * parent will kill us when its done. + */ + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child)"); + if (write(pfd[1], "a", 1) < 0) + err(1, "writing to pipe (child)"); + pause(); + exit(0); + } + + /* + * Wait until the child has set its lock and then perform the + * test. + */ + if (read(pfd[0], &ch, 1) != 1) + err(1, "reading from pipe (child)"); + + /* + * fcntl should return -1 with errno set to either EACCES or + * EAGAIN. + */ + printf("2 - F_SETLK on locked region: "); + res = fcntl(fd, F_SETLK, &fl); + kill(pid, SIGTERM); + if (waitpid(pid, 0, 0) != pid) + err(1, "waitpid"); + close(pfd[0]); + close(pfd[1]); + FAIL(res == 0); + FAIL(errno != EACCES && errno != EAGAIN); + + SUCCEED; +} + +/* + * Test 3 - F_SETLKW on locked region + * + * If a shared or exclusive lock is blocked by other locks, the + * process waits until the request can be satisfied. + * + * XXX this test hangs on FreeBSD NFS filesystems due to limitations + * in FreeBSD's client (and server) lockd implementation. + */ +static int +test3(int fd) +{ + /* + * We create a child process to hold the lock which we will + * test. We use a pipe to communicate with the child. + */ + int pid; + int pfd[2]; + struct flock fl; + char ch; + int res; + + if (pipe(pfd) < 0) + err(1, "pipe"); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + + pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + /* + * We are the child. We set a write lock and then + * write one byte back to the parent to tell it. The + * parent will kill us when its done. + */ + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child)"); + if (write(pfd[1], "a", 1) < 0) + err(1, "writing to pipe (child)"); + pause(); + exit(0); + } + + /* + * Wait until the child has set its lock and then perform the + * test. + */ + if (read(pfd[0], &ch, 1) != 1) + err(1, "reading from pipe (child)"); + + /* + * fcntl should wait until the alarm and then return -1 with + * errno set to EINTR. + */ + printf("3 - F_SETLKW on locked region: "); + + alarm(1); + + res = fcntl(fd, F_SETLKW, &fl); + kill(pid, SIGTERM); + if (waitpid(pid, 0, 0) != pid) + err(1, "waitpid"); + close(pfd[0]); + close(pfd[1]); + FAIL(res == 0); + FAIL(errno != EINTR); + + SUCCEED; +} + +/* + * Test 4 - F_GETLK on locked region + * + * Get the first lock that blocks the lock. + */ +static int +test4(int fd) +{ + /* + * We create a child process to hold the lock which we will + * test. We use a pipe to communicate with the child. + */ + int pid; + int pfd[2]; + struct flock fl; + char ch; + + if (pipe(pfd) < 0) + err(1, "pipe"); + + fl.l_start = 0; + fl.l_len = 99; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + + pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + /* + * We are the child. We set a write lock and then + * write one byte back to the parent to tell it. The + * parent will kill us when its done. + */ + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child)"); + if (write(pfd[1], "a", 1) < 0) + err(1, "writing to pipe (child)"); + pause(); + exit(0); + } + + /* + * Wait until the child has set its lock and then perform the + * test. + */ + if (read(pfd[0], &ch, 1) != 1) + err(1, "reading from pipe (child)"); + + /* + * fcntl should return a lock structure reflecting the lock we + * made in the child process. + */ + if (fcntl(fd, F_GETLK, &fl) < 0) + err(1, "F_GETLK"); + + printf("4 - F_GETLK on locked region: "); + FAIL(fl.l_start != 0); + FAIL(fl.l_len != 99); + FAIL(fl.l_type != F_WRLCK); + FAIL(fl.l_pid != pid); +#ifdef HAVE_SYSID + FAIL(fl.l_sysid != 0); +#endif + + kill(pid, SIGTERM); + if (waitpid(pid, 0, 0) != pid) + err(1, "waitpid"); + close(pfd[0]); + close(pfd[1]); + + SUCCEED; +} + +/* + * Test 5 - F_SETLKW simple deadlock + * + * If a blocking shared lock request would cause a deadlock (i.e. the + * lock request is blocked by a process which is itself blocked on a + * lock currently owned by the process making the new request), + * EDEADLK is returned. + */ +static int +test5(int fd) +{ + /* + * We create a child process to hold the lock which we will + * test. Because our test relies on the child process being + * blocked on the parent's lock, we can't easily use a pipe to + * synchronize so we just sleep in the parent to given the + * child a chance to setup. + * + * To create the deadlock condition, we arrange for the parent + * to lock the first byte of the file and the child to lock + * the second byte. After locking the second byte, the child + * will attempt to lock the first byte of the file, and + * block. The parent will then attempt to lock the second byte + * (owned by the child) which should cause deadlock. + */ + int pid; + struct flock fl; + int res; + + /* + * Lock the first byte in the parent. + */ + fl.l_start = 0; + fl.l_len = 1; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK 1 (parent)"); + + pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + /* + * Lock the second byte in the child and then block on + * the parent's lock. + */ + fl.l_start = 1; + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child)"); + fl.l_start = 0; + if (fcntl(fd, F_SETLKW, &fl) < 0) + err(1, "F_SETLKW (child)"); + exit(0); + } + + /* + * Wait until the child has set its lock and then perform the + * test. + */ + sleep(1); + + /* + * fcntl should immediately return -1 with errno set to EDEADLK. + */ + printf("5 - F_SETLKW simple deadlock: "); + + fl.l_start = 1; + res = fcntl(fd, F_SETLKW, &fl); + kill(pid, SIGTERM); + if (waitpid(pid, 0, 0) != pid) + err(1, "waitpid"); + + FAIL(res == 0); + FAIL(errno != EDEADLK); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_UNLCK; + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_UNLCK"); + + SUCCEED; +} + +/* + * Test 6 - F_SETLKW complex deadlock. + * + * This test involves three process, P, C1 and C2. We set things up so + * that P locks byte zero, C1 locks byte 1 and C2 locks byte 2. We + * also block C2 by attempting to lock byte zero. Lastly, P attempts + * to lock a range including byte 1 and 2. This represents a deadlock + * (due to C2's blocking attempt to lock byte zero). + */ +static int +test6(int fd) +{ + /* + * Because our test relies on the child process being blocked + * on the parent's lock, we can't easily use a pipe to + * synchronize so we just sleep in the parent to given the + * children a chance to setup. + */ + int pid1, pid2; + struct flock fl; + int res; + + /* + * Lock the first byte in the parent. + */ + fl.l_start = 0; + fl.l_len = 1; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK 1 (parent)"); + + pid1 = fork(); + if (pid1 < 0) + err(1, "fork"); + + if (pid1 == 0) { + /* + * C1 + * Lock the second byte in the child and then sleep + */ + fl.l_start = 1; + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child1)"); + pause(); + exit(0); + } + + pid2 = fork(); + if (pid2 < 0) + err(1, "fork"); + + if (pid2 == 0) { + /* + * C2 + * Lock the third byte in the child and then block on + * the parent's lock. + */ + fl.l_start = 2; + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child2)"); + fl.l_start = 0; + if (fcntl(fd, F_SETLKW, &fl) < 0) + err(1, "F_SETLKW (child2)"); + exit(0); + } + + /* + * Wait until the children have set their locks and then + * perform the test. + */ + sleep(1); + + /* + * fcntl should immediately return -1 with errno set to + * EDEADLK. If the alarm fires, we failed to detect the + * deadlock. + */ + alarm(1); + printf("6 - F_SETLKW complex deadlock: "); + + fl.l_start = 1; + fl.l_len = 2; + res = fcntl(fd, F_SETLKW, &fl); + kill(pid1, SIGTERM); + if (waitpid(pid1, 0, 0) != pid1) + err(1, "waitpid"); + kill(pid2, SIGTERM); + if (waitpid(pid2, 0, 0) != pid2) + err(1, "waitpid"); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_UNLCK; + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_UNLCK"); + + FAIL(res == 0); + FAIL(errno != EDEADLK); + + /* + * Cancel the alarm to avoid confusing later tests. + */ + alarm(0); + + SUCCEED; +} + +/* + * Test 7 - F_SETLK shared lock on exclusive locked region + * + * If a shared or exclusive lock cannot be set, fcntl returns + * immediately with EACCES or EAGAIN. + */ +static int +test7(int fd) +{ + /* + * We create a child process to hold the lock which we will + * test. We use a pipe to communicate with the child. + */ + int pid; + int pfd[2]; + struct flock fl; + char ch; + int res; + + if (pipe(pfd) < 0) + err(1, "pipe"); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + + pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + /* + * We are the child. We set a write lock and then + * write one byte back to the parent to tell it. The + * parent will kill us when its done. + */ + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child)"); + if (write(pfd[1], "a", 1) < 0) + err(1, "writing to pipe (child)"); + pause(); + exit(0); + } + + /* + * Wait until the child has set its lock and then perform the + * test. + */ + if (read(pfd[0], &ch, 1) != 1) + err(1, "reading from pipe (child)"); + + /* + * fcntl should wait until the alarm and then return -1 with + * errno set to EINTR. + */ + printf("7 - F_SETLK shared lock on exclusive locked region: "); + + fl.l_type = F_RDLCK; + res = fcntl(fd, F_SETLK, &fl); + kill(pid, SIGTERM); + if (waitpid(pid, 0, 0) != pid) + err(1, "waitpid"); + close(pfd[0]); + close(pfd[1]); + + FAIL(res == 0); + FAIL(errno != EACCES && errno != EAGAIN); + + SUCCEED; +} + +/* + * Test 8 - F_SETLK shared lock on share locked region + * + * When a shared lock is set on a segment of a file, other processes + * shall be able to set shared locks on that segment or a portion of + * it. + */ +static int +test8(int fd) +{ + /* + * We create a child process to hold the lock which we will + * test. We use a pipe to communicate with the child. + */ + int pid; + int pfd[2]; + struct flock fl; + char ch; + int res; + + if (pipe(pfd) < 0) + err(1, "pipe"); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_RDLCK; + fl.l_whence = SEEK_SET; + + pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + /* + * We are the child. We set a write lock and then + * write one byte back to the parent to tell it. The + * parent will kill us when its done. + */ + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child)"); + if (write(pfd[1], "a", 1) < 0) + err(1, "writing to pipe (child)"); + pause(); + exit(0); + } + + /* + * Wait until the child has set its lock and then perform the + * test. + */ + if (read(pfd[0], &ch, 1) != 1) + err(1, "reading from pipe (child)"); + + /* + * fcntl should wait until the alarm and then return -1 with + * errno set to EINTR. + */ + printf("8 - F_SETLK shared lock on share locked region: "); + + fl.l_type = F_RDLCK; + res = fcntl(fd, F_SETLK, &fl); + + kill(pid, SIGTERM); + if (waitpid(pid, 0, 0) != pid) + err(1, "waitpid"); + close(pfd[0]); + close(pfd[1]); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_UNLCK; + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_UNLCK"); + + FAIL(res != 0); + + SUCCEED; +} + +/* + * Test 9 - F_SETLK exclusive lock on share locked region + * + * If a shared or exclusive lock cannot be set, fcntl returns + * immediately with EACCES or EAGAIN. + */ +static int +test9(int fd) +{ + /* + * We create a child process to hold the lock which we will + * test. We use a pipe to communicate with the child. + */ + int pid; + int pfd[2]; + struct flock fl; + char ch; + int res; + + if (pipe(pfd) < 0) + err(1, "pipe"); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_RDLCK; + fl.l_whence = SEEK_SET; + + pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + /* + * We are the child. We set a write lock and then + * write one byte back to the parent to tell it. The + * parent will kill us when its done. + */ + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child)"); + if (write(pfd[1], "a", 1) < 0) + err(1, "writing to pipe (child)"); + pause(); + exit(0); + } + + /* + * Wait until the child has set its lock and then perform the + * test. + */ + if (read(pfd[0], &ch, 1) != 1) + err(1, "reading from pipe (child)"); + + /* + * fcntl should wait until the alarm and then return -1 with + * errno set to EINTR. + */ + printf("9 - F_SETLK exclusive lock on share locked region: "); + + fl.l_type = F_WRLCK; + res = fcntl(fd, F_SETLK, &fl); + kill(pid, SIGTERM); + if (waitpid(pid, 0, 0) != pid) + err(1, "waitpid"); + close(pfd[0]); + close(pfd[1]); + + FAIL(res == 0); + FAIL(errno != EACCES && errno != EAGAIN); + + SUCCEED; +} + +/* + * Test 10 - trying to set bogus pid or sysid values + * + * The l_pid and l_sysid fields are only used with F_GETLK to return + * the process ID of the process holding a blocking lock and the + * system ID of the system that owns that process + */ +static int +test10(int fd) +{ + /* + * We create a child process to hold the lock which we will + * test. We use a pipe to communicate with the child. + */ + int pid; + int pfd[2]; + struct flock fl; + char ch; + + if (pipe(pfd) < 0) + err(1, "pipe"); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + fl.l_pid = 9999; + fl.l_sysid = 9999; + + pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + /* + * We are the child. We set a write lock and then + * write one byte back to the parent to tell it. The + * parent will kill us when its done. + */ + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child)"); + if (write(pfd[1], "a", 1) < 0) + err(1, "writing to pipe (child)"); + pause(); + exit(0); + } + + /* + * Wait until the child has set its lock and then perform the + * test. + */ + if (read(pfd[0], &ch, 1) != 1) + err(1, "reading from pipe (child)"); + + printf("10 - trying to set bogus pid or sysid values: "); + + if (fcntl(fd, F_GETLK, &fl) < 0) + err(1, "F_GETLK"); + + kill(pid, SIGTERM); + if (waitpid(pid, 0, 0) != pid) + err(1, "waitpid"); + close(pfd[0]); + close(pfd[1]); + + FAIL(fl.l_pid != pid); +#ifdef HAVE_SYSID + FAIL(fl.l_sysid != 0); +#endif + + SUCCEED; +} + +/* + * Test 11 - remote locks + * + * XXX temporary interface which will be removed when the kernel lockd + * is added. + */ +static int +test11(int fd) +{ + struct flock fl; + int res; + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + fl.l_pid = 9999; + fl.l_sysid = 1; + + printf("11 - remote locks: "); + + res = fcntl(fd, F_SETLK_REMOTE, &fl); + FAIL(res != 0); + + fl.l_sysid = 2; + res = fcntl(fd, F_SETLK_REMOTE, &fl); + FAIL(res == 0); + FAIL(errno != EACCES && errno != EAGAIN); + + res = fcntl(fd, F_GETLK, &fl); + FAIL(res != 0); + FAIL(fl.l_pid != 9999); + FAIL(fl.l_sysid != 1); + + fl.l_type = F_UNLCK; + fl.l_sysid = 1; + fl.l_start = 0; + fl.l_len = 0; + res = fcntl(fd, F_SETLK_REMOTE, &fl); + FAIL(res != 0); + + fl.l_pid = 1234; + fl.l_sysid = 1; + fl.l_start = 0; + fl.l_len = 1; + fl.l_whence = SEEK_SET; + fl.l_type = F_RDLCK; + res = fcntl(fd, F_SETLK_REMOTE, &fl); + FAIL(res != 0); + + fl.l_sysid = 2; + res = fcntl(fd, F_SETLK_REMOTE, &fl); + FAIL(res != 0); + + fl.l_type = F_UNLCKSYS; + fl.l_sysid = 1; + res = fcntl(fd, F_SETLK_REMOTE, &fl); + FAIL(res != 0); + + fl.l_type = F_WRLCK; + res = fcntl(fd, F_GETLK, &fl); + FAIL(res != 0); + FAIL(fl.l_pid != 1234); + FAIL(fl.l_sysid != 2); + + fl.l_type = F_UNLCKSYS; + fl.l_sysid = 2; + res = fcntl(fd, F_SETLK_REMOTE, &fl); + FAIL(res != 0); + + SUCCEED; +} + +/* + * Test 12 - F_SETLKW on locked region which is then unlocked + * + * If a shared or exclusive lock is blocked by other locks, the + * process waits until the request can be satisfied. + */ +static int +test12(int fd) +{ + /* + * We create a child process to hold the lock which we will + * test. We use a pipe to communicate with the child. + */ + int pid; + int pfd[2]; + struct flock fl; + char ch; + int res; + + if (pipe(pfd) < 0) + err(1, "pipe"); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + + pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + /* + * We are the child. We set a write lock and then + * write one byte back to the parent to tell it. The + * parent will kill us when its done. + */ + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child)"); + if (write(pfd[1], "a", 1) < 0) + err(1, "writing to pipe (child)"); + + sleep(1); + exit(0); + } + + /* + * Wait until the child has set its lock and then perform the + * test. + */ + if (read(pfd[0], &ch, 1) != 1) + err(1, "reading from pipe (child)"); + + /* + * fcntl should wait until the alarm and then return -1 with + * errno set to EINTR. + */ + printf("12 - F_SETLKW on locked region which is then unlocked: "); + + //alarm(1); + + res = fcntl(fd, F_SETLKW, &fl); + kill(pid, SIGTERM); + if (waitpid(pid, 0, 0) != pid) + err(1, "waitpid"); + close(pfd[0]); + close(pfd[1]); + FAIL(res != 0); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_UNLCK; + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_UNLCK"); + + SUCCEED; +} + +/* + * Test 13 - F_SETLKW on locked region, race with owner + * + * If a shared or exclusive lock is blocked by other locks, the + * process waits until the request can be satisfied. + */ +static int +test13(int fd) +{ + /* + * We create a child process to hold the lock which we will + * test. We use a pipe to communicate with the child. + */ + int i; + int pid; + int pfd[2]; + struct flock fl; + char ch; + int res; + struct itimerval itv; + + printf("13 - F_SETLKW on locked region, race with owner: "); + fflush(stdout); + + for (i = 0; i < 100; i++) { + if (pipe(pfd) < 0) + err(1, "pipe"); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + + pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + /* + * We are the child. We set a write lock and then + * write one byte back to the parent to tell it. The + * parent will kill us when its done. + */ + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child)"); + if (write(pfd[1], "a", 1) < 0) + err(1, "writing to pipe (child)"); + + usleep(1); + exit(0); + } + + /* + * Wait until the child has set its lock and then perform the + * test. + */ + if (read(pfd[0], &ch, 1) != 1) + err(1, "reading from pipe (child)"); + + /* + * fcntl should wait until the alarm and then return -1 with + * errno set to EINTR. + */ + itv.it_interval.tv_sec = 0; + itv.it_interval.tv_usec = 0; + itv.it_value.tv_sec = 0; + itv.it_value.tv_usec = 2; + setitimer(ITIMER_REAL, &itv, NULL); + + res = fcntl(fd, F_SETLKW, &fl); + kill(pid, SIGTERM); + if (waitpid(pid, 0, 0) != pid) + err(1, "waitpid"); + close(pfd[0]); + close(pfd[1]); + FAIL(!(res == 0 || (res == -1 && errno == EINTR))); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_UNLCK; + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_UNLCK"); + } + SUCCEED; +} + +int +main(int argc, const char *argv[]) +{ + int fd; + struct sigaction sa; + + if (argc != 2) { + errx(1, "usage: flock "); + } + + fd = make_file(argv[1], 1024); + + sa.sa_handler = ignore_alarm; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + sigaction(SIGALRM, &sa, 0); + + test1(fd); + test2(fd); + test3(fd); + test4(fd); + test5(fd); + test6(fd); + test7(fd); + test8(fd); + test9(fd); + test10(fd); + test11(fd); + test12(fd); + test13(fd); + + return 0; +}