diff -urN /Projects/clean/src/lib/libc/gen/lockf.c /Projects/M3/src/lib/libc/gen/lockf.c --- /Projects/clean/src/lib/libc/gen/lockf.c 2008-01-19 15:54:31.000000000 +0000 +++ /Projects/M3/src/lib/libc/gen/lockf.c 2008-01-30 10:35:45.000000000 +0000 @@ -74,7 +74,7 @@ fl.l_type = F_WRLCK; if (_fcntl(filedes, F_GETLK, &fl) == -1) return (-1); - if (fl.l_type == F_UNLCK || fl.l_pid == getpid()) + if (fl.l_type == F_UNLCK || (fl.l_sysid == 0 && fl.l_pid == getpid())) return (0); errno = EAGAIN; return (-1); diff -urN /Projects/clean/src/lib/libc/sys/fcntl.2 /Projects/M3/src/lib/libc/sys/fcntl.2 --- /Projects/clean/src/lib/libc/sys/fcntl.2 2008-01-19 15:54:32.000000000 +0000 +++ /Projects/M3/src/lib/libc/sys/fcntl.2 2008-01-30 10:35:48.000000000 +0000 @@ -177,6 +177,7 @@ pid_t l_pid; /* lock owner */ short l_type; /* lock type: read/write, etc. */ short l_whence; /* type of l_start */ + int l_sysid; /* remote system id or zero for local */ }; .Ed The commands available for advisory record locking are as follows: @@ -264,9 +265,13 @@ means end edge of the region. The .Fa l_pid -field is only used with +and +.Fa l_sysid +fields are only used with .Dv F_GETLK -to return the process ID of the process holding a blocking lock. +to return the process ID of the process holding a blocking lock and +the system ID of the system that owns that process. +Locks created by the local system will have a system ID of zero. After a successful .Dv F_GETLK request, the value of diff -urN /Projects/clean/src/sys/compat/linux/linux_file.c /Projects/M3/src/sys/compat/linux/linux_file.c --- /Projects/clean/src/sys/compat/linux/linux_file.c 2008-01-19 15:54:38.000000000 +0000 +++ /Projects/M3/src/sys/compat/linux/linux_file.c 2008-01-30 10:35:05.000000000 +0000 @@ -1051,6 +1051,7 @@ bsd_flock->l_start = (off_t)linux_flock->l_start; bsd_flock->l_len = (off_t)linux_flock->l_len; bsd_flock->l_pid = (pid_t)linux_flock->l_pid; + bsd_flock->l_sysid = 0; } static void @@ -1107,6 +1108,7 @@ bsd_flock->l_start = (off_t)linux_flock->l_start; bsd_flock->l_len = (off_t)linux_flock->l_len; bsd_flock->l_pid = (pid_t)linux_flock->l_pid; + bsd_flock->l_sysid = 0; } static void diff -urN /Projects/clean/src/sys/compat/svr4/svr4_fcntl.c /Projects/M3/src/sys/compat/svr4/svr4_fcntl.c --- /Projects/clean/src/sys/compat/svr4/svr4_fcntl.c 2008-01-19 15:54:40.000000000 +0000 +++ /Projects/M3/src/sys/compat/svr4/svr4_fcntl.c 2008-01-30 10:35:06.000000000 +0000 @@ -191,7 +191,7 @@ oflp->l_start = (off_t) iflp->l_start; oflp->l_len = (off_t) iflp->l_len; oflp->l_pid = (pid_t) iflp->l_pid; - + oflp->l_sysid = iflp->l_sysid; } static void @@ -217,7 +217,7 @@ oflp->l_whence = (short) iflp->l_whence; oflp->l_start = (svr4_off64_t) iflp->l_start; oflp->l_len = (svr4_off64_t) iflp->l_len; - oflp->l_sysid = 0; + oflp->l_sysid = iflp->l_sysid; oflp->l_pid = (svr4_pid_t) iflp->l_pid; } diff -urN /Projects/clean/src/sys/i386/ibcs2/ibcs2_fcntl.c /Projects/M3/src/sys/i386/ibcs2/ibcs2_fcntl.c --- /Projects/clean/src/sys/i386/ibcs2/ibcs2_fcntl.c 2008-01-19 15:54:41.000000000 +0000 +++ /Projects/M3/src/sys/i386/ibcs2/ibcs2_fcntl.c 2008-01-30 10:35:27.000000000 +0000 @@ -93,7 +93,7 @@ iflp->l_whence = (short)flp->l_whence; iflp->l_start = (ibcs2_off_t)flp->l_start; iflp->l_len = (ibcs2_off_t)flp->l_len; - iflp->l_sysid = 0; + iflp->l_sysid = flp->l_sysid; iflp->l_pid = (ibcs2_pid_t)flp->l_pid; } @@ -127,6 +127,7 @@ break; } flp->l_whence = iflp->l_whence; + flk->l_sysid = iflp->l_sysid; } /* convert iBCS2 mode into NetBSD mode */ diff -urN /Projects/clean/src/sys/kern/kern_descrip.c /Projects/M3/src/sys/kern/kern_descrip.c --- /Projects/clean/src/sys/kern/kern_descrip.c 2008-01-19 15:54:42.000000000 +0000 +++ /Projects/M3/src/sys/kern/kern_descrip.c 2008-01-30 10:35:31.000000000 +0000 @@ -316,28 +316,67 @@ fcntl(struct thread *td, struct fcntl_args *uap) { struct flock fl; + struct oflock ofl; intptr_t arg; int error; + int cmd; error = 0; + cmd = uap->cmd; switch (uap->cmd) { - case F_GETLK: - case F_SETLK: - case F_SETLKW: - error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl)); + case F_OGETLK: + case F_OSETLK: + case F_OSETLKW: + /* + * Convert old flock structure to new. + */ + error = copyin((void *)(intptr_t)uap->arg, &ofl, sizeof(ofl)); + fl.l_start = ofl.l_start; + fl.l_len = ofl.l_len; + fl.l_pid = ofl.l_pid; + fl.l_type = ofl.l_type; + fl.l_whence = ofl.l_whence; + fl.l_sysid = 0; + + switch (uap->cmd) { + case F_OGETLK: + cmd = F_GETLK; + break; + case F_OSETLK: + cmd = F_SETLK; + break; + case F_OSETLKW: + cmd = F_SETLKW; + break; + } arg = (intptr_t)&fl; break; + case F_GETLK: + case F_SETLK: + case F_SETLKW: + case F_SETLK_REMOTE: + error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl)); + arg = (intptr_t)&fl; + break; default: arg = uap->arg; break; } if (error) return (error); - error = kern_fcntl(td, uap->fd, uap->cmd, arg); + error = kern_fcntl(td, uap->fd, cmd, arg); if (error) return (error); - if (uap->cmd == F_GETLK) + if (uap->cmd == F_OGETLK) { + ofl.l_start = fl.l_start; + ofl.l_len = fl.l_len; + ofl.l_pid = fl.l_pid; + ofl.l_type = fl.l_type; + ofl.l_whence = fl.l_whence; + error = copyout(&ofl, (void *)(intptr_t)uap->arg, sizeof(ofl)); + } else if (uap->cmd == F_GETLK) { error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl)); + } return (error); } @@ -490,11 +529,16 @@ fdrop(fp, td); break; + case F_SETLK_REMOTE: + flg = F_REMOTE; + goto do_setlk; + case F_SETLKW: flg |= F_WAIT; /* FALLTHROUGH F_SETLK */ case F_SETLK: + do_setlk: FILEDESC_SLOCK(fdp); if ((fp = fdtofp(fd, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); @@ -550,7 +594,19 @@ break; case F_UNLCK: error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, - flp, F_POSIX); + flp, flg); + break; + case F_UNLCKSYS: + /* + * Temporary api for testing remote lock + * infrastructure. + */ + if (flg != F_REMOTE) { + error = EINVAL; + break; + } + error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, + F_UNLCKSYS, flp, flg); break; default: error = EINVAL; diff -urN /Projects/clean/src/sys/kern/kern_lockf.c /Projects/M3/src/sys/kern/kern_lockf.c --- /Projects/clean/src/sys/kern/kern_lockf.c 2008-01-19 15:54:43.000000000 +0000 +++ /Projects/M3/src/sys/kern/kern_lockf.c 2008-01-30 10:35:31.000000000 +0000 @@ -39,6 +39,7 @@ #include #include +#include #include #include #include @@ -57,6 +58,7 @@ */ static int maxlockdepth = MAXDEPTH; +#define LOCKF_DEBUG #ifdef LOCKF_DEBUG #include @@ -80,36 +82,185 @@ lf_getblock(struct lockf *); static int lf_getlock(struct lockf *, struct flock *); static int lf_setlock(struct lockf *, struct vnode *, struct lockf **); -static void lf_split(struct lockf *, struct lockf *, struct lockf **); +static void lf_split(struct lockf *, struct lockf *, struct lockf **, + int addlock); static void lf_wakelock(struct lockf *); +void lf_clearremotesys(int sysid); #ifdef LOCKF_DEBUG static void lf_print(char *, struct lockf *); static void lf_printlist(char *, struct lockf *); +static void lf_print_owner(struct lock_owner *); #endif /* + * This structure is used to keep track of both local and remote lock + * owners. The lf_owner field of the struct lockf points back at the + * lock owner structure. Each possible lock owner (local proc for + * POSIX fcntl locks, local file for BSD flock locks or + * pair for remote locks) is represented by a unique instance of + * struct lock_owner. + * + * Locks: + * (l) locked by lf_lock_owners_mutex + * (p) locked by mtx_pool_lock(mtxpool_sleep, lo) + * (c) const until freeing + */ +#define LOCK_OWNER_HASH_SIZE 256 + +struct lock_owner { + LIST_ENTRY(lock_owner) lo_link; /* (l) hash chain */ + int lo_refs; /* (l) Number of locks referring to this */ + int lo_flags; /* (c) Flags passwd to lf_advlock */ + caddr_t lo_id; /* (c) Id value passed to lf_advlock */ + pid_t lo_pid; /* (c) Process Id of the lock owner */ + int lo_sysid; /* (c) System Id of the lock owner */ + struct locklist lo_active; /* (p) Active locks for this owner */ + struct locklist lo_pending; /* (p) Pending locks for this owner */ +}; + +LIST_HEAD(lock_owner_list, lock_owner); + +static struct mtx lf_lock_owners_mutex; +static struct lock_owner_list lf_lock_owners[LOCK_OWNER_HASH_SIZE]; /* (l) */ + +/* + * Initialise the lock owner structures. + */ +static void +lf_init(void *dummy) +{ + int i; + + mtx_init(&lf_lock_owners_mutex, "lock owners lock", NULL, MTX_DEF); + for (i = 0; i < LOCK_OWNER_HASH_SIZE; i++) + LIST_INIT(&lf_lock_owners[i]); +} +SYSINIT(lf_init, SI_SUB_LOCK, SI_ORDER_FIRST, lf_init, NULL) + +/* + * Generate a hash value for a lock owner. + */ +static int +lf_hash_owner(caddr_t id, struct flock *fl, int flags) +{ + uint32_t h; + + if (flags & F_REMOTE) { + h = HASHSTEP(0, fl->l_pid); + h = HASHSTEP(h, fl->l_sysid); + } else if (flags & F_FLOCK) { + h = ((uintptr_t) id) >> 7; + } else { + struct proc *p = (struct proc *) id; + h = HASHSTEP(0, p->p_pid); + h = HASHSTEP(h, 0); + } + + return (h % LOCK_OWNER_HASH_SIZE); +} + +/* + * Return true if a lock owner matches the details passed to + * lf_advlock. + */ +static int +lf_owner_matches(struct lock_owner *lo, caddr_t id, struct flock *fl, + int flags) +{ + if (flags & F_REMOTE) { + return lo->lo_pid == fl->l_pid + && lo->lo_sysid == fl->l_sysid; + } else { + return lo->lo_id == id; + } +} + +/* * Advisory record locking support */ int -lf_advlock(ap, head, size) - struct vop_advlock_args /* { - struct vnode *a_vp; - caddr_t a_id; - int a_op; - struct flock *a_fl; - int a_flags; - } */ *ap; - struct lockf **head; - u_quad_t size; +lf_advlock(struct vop_advlock_args *ap, struct lockf **head, u_quad_t size) { struct flock *fl = ap->a_fl; struct lockf *lock; struct vnode *vp = ap->a_vp; + caddr_t id = ap->a_id; + int flags = ap->a_flags; + int hash; + struct lock_owner *lo; off_t start, end, oadd; struct lockf *clean, *n; int error; /* + * Handle the F_UNLKSYS case first - no need to mess about + * creating a lock owner for this one. + */ + if (ap->a_op == F_UNLCKSYS) { + lf_clearremotesys(fl->l_sysid); + return (0); + } + + /* + * Map our arguments to an existing lock owner or create one + * if this is the first time we have seen this owner. + */ + hash = lf_hash_owner(id, fl, flags); + mtx_lock(&lf_lock_owners_mutex); + LIST_FOREACH(lo, &lf_lock_owners[hash], lo_link) + if (lf_owner_matches(lo, id, fl, flags)) + break; + if (!lo) { + /* + * We initialise the lock with a reference + * count of one which refers to the new lockf + * structure created below. + */ + lo = malloc(sizeof(struct lock_owner), + M_LOCKF, M_NOWAIT); + if (!lo) { + mtx_unlock(&lf_lock_owners_mutex); + return (ENOMEM); + } + + lo->lo_refs = 1; + lo->lo_flags = flags; + lo->lo_id = id; + if (flags & F_REMOTE) { + lo->lo_pid = fl->l_pid; + lo->lo_sysid = fl->l_sysid; + } else if (flags & F_FLOCK) { + lo->lo_pid = -1; + lo->lo_sysid = 0; + } else { + struct proc *p = (struct proc *) id; + lo->lo_pid = p->p_pid; + lo->lo_sysid = 0; + } + TAILQ_INIT(&lo->lo_active); + TAILQ_INIT(&lo->lo_pending); + +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) { + printf("lf_advlock: new lock owner %p ", lo); + lf_print_owner(lo); + printf("\n"); + } +#endif + + LIST_INSERT_HEAD(&lf_lock_owners[hash], + lo, lo_link); + } else { + /* + * We have seen this lock owner before, + * increase its reference count to account for + * the new lockf struct we create below. + */ + lo->lo_refs++; + } + mtx_unlock(&lf_lock_owners_mutex); + + /* * Convert the flock structure into a start and end. */ switch (fl->l_whence) { @@ -165,6 +316,7 @@ clean = NULL; if (ap->a_op == F_SETLK || ap->a_op == F_UNLCK) { MALLOC(clean, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK); + clean->lf_owner = 0; clean->lf_next = NULL; } /* @@ -173,7 +325,8 @@ MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK); lock->lf_start = start; lock->lf_end = end; - lock->lf_id = ap->a_id; + lock->lf_owner = lo; + lock->lf_vnode = vp; /* * XXX The problem is that VTOI is ufs specific, so it will * break LOCKF_DEBUG for all other FS's other than UFS because @@ -186,6 +339,7 @@ lock->lf_next = (struct lockf *)0; TAILQ_INIT(&lock->lf_blkhd); lock->lf_flags = ap->a_flags; + /* * Do the requested operation. */ @@ -215,6 +369,31 @@ } VI_UNLOCK(vp); for (lock = clean; lock != NULL; ) { + /* + * Adjust the lock_owner reference count and + * reclaim the entry if this is the last lock + * for that owner. + */ + struct lock_owner *lo = lock->lf_owner; + if (lo) { + mtx_lock(&lf_lock_owners_mutex); + lo->lo_refs--; + if (lo->lo_refs == 0) { +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) + printf("lf_advlock: freeing lock owner %p\n", + lo); +#endif + KASSERT(TAILQ_EMPTY(&lo->lo_active), + ("freeing lock owner with active locks")); + KASSERT(TAILQ_EMPTY(&lo->lo_pending), + ("freeing lock owner with pending locks")); + LIST_REMOVE(lo, lo_link); + free(lo, M_LOCKF); + } + mtx_unlock(&lf_lock_owners_mutex); + } + n = lock->lf_next; free(lock, M_LOCKF); lock = n; @@ -226,10 +405,7 @@ * Set a byte-range lock. */ static int -lf_setlock(lock, vp, clean) - struct lockf *lock; - struct vnode *vp; - struct lockf **clean; +lf_setlock(struct lockf *lock, struct vnode *vp, struct lockf **clean) { struct lockf *block; struct lockf **head = lock->lf_head; @@ -267,48 +443,57 @@ * For byte-range locks we must check for deadlock. * * Deadlock detection is done by looking through the - * wait channels to see if there are any cycles that - * involve us. MAXDEPTH is set just to make sure we - * do not go off into neverland. + * lock owner pending lists to see if there are any + * cycles that involve us. MAXDEPTH is set just to + * make sure we do not go off into neverland. + * + * This algorithm is simplistic - it only considers + * the first blocking lock and it doesn't follow all + * paths through the lock graph. */ if ((lock->lf_flags & F_POSIX) && (block->lf_flags & F_POSIX)) { - struct proc *wproc; - struct proc *nproc; - struct thread *td; struct lockf *waitblock; - int i = 0; - - /* The block is waiting on something */ - wproc = (struct proc *)block->lf_id; -restart: - nproc = NULL; - PROC_SLOCK(wproc); - FOREACH_THREAD_IN_PROC(wproc, td) { - thread_lock(td); - while (td->td_wchan && - (td->td_wmesg == lockstr) && - (i++ < maxlockdepth)) { - waitblock = (struct lockf *)td->td_wchan; - /* Get the owner of the blocking lock */ - waitblock = waitblock->lf_next; - if ((waitblock->lf_flags & F_POSIX) == 0) - break; - nproc = (struct proc *)waitblock->lf_id; - if (nproc == (struct proc *)lock->lf_id) { - PROC_SUNLOCK(wproc); - thread_unlock(td); + struct lockf *nblock; + struct lock_owner *lo; + struct lock_owner *nlo; + int i; + + lo = block->lf_owner; + i = 0; + while (lo) { + if (i++ == maxlockdepth) + break; + mtx_pool_lock(mtxpool_sleep, lo); + nlo = NULL; + TAILQ_FOREACH(waitblock, &lo->lo_pending, + lf_olock) { + /* + * Get the owner of the + * blocking lock. + * + * XXX this is unsafe - if + * waitblock is on a different + * vnode to this one, our + * vnode interlock will not + * protect us against changes + * to waitblock->lf_next. + */ + nblock = waitblock->lf_next; + if ((nblock->lf_flags & F_POSIX) == 0) + continue; + nlo = nblock->lf_owner; + if (nlo == lock->lf_owner) { + mtx_pool_unlock(mtxpool_sleep, + lo); lock->lf_next = *clean; *clean = lock; return (EDEADLK); } } - thread_unlock(td); + mtx_pool_unlock(mtxpool_sleep, block->lf_owner); + lo = nlo; } - PROC_SUNLOCK(wproc); - wproc = nproc; - if (wproc) - goto restart; } /* * For flock type locks, we must first remove @@ -327,6 +512,9 @@ */ lock->lf_next = block; TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block); + mtx_pool_lock(mtxpool_sleep, lock->lf_owner); + TAILQ_INSERT_TAIL(&lock->lf_owner->lo_pending, lock, lf_olock); + mtx_pool_unlock(mtxpool_sleep, lock->lf_owner); #ifdef LOCKF_DEBUG if (lockf_debug & 1) { lf_print("lf_setlock: blocking on", block); @@ -344,6 +532,10 @@ */ if (lock->lf_next) { TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block); + mtx_pool_lock(mtxpool_sleep, lock->lf_owner); + TAILQ_REMOVE(&lock->lf_owner->lo_pending, lock, + lf_olock); + mtx_pool_unlock(mtxpool_sleep, lock->lf_owner); lock->lf_next = NOLOCKF; } if (error) { @@ -381,6 +573,10 @@ if (needtolink) { *prev = lock; lock->lf_next = overlap; + mtx_pool_lock(mtxpool_sleep, lock->lf_owner); + TAILQ_INSERT_TAIL(&lock->lf_owner->lo_active, + lock, lf_olock); + mtx_pool_unlock(mtxpool_sleep, lock->lf_owner); } break; @@ -412,8 +608,12 @@ *prev = lock; lock->lf_next = overlap; overlap->lf_start = lock->lf_end + 1; + mtx_pool_lock(mtxpool_sleep, lock->lf_owner); + TAILQ_INSERT_TAIL(&lock->lf_owner->lo_active, + lock, lf_olock); + mtx_pool_unlock(mtxpool_sleep, lock->lf_owner); } else - lf_split(overlap, lock, clean); + lf_split(overlap, lock, clean, TRUE); lf_wakelock(overlap); break; @@ -438,13 +638,21 @@ /* * Add the new lock if necessary and delete the overlap. */ + mtx_pool_lock(mtxpool_sleep, lock->lf_owner); + KASSERT(lock->lf_owner == overlap->lf_owner, + ("unexpected lock owner for overlap")); if (needtolink) { *prev = lock; lock->lf_next = overlap->lf_next; prev = &lock->lf_next; + TAILQ_INSERT_TAIL(&lock->lf_owner->lo_active, + lock, lf_olock); needtolink = 0; } else *prev = overlap->lf_next; + TAILQ_REMOVE(&lock->lf_owner->lo_active, + overlap, lf_olock); + mtx_pool_unlock(mtxpool_sleep, lock->lf_owner); overlap->lf_next = *clean; *clean = overlap; continue; @@ -457,6 +665,10 @@ overlap->lf_next = lock; overlap->lf_end = lock->lf_start - 1; prev = &lock->lf_next; + mtx_pool_lock(mtxpool_sleep, lock->lf_owner); + TAILQ_INSERT_TAIL(&lock->lf_owner->lo_active, + lock, lf_olock); + mtx_pool_unlock(mtxpool_sleep, lock->lf_owner); lf_wakelock(overlap); needtolink = 0; continue; @@ -468,6 +680,10 @@ if (needtolink) { *prev = lock; lock->lf_next = overlap; + mtx_pool_lock(mtxpool_sleep, lock->lf_owner); + TAILQ_INSERT_TAIL(&lock->lf_owner->lo_active, + lock, lf_olock); + mtx_pool_unlock(mtxpool_sleep, lock->lf_owner); } overlap->lf_start = lock->lf_end + 1; lf_wakelock(overlap); @@ -491,12 +707,10 @@ * and remove it (or shrink it), then wakeup anyone we can. */ static int -lf_clearlock(unlock, clean) - struct lockf *unlock; - struct lockf **clean; +lf_clearlock(struct lockf *unlock, struct lockf **clean) { struct lockf **head = unlock->lf_head; - register struct lockf *lf = *head; + struct lockf *lf = *head; struct lockf *overlap, **prev; int ovcase; @@ -521,6 +735,10 @@ *prev = overlap->lf_next; overlap->lf_next = *clean; *clean = overlap; + mtx_pool_lock(mtxpool_sleep, overlap->lf_owner); + TAILQ_REMOVE(&overlap->lf_owner->lo_active, + overlap, lf_olock); + mtx_pool_unlock(mtxpool_sleep, overlap->lf_owner); break; case 2: /* overlap contains lock: split it */ @@ -528,8 +746,7 @@ overlap->lf_start = unlock->lf_end + 1; break; } - lf_split(overlap, unlock, clean); - overlap->lf_next = unlock->lf_next; + lf_split(overlap, unlock, clean, FALSE); break; case 3: /* lock contains overlap */ @@ -537,6 +754,10 @@ lf = overlap->lf_next; overlap->lf_next = *clean; *clean = overlap; + mtx_pool_lock(mtxpool_sleep, overlap->lf_owner); + TAILQ_REMOVE(&overlap->lf_owner->lo_active, + overlap, lf_olock); + mtx_pool_unlock(mtxpool_sleep, overlap->lf_owner); continue; case 4: /* overlap starts before lock */ @@ -563,11 +784,9 @@ * and if so return its process identifier. */ static int -lf_getlock(lock, fl) - register struct lockf *lock; - register struct flock *fl; +lf_getlock(struct lockf *lock, struct flock *fl) { - register struct lockf *block; + struct lockf *block; #ifdef LOCKF_DEBUG if (lockf_debug & 1) @@ -582,10 +801,8 @@ fl->l_len = 0; else fl->l_len = block->lf_end - block->lf_start + 1; - if (block->lf_flags & F_POSIX) - fl->l_pid = ((struct proc *)(block->lf_id))->p_pid; - else - fl->l_pid = -1; + fl->l_pid = block->lf_owner->lo_pid; + fl->l_sysid = block->lf_owner->lo_sysid; } else { fl->l_type = F_UNLCK; } @@ -597,8 +814,7 @@ * return the first blocking lock. */ static struct lockf * -lf_getblock(lock) - register struct lockf *lock; +lf_getblock(struct lockf *lock) { struct lockf **prev, *overlap, *lf = *(lock->lf_head); int ovcase; @@ -627,12 +843,8 @@ * may be more than one. */ static int -lf_findoverlap(lf, lock, type, prev, overlap) - register struct lockf *lf; - struct lockf *lock; - int type; - struct lockf ***prev; - struct lockf **overlap; +lf_findoverlap(struct lockf *lf, struct lockf *lock, int type, + struct lockf ***prev, struct lockf **overlap) { off_t start, end; @@ -646,8 +858,8 @@ start = lock->lf_start; end = lock->lf_end; while (lf != NOLOCKF) { - if (((type & SELF) && lf->lf_id != lock->lf_id) || - ((type & OTHERS) && lf->lf_id == lock->lf_id)) { + if (((type & SELF) && lf->lf_owner != lock->lf_owner) || + ((type & OTHERS) && lf->lf_owner == lock->lf_owner)) { *prev = &lf->lf_next; *overlap = lf = lf->lf_next; continue; @@ -733,14 +945,13 @@ } /* - * Split a lock and a contained region into - * two or three locks as necessary. + * Split a lock and a contained region into two or three locks as + * necessary. If addlock is TRUE, lock2 is being set so it must be + * added to the list, otherwise it is being cleared. */ static void -lf_split(lock1, lock2, split) - struct lockf *lock1; - struct lockf *lock2; - struct lockf **split; +lf_split(struct lockf *lock1, struct lockf *lock2, struct lockf **split, + int addlock) { struct lockf *splitlock; @@ -755,13 +966,16 @@ */ if (lock1->lf_start == lock2->lf_start) { lock1->lf_start = lock2->lf_end + 1; - lock2->lf_next = lock1; + if (addlock) + lock2->lf_next = lock1; return; } if (lock1->lf_end == lock2->lf_end) { lock1->lf_end = lock2->lf_start - 1; - lock2->lf_next = lock1->lf_next; - lock1->lf_next = lock2; + if (addlock) { + lock2->lf_next = lock1->lf_next; + lock1->lf_next = lock2; + } return; } /* @@ -773,6 +987,15 @@ KASSERT(splitlock != NULL, ("no split")); *split = splitlock->lf_next; bcopy(lock1, splitlock, sizeof *splitlock); + + /* + * Update the lock owner reference count to account for the + * new lock. + */ + mtx_lock(&lf_lock_owners_mutex); + splitlock->lf_owner->lo_refs++; + mtx_unlock(&lf_lock_owners_mutex); + splitlock->lf_start = lock2->lf_end + 1; TAILQ_INIT(&splitlock->lf_blkhd); lock1->lf_end = lock2->lf_start - 1; @@ -780,23 +1003,37 @@ * OK, now link it in */ splitlock->lf_next = lock1->lf_next; - lock2->lf_next = splitlock; - lock1->lf_next = lock2; + mtx_pool_lock(mtxpool_sleep, lock1->lf_owner); + TAILQ_INSERT_TAIL(&lock1->lf_owner->lo_active, splitlock, lf_olock); + if (addlock) { + KASSERT(lock1->lf_owner == lock2->lf_owner, + ("unexpected lock owner for split")); + TAILQ_INSERT_TAIL(&lock1->lf_owner->lo_active, lock2, + lf_olock); + lock2->lf_next = splitlock; + lock1->lf_next = lock2; + } else { + lock1->lf_next = splitlock; + } + mtx_pool_unlock(mtxpool_sleep, lock1->lf_owner); } /* * Wakeup a blocklist */ static void -lf_wakelock(listhead) - struct lockf *listhead; +lf_wakelock(struct lockf *listhead) { - register struct lockf *wakelock; + struct lockf *wakelock; while (!TAILQ_EMPTY(&listhead->lf_blkhd)) { wakelock = TAILQ_FIRST(&listhead->lf_blkhd); TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block); wakelock->lf_next = NOLOCKF; + mtx_pool_lock(mtxpool_sleep, wakelock->lf_owner); + TAILQ_REMOVE(&wakelock->lf_owner->lo_pending, wakelock, + lf_olock); + mtx_pool_unlock(mtxpool_sleep, wakelock->lf_owner); #ifdef LOCKF_DEBUG if (lockf_debug & 2) lf_print("lf_wakelock: awakening", wakelock); @@ -805,21 +1042,94 @@ } } +struct clearlock { + STAILQ_ENTRY(clearlock) link; + struct vnode *vp; + struct flock fl; +}; +STAILQ_HEAD(clearlocklist, clearlock); + +void +lf_clearremotesys(int sysid) +{ + int i; + struct lock_owner *lo; + struct lockf *lf; + struct clearlock *cl; + struct clearlocklist locks; + + KASSERT(sysid != 0, ("Can't clear local locks with F_UNLCKSYS")); + + /* + * In order to keep the locking simple, we iterate over the + * active lock lists to build a list of locks that need + * releasing. We then call VOP_ADVLOCK for each one in turn. + */ + STAILQ_INIT(&locks); + mtx_lock(&lf_lock_owners_mutex); + for (i = 0; i < LOCK_OWNER_HASH_SIZE; i++) { + LIST_FOREACH(lo, &lf_lock_owners[i], lo_link) { + if (lo->lo_sysid != sysid) + continue; + + mtx_pool_lock(mtxpool_sleep, lo); + TAILQ_FOREACH(lf, &lo->lo_active, lf_olock) { + cl = malloc(sizeof(struct clearlock), + M_LOCKF, M_NOWAIT); + if (!cl) + continue; + cl->vp = lf->lf_vnode; + cl->fl.l_start = lf->lf_start; + if (lf->lf_end == -1) + cl->fl.l_len = 0; + else + cl->fl.l_len = + lf->lf_end - lf->lf_start; + cl->fl.l_whence = SEEK_SET; + cl->fl.l_type = F_UNLCK; + cl->fl.l_pid = lo->lo_pid; + cl->fl.l_sysid = sysid; + STAILQ_INSERT_TAIL(&locks, cl, link); + } + mtx_pool_unlock(mtxpool_sleep, lo); + } + } + mtx_unlock(&lf_lock_owners_mutex); + + while ((cl = STAILQ_FIRST(&locks)) != NULL) { + STAILQ_REMOVE_HEAD(&locks, link); + VOP_ADVLOCK(cl->vp, 0, F_UNLCK, &cl->fl, F_REMOTE); + free(cl, M_LOCKF); + } +} + #ifdef LOCKF_DEBUG /* + * Print description of a lock owner + */ +static void +lf_print_owner(struct lock_owner *lo) +{ + + if (lo->lo_flags & F_REMOTE) { + printf("remote pid %d, system %d", + lo->lo_pid, lo->lo_sysid); + } else if (lo->lo_flags & F_FLOCK) { + printf("file %p", lo->lo_id); + } else { + printf("local pid %d", lo->lo_pid); + } +} + +/* * Print out a lock. */ static void -lf_print(tag, lock) - char *tag; - register struct lockf *lock; +lf_print(char *tag, struct lockf *lock) { printf("%s: lock %p for ", tag, (void *)lock); - if (lock->lf_flags & F_POSIX) - printf("proc %ld", (long)((struct proc *)lock->lf_id)->p_pid); - else - printf("id %p", (void *)lock->lf_id); + lf_print_owner(lock->lf_owner); if (lock->lf_inode != (struct inode *)0) printf(" in ino %ju on dev <%s>, %s, start %jd, end %jd", (uintmax_t)lock->lf_inode->i_number, @@ -841,11 +1151,9 @@ } static void -lf_printlist(tag, lock) - char *tag; - struct lockf *lock; +lf_printlist(char *tag, struct lockf *lock) { - register struct lockf *lf, *blk; + struct lockf *lf, *blk; if (lock->lf_inode == (struct inode *)0) return; @@ -855,11 +1163,7 @@ devtoname(lock->lf_inode->i_dev)); for (lf = lock->lf_inode->i_lockf; lf; lf = lf->lf_next) { printf("\tlock %p for ",(void *)lf); - if (lf->lf_flags & F_POSIX) - printf("proc %ld", - (long)((struct proc *)lf->lf_id)->p_pid); - else - printf("id %p", (void *)lf->lf_id); + lf_print_owner(lock->lf_owner); printf(", %s, start %jd, end %jd", lf->lf_type == F_RDLCK ? "shared" : lf->lf_type == F_WRLCK ? "exclusive" : @@ -867,11 +1171,7 @@ "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end); TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) { printf("\n\t\tlock request %p for ", (void *)blk); - if (blk->lf_flags & F_POSIX) - printf("proc %ld", - (long)((struct proc *)blk->lf_id)->p_pid); - else - printf("id %p", (void *)blk->lf_id); + lf_print_owner(blk->lf_owner); printf(", %s, start %jd, end %jd", blk->lf_type == F_RDLCK ? "shared" : blk->lf_type == F_WRLCK ? "exclusive" : diff -urN /Projects/clean/src/sys/sys/fcntl.h /Projects/M3/src/sys/sys/fcntl.h --- /Projects/clean/src/sys/sys/fcntl.h 2008-01-19 15:54:44.000000000 +0000 +++ /Projects/M3/src/sys/sys/fcntl.h 2008-01-30 10:35:43.000000000 +0000 @@ -178,9 +178,13 @@ #define F_GETOWN 5 /* get SIGIO/SIGURG proc/pgrp */ #define F_SETOWN 6 /* set SIGIO/SIGURG proc/pgrp */ #endif -#define F_GETLK 7 /* get record locking information */ -#define F_SETLK 8 /* set record locking information */ -#define F_SETLKW 9 /* F_SETLK; wait if blocked */ +#define F_OGETLK 7 /* get record locking information */ +#define F_OSETLK 8 /* set record locking information */ +#define F_OSETLKW 9 /* F_SETLK; wait if blocked */ +#define F_GETLK 10 /* get record locking information */ +#define F_SETLK 11 /* set record locking information */ +#define F_SETLKW 12 /* F_SETLK; wait if blocked */ +#define F_SETLK_REMOTE 13 /* debugging support for remote locks */ /* file descriptor flags (F_GETFD, F_SETFD) */ #define FD_CLOEXEC 1 /* close-on-exec flag */ @@ -189,10 +193,12 @@ #define F_RDLCK 1 /* shared or read lock */ #define F_UNLCK 2 /* unlock */ #define F_WRLCK 3 /* exclusive or write lock */ +#define F_UNLCKSYS 4 /* purge locks for a given system ID */ #ifdef _KERNEL #define F_WAIT 0x010 /* Wait until lock is granted */ #define F_FLOCK 0x020 /* Use flock(2) semantics for lock */ #define F_POSIX 0x040 /* Use POSIX semantics for lock */ +#define F_REMOTE 0x080 /* Lock owner is remote NFS client */ #endif /* @@ -205,6 +211,19 @@ pid_t l_pid; /* lock owner */ short l_type; /* lock type: read/write, etc. */ short l_whence; /* type of l_start */ + int l_sysid; /* remote system id or zero for local */ +}; + +/* + * Old advisory file segment locking data type, + * before adding l_sysid. + */ +struct oflock { + off_t l_start; /* starting offset */ + off_t l_len; /* len = 0 means until end of file */ + pid_t l_pid; /* lock owner */ + short l_type; /* lock type: read/write, etc. */ + short l_whence; /* type of l_start */ }; diff -urN /Projects/clean/src/sys/sys/lockf.h /Projects/M3/src/sys/sys/lockf.h --- /Projects/clean/src/sys/sys/lockf.h 2008-01-19 15:44:41.000000000 +0000 +++ /Projects/M3/src/sys/sys/lockf.h 2008-01-30 10:35:43.000000000 +0000 @@ -53,12 +53,14 @@ short lf_type; /* Lock type: F_RDLCK, F_WRLCK */ off_t lf_start; /* Byte # of the start of the lock */ off_t lf_end; /* Byte # of the end of the lock (-1=EOF) */ - caddr_t lf_id; /* Id of the resource holding the lock */ + struct lock_owner *lf_owner; /* Owner of the lock */ + struct vnode *lf_vnode; /* File being locked (only valid for active lock) */ struct lockf **lf_head; /* Back pointer to the head of the lockf list */ struct inode *lf_inode; /* Back pointer to the inode */ struct lockf *lf_next; /* Pointer to the next lock on this inode */ struct locklist lf_blkhd; /* List of requests blocked on this lock */ TAILQ_ENTRY(lockf) lf_block;/* A request waiting for a lock */ + TAILQ_ENTRY(lockf) lf_olock;/* Linkage for owner lock lists */ }; /* Maximum length of sleep chains to traverse to try and detect deadlock. */ diff -urN /Projects/clean/src/tools/regression/file/flock/Makefile /Projects/M3/src/tools/regression/file/flock/Makefile --- /Projects/clean/src/tools/regression/file/flock/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ /Projects/M3/src/tools/regression/file/flock/Makefile 2008-01-30 10:35:49.000000000 +0000 @@ -0,0 +1,8 @@ +# $FreeBSD$ + +PROG= flock +NO_MAN= +WARNS?= 6 +DEBUG_FLAGS= -g -O0 + +.include diff -urN /Projects/clean/src/tools/regression/file/flock/flock.c /Projects/M3/src/tools/regression/file/flock/flock.c --- /Projects/clean/src/tools/regression/file/flock/flock.c 1970-01-01 01:00:00.000000000 +0100 +++ /Projects/M3/src/tools/regression/file/flock/flock.c 2008-01-30 10:35:49.000000000 +0000 @@ -0,0 +1,931 @@ +/*- + * Copyright (c) 2008 Isilon Inc http://www.isilon.com/ + * Authors: Doug Rabson + * Developed with Red Inc: Alfred Perlstein + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __FreeBSD__ +#define HAVE_SYSID +#include +#else +#define __unused +#endif + +static int +make_file(const char *dir, off_t sz) +{ + const char *template = "/flocktempXXXXXX"; + size_t len; + char *filename; + int fd; + + len = strlen(dir) + strlen(template) + 1; + filename = malloc(len); + strcpy(filename, dir); + strcat(filename, template); + fd = mkstemp(filename); + if (fd < 0) + err(1, "mkstemp"); + if (ftruncate(fd, sz) < 0) + err(1, "ftruncate"); + if (unlink(filename) < 0) + err(1, "unlink"); + free(filename); + + return (fd); +} + +static void +ignore_alarm(int __unused sig) +{ +} + +#define FAIL(test) \ + do { \ + if (test) { \ + printf("FAIL (%s)\n", #test); \ + return -1; \ + } \ + } while (0) + +#define SUCCEED \ + do { printf("SUCCEED\n"); return 0; } while (0) + +/* + * Test 1 - F_GETLK on unlocked region + * + * If no lock is found that would prevent this lock from being + * created, the structure is left unchanged by this function call + * except for the lock type which is set to F_UNLCK. + */ +static int +test1(int fd) +{ + struct flock fl1, fl2; + + memset(&fl1, 1, sizeof(fl1)); + fl1.l_type = F_WRLCK; + fl1.l_whence = SEEK_SET; + fl2 = fl1; + + if (fcntl(fd, F_GETLK, &fl1) < 0) + err(1, "F_GETLK"); + + printf("1 - F_GETLK on unlocked region: "); + FAIL(fl1.l_start != fl2.l_start); + FAIL(fl1.l_len != fl2.l_len); + FAIL(fl1.l_pid != fl2.l_pid); + FAIL(fl1.l_type != F_UNLCK); + FAIL(fl1.l_whence != fl2.l_whence); +#ifdef HAVE_SYSID + FAIL(fl1.l_sysid != fl2.l_sysid); +#endif + + SUCCEED; +} + +/* + * Test 2 - F_SETLK on locked region + * + * If a shared or exclusive lock cannot be set, fcntl returns + * immediately with EACCES or EAGAIN. + */ +static int +test2(int fd) +{ + /* + * We create a child process to hold the lock which we will + * test. We use a pipe to communicate with the child. + */ + int pid; + int pfd[2]; + struct flock fl; + char ch; + int res; + + if (pipe(pfd) < 0) + err(1, "pipe"); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + + pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + /* + * We are the child. We set a write lock and then + * write one byte back to the parent to tell it. The + * parent will kill us when its done. + */ + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child)"); + if (write(pfd[1], "a", 1) < 0) + err(1, "writing to pipe (child)"); + pause(); + exit(0); + } + + /* + * Wait until the child has set its lock and then perform the + * test. + */ + if (read(pfd[0], &ch, 1) != 1) + err(1, "reading from pipe (child)"); + + /* + * fcntl should return -1 with errno set to either EACCES or + * EAGAIN. + */ + printf("2 - F_SETLK on locked region: "); + res = fcntl(fd, F_SETLK, &fl); + kill(pid, SIGTERM); + if (waitpid(pid, 0, 0) != pid) + err(1, "waitpid"); + close(pfd[0]); + close(pfd[1]); + FAIL(res == 0); + FAIL(errno != EACCES && errno != EAGAIN); + + SUCCEED; +} + +/* + * Test 3 - F_SETLKW on locked region + * + * If a shared or exclusive lock is blocked by other locks, the + * process waits until the request can be satisfied. + * + * XXX this test hangs on FreeBSD NFS filesystems due to limitations + * in FreeBSD's client (and server) lockd implementation. + */ +static int +test3(int fd) +{ + /* + * We create a child process to hold the lock which we will + * test. We use a pipe to communicate with the child. + */ + int pid; + int pfd[2]; + struct flock fl; + char ch; + int res; + + if (pipe(pfd) < 0) + err(1, "pipe"); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + + pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + /* + * We are the child. We set a write lock and then + * write one byte back to the parent to tell it. The + * parent will kill us when its done. + */ + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child)"); + if (write(pfd[1], "a", 1) < 0) + err(1, "writing to pipe (child)"); + pause(); + exit(0); + } + + /* + * Wait until the child has set its lock and then perform the + * test. + */ + if (read(pfd[0], &ch, 1) != 1) + err(1, "reading from pipe (child)"); + + /* + * fcntl should wait until the alarm and then return -1 with + * errno set to EINTR. + */ + printf("3 - F_SETLKW on locked region: "); + + alarm(1); + + res = fcntl(fd, F_SETLKW, &fl); + kill(pid, SIGTERM); + if (waitpid(pid, 0, 0) != pid) + err(1, "waitpid"); + close(pfd[0]); + close(pfd[1]); + FAIL(res == 0); + FAIL(errno != EINTR); + + SUCCEED; +} + +/* + * Test 4 - F_GETLK on locked region + * + * Get the first lock that blocks the lock. + */ +static int +test4(int fd) +{ + /* + * We create a child process to hold the lock which we will + * test. We use a pipe to communicate with the child. + */ + int pid; + int pfd[2]; + struct flock fl; + char ch; + + if (pipe(pfd) < 0) + err(1, "pipe"); + + fl.l_start = 0; + fl.l_len = 99; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + + pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + /* + * We are the child. We set a write lock and then + * write one byte back to the parent to tell it. The + * parent will kill us when its done. + */ + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child)"); + if (write(pfd[1], "a", 1) < 0) + err(1, "writing to pipe (child)"); + pause(); + exit(0); + } + + /* + * Wait until the child has set its lock and then perform the + * test. + */ + if (read(pfd[0], &ch, 1) != 1) + err(1, "reading from pipe (child)"); + + /* + * fcntl should return a lock structure reflecting the lock we + * made in the child process. + */ + if (fcntl(fd, F_GETLK, &fl) < 0) + err(1, "F_GETLK"); + + printf("4 - F_GETLK on locked region: "); + FAIL(fl.l_start != 0); + FAIL(fl.l_len != 99); + FAIL(fl.l_type != F_WRLCK); + FAIL(fl.l_pid != pid); +#ifdef HAVE_SYSID + FAIL(fl.l_sysid != 0); +#endif + + kill(pid, SIGTERM); + if (waitpid(pid, 0, 0) != pid) + err(1, "waitpid"); + close(pfd[0]); + close(pfd[1]); + + SUCCEED; +} + +/* + * Test 5 - F_SETLKW simple deadlock + * + * If a blocking shared lock request would cause a deadlock (i.e. the + * lock request is blocked by a process which is itself blocked on a + * lock currently owned by the process making the new request), + * EDEADLK is returned. + */ +static int +test5(int fd) +{ + /* + * We create a child process to hold the lock which we will + * test. Because our test relies on the child process being + * blocked on the parent's lock, we can't easily use a pipe to + * synchronize so we just sleep in the parent to given the + * child a chance to setup. + * + * To create the deadlock condition, we arrange for the parent + * to lock the first byte of the file and the child to lock + * the second byte. After locking the second byte, the child + * will attempt to lock the first byte of the file, and + * block. The parent will then attempt to lock the second byte + * (owned by the child) which should cause deadlock. + */ + int pid; + struct flock fl; + int res; + + /* + * Lock the first byte in the parent. + */ + fl.l_start = 0; + fl.l_len = 1; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK 1 (parent)"); + + pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + /* + * Lock the second byte in the child and then block on + * the parent's lock. + */ + fl.l_start = 1; + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child)"); + fl.l_start = 0; + if (fcntl(fd, F_SETLKW, &fl) < 0) + err(1, "F_SETLKW (child)"); + exit(0); + } + + /* + * Wait until the child has set its lock and then perform the + * test. + */ + sleep(1); + + /* + * fcntl should immediately return -1 with errno set to EDEADLK. + */ + printf("5 - F_SETLKW simple deadlock: "); + + fl.l_start = 1; + res = fcntl(fd, F_SETLKW, &fl); + kill(pid, SIGTERM); + if (waitpid(pid, 0, 0) != pid) + err(1, "waitpid"); + + FAIL(res == 0); + FAIL(errno != EDEADLK); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_UNLCK; + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_UNLCK"); + + SUCCEED; +} + +/* + * Test 6 - F_SETLKW complex deadlock. + * + * This test involves three process, P, C1 and C2. We set things up so + * that P locks byte zero, C1 locks byte 1 and C2 locks byte 2. We + * also block C2 by attempting to lock byte zero. Lastly, P attempts + * to lock a range including byte 1 and 2. This represents a deadlock + * (due to C2's blocking attempt to lock byte zero). + */ +static int +test6(int fd) +{ + /* + * Because our test relies on the child process being blocked + * on the parent's lock, we can't easily use a pipe to + * synchronize so we just sleep in the parent to given the + * children a chance to setup. + */ + int pid1, pid2; + struct flock fl; + int res; + + /* + * Lock the first byte in the parent. + */ + fl.l_start = 0; + fl.l_len = 1; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK 1 (parent)"); + + pid1 = fork(); + if (pid1 < 0) + err(1, "fork"); + + if (pid1 == 0) { + /* + * C1 + * Lock the second byte in the child and then sleep + */ + fl.l_start = 1; + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child1)"); + pause(); + exit(0); + } + + pid2 = fork(); + if (pid2 < 0) + err(1, "fork"); + + if (pid2 == 0) { + /* + * C2 + * Lock the third byte in the child and then block on + * the parent's lock. + */ + fl.l_start = 2; + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child2)"); + fl.l_start = 0; + if (fcntl(fd, F_SETLKW, &fl) < 0) + err(1, "F_SETLKW (child2)"); + exit(0); + } + + /* + * Wait until the children have set their locks and then + * perform the test. + */ + sleep(1); + + /* + * fcntl should immediately return -1 with errno set to + * EDEADLK. If the alarm fires, we failed to detect the + * deadlock. + */ + alarm(1); + printf("6 - F_SETLKW complex deadlock: "); + + fl.l_start = 1; + fl.l_len = 2; + res = fcntl(fd, F_SETLKW, &fl); + kill(pid1, SIGTERM); + if (waitpid(pid1, 0, 0) != pid1) + err(1, "waitpid"); + kill(pid2, SIGTERM); + if (waitpid(pid2, 0, 0) != pid2) + err(1, "waitpid"); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_UNLCK; + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_UNLCK"); + + FAIL(res == 0); + FAIL(errno != EDEADLK); + + /* + * Cancel the alarm to avoid confusing later tests. + */ + alarm(0); + + SUCCEED; +} + +/* + * Test 7 - F_SETLK shared lock on exclusive locked region + * + * If a shared or exclusive lock cannot be set, fcntl returns + * immediately with EACCES or EAGAIN. + */ +static int +test7(int fd) +{ + /* + * We create a child process to hold the lock which we will + * test. We use a pipe to communicate with the child. + */ + int pid; + int pfd[2]; + struct flock fl; + char ch; + int res; + + if (pipe(pfd) < 0) + err(1, "pipe"); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + + pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + /* + * We are the child. We set a write lock and then + * write one byte back to the parent to tell it. The + * parent will kill us when its done. + */ + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child)"); + if (write(pfd[1], "a", 1) < 0) + err(1, "writing to pipe (child)"); + pause(); + exit(0); + } + + /* + * Wait until the child has set its lock and then perform the + * test. + */ + if (read(pfd[0], &ch, 1) != 1) + err(1, "reading from pipe (child)"); + + /* + * fcntl should wait until the alarm and then return -1 with + * errno set to EINTR. + */ + printf("7 - F_SETLK shared lock on exclusive locked region: "); + + fl.l_type = F_RDLCK; + res = fcntl(fd, F_SETLK, &fl); + kill(pid, SIGTERM); + if (waitpid(pid, 0, 0) != pid) + err(1, "waitpid"); + close(pfd[0]); + close(pfd[1]); + + FAIL(res == 0); + FAIL(errno != EACCES && errno != EAGAIN); + + SUCCEED; +} + +/* + * Test 8 - F_SETLK shared lock on share locked region + * + * When a shared lock is set on a segment of a file, other processes + * shall be able to set shared locks on that segment or a portion of + * it. + */ +static int +test8(int fd) +{ + /* + * We create a child process to hold the lock which we will + * test. We use a pipe to communicate with the child. + */ + int pid; + int pfd[2]; + struct flock fl; + char ch; + int res; + + if (pipe(pfd) < 0) + err(1, "pipe"); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_RDLCK; + fl.l_whence = SEEK_SET; + + pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + /* + * We are the child. We set a write lock and then + * write one byte back to the parent to tell it. The + * parent will kill us when its done. + */ + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child)"); + if (write(pfd[1], "a", 1) < 0) + err(1, "writing to pipe (child)"); + pause(); + exit(0); + } + + /* + * Wait until the child has set its lock and then perform the + * test. + */ + if (read(pfd[0], &ch, 1) != 1) + err(1, "reading from pipe (child)"); + + /* + * fcntl should wait until the alarm and then return -1 with + * errno set to EINTR. + */ + printf("8 - F_SETLK shared lock on share locked region: "); + + fl.l_type = F_RDLCK; + res = fcntl(fd, F_SETLK, &fl); + + kill(pid, SIGTERM); + if (waitpid(pid, 0, 0) != pid) + err(1, "waitpid"); + close(pfd[0]); + close(pfd[1]); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_UNLCK; + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_UNLCK"); + + FAIL(res != 0); + + SUCCEED; +} + +/* + * Test 9 - F_SETLK exclusive lock on share locked region + * + * If a shared or exclusive lock cannot be set, fcntl returns + * immediately with EACCES or EAGAIN. + */ +static int +test9(int fd) +{ + /* + * We create a child process to hold the lock which we will + * test. We use a pipe to communicate with the child. + */ + int pid; + int pfd[2]; + struct flock fl; + char ch; + int res; + + if (pipe(pfd) < 0) + err(1, "pipe"); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_RDLCK; + fl.l_whence = SEEK_SET; + + pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + /* + * We are the child. We set a write lock and then + * write one byte back to the parent to tell it. The + * parent will kill us when its done. + */ + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child)"); + if (write(pfd[1], "a", 1) < 0) + err(1, "writing to pipe (child)"); + pause(); + exit(0); + } + + /* + * Wait until the child has set its lock and then perform the + * test. + */ + if (read(pfd[0], &ch, 1) != 1) + err(1, "reading from pipe (child)"); + + /* + * fcntl should wait until the alarm and then return -1 with + * errno set to EINTR. + */ + printf("9 - F_SETLK exclusive lock on share locked region: "); + + fl.l_type = F_WRLCK; + res = fcntl(fd, F_SETLK, &fl); + kill(pid, SIGTERM); + if (waitpid(pid, 0, 0) != pid) + err(1, "waitpid"); + close(pfd[0]); + close(pfd[1]); + + FAIL(res == 0); + FAIL(errno != EACCES && errno != EAGAIN); + + SUCCEED; +} + +/* + * Test 10 - trying to set bogus pid or sysid values + * + * The l_pid and l_sysid fields are only used with F_GETLK to return + * the process ID of the process holding a blocking lock and the + * system ID of the system that owns that process + */ +static int +test10(int fd) +{ + /* + * We create a child process to hold the lock which we will + * test. We use a pipe to communicate with the child. + */ + int pid; + int pfd[2]; + struct flock fl; + char ch; + + if (pipe(pfd) < 0) + err(1, "pipe"); + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + fl.l_pid = 9999; + fl.l_sysid = 9999; + + pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + /* + * We are the child. We set a write lock and then + * write one byte back to the parent to tell it. The + * parent will kill us when its done. + */ + if (fcntl(fd, F_SETLK, &fl) < 0) + err(1, "F_SETLK (child)"); + if (write(pfd[1], "a", 1) < 0) + err(1, "writing to pipe (child)"); + pause(); + exit(0); + } + + /* + * Wait until the child has set its lock and then perform the + * test. + */ + if (read(pfd[0], &ch, 1) != 1) + err(1, "reading from pipe (child)"); + + printf("10 - trying to set bogus pid or sysid values: "); + + if (fcntl(fd, F_GETLK, &fl) < 0) + err(1, "F_GETLK"); + + kill(pid, SIGTERM); + if (waitpid(pid, 0, 0) != pid) + err(1, "waitpid"); + close(pfd[0]); + close(pfd[1]); + + FAIL(fl.l_pid != pid); +#ifdef HAVE_SYSID + FAIL(fl.l_sysid != 0); +#endif + + SUCCEED; +} + +/* + * Test 11 - remote locks + * + * XXX temporary interface which will be removed when the kernel lockd + * is added. + */ +static int +test11(int fd) +{ + struct flock fl; + int res; + + fl.l_start = 0; + fl.l_len = 0; + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + fl.l_pid = 9999; + fl.l_sysid = 1; + + printf("11 - remote locks: "); + + res = fcntl(fd, F_SETLK_REMOTE, &fl); + FAIL(res != 0); + + fl.l_sysid = 2; + res = fcntl(fd, F_SETLK_REMOTE, &fl); + FAIL(res == 0); + FAIL(errno != EACCES && errno != EAGAIN); + + res = fcntl(fd, F_GETLK, &fl); + FAIL(res != 0); + FAIL(fl.l_pid != 9999); + FAIL(fl.l_sysid != 1); + + fl.l_type = F_UNLCK; + fl.l_sysid = 1; + fl.l_start = 0; + fl.l_len = 0; + res = fcntl(fd, F_SETLK_REMOTE, &fl); + FAIL(res != 0); + + fl.l_pid = 1234; + fl.l_sysid = 1; + fl.l_start = 0; + fl.l_len = 1; + fl.l_whence = SEEK_SET; + fl.l_type = F_RDLCK; + res = fcntl(fd, F_SETLK_REMOTE, &fl); + FAIL(res != 0); + + fl.l_sysid = 2; + res = fcntl(fd, F_SETLK_REMOTE, &fl); + FAIL(res != 0); + + fl.l_type = F_UNLCKSYS; + fl.l_sysid = 1; + res = fcntl(fd, F_SETLK_REMOTE, &fl); + FAIL(res != 0); + + fl.l_type = F_WRLCK; + res = fcntl(fd, F_GETLK, &fl); + FAIL(res != 0); + FAIL(fl.l_pid != 1234); + FAIL(fl.l_sysid != 2); + + fl.l_type = F_UNLCKSYS; + fl.l_sysid = 2; + res = fcntl(fd, F_SETLK_REMOTE, &fl); + FAIL(res != 0); + + SUCCEED; +} + +int +main(int argc, const char *argv[]) +{ + int fd; + struct sigaction sa; + + if (argc != 2) { + errx(1, "usage: flock "); + } + + fd = make_file(argv[1], 1024); + + sa.sa_handler = ignore_alarm; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + sigaction(SIGALRM, &sa, 0); + + test1(fd); + test2(fd); + test3(fd); + test4(fd); + test5(fd); + test6(fd); + test7(fd); + test8(fd); + test9(fd); + test10(fd); + test11(fd); + + return 0; +}