Index: lib/libc/gen/Makefile.inc =================================================================== RCS file: /usr/cvs/src/lib/libc/gen/Makefile.inc,v retrieving revision 1.128 diff -u -r1.128 Makefile.inc --- lib/libc/gen/Makefile.inc 28 Sep 2007 02:22:56 -0000 1.128 +++ lib/libc/gen/Makefile.inc 15 Oct 2007 21:20:06 -0000 @@ -21,7 +21,7 @@ initgroups.c isatty.c isinf.c isnan.c jrand48.c lcong48.c \ lockf.c lrand48.c mrand48.c nftw.c nice.c \ nlist.c nrand48.c opendir.c \ - pause.c pmadvise.c popen.c posixshm.c pselect.c \ + pause.c pmadvise.c popen.c pselect.c \ psignal.c pw_scan.c pwcache.c \ raise.c readdir.c readpassphrase.c rewinddir.c \ scandir.c seed48.c seekdir.c sem.c semctl.c \ Index: lib/libc/gen/shm_open.3 =================================================================== RCS file: /usr/cvs/src/lib/libc/gen/shm_open.3,v retrieving revision 1.12 diff -u -r1.12 shm_open.3 --- lib/libc/gen/shm_open.3 20 Jan 2005 09:17:02 -0000 1.12 +++ lib/libc/gen/shm_open.3 1 May 2007 20:09:04 -0000 @@ -28,8 +28,8 @@ .\" .\" $FreeBSD: src/lib/libc/gen/shm_open.3,v 1.12 2005/01/20 09:17:02 ru Exp $ .\" -.Dd March 24, 2000 -.Dt SHM_OPEN 3 +.Dd March 20, 2007 +.Dt SHM_OPEN 2 .Os .Sh NAME .Nm shm_open , shm_unlink @@ -46,62 +46,104 @@ .Sh DESCRIPTION The .Fn shm_open -function opens (or optionally creates) a +system call opens (or optionally creates) a .Tn POSIX shared memory object named .Fa path . The -.Fn shm_unlink -function removes a shared memory object named -.Fa path . -.Pp -In the -.Fx -implementation, -.Tn POSIX -shared memory objects are implemented as ordinary files. -The -.Fn shm_open -and -.Fn shm_unlink -act as wrappers around the -.Xr open 2 -and -.Xr unlink 2 -routines, and -.Fa path , -.Fa flags , +.Fa flags +argument contains a subset of the flags used by +.Xr open 2 . +An access mode of either +.Dv O_RDONLY +or +.Dv O_RDWR +must be included in +.Fa flags . +The optional flags +.Dv O_CREAT , +.Dv O_EXCL , and +.Dv O_TRUNC +may also be specified. +.Pp +If +.Dv O_CREAT +is specified, +then a new shared memory object named +.Fa path +will be created if it does not exist. +In this case, +the shared memory object is created with mode .Fa mode -arguments are as specified for those functions. -The -.Fa flags -argument is checked to ensure that the access mode specified is not -.Dv O_WRONLY -(which is not defined for shared memory objects). -.Pp -In addition, the -.Fx -implementation causes -.Fn mmap -of a descriptor returned by +subject to the process' umask value. +If both the +.Dv O_CREAT +and +.Dv O_EXCL +flags are specified and a shared memory object named +.Fa path +already exists, +then .Fn shm_open -to behave as if the -.Dv MAP_NOSYNC -flag had been specified to -.Xr mmap 2 . -(It does so by setting a special file flag using -.Xr fcntl 2 . ) +will fail with +.Er EEXIST. +.Pp +Newly created objects start off with a size of zero. +If an existing shared memory object is opened with +.Dv O_RDWR +and the +.Dv O_TRUNC +flag is specified, +then the shared memory object will be truncated to a size of zero. +The size of the object can be adjusted via +.Xr ftruncate 2 +and queried via +.Xr fstat 2 . +.Pp +The new descriptor is set to close during +.Xr execve 2 +system calls; +see +.Xr close 2 +and +.Xr fcntl 2 . +.Pp +As a FreeBSD extension, +the constant +.Dv SHM_ANON +may be used for the +.Fa path +argument to +.Fn shm_open . +In this case, an anonymous, unnamed shared memory object is created. +Since the object has no name, +it cannot be removed via a subsequent call to +.Fn shm_unlink . +Instead, +the shared memory object will be garbage collected when the last reference to +the shared memory object is removed. +The shared memory object may be shared with other processes by sharing the +file descriptor via +.Xr fork 2 +or +.Xr sendmsg 2 . +Attempting to open an anonymous shared memory object with +.Dv O_RDONLY +will fail with +.Er EINVAL . +All other flags are ignored. .Pp The .Fn shm_unlink -function makes no effort to ensure that -.Fa path -refers to a shared memory object. +system call removes a shared memory object named +.Fa path . +.Pp .Sh RETURN VALUES If successful, .Fn shm_open -returns a non-negative integer; +returns a non-negative integer, +and .Fn shm_unlink returns zero. Both functions return -1 on failure, and set @@ -110,8 +152,8 @@ .Sh COMPATIBILITY The .Fa path -argument does not necessarily represent a pathname (although it does in this -and most other implementations). +argument does not necessarily represent a pathname (although it does in +most other implementations). Two processes opening the same .Fa path are guaranteed to access the same shared memory object if and only if @@ -139,37 +181,82 @@ is undefined. It is also undefined whether the shared memory object itself, or its contents, persist across reboots. -.Sh ERRORS -The -.Fn shm_open -and -.Fn shm_unlink -functions can fail with any error defined for -.Fn open +.Pp +In FreeBSD, +.Xr read 2 and -.Fn unlink , -respectively. -In addition, the following errors are defined for +.Xr write 2 +on a shared memory object will fail with +.Er EOPNOTSUPP +and neither shared memory objects nor their contents persist across reboots. +.Sh ERRORS +The following errors are defined for .Fn shm_open : .Bl -tag -width Er .It Bq Er EINVAL -The object named by +A flag other than +.Dv O_RDONLY , +.Dv O_RDWR , +.Dv O_CREAT , +.Dv O_EXCL , +or +.Dv O_TRUNC +was included in +.Fa flags . +.It Bq Er EMFILE +The process has already reached its limit for open file descriptors. +.It Bq Er ENFILE +The system file table is full. +.It Bq Er EINVAL +.Dv O_RDONLY +was specified while creating an anonymous shared memory object via +.Dv SHM_ANON . +.It Bq Er EFAULT +The .Fa path -is not a shared memory object -(i.e., it is not a regular file). +argument points outside the process' allocated address space. +.It Bq Er ENAMETOOLONG +The entire pathname exceeded 1023 characters. .It Bq Er EINVAL The -.Fa flags -argument to -.Fn shm_open -specifies an access mode of -.Dv O_WRONLY . +.Fa path +does not begin with a slash +.Pq Ql \&/ +character. +.It Bq Er ENOENT +.Dv O_CREAT +is specified and the named shared memory object does not exist. +.It Bq Er EEXIST +.Dv O_CREAT +and +.Dv O_EXCL +are specified and the named shared memory object dies exist. +.It Bq Er EACCES +The required permissions (for reading or reading and writing) are denied. +.El +.Pp +The following errors are defined for +.Fn shm_unlink : +.Bl -tag -width Er +.It Bq Er EFAULT +The +.Fa path +argument points outside the process' allocated address space. +.It Bq Er ENAMETOOLONG +The entire pathname exceeded 1023 characters. +.It Bq Er ENOENT +The named shared memory object does not exist. +.It Bq Er EACCES +The required permissions are denied. +.Fn shm_unlink +requires write permission to the shared memory object. .El .Sh SEE ALSO +.Xr close 2 , +.Xr ftruncate 2 , +.Xr fstat 2 , .Xr mmap 2 , -.Xr munmap 2 , -.Xr open 2 , -.Xr unlink 2 +.Xr munmap 2 .Sh STANDARDS The .Fn shm_open @@ -184,6 +271,9 @@ .Fn shm_unlink functions first appeared in .Fx 4.3 . +The functions were reimplemented as system calls using shared memory objects +directly rather than files in +.Fx 7.0 . .Sh AUTHORS .An Garrett A. Wollman Aq wollman@FreeBSD.org (C library support and this manual page) Index: sys/compat/freebsd32/syscalls.master =================================================================== RCS file: /usr/cvs/src/sys/compat/freebsd32/syscalls.master,v retrieving revision 1.91 diff -u -r1.91 syscalls.master --- sys/compat/freebsd32/syscalls.master 16 Aug 2007 05:30:04 -0000 1.91 +++ sys/compat/freebsd32/syscalls.master 15 Oct 2007 21:24:16 -0000 @@ -795,3 +795,6 @@ 480 AUE_FTRUNCATE STD { int freebsd32_ftruncate(int fd, \ u_int32_t lengthlo, u_int32_t lengthhi); } 481 AUE_KILL NOPROTO { int thr_kill2(pid_t pid, long id, int sig); } +482 AUE_NULL NOPROTO { int shm_open(const char *path, int flags, \ + mode_t mode); } +483 AUE_NULL NOPROTO { int shm_unlink(const char *path); } Index: sys/conf/files =================================================================== RCS file: /usr/cvs/src/sys/conf/files,v retrieving revision 1.1245 diff -u -r1.1245 files --- sys/conf/files 14 Oct 2007 10:55:49 -0000 1.1245 +++ sys/conf/files 15 Oct 2007 21:20:37 -0000 @@ -1534,6 +1534,7 @@ kern/uipc_mbuf2.c standard kern/uipc_mqueue.c optional p1003_1b_mqueue kern/uipc_sem.c optional p1003_1b_semaphores +kern/uipc_shm.c standard kern/uipc_sockbuf.c standard kern/uipc_socket.c standard kern/uipc_syscalls.c standard Index: sys/kern/kern_descrip.c =================================================================== RCS file: /usr/cvs/src/sys/kern/kern_descrip.c,v retrieving revision 1.313 diff -u -r1.313 kern_descrip.c --- sys/kern/kern_descrip.c 6 Aug 2007 14:26:00 -0000 1.313 +++ sys/kern/kern_descrip.c 15 Oct 2007 21:21:53 -0000 @@ -2514,6 +2514,8 @@ return ("crpt"); case DTYPE_MQUEUE: return ("mque"); + case DTYPE_SHM: + return ("shm"); default: return ("unkn"); } Index: sys/kern/sys_generic.c =================================================================== RCS file: /usr/cvs/src/sys/kern/sys_generic.c,v retrieving revision 1.158 diff -u -r1.158 sys_generic.c --- sys/kern/sys_generic.c 4 Jul 2007 22:57:21 -0000 1.158 +++ sys/kern/sys_generic.c 15 Oct 2007 21:21:54 -0000 @@ -69,6 +69,8 @@ #include #endif +#include + static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); MALLOC_DEFINE(M_IOV, "iov", "large iov's"); @@ -502,6 +504,60 @@ return (error); } +/* + * Truncate a file given a file descriptor. + * + * Can't use fget_write() here, since must return EINVAL and not EBADF if the + * descriptor isn't writable. + */ +int +kern_ftruncate(td, fd, length) + struct thread *td; + int fd; + off_t length; +{ + struct file *fp; + int error; + + AUDIT_ARG(fd, fd); + if (length < 0) + return (EINVAL); + error = fget(td, fd, &fp); + if (error) + return (error); + AUDIT_ARG(file, td->td_proc, fp); + if (!(fp->f_flag & FWRITE)) { + fdrop(fp, td); + return (EINVAL); + } + if (!(fp->f_ops->fo_flags & DFLAG_TRUNCATABLE)) + error = EINVAL; + else + error = fo_truncate(fp, length, td->td_ucred, td); + fdrop(fp, td); + return (error); +} + +int +ftruncate(td, uap) + struct thread *td; + struct ftruncate_args *uap; +{ + + return (kern_ftruncate(td, uap->fd, uap->length)); +} + +#if defined(COMPAT_43) +int +oftruncate(td, uap) + struct thread *td; + struct oftruncate_args *uap; +{ + + return (kern_ftruncate(td, uap->fd, uap->length)); +} +#endif /* COMPAT_43 */ + #ifndef _SYS_SYSPROTO_H_ struct ioctl_args { int fd; Index: sys/kern/syscalls.master =================================================================== RCS file: /usr/cvs/src/sys/kern/syscalls.master,v retrieving revision 1.233 diff -u -r1.233 syscalls.master --- sys/kern/syscalls.master 16 Aug 2007 05:26:41 -0000 1.233 +++ sys/kern/syscalls.master 15 Oct 2007 21:24:55 -0000 @@ -847,5 +847,8 @@ 479 AUE_TRUNCATE STD { int truncate(char *path, off_t length); } 480 AUE_FTRUNCATE STD { int ftruncate(int fd, off_t length); } 481 AUE_KILL STD { int thr_kill2(pid_t pid, long id, int sig); } +482 AUE_NULL STD { int shm_open(const char *path, int flags, \ + mode_t mode); } +483 AUE_NULL STD { int shm_unlink(const char *path); } ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master Index: sys/kern/uipc_shm.c =================================================================== RCS file: sys/kern/uipc_shm.c diff -N sys/kern/uipc_shm.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/kern/uipc_shm.c 1 May 2007 20:09:25 -0000 @@ -0,0 +1,578 @@ +/*- + * Copyright (c) 2006 Robert N. M. Watson + * Copyright (c) 2007 John H. Baldwin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Support for shared swap-backed anonymous memory objects via + * shm_open(2) and shm_unlink(2). While most of the implementation is + * here, vm_mmap.c contains mapping logic changes. + * + * TODO: + * + * (3) Resource limits? Does this need its own resource limits or are the + * existing limits in mmap(2) sufficient? + * + * (4) Partial page truncation. vnode_pager_setsize() will zero any parts + * of a partially mapped page as a result of ftruncate(2)/truncate(2). + * We can do the same (with the same pmap evil), but do we need to + * worry about the bits on disk if the page is swapped out or will the + * swapper zero the parts of a page that are invalid if the page is + * swapped back in for us? + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +struct shmfd { + size_t shm_size; + vm_object_t shm_object; + int shm_refs; + uid_t shm_uid; + gid_t shm_gid; + mode_t shm_mode; + + /* + * Values maintained solely to make this a better-behaved file + * descriptor for fstat() to run on. + */ + struct timespec shm_atime; + struct timespec shm_mtime; + struct timespec shm_ctime; + struct timespec shm_birthtime; +}; + +struct shm_mapping { + char *sm_path; + Fnv32_t sm_fnv; + struct shmfd *sm_shmfd; + LIST_ENTRY(shm_mapping) sm_link; +}; + +static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor"); +static LIST_HEAD(, shm_mapping) *shm_dictionary; +static struct sx shm_dict_lock; +static struct mtx shm_timestamp_lock; +static u_long shm_hash; + +#define SHM_HASH(fnv) (&shm_dictionary[(fnv) & shm_hash]) + +static int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags); +static struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode); +static void shm_dict_init(void *arg); +static void shm_drop(struct shmfd *shmfd); +static struct shmfd *shm_hold(struct shmfd *shmfd); +static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); +static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); +static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); +static void shm_dotruncate(struct shmfd *shmfd, off_t length); + +static fo_rdwr_t shm_read; +static fo_rdwr_t shm_write; +static fo_ioctl_t shm_ioctl; +static fo_poll_t shm_poll; +static fo_kqfilter_t shm_kqfilter; +static fo_stat_t shm_stat; +static fo_close_t shm_close; +static fo_truncate_t shm_truncate; + +/* File descriptor operations. */ +static struct fileops shm_ops = { + .fo_read = shm_read, + .fo_write = shm_write, + .fo_ioctl = shm_ioctl, + .fo_poll = shm_poll, + .fo_kqfilter = shm_kqfilter, + .fo_stat = shm_stat, + .fo_close = shm_close, + .fo_truncate = shm_truncate, + .fo_flags = DFLAG_PASSABLE | DFLAG_TRUNCATABLE +}; + +static int +shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td) +{ + + return (EOPNOTSUPP); +} + +static int +shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td) +{ + + return (EOPNOTSUPP); +} + +static int +shm_ioctl(struct file *fp, u_long com, void *data, + struct ucred *active_cred, struct thread *td) +{ + + return (EOPNOTSUPP); +} + +static int +shm_poll(struct file *fp, int events, struct ucred *active_cred, + struct thread *td) +{ + + return (EOPNOTSUPP); +} + +static int +shm_kqfilter(struct file *fp, struct knote *kn) +{ + + return (EOPNOTSUPP); +} + +static int +shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, + struct thread *td) +{ + struct shmfd *shmfd; + + shmfd = fp->f_data; + + /* + * Attempt to return sanish values for fstat() on a memory file + * descriptor. + */ + bzero(sb, sizeof(*sb)); + sb->st_mode = S_IFREG | shmfd->shm_mode; /* XXX */ + sb->st_blksize = PAGE_SIZE; + sb->st_size = shmfd->shm_size; + sb->st_blocks = (sb->st_size + sb->st_blksize - 1) / sb->st_blksize; + sb->st_atimespec = shmfd->shm_atime; + sb->st_ctimespec = shmfd->shm_ctime; + sb->st_mtimespec = shmfd->shm_mtime; + sb->st_birthtimespec = shmfd->shm_birthtime; + sb->st_uid = shmfd->shm_uid; + sb->st_gid = shmfd->shm_gid; + + return (0); +} + +static int +shm_close(struct file *fp, struct thread *td) +{ + struct shmfd *shmfd; + + shmfd = fp->f_data; + fp->f_data = NULL; + shm_drop(shmfd); + + return (0); +} + +static void +shm_dotruncate(struct shmfd *shmfd, off_t length) +{ + vm_object_t object; + vm_page_t m; + vm_pindex_t nobjsize; + + object = shmfd->shm_object; + VM_OBJECT_LOCK(object); + if (length == shmfd->shm_size) { + VM_OBJECT_UNLOCK(object); + return; + } + nobjsize = OFF_TO_IDX(length + PAGE_MASK); + + /* Are we shrinking? If so, trim the end. */ + if (length < shmfd->shm_size) { + /* Toss in memory pages. */ + if (nobjsize < object->size) + vm_object_page_remove(object, nobjsize, object->size, + FALSE); + + /* Toss pages from swap. */ + if (object->type == OBJT_SWAP) + swap_pager_freespace(object, nobjsize, + object->size - nobjsize); + + /* + * If the last page is partially mapped, then zero out + * the garbage at the end of the page. See comments + * in vnode_page_setsize() for more details. + * + * XXXJHB: This handles in memory pages, but what about + * a page swapped out to disk? + */ + if ((length & PAGE_MASK) && + (m = vm_page_lookup(object, OFF_TO_IDX(length))) != NULL && + m->valid != 0) { + int base = (int)length & PAGE_MASK; + int size = PAGE_SIZE - base; + + pmap_zero_page_area(m, base, size); + vm_page_lock_queues(); + pmap_remove_all(m); + vm_page_set_validclean(m, base, size); + if (m->dirty != 0) + m->dirty = VM_PAGE_BITS_ALL; + vm_page_unlock_queues(); + } + } + shmfd->shm_size = length; + mtx_lock(&shm_timestamp_lock); + vfs_timestamp(&shmfd->shm_ctime); + shmfd->shm_mtime = shmfd->shm_ctime; + mtx_unlock(&shm_timestamp_lock); + object->size = nobjsize; + VM_OBJECT_UNLOCK(object); +} + +static int +shm_truncate(struct file *fp, off_t length, struct ucred *active_cred, + struct thread *td) +{ + struct shmfd *shmfd; + + shmfd = fp->f_data; + shm_dotruncate(shmfd, length); + return (0); +} + +/* + * shmfd object management including creation and reference counting + * routines. + */ +static struct shmfd * +shm_alloc(struct ucred *ucred, mode_t mode) +{ + struct shmfd *shmfd; + + shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO); + shmfd->shm_size = 0; + shmfd->shm_uid = ucred->cr_uid; + shmfd->shm_gid = ucred->cr_gid; + shmfd->shm_mode = mode; + shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL, + shmfd->shm_size, VM_PROT_DEFAULT, 0); + KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); + vfs_timestamp(&shmfd->shm_birthtime); + shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = + shmfd->shm_birthtime; + refcount_init(&shmfd->shm_refs, 1); + + return (shmfd); +} + +static struct shmfd * +shm_hold(struct shmfd *shmfd) +{ + + refcount_acquire(&shmfd->shm_refs); + return (shmfd); +} + +static void +shm_drop(struct shmfd *shmfd) +{ + + if (refcount_release(&shmfd->shm_refs)) { + vm_object_deallocate(shmfd->shm_object); + free(shmfd, M_SHMFD); + } +} + +/* + * Determine if the credentials have sufficient permissions for a + * specified combination of FREAD and FWRITE. + */ +static int +shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags) +{ + int acc_mode; + + acc_mode = 0; + if (flags & FREAD) + acc_mode |= VREAD; + if (flags & FWRITE) + acc_mode |= VWRITE; + return (vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, + acc_mode, ucred, NULL)); +} + +/* + * Dictionary management. We maintain an in-kernel dictionary to map + * paths to shmfd objects. We use the FNV hash on the path to store + * the mappings in a hash table. + */ +static void +shm_dict_init(void *arg) +{ + + mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF); + sx_init(&shm_dict_lock, "shm dictionary"); + shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash); +} +SYSINIT(shm_dict_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_dict_init, NULL); + +static struct shmfd * +shm_lookup(char *path, Fnv32_t fnv) +{ + struct shm_mapping *map; + + LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { + if (map->sm_fnv != fnv) + continue; + if (strcmp(map->sm_path, path) == 0) + return (map->sm_shmfd); + } + + return (NULL); +} + +static void +shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd) +{ + struct shm_mapping *map; + + map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK); + map->sm_path = path; + map->sm_fnv = fnv; + map->sm_shmfd = shm_hold(shmfd); + LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link); +} + +static int +shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred) +{ + struct shm_mapping *map; + int error; + + LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { + if (map->sm_fnv != fnv) + continue; + if (strcmp(map->sm_path, path) == 0) { + error = shm_access(map->sm_shmfd, ucred, + FREAD | FWRITE); + if (error) + return (error); + LIST_REMOVE(map, sm_link); + shm_drop(map->sm_shmfd); + free(map->sm_path, M_SHMFD); + free(map, M_SHMFD); + return (0); + } + } + + return (ENOENT); +} + +/* System calls. */ +int +shm_open(struct thread *td, struct shm_open_args *uap) +{ + struct filedesc *fdp; + struct shmfd *shmfd; + struct file *fp; + char *path; + Fnv32_t fnv; + mode_t cmode; + int fd, error; + + if ((uap->flags & O_ACCMODE) != O_RDONLY && + (uap->flags & O_ACCMODE) != O_RDWR) + return (EINVAL); + + if ((uap->flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC)) != 0) + return (EINVAL); + + fdp = td->td_proc->p_fd; + cmode = (uap->mode & ~fdp->fd_cmask) & ACCESSPERMS; + + error = falloc(td, &fp, &fd); + if (error) + return (error); + + /* A SHM_ANON path pointer creates an anonymous object. */ + if (uap->path == SHM_ANON) { + /* A read-only anonymous object is pointless. */ + if ((uap->flags & O_ACCMODE) == O_RDONLY) { + fdclose(fdp, fp, fd, td); + fdrop(fp, td); + return (EINVAL); + } + shmfd = shm_alloc(td->td_ucred, cmode); + } else { + path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK); + error = copyinstr(uap->path, path, MAXPATHLEN, NULL); + + /* Require paths to start with a '/' character. */ + if (error == 0 && path[0] != '/') + error = EINVAL; + if (error) { + fdclose(fdp, fp, fd, td); + fdrop(fp, td); + free(path, M_SHMFD); + return (error); + } + + fnv = fnv_32_str(path, FNV1_32_INIT); + sx_xlock(&shm_dict_lock); + shmfd = shm_lookup(path, fnv); + if (shmfd == NULL) { + /* Object does not yet exist, create it if requested. */ + if (uap->flags & O_CREAT) { + shmfd = shm_alloc(td->td_ucred, cmode); + shm_insert(path, fnv, shmfd); + } else { + free(path, M_SHMFD); + error = ENOENT; + } + } else { + /* + * Object already exists, obtain a new + * reference if requested and permitted. + */ + free(path, M_SHMFD); + if ((uap->flags & (O_CREAT | O_EXCL)) == + (O_CREAT | O_EXCL)) + error = EEXIST; + else + error = shm_access(shmfd, td->td_ucred, + FFLAGS(uap->flags & O_ACCMODE)); + + if (error == 0) { + shm_hold(shmfd); + + /* + * Truncate the file back to zero + * length if O_TRUNC was specified and + * the object was opened with + * read/write. + */ + if ((uap->flags & (O_ACCMODE | O_TRUNC)) == + (O_RDWR | O_TRUNC)) + shm_dotruncate(shmfd, 0); + } + } + sx_xunlock(&shm_dict_lock); + + if (error) { + fdclose(fdp, fp, fd, td); + fdrop(fp, td); + return (error); + } + } + + FILE_LOCK(fp); + fp->f_flag = FFLAGS(uap->flags & O_ACCMODE); + fp->f_type = DTYPE_SHM; + fp->f_data = shmfd; + fp->f_ops = &shm_ops; + FILE_UNLOCK(fp); + + FILEDESC_LOCK_FAST(fdp); + if (fdp->fd_ofiles[fd] == fp) + fdp->fd_ofileflags[fd] |= UF_EXCLOSE; + FILEDESC_UNLOCK_FAST(fdp); + td->td_retval[0] = fd; + fdrop(fp, td); + + return (0); +} + +int +shm_unlink(struct thread *td, struct shm_unlink_args *uap) +{ + char *path; + Fnv32_t fnv; + int error; + + path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); + error = copyinstr(uap->path, path, MAXPATHLEN, NULL); + if (error) { + free(path, M_TEMP); + return (error); + } + + fnv = fnv_32_str(path, FNV1_32_INIT); + sx_xlock(&shm_dict_lock); + error = shm_remove(path, fnv, td->td_ucred); + sx_xunlock(&shm_dict_lock); + free(path, M_TEMP); + + return (error); +} + +/* + * mmap() helper to validate mmap() requests against shm object state + * and give mmap() the vm_object to use for the mapping. + */ +int +shm_mmap(struct shmfd *shmfd, vm_size_t objsize, vm_ooffset_t foff, + vm_object_t *obj) +{ + + /* + * XXXRW: This validation is probably insufficient, and subject to + * sign errors. It should be fixed. + */ + if (foff >= shmfd->shm_size || foff + objsize > shmfd->shm_size) + return (EINVAL); + + mtx_lock(&shm_timestamp_lock); + vfs_timestamp(&shmfd->shm_atime); + mtx_unlock(&shm_timestamp_lock); + vm_object_reference(shmfd->shm_object); + *obj = shmfd->shm_object; + return (0); +} Index: sys/kern/vfs_syscalls.c =================================================================== RCS file: /usr/cvs/src/sys/kern/vfs_syscalls.c,v retrieving revision 1.443 diff -u -r1.443 vfs_syscalls.c --- sys/kern/vfs_syscalls.c 10 Sep 2007 00:00:16 -0000 1.443 +++ sys/kern/vfs_syscalls.c 15 Oct 2007 21:21:55 -0000 @@ -3084,68 +3084,6 @@ return (error); } -/* - * Truncate a file given a file descriptor. - */ -#ifndef _SYS_SYSPROTO_H_ -struct ftruncate_args { - int fd; - int pad; - off_t length; -}; -#endif -int -ftruncate(td, uap) - struct thread *td; - register struct ftruncate_args /* { - int fd; - int pad; - off_t length; - } */ *uap; -{ - struct mount *mp; - struct vattr vattr; - struct vnode *vp; - struct file *fp; - int vfslocked; - int error; - - AUDIT_ARG(fd, uap->fd); - if (uap->length < 0) - return(EINVAL); - if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) - return (error); - if ((fp->f_flag & FWRITE) == 0) { - fdrop(fp, td); - return (EINVAL); - } - vp = fp->f_vnode; - vfslocked = VFS_LOCK_GIANT(vp->v_mount); - if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) - goto drop; - VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); - AUDIT_ARG(vnode, vp, ARG_VNODE1); - if (vp->v_type == VDIR) - error = EISDIR; -#ifdef MAC - else if ((error = mac_check_vnode_write(td->td_ucred, fp->f_cred, - vp))) { - } -#endif - else if ((error = vn_writechk(vp)) == 0) { - VATTR_NULL(&vattr); - vattr.va_size = uap->length; - error = VOP_SETATTR(vp, &vattr, fp->f_cred, td); - } - VOP_UNLOCK(vp, 0, td); - vn_finished_write(mp); -drop: - VFS_UNLOCK_GIANT(vfslocked); - fdrop(fp, td); - return (error); -} - #if defined(COMPAT_43) /* * Truncate a file given its path name. @@ -3174,34 +3112,6 @@ nuap.length = uap->length; return (truncate(td, &nuap)); } - -/* - * Truncate a file given a file descriptor. - */ -#ifndef _SYS_SYSPROTO_H_ -struct oftruncate_args { - int fd; - long length; -}; -#endif -int -oftruncate(td, uap) - struct thread *td; - register struct oftruncate_args /* { - int fd; - long length; - } */ *uap; -{ - struct ftruncate_args /* { - int fd; - int pad; - off_t length; - } */ nuap; - - nuap.fd = uap->fd; - nuap.length = uap->length; - return (ftruncate(td, &nuap)); -} #endif /* COMPAT_43 */ /* Versions with the pad argument */ Index: sys/kern/vfs_vnops.c =================================================================== RCS file: /usr/cvs/src/sys/kern/vfs_vnops.c,v retrieving revision 1.252 diff -u -r1.252 vfs_vnops.c --- sys/kern/vfs_vnops.c 26 Jul 2007 16:58:09 -0000 1.252 +++ sys/kern/vfs_vnops.c 15 Oct 2007 21:21:55 -0000 @@ -71,6 +71,7 @@ static fo_kqfilter_t vn_kqfilter; static fo_stat_t vn_statfile; static fo_close_t vn_closefile; +static fo_truncate_t vn_truncate; struct fileops vnops = { .fo_read = vn_read, @@ -80,7 +81,8 @@ .fo_kqfilter = vn_kqfilter, .fo_stat = vn_statfile, .fo_close = vn_closefile, - .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE + .fo_truncate = vn_truncate, + .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE | DFLAG_TRUNCATABLE }; int @@ -871,6 +873,53 @@ } /* + * File table truncate routine. + */ +static int +vn_truncate(fp, length, active_cred, td) + struct file *fp; + off_t length; + struct ucred *active_cred; + struct thread *td; +{ + struct vattr vattr; + struct mount *mp; + struct vnode *vp; + int vfslocked; + int error; + + vp = fp->f_vnode; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + error = vn_start_write(vp, &mp, V_WAIT | PCATCH); + if (error) { + VFS_UNLOCK_GIANT(vfslocked); + return (error); + } + VOP_LEASE(vp, td, active_cred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); + if (vp->v_type == VDIR) { + error = EISDIR; + goto out; + } +#ifdef MAC + error = mac_check_vnode_write(active_cred, fp->f_cred, vp); + if (error) + goto out; +#endif + error = vn_writechk(vp); + if (error == 0) { + VATTR_NULL(&vattr); + vattr.va_size = length; + error = VOP_SETATTR(vp, &vattr, fp->f_cred, td); + } +out: + VOP_UNLOCK(vp, 0, td); + vn_finished_write(mp); + VFS_UNLOCK_GIANT(vfslocked); + return (error); +} + +/* * Preparing to start a filesystem write operation. If the operation is * permitted, then we bump the count of operations in progress and * proceed. If a suspend request is in progress, we wait until the Index: sys/sys/fcntl.h =================================================================== RCS file: /usr/cvs/src/sys/sys/fcntl.h,v retrieving revision 1.16 diff -u -r1.16 fcntl.h --- sys/sys/fcntl.h 7 Apr 2004 04:19:49 -0000 1.16 +++ sys/sys/fcntl.h 1 May 2007 20:09:04 -0000 @@ -126,8 +126,20 @@ /* bits to save after open */ #define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK|O_DIRECT) /* bits settable by fcntl(F_SETFL, ...) */ +#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|O_DIRECT) + +#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ + defined(COMPAT_FREEBSD4) +/* + * Set by shm_open(3) in older libc's to get automatic MAP_ASYNC + * behavior for POSIX shared memory objects (which are otherwise + * implemented as plain files). + */ +#define FPOSIXSHM O_NOFOLLOW +#undef FCNTLFLAGS #define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM|O_DIRECT) #endif +#endif /* * The O_* flags used to have only F* names, which were used in the kernel @@ -150,13 +162,6 @@ * different meaning for fcntl(2). */ #if __BSD_VISIBLE - -/* - * Set by shm_open(3) to get automatic MAP_ASYNC behavior - * for POSIX shared memory objects (which are otherwise - * implemented as plain files). - */ -#define FPOSIXSHM O_NOFOLLOW #endif /* Index: sys/sys/file.h =================================================================== RCS file: /usr/cvs/src/sys/sys/file.h,v retrieving revision 1.73 diff -u -r1.73 file.h --- sys/sys/file.h 5 Jan 2007 19:59:46 -0000 1.73 +++ sys/sys/file.h 1 May 2007 20:09:04 -0000 @@ -59,6 +59,7 @@ #define DTYPE_KQUEUE 5 /* event queue */ #define DTYPE_CRYPTO 6 /* crypto */ #define DTYPE_MQUEUE 7 /* posix message queue */ +#define DTYPE_SHM 8 /* swap-backed shared memory */ #ifdef _KERNEL @@ -77,6 +78,8 @@ typedef int fo_stat_t(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td); typedef int fo_close_t(struct file *fp, struct thread *td); +typedef int fo_truncate_t(struct file *fp, off_t length, + struct ucred *active_cred, struct thread *td); typedef int fo_flags_t; struct fileops { @@ -87,11 +90,13 @@ fo_kqfilter_t *fo_kqfilter; fo_stat_t *fo_stat; fo_close_t *fo_close; + fo_truncate_t *fo_truncate; fo_flags_t fo_flags; /* DFLAG_* below */ }; #define DFLAG_PASSABLE 0x01 /* may be passed via unix sockets. */ #define DFLAG_SEEKABLE 0x02 /* seekable / nonsequential */ +#define DFLAG_TRUNCATABLE 0x04 /* truncate supported */ /* * Kernel descriptor table. @@ -229,6 +234,7 @@ static __inline fo_kqfilter_t fo_kqfilter; static __inline fo_stat_t fo_stat; static __inline fo_close_t fo_close; +static __inline fo_truncate_t fo_truncate; static __inline int fo_read(fp, uio, active_cred, flags, td) @@ -306,6 +312,17 @@ return ((*fp->f_ops->fo_kqfilter)(fp, kn)); } +static __inline int +fo_truncate(fp, length, active_cred, td) + struct file *fp; + off_t length; + struct ucred *active_cred; + struct thread *td; +{ + + return ((*fp->f_ops->fo_truncate)(fp, length, active_cred, td)); +} + #endif /* _KERNEL */ #endif /* !SYS_FILE_H */ Index: sys/sys/mman.h =================================================================== RCS file: /usr/cvs/src/sys/sys/mman.h,v retrieving revision 1.40 diff -u -r1.40 mman.h --- sys/sys/mman.h 2 Apr 2005 12:33:31 -0000 1.40 +++ sys/sys/mman.h 1 May 2007 20:09:04 -0000 @@ -139,6 +139,11 @@ #define MINCORE_MODIFIED 0x4 /* Page has been modified by us */ #define MINCORE_REFERENCED_OTHER 0x8 /* Page has been referenced */ #define MINCORE_MODIFIED_OTHER 0x10 /* Page has been modified */ + +/* + * Anonymous object constant for shm_open(). + */ +#define SHM_ANON ((char *)1) #endif /* __BSD_VISIBLE */ /* @@ -168,7 +173,15 @@ #define _SIZE_T_DECLARED #endif -#ifndef _KERNEL +#ifdef _KERNEL +#include + +struct shmfd; + +int shm_mmap(struct shmfd *shmfd, vm_size_t objsize, vm_ooffset_t foff, + vm_object_t *obj); + +#else /* !_KERNEL */ __BEGIN_DECLS /* Index: sys/sys/syscallsubr.h =================================================================== RCS file: /usr/cvs/src/sys/sys/syscallsubr.h,v retrieving revision 1.46 diff -u -r1.46 syscallsubr.h --- sys/sys/syscallsubr.h 7 Jun 2007 19:45:19 -0000 1.46 +++ sys/sys/syscallsubr.h 15 Oct 2007 21:22:44 -0000 @@ -82,6 +82,7 @@ int kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf); int kern_fstat(struct thread *td, int fd, struct stat *sbp); int kern_fstatfs(struct thread *td, int fd, struct statfs *buf); +int kern_ftruncate(struct thread *td, int fd, off_t length); int kern_futimes(struct thread *td, int fd, struct timeval *tptr, enum uio_seg tptrseg); int kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize, Index: sys/vm/vm_mmap.c =================================================================== RCS file: /usr/cvs/src/sys/vm/vm_mmap.c,v retrieving revision 1.213 diff -u -r1.213 vm_mmap.c --- sys/vm/vm_mmap.c 20 Aug 2007 12:05:45 -0000 1.213 +++ sys/vm/vm_mmap.c 15 Oct 2007 21:22:46 -0000 @@ -118,6 +118,8 @@ int *, struct vnode *, vm_ooffset_t, vm_object_t *); static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, int *, struct cdev *, vm_ooffset_t, vm_object_t *); +static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, + int *, struct shmfd *, vm_ooffset_t, vm_object_t *); /* * MPSAFE @@ -300,16 +302,29 @@ pos = 0; } else { /* - * Mapping file, get fp for validation. Obtain vnode and make - * sure it is of appropriate type. - * don't let the descriptor disappear on us if we block + * Mapping file, get fp for validation and + * don't let the descriptor disappear on us if we block. */ if ((error = fget(td, uap->fd, &fp)) != 0) goto done; + if (fp->f_type == DTYPE_SHM) { + handle = fp->f_data; + handle_type = OBJT_SWAP; + maxprot = VM_PROT_NONE; + + /* FREAD should always be set. */ + if (fp->f_flag & FREAD) + maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; + if (fp->f_flag & FWRITE) + maxprot |= VM_PROT_WRITE; + goto map; + } if (fp->f_type != DTYPE_VNODE) { error = ENODEV; goto done; } +#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ + defined(COMPAT_FREEBSD4) /* * POSIX shared-memory objects are defined to have * kernel persistence, and are not defined to support @@ -320,6 +335,7 @@ */ if (fp->f_flag & FPOSIXSHM) flags |= MAP_NOSYNC; +#endif vp = fp->f_vnode; /* * Ensure that file and memory protections are @@ -360,6 +376,7 @@ handle = (void *)vp; handle_type = OBJT_VNODE; } +map: /* * Do not allow more then a certain number of vm_map_entry structures @@ -1291,6 +1308,38 @@ } /* + * vm_mmap_shm() + * + * MPSAFE + * + * Helper function for vm_mmap. Perform sanity check specific for mmap + * operations on shm file descriptors. + */ +int +vm_mmap_shm(struct thread *td, vm_size_t objsize, + vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, + struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) +{ + int error, flags; + + flags = *flagsp; + + if ((*maxprotp & VM_PROT_WRITE) == 0 && + (prot & PROT_WRITE) != 0) + return (EACCES); +#ifdef MAC_XXX + error = mac_check_shm_mmap(td->td_ucred, shmfd, prot); + if (error != 0) + return (error); +#endif + error = shm_mmap(shmfd, objsize, foff, objp); + if (error) + return (error); + *flagsp = flags; + return (0); +} + +/* * vm_mmap() * * MPSAFE @@ -1354,6 +1403,10 @@ error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, handle, foff, &object); break; + case OBJT_SWAP: + error = vm_mmap_shm(td, size, prot, &maxprot, &flags, + handle, foff, &object); + break; case OBJT_DEFAULT: if (handle == NULL) { error = 0;