Index: lib/libc/sys/Symbol.map =================================================================== --- lib/libc/sys/Symbol.map (revision 226848) +++ lib/libc/sys/Symbol.map (working copy) @@ -378,6 +378,10 @@ setloginclass; }; +FBSD_1.3 { + posix_fadvise; +}; + FBSDprivate_1.0 { ___acl_aclcheck_fd; __sys___acl_aclcheck_fd; Index: lib/libc/sys/posix_fadvise.2 =================================================================== --- lib/libc/sys/posix_fadvise.2 (revision 0) +++ lib/libc/sys/posix_fadvise.2 (revision 0) @@ -0,0 +1,139 @@ +.\" Copyright (c) 1991, 1993 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 4. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)madvise.2 8.1 (Berkeley) 6/9/93 +.\" $FreeBSD$ +.\" +.Dd October 26, 2011 +.Dt POSIX_FADVISE 2 +.Os +.Sh NAME +.Nm posix_fadvise +.Nd give advice about use of file data +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In fcntl.h +.Ft int +.Fn posix_fadvise "int fd" "off_t offset" "off_t len" "int advice" +.Sh DESCRIPTION +The +.Fn posix_fadvise +system call +allows a process to describe to the system its data access behavior for an +open file descriptor +.Fa fd . +The advice covers the data starting at offset +.Fa offset +and continuing for +.Fa len +bytes. +If +.Fa len +is zero, +all data from +.Fa offset +to the end of the file is covered. +.Pp +The behavior is specified by the +.Fa advice +parameter and may be one of: +.Bl -tag -width POSIX_FADV_SEQUENTIAL +.It Dv POSIX_FADV_NORMAL +Tells the system to revert to the default data access behavior. +.It Dv POSIX_FADV_RANDOM +Is a hint that file data will be accessed randomly, +and prefetching is likely not advantageous. +.It Dv POSIX_FADV_SEQUENTIAL +Tells the system that file data will be accessed sequentially. +This currently does nothing as the default behavior uses heuristics to +detect sequential behavior. +.It Dv POSIX_FADV_WILLNEED +Tells the system that the specified data will be accessed in the near future. +The system may initiate an asychronous read of the data if it is not already +present in memory. +.It Dv POSIX_FADV_DONTNEED +Tells the system that the specified data will not be accessed in the near +future. +The system may decrease the in-memory priority of clean data within the +specified range and future access to this data may require a read operation. +.It Dv POSIX_FADV_NOREUSE +Tells the system that the specified data will only be accessed once and +then not reused. +Accesses to data within the specified range are treated as if the file +descriptor has the +.Dv O_DIRECT +flag enabled. +.El +.Pp +.Sh RETURN VALUES +.Rv -std posix_fadvise +.Sh ERRORS +The +.Fn posix_fadvise +system call will fail if: +.Bl -tag -width Er +.It Bq Er EBADF +The +.Fa fd +argument is not a valid file descriptor. +.It Bq Er EINVAL +The +.Fa advice +argument is not valid. +.It Bq Er EINVAL +The +.Fa offset +or +.Fa len +arguments are negative, +or +.Fa offset ++ +.Fa len +is greater than the maximum file size. +.It Bq Er ENODEV +The +.Fa fd +argument does not refer to a regular file. +.It Bq Er ESPIPE +The +.Fa fd +argument is associated with a pipe or FIFO. +.El +.Sh SEE ALSO +.Xr madvise 2 +.Sh STANDARDS +The +.Fn posix_fadvise +interface conforms to +.St -p1003.1-2001 . +.Sh HISTORY +The +.Fn posix_fadvise +system call first appeared in +.Fx 10.0 . Property changes on: lib/libc/sys/posix_fadvise.2 ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:keywords + FreeBSD=%H Added: svn:eol-style + native Index: lib/libc/sys/Makefile.inc =================================================================== --- lib/libc/sys/Makefile.inc (revision 226848) +++ lib/libc/sys/Makefile.inc (working copy) @@ -96,7 +96,8 @@ mq_setattr.2 \ msgctl.2 msgget.2 msgrcv.2 msgsnd.2 \ msync.2 munmap.2 nanosleep.2 nfssvc.2 ntp_adjtime.2 open.2 \ - pathconf.2 pdfork.2 pipe.2 poll.2 posix_fallocate.2 posix_openpt.2 profil.2 \ + pathconf.2 pdfork.2 pipe.2 poll.2 posix_fadvise.2 posix_fallocate.2 \ + posix_openpt.2 profil.2 \ pselect.2 ptrace.2 quotactl.2 \ read.2 readlink.2 reboot.2 recv.2 rename.2 revoke.2 rfork.2 rmdir.2 \ rtprio.2 Index: lib/libc/sys/madvise.2 =================================================================== --- lib/libc/sys/madvise.2 (revision 226848) +++ lib/libc/sys/madvise.2 (working copy) @@ -169,7 +169,8 @@ .Xr mincore 2 , .Xr mprotect 2 , .Xr msync 2 , -.Xr munmap 2 +.Xr munmap 2 , +.Xr posix_fadvise 2 .Sh STANDARDS The .Fn posix_madvise Index: sys/kern/vfs_vnops.c =================================================================== --- sys/kern/vfs_vnops.c (revision 226848) +++ sys/kern/vfs_vnops.c (working copy) @@ -518,7 +518,7 @@ struct vnode *vp; int error, ioflag; struct mtx *mtxp; - int vfslocked; + int advice, vfslocked; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); @@ -529,27 +529,48 @@ ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; + advice = POSIX_FADV_NORMAL; vfslocked = VFS_LOCK_GIANT(vp->v_mount); /* * According to McKusick the vn lock was protecting f_offset here. * It is now protected by the FOFFSET_LOCKED flag. */ - if ((flags & FOF_OFFSET) == 0) { + if ((flags & FOF_OFFSET) == 0 || fp->f_advice != NULL) { mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); - while(fp->f_vnread_flags & FOFFSET_LOCKED) { - fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; - msleep(&fp->f_vnread_flags, mtxp, PUSER -1, - "vnread offlock", 0); + if ((flags & FOF_OFFSET) == 0) { + while (fp->f_vnread_flags & FOFFSET_LOCKED) { + fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; + msleep(&fp->f_vnread_flags, mtxp, PUSER -1, + "vnread offlock", 0); + } + fp->f_vnread_flags |= FOFFSET_LOCKED; + uio->uio_offset = fp->f_offset; } - fp->f_vnread_flags |= FOFFSET_LOCKED; + if (fp->f_advice != NULL && + uio->uio_offset >= fp->f_advice->fa_start && + uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end) + advice = fp->f_advice->fa_advice; mtx_unlock(mtxp); - vn_lock(vp, LK_SHARED | LK_RETRY); - uio->uio_offset = fp->f_offset; - } else - vn_lock(vp, LK_SHARED | LK_RETRY); + } + vn_lock(vp, LK_SHARED | LK_RETRY); - ioflag |= sequential_heuristic(uio, fp); + switch (advice) { + case POSIX_FADV_NORMAL: + case POSIX_FADV_SEQUENTIAL: + ioflag |= sequential_heuristic(uio, fp); + break; + case POSIX_FADV_RANDOM: + /* Do no read-ahead for random I/O. */ + break; + case POSIX_FADV_NOREUSE: + /* + * Request the underlying FS to discard the buffers + * and pages after the I/O is complete. + */ + ioflag |= IO_DIRECT; + break; + } #ifdef MAC error = mac_vnode_check_read(active_cred, fp->f_cred, vp); @@ -584,7 +605,8 @@ struct vnode *vp; struct mount *mp; int error, ioflag, lock_flags; - int vfslocked; + struct mtx *mtxp; + int advice, vfslocked; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); @@ -618,7 +640,32 @@ vn_lock(vp, lock_flags | LK_RETRY); if ((flags & FOF_OFFSET) == 0) uio->uio_offset = fp->f_offset; - ioflag |= sequential_heuristic(uio, fp); + advice = POSIX_FADV_NORMAL; + if (fp->f_advice != NULL) { + mtxp = mtx_pool_find(mtxpool_sleep, fp); + mtx_lock(mtxp); + if (fp->f_advice != NULL && + uio->uio_offset >= fp->f_advice->fa_start && + uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end) + advice = fp->f_advice->fa_advice; + mtx_unlock(mtxp); + } + switch (advice) { + case POSIX_FADV_NORMAL: + case POSIX_FADV_SEQUENTIAL: + ioflag |= sequential_heuristic(uio, fp); + case POSIX_FADV_RANDOM: + /* XXX: Is this correct? */ + break; + case POSIX_FADV_NOREUSE: + /* + * Request the underlying FS to discard the buffers + * and pages after the I/O is complete. + */ + ioflag |= IO_DIRECT; + break; + } + #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); if (error == 0) Index: sys/kern/vfs_syscalls.c =================================================================== --- sys/kern/vfs_syscalls.c (revision 226848) +++ sys/kern/vfs_syscalls.c (working copy) @@ -86,6 +86,8 @@ #include #include +static MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information"); + SDT_PROVIDER_DEFINE(vfs); SDT_PROBE_DEFINE(vfs, , stat, mode, mode); SDT_PROBE_ARGTYPE(vfs, , stat, mode, 0, "char *"); @@ -4845,3 +4847,135 @@ return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len)); } + +/* + * Unlike madvise(2), we do not make a best effort to remember every + * possible caching hint. Instead, we remember the last setting with + * the exception that we will allow POSIX_FADV_NORMAL to adjust the + * region of any current setting. + */ +int +sys_posix_fadvise(struct thread *td, struct fadvise_args *uap) +{ + struct fadvise_info *fa, *new; + struct file *fp; + struct vnode *vp; + off_t end; + int error; + + if (uap->offset < 0 || uap->len < 0 || + uap->offset > OFF_MAX - uap->len) + return (EINVAL); + switch (uap->advice) { + case POSIX_FADV_SEQUENTIAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_NOREUSE: + new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK); + break; + case POSIX_FADV_NORMAL: + case POSIX_FADV_WILLNEED: + case POSIX_FADV_DONTNEED: + new = NULL; + break; + default: + return (EINVAL); + } + /* XXX: CAP_POSIX_FADVISE? */ + error = fget(td, uap->fd, 0, &fp); + if (error != 0) + goto out; + + switch (fp->f_type) { + case DTYPE_VNODE: + break; + case DTYPE_PIPE: + case DTYPE_FIFO: + error = ESPIPE; + goto out; + default: + error = ENODEV; + goto out; + } + vp = fp->f_vnode; + if (vp->v_type != VREG) { + error = ENODEV; + goto out; + } + if (uap->len == 0) + end = OFF_MAX; + else + end = uap->offset + uap->len - 1; + switch (uap->advice) { + case POSIX_FADV_SEQUENTIAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_NOREUSE: + /* + * Try to merge any existing non-standard region with + * this new region if possible, otherwise create a new + * non-standard region for this request. + */ + mtx_pool_lock(mtxpool_sleep, fp); + fa = fp->f_advice; + if (fa != NULL && fa->fa_advice == uap->advice && + ((fa->fa_start <= end && fa->fa_end >= uap->offset) || + (fa->fa_start != 0 && fa->fa_start == end + 1) || + (uap->offset != 0 && fa->fa_end + 1 == uap->offset))) { + if (uap->offset < fa->fa_start) + fa->fa_start = uap->offset; + if (end > fa->fa_end) + fa->fa_end = end; + } else { + new->fa_advice = uap->advice; + new->fa_start = uap->offset; + new->fa_end = end; + fp->f_advice = new; + new = fa; + } + mtx_pool_unlock(mtxpool_sleep, fp); + break; + case POSIX_FADV_NORMAL: + /* + * If a the "normal" region overlaps with an existing + * non-standard region, trim or remove the + * non-standard region. + */ + mtx_pool_lock(mtxpool_sleep, fp); + fa = fp->f_advice; + if (fa != NULL) { + if (uap->offset <= fa->fa_start && + end >= fa->fa_end) { + new = fa; + fp->f_advice = NULL; + } else if (uap->offset <= fa->fa_start && + end >= fa->fa_start) + fa->fa_start = end + 1; + else if (uap->offset <= fa->fa_end && + end >= fa->fa_end) + fa->fa_end = uap->offset - 1; + else if (uap->offset >= fa->fa_start && + end <= fa->fa_end) { + /* + * If the "normal" region is a middle + * portion of the existing + * non-standard region, just remove + * the whole thing rather than picking + * one side or the other to + * preserve. + */ + new = fa; + fp->f_advice = NULL; + } + } + mtx_pool_unlock(mtxpool_sleep, fp); + break; + case POSIX_FADV_WILLNEED: + case POSIX_FADV_DONTNEED: + error = VOP_ADVISE(vp, uap->offset, end, uap->advice); + break; + } +out: + if (fp != NULL) + fdrop(fp, td); + free(new, M_FADVISE); + return (error); +} Index: sys/kern/vfs_default.c =================================================================== --- sys/kern/vfs_default.c (revision 226848) +++ sys/kern/vfs_default.c (working copy) @@ -96,6 +96,7 @@ .vop_access = vop_stdaccess, .vop_accessx = vop_stdaccessx, + .vop_advise = vop_stdadvise, .vop_advlock = vop_stdadvlock, .vop_advlockasync = vop_stdadvlockasync, .vop_advlockpurge = vop_stdadvlockpurge, @@ -984,6 +985,57 @@ return (error); } +int +vop_stdadvise(struct vop_advise_args *ap) +{ + struct vnode *vp; + off_t start, end; + int error, vfslocked; + + vp = ap->a_vp; + switch (ap->a_advice) { + case POSIX_FADV_WILLNEED: + /* + * Do nothing for now. Filesystems should provide a + * custom method which starts an asynchronous read of + * the requested region. + */ + error = 0; + break; + case POSIX_FADV_DONTNEED: + /* + * Flush any open FS buffers and then remove pages + * from the backing VM object. Using vinvalbuf() here + * is a bit heavy-handed as it flushes all buffers for + * the given vnode, not just the buffers covering the + * requested range. + */ + error = 0; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + if (vp->v_iflag & VI_DOOMED) { + VOP_UNLOCK(vp, 0); + break; + } + vinvalbuf(vp, V_CLEANONLY, 0, 0); + if (vp->v_object != NULL) { + start = trunc_page(ap->a_start); + end = round_page(ap->a_end); + VM_OBJECT_LOCK(vp->v_object); + vm_object_page_cache(vp->v_object, OFF_TO_IDX(start), + OFF_TO_IDX(end)); + VM_OBJECT_UNLOCK(vp->v_object); + } + VOP_UNLOCK(vp, 0); + VFS_UNLOCK_GIANT(vfslocked); + break; + default: + error = EINVAL; + break; + } + return (error); +} + /* * vfs default ops * used to fill the vfs function table to get reasonable default return values. Index: sys/kern/init_sysent.c =================================================================== --- sys/kern/init_sysent.c (revision 226848) +++ sys/kern/init_sysent.c (working copy) @@ -565,5 +565,5 @@ { AS(rctl_add_rule_args), (sy_call_t *)sys_rctl_add_rule, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 528 = rctl_add_rule */ { AS(rctl_remove_rule_args), (sy_call_t *)sys_rctl_remove_rule, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 529 = rctl_remove_rule */ { AS(posix_fallocate_args), (sy_call_t *)sys_posix_fallocate, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 530 = posix_fallocate */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 531 = posix_fadvise */ + { AS(posix_fadvise_args), (sy_call_t *)sys_posix_fadvise, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 531 = posix_fadvise */ }; Index: sys/kern/kern_descrip.c =================================================================== --- sys/kern/kern_descrip.c (revision 226848) +++ sys/kern/kern_descrip.c (working copy) @@ -2575,12 +2575,6 @@ panic("fdrop: count %d", fp->f_count); if (fp->f_ops != &badfileops) error = fo_close(fp, td); - /* - * The f_cdevpriv cannot be assigned non-NULL value while we - * are destroying the file. - */ - if (fp->f_cdevpriv != NULL) - devfs_fpdrop(fp); atomic_subtract_int(&openfiles, 1); crfree(fp->f_cred); uma_zfree(file_zone, fp); Index: sys/kern/syscalls.c =================================================================== --- sys/kern/syscalls.c (revision 226848) +++ sys/kern/syscalls.c (working copy) @@ -538,5 +538,5 @@ "rctl_add_rule", /* 528 = rctl_add_rule */ "rctl_remove_rule", /* 529 = rctl_remove_rule */ "posix_fallocate", /* 530 = posix_fallocate */ - "#531", /* 531 = posix_fadvise */ + "posix_fadvise", /* 531 = posix_fadvise */ }; Index: sys/kern/vnode_if.src =================================================================== --- sys/kern/vnode_if.src (revision 226848) +++ sys/kern/vnode_if.src (working copy) @@ -628,3 +628,12 @@ INOUT off_t *offset; INOUT off_t *len; }; + +%% advise vp U U U + +vop_advise { + IN struct vnode *vp; + IN off_t start; + IN off_t end; + IN int advice; +}; Index: sys/kern/syscalls.master =================================================================== --- sys/kern/syscalls.master (revision 226848) +++ sys/kern/syscalls.master (working copy) @@ -947,6 +947,7 @@ size_t outbuflen); } 530 AUE_NULL STD { int posix_fallocate(int fd, \ off_t offset, off_t len); } -531 AUE_NULL UNIMPL posix_fadvise +531 AUE_NULL STD { int posix_fadvise(int fd, off_t offset, \ + off_t len, int advice); } ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master Index: sys/kern/systrace_args.c =================================================================== --- sys/kern/systrace_args.c (revision 226848) +++ sys/kern/systrace_args.c (working copy) @@ -3234,6 +3234,16 @@ *n_args = 3; break; } + /* posix_fadvise */ + case 531: { + struct posix_fadvise_args *p = params; + iarg[0] = p->fd; /* int */ + iarg[1] = p->offset; /* off_t */ + iarg[2] = p->len; /* off_t */ + iarg[3] = p->advice; /* int */ + *n_args = 4; + break; + } default: *n_args = 0; break; @@ -8603,6 +8613,25 @@ break; }; break; + /* posix_fadvise */ + case 531: + switch(ndx) { + case 0: + p = "int"; + break; + case 1: + p = "off_t"; + break; + case 2: + p = "off_t"; + break; + case 3: + p = "int"; + break; + default: + break; + }; + break; default: break; }; Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c (revision 226848) +++ sys/kern/vfs_subr.c (working copy) @@ -1191,7 +1191,7 @@ do { error = flushbuflist(&bo->bo_clean, flags, bo, slpflag, slptimeo); - if (error == 0) + if (error == 0 && !(flags & V_CLEANONLY)) error = flushbuflist(&bo->bo_dirty, flags, bo, slpflag, slptimeo); if (error != 0 && error != EAGAIN) { @@ -1220,7 +1220,8 @@ /* * Destroy the copy in the VM cache, too. */ - if (bo->bo_object != NULL && (flags & (V_ALT | V_NORMAL)) == 0) { + if (bo->bo_object != NULL && + (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) { VM_OBJECT_LOCK(bo->bo_object); vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? OBJPR_CLEANONLY : 0); @@ -1229,7 +1230,7 @@ #ifdef INVARIANTS BO_LOCK(bo); - if ((flags & (V_ALT | V_NORMAL)) == 0 && + if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 && (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) panic("vinvalbuf: flush failed"); BO_UNLOCK(bo); Index: sys/fs/devfs/devfs_vnops.c =================================================================== --- sys/fs/devfs/devfs_vnops.c (revision 226848) +++ sys/fs/devfs/devfs_vnops.c (working copy) @@ -604,6 +604,13 @@ td->td_fpop = fp; error = vnops.fo_close(fp, td); td->td_fpop = fpop; + + /* + * The f_cdevpriv cannot be assigned non-NULL value while we + * are destroying the file. + */ + if (fp->f_cdevpriv != NULL) + devfs_fpdrop(fp); return (error); } Index: sys/vm/vm_object.c =================================================================== --- sys/vm/vm_object.c (revision 226848) +++ sys/vm/vm_object.c (working copy) @@ -1863,6 +1863,60 @@ } /* + * vm_object_page_cache: + * + * For the given object, attempt to move the specified clean + * pages to the cache queue. If a page is wired for any reason, + * then it will not be changed. Pages are specified by the given + * range ["start", "end"). As a special case, if "end" is zero, + * then the range extends from "start" to the end of the object. + * Any mappings to the specified pages are removed before the + * pages are moved to the cache queue. + * + * This operation should only be performed on objects that + * contain managed pages. + * + * The object must be locked. + */ +void +vm_object_page_cache(vm_object_t object, vm_pindex_t start, vm_pindex_t end) +{ + struct mtx *mtx, *new_mtx; + vm_page_t p, next; + + VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); + KASSERT((object->type != OBJT_DEVICE && object->type != OBJT_SG && + object->type != OBJT_PHYS), + ("vm_object_page_cache: illegal object %p", object)); + if (object->resident_page_count == 0) + return; + p = vm_page_find_least(object, start); + + /* + * Here, the variable "p" is either (1) the page with the least pindex + * greater than or equal to the parameter "start" or (2) NULL. + */ + mtx = NULL; + for (; p != NULL && (p->pindex < end || end == 0); p = next) { + next = TAILQ_NEXT(p, listq); + + /* + * Avoid releasing and reacquiring the same page lock. + */ + new_mtx = vm_page_lockptr(p); + if (mtx != new_mtx) { + if (mtx != NULL) + mtx_unlock(mtx); + mtx = new_mtx; + mtx_lock(mtx); + } + vm_page_try_to_cache(p); + } + if (mtx != NULL) + mtx_unlock(mtx); +} + +/* * Populate the specified range of the object with valid pages. Returns * TRUE if the range is successfully populated and FALSE otherwise. * Index: sys/vm/vm_object.h =================================================================== --- sys/vm/vm_object.h (revision 226848) +++ sys/vm/vm_object.h (working copy) @@ -223,6 +223,8 @@ void vm_object_terminate (vm_object_t); void vm_object_set_writeable_dirty (vm_object_t); void vm_object_init (void); +void vm_object_page_cache(vm_object_t object, vm_pindex_t start, + vm_pindex_t end); void vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags); void vm_object_page_remove(vm_object_t object, vm_pindex_t start, Index: sys/compat/freebsd32/syscalls.master =================================================================== --- sys/compat/freebsd32/syscalls.master (revision 226848) +++ sys/compat/freebsd32/syscalls.master (working copy) @@ -991,4 +991,7 @@ 530 AUE_NULL STD { int freebsd32_posix_fallocate(int fd,\ uint32_t offset1, uint32_t offset2,\ uint32_t len1, uint32_t len2); } -531 AUE_NULL UNIMPL posix_fadvise +531 AUE_NULL STD { int freebsd32_posix_fadvise(int fd, \ + uint32_t offset1, uint32_t offset2,\ + uint32_t len1, uint32_t len2, \ + int advice); } Index: sys/compat/freebsd32/freebsd32_proto.h =================================================================== --- sys/compat/freebsd32/freebsd32_proto.h (revision 226848) +++ sys/compat/freebsd32/freebsd32_proto.h (working copy) @@ -580,6 +580,14 @@ char len1_l_[PADL_(uint32_t)]; uint32_t len1; char len1_r_[PADR_(uint32_t)]; char len2_l_[PADL_(uint32_t)]; uint32_t len2; char len2_r_[PADR_(uint32_t)]; }; +struct freebsd32_posix_fadvise_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char offset1_l_[PADL_(uint32_t)]; uint32_t offset1; char offset1_r_[PADR_(uint32_t)]; + char offset2_l_[PADL_(uint32_t)]; uint32_t offset2; char offset2_r_[PADR_(uint32_t)]; + char len1_l_[PADL_(uint32_t)]; uint32_t len1; char len1_r_[PADR_(uint32_t)]; + char len2_l_[PADL_(uint32_t)]; uint32_t len2; char len2_r_[PADR_(uint32_t)]; + char advice_l_[PADL_(int)]; int advice; char advice_r_[PADR_(int)]; +}; #if !defined(PAD64_REQUIRED) && defined(__powerpc__) #define PAD64_REQUIRED #endif @@ -690,6 +698,7 @@ int freebsd32_shmctl(struct thread *, struct freebsd32_shmctl_args *); int freebsd32_pselect(struct thread *, struct freebsd32_pselect_args *); int freebsd32_posix_fallocate(struct thread *, struct freebsd32_posix_fallocate_args *); +int freebsd32_posix_fadvise(struct thread *, struct freebsd32_posix_fadvise_args *); #ifdef COMPAT_43 @@ -1065,6 +1074,7 @@ #define FREEBSD32_SYS_AUE_freebsd32_shmctl AUE_SHMCTL #define FREEBSD32_SYS_AUE_freebsd32_pselect AUE_SELECT #define FREEBSD32_SYS_AUE_freebsd32_posix_fallocate AUE_NULL +#define FREEBSD32_SYS_AUE_freebsd32_posix_fadvise AUE_NULL #undef PAD_ #undef PADL_ Index: sys/compat/freebsd32/freebsd32_systrace_args.c =================================================================== --- sys/compat/freebsd32/freebsd32_systrace_args.c (revision 226848) +++ sys/compat/freebsd32/freebsd32_systrace_args.c (working copy) @@ -3034,6 +3034,18 @@ *n_args = 5; break; } + /* freebsd32_posix_fadvise */ + case 531: { + struct freebsd32_posix_fadvise_args *p = params; + iarg[0] = p->fd; /* int */ + uarg[1] = p->offset1; /* uint32_t */ + uarg[2] = p->offset2; /* uint32_t */ + uarg[3] = p->len1; /* uint32_t */ + uarg[4] = p->len2; /* uint32_t */ + iarg[5] = p->advice; /* int */ + *n_args = 6; + break; + } default: *n_args = 0; break; @@ -8093,6 +8105,31 @@ break; }; break; + /* freebsd32_posix_fadvise */ + case 531: + switch(ndx) { + case 0: + p = "int"; + break; + case 1: + p = "uint32_t"; + break; + case 2: + p = "uint32_t"; + break; + case 3: + p = "uint32_t"; + break; + case 4: + p = "uint32_t"; + break; + case 5: + p = "int"; + break; + default: + break; + }; + break; default: break; }; Index: sys/compat/freebsd32/freebsd32_syscall.h =================================================================== --- sys/compat/freebsd32/freebsd32_syscall.h (revision 226848) +++ sys/compat/freebsd32/freebsd32_syscall.h (working copy) @@ -424,4 +424,5 @@ #define FREEBSD32_SYS_rctl_add_rule 528 #define FREEBSD32_SYS_rctl_remove_rule 529 #define FREEBSD32_SYS_freebsd32_posix_fallocate 530 +#define FREEBSD32_SYS_freebsd32_posix_fadvise 531 #define FREEBSD32_SYS_MAXSYSCALL 532 Index: sys/compat/freebsd32/freebsd32_sysent.c =================================================================== --- sys/compat/freebsd32/freebsd32_sysent.c (revision 226848) +++ sys/compat/freebsd32/freebsd32_sysent.c (working copy) @@ -591,5 +591,5 @@ { AS(rctl_add_rule_args), (sy_call_t *)sys_rctl_add_rule, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 528 = rctl_add_rule */ { AS(rctl_remove_rule_args), (sy_call_t *)sys_rctl_remove_rule, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 529 = rctl_remove_rule */ { AS(freebsd32_posix_fallocate_args), (sy_call_t *)freebsd32_posix_fallocate, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 530 = freebsd32_posix_fallocate */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 531 = posix_fadvise */ + { AS(freebsd32_posix_fadvise_args), (sy_call_t *)freebsd32_posix_fadvise, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 531 = freebsd32_posix_fadvise */ }; Index: sys/compat/freebsd32/freebsd32_misc.c =================================================================== --- sys/compat/freebsd32/freebsd32_misc.c (revision 226848) +++ sys/compat/freebsd32/freebsd32_misc.c (working copy) @@ -2835,3 +2835,16 @@ ap.len = PAIR32TO64(off_t, uap->len); return (sys_posix_fallocate(td, &ap)); } + +int +freebsd32_posix_fadvise(struct thread *td, + struct freebsd32_posix_fadvise_args *uap) +{ + struct posix_fadvise_args ap; + + ap.fd = uap->fd; + ap.offset = PAIR32TO64(off_t, uap->offset); + ap.len = PAIR32TO64(off_t, uap->len); + ap.advice = uap->advice; + return (sys_posix_fadvise(td, &ap)); +} Index: sys/compat/freebsd32/freebsd32_syscalls.c =================================================================== --- sys/compat/freebsd32/freebsd32_syscalls.c (revision 226848) +++ sys/compat/freebsd32/freebsd32_syscalls.c (working copy) @@ -554,5 +554,5 @@ "rctl_add_rule", /* 528 = rctl_add_rule */ "rctl_remove_rule", /* 529 = rctl_remove_rule */ "freebsd32_posix_fallocate", /* 530 = freebsd32_posix_fallocate */ - "#531", /* 531 = posix_fadvise */ + "freebsd32_posix_fadvise", /* 531 = freebsd32_posix_fadvise */ }; Index: sys/sys/fcntl.h =================================================================== --- sys/sys/fcntl.h (revision 226848) +++ sys/sys/fcntl.h (working copy) @@ -277,9 +277,17 @@ #define LOCK_UN 0x08 /* unlock file */ #endif +#if __POSIX_VISIBLE >= 200112 /* - * XXX missing posix_fadvise() and POSIX_FADV_* macros. + * Advice to posix_fadvise */ +#define POSIX_FADV_NORMAL 0 /* no special treatment */ +#define POSIX_FADV_RANDOM 1 /* expect random page references */ +#define POSIX_FADV_SEQUENTIAL 2 /* expect sequential page references */ +#define POSIX_FADV_WILLNEED 3 /* will need these pages */ +#define POSIX_FADV_DONTNEED 4 /* dont need these pages */ +#define POSIX_FADV_NOREUSE 5 /* access data only once */ +#endif #ifndef _KERNEL __BEGIN_DECLS @@ -290,6 +298,7 @@ int openat(int, const char *, int, ...); #endif #if __BSD_VISIBLE || __POSIX_VISIBLE >= 200112 +int posix_fadvise(int, off_t, off_t, int); int posix_fallocate(int, off_t, off_t); #endif #if __BSD_VISIBLE Index: sys/sys/syscall.h =================================================================== --- sys/sys/syscall.h (revision 226848) +++ sys/sys/syscall.h (working copy) @@ -446,4 +446,5 @@ #define SYS_rctl_add_rule 528 #define SYS_rctl_remove_rule 529 #define SYS_posix_fallocate 530 +#define SYS_posix_fadvise 531 #define SYS_MAXSYSCALL 532 Index: sys/sys/sysproto.h =================================================================== --- sys/sys/sysproto.h (revision 226848) +++ sys/sys/sysproto.h (working copy) @@ -1733,6 +1733,12 @@ char offset_l_[PADL_(off_t)]; off_t offset; char offset_r_[PADR_(off_t)]; char len_l_[PADL_(off_t)]; off_t len; char len_r_[PADR_(off_t)]; }; +struct posix_fadvise_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char offset_l_[PADL_(off_t)]; off_t offset; char offset_r_[PADR_(off_t)]; + char len_l_[PADL_(off_t)]; off_t len; char len_r_[PADR_(off_t)]; + char advice_l_[PADL_(int)]; int advice; char advice_r_[PADR_(int)]; +}; int nosys(struct thread *, struct nosys_args *); void sys_sys_exit(struct thread *, struct sys_exit_args *); int sys_fork(struct thread *, struct fork_args *); @@ -2109,6 +2115,7 @@ int sys_rctl_add_rule(struct thread *, struct rctl_add_rule_args *); int sys_rctl_remove_rule(struct thread *, struct rctl_remove_rule_args *); int sys_posix_fallocate(struct thread *, struct posix_fallocate_args *); +int sys_posix_fadvise(struct thread *, struct posix_fadvise_args *); #ifdef COMPAT_43 @@ -2799,6 +2806,7 @@ #define SYS_AUE_rctl_add_rule AUE_NULL #define SYS_AUE_rctl_remove_rule AUE_NULL #define SYS_AUE_posix_fallocate AUE_NULL +#define SYS_AUE_posix_fadvise AUE_NULL #undef PAD_ #undef PADL_ Index: sys/sys/syscall.mk =================================================================== --- sys/sys/syscall.mk (revision 226848) +++ sys/sys/syscall.mk (working copy) @@ -394,4 +394,5 @@ rctl_get_limits.o \ rctl_add_rule.o \ rctl_remove_rule.o \ - posix_fallocate.o + posix_fallocate.o \ + posix_fadvise.o Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h (revision 226848) +++ sys/sys/vnode.h (working copy) @@ -384,6 +384,7 @@ #define V_SAVE 0x0001 /* vinvalbuf: sync file first */ #define V_ALT 0x0002 /* vinvalbuf: invalidate only alternate bufs */ #define V_NORMAL 0x0004 /* vinvalbuf: invalidate only regular bufs */ +#define V_CLEANONLY 0x0008 /* vinvalbuf: invalidate only clean bufs */ #define REVOKEALL 0x0001 /* vop_revoke: revoke all aliases */ #define V_WAIT 0x0001 /* vn_start_write: sleep for suspend */ #define V_NOWAIT 0x0002 /* vn_start_write: don't sleep for suspend */ @@ -685,6 +686,7 @@ int vop_nopoll(struct vop_poll_args *); int vop_stdaccess(struct vop_access_args *ap); int vop_stdaccessx(struct vop_accessx_args *ap); +int vop_stdadvise(struct vop_advise_args *ap); int vop_stdadvlock(struct vop_advlock_args *ap); int vop_stdadvlockasync(struct vop_advlockasync_args *ap); int vop_stdadvlockpurge(struct vop_advlockpurge_args *ap); Index: sys/sys/file.h =================================================================== --- sys/sys/file.h (revision 226848) +++ sys/sys/file.h (working copy) @@ -122,6 +122,12 @@ * none not locked */ +struct fadvise_info { + int fa_advice; /* (f) FADV_* type. */ + off_t fa_start; /* (f) Region start. */ + off_t fa_end; /* (f) Region end. */ +}; + struct file { void *f_data; /* file descriptor specific data */ struct fileops *f_ops; /* File operations */ @@ -136,7 +142,11 @@ */ int f_seqcount; /* Count of sequential accesses. */ off_t f_nextoff; /* next expected read/write offset. */ - struct cdev_privdata *f_cdevpriv; /* (d) Private data for the cdev. */ + union { + struct cdev_privdata *fvn_cdevpriv; + /* (d) Private data for the cdev. */ + struct fadvise_info *fvn_advice; + } f_vnun; /* * DFLAG_SEEKABLE specific fields */ @@ -147,6 +157,9 @@ void *f_label; /* Place-holder for MAC label. */ }; +#define f_cdevpriv f_vnun.fvn_cdevpriv +#define f_advice f_vnun.fvn_advice + #define FOFFSET_LOCKED 0x1 #define FOFFSET_LOCK_WAITING 0x2