diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c index 76237fb..0d0ef86 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c @@ -91,7 +91,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include static int (*pru_sosend)(struct socket *so, struct sockaddr *addr, diff --git a/sys/dev/cxgb/ulp/tom/cxgb_ddp.c b/sys/dev/cxgb/ulp/tom/cxgb_ddp.c index a54598c..1c3953d 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_ddp.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_ddp.c @@ -90,7 +90,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #define MAX_SCHEDULE_TIMEOUT 300 diff --git a/sys/dev/cxgb/ulp/tom/cxgb_vm.c b/sys/dev/cxgb/ulp/tom/cxgb_vm.c deleted file mode 100644 index e7a3893..0000000 --- a/sys/dev/cxgb/ulp/tom/cxgb_vm.c +++ /dev/null @@ -1,166 +0,0 @@ -/************************************************************************** - -Copyright (c) 2007-2008, Chelsio Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Neither the name of the Chelsio Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -***************************************************************************/ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -/* - * This routine takes a user's map, array of pages, number of pages, and flags - * and then does the following: - * - validate that the user has access to those pages (flags indicates read - * or write) - if not fail - * - validate that count is enough to hold range number of pages - if not fail - * - fault in any non-resident pages - * - if the user is doing a read force a write fault for any COWed pages - * - if the user is doing a read mark all pages as dirty - * - hold all pages - */ -int -vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr, vm_page_t *mp, - int count, vm_prot_t prot) -{ - vm_offset_t end, va; - int faults, rv; - pmap_t pmap; - vm_page_t m, *pages; - - pmap = vm_map_pmap(map); - pages = mp; - addr &= ~PAGE_MASK; - /* - * Check that virtual address range is legal - * This check is somewhat bogus as on some architectures kernel - * and user do not share VA - however, it appears that all FreeBSD - * architectures define it - */ - end = addr + (count * PAGE_SIZE); - if (end > VM_MAXUSER_ADDRESS) { - log(LOG_WARNING, "bad address passed to vm_fault_hold_user_pages"); - return (EFAULT); - } - - /* - * First optimistically assume that all pages are resident - * (and R/W if for write) if so just mark pages as held (and - * dirty if for write) and return - */ - vm_page_lock_queues(); - for (pages = mp, faults = 0, va = addr; va < end; - va += PAGE_SIZE, pages++) { - /* - * page queue mutex is recursable so this is OK - * it would be really nice if we had an unlocked - * version of this so we were only acquiring the - * pmap lock 1 time as opposed to potentially - * many dozens of times - */ - *pages = m = pmap_extract_and_hold(pmap, va, prot); - if (m == NULL) { - faults++; - continue; - } - /* - * Preemptively mark dirty - the pages - * will never have the modified bit set if - * they are only changed via DMA - */ - if (prot & VM_PROT_WRITE) - vm_page_dirty(m); - - } - vm_page_unlock_queues(); - - if (faults == 0) - return (0); - - /* - * Pages either have insufficient permissions or are not present - * trigger a fault where neccessary - * - */ - rv = 0; - for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++) { - /* - * Account for a very narrow race where the page may be - * taken away from us before it is held - */ - while (*pages == NULL) { - rv = vm_fault(map, va, prot, - (prot & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL); - if (rv) - goto error; - *pages = pmap_extract_and_hold(pmap, va, prot); - } - } - return (0); -error: - log(LOG_WARNING, - "vm_fault bad return rv=%d va=0x%zx\n", rv, va); - vm_page_lock_queues(); - for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++) - if (*pages) { - vm_page_unhold(*pages); - *pages = NULL; - } - vm_page_unlock_queues(); - return (EFAULT); -} - -void -vm_fault_unhold_pages(vm_page_t *mp, int count) -{ - - KASSERT(count >= 0, ("negative count %d", count)); - vm_page_lock_queues(); - while (count--) { - vm_page_unhold(*mp); - mp++; - } - vm_page_unlock_queues(); -} diff --git a/sys/dev/cxgb/ulp/tom/cxgb_vm.h b/sys/dev/cxgb/ulp/tom/cxgb_vm.h deleted file mode 100644 index 7532e20..0000000 --- a/sys/dev/cxgb/ulp/tom/cxgb_vm.h +++ /dev/null @@ -1,39 +0,0 @@ -/************************************************************************** - -Copyright (c) 2007-2008, Chelsio Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Neither the name of the Chelsio Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -$FreeBSD$ - -***************************************************************************/ -#ifndef CXGB_VM_H_ -#define CXGB_VM_H_ - -int vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr, - vm_page_t *mp, int count, vm_prot_t prot); -void vm_fault_unhold_pages(vm_page_t *mp, int count); - -#endif diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c index ce1afd2..331712f 100644 --- a/sys/kern/kern_subr.c +++ b/sys/kern/kern_subr.c @@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #ifdef ZERO_COPY_SOCKETS #include #include diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 2f085d9..4b19eb2 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -62,6 +62,9 @@ __FBSDID("$FreeBSD$"); #include #include +#include +#include + #include static fo_rdwr_t vn_read; @@ -363,14 +366,43 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, int *aresid; struct thread *td; { - struct uio auio; - struct iovec aiov; + struct uio auio, auio_clone; + struct iovec aiov, aiov_clone; struct mount *mp; struct ucred *cred; - int error; + vm_page_t *m_hold; + int wired_pages, error; VFS_ASSERT_GIANT(vp->v_mount); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = base; + aiov.iov_len = len; + auio.uio_resid = len; + auio.uio_offset = offset; + auio.uio_segflg = segflg; + auio.uio_rw = rw; + auio.uio_td = td; + error = 0; + + m_hold = NULL; + if (segflg == UIO_USERSPACE) { + m_hold = malloc(sizeof(vm_page_t) * (btoc(len) + 1), M_IOV, + M_WAITOK); + aiov_clone = aiov; + auio_clone = auio; + auio_clone.uio_iov = &aiov_clone; + error = vm_wireuio(&auio, m_hold, + round_page((vm_offset_t)base + len) - + trunc_page((vm_offset_t)base), + &wired_pages); + if (error) { + free(m_hold, M_IOV); + return (error); + } + } + if ((ioflg & IO_NODELOCKED) == 0) { mp = NULL; if (rw == UIO_WRITE) { @@ -384,16 +416,6 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - aiov.iov_base = base; - aiov.iov_len = len; - auio.uio_resid = len; - auio.uio_offset = offset; - auio.uio_segflg = segflg; - auio.uio_rw = rw; - auio.uio_td = td; - error = 0; #ifdef MAC if ((ioflg & IO_NOMACCHECK) == 0) { if (rw == UIO_READ) @@ -424,6 +446,10 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, vn_finished_write(mp); VOP_UNLOCK(vp, 0); } + if (segflg == UIO_USERSPACE) { + vm_unwireuio(&auio_clone, m_hold, wired_pages); + free(m_hold, M_IOV); + } return (error); } @@ -485,31 +511,65 @@ vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred, return (error); } -/* - * File table vnode read routine. - */ +static int uio_hold_pages = 12; + static int -vn_read(fp, uio, active_cred, flags, td) - struct file *fp; - struct uio *uio; - struct ucred *active_cred; - struct thread *td; - int flags; +do_vn_rw_chunked(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, int ioflag, struct thread *td, + int (*vn_chunk_func)(struct file *, struct uio *, struct ucred *, + int, int, struct thread *)) +{ + struct uio *uio_clone; + vm_page_t *m_hold; + int error, wire_bytes, io_chunk, total_cnt, cnt; + int first_chunk, wired_pages; + + if (uio->uio_segflg != UIO_USERSPACE || fp->f_vnode->v_type != VREG) + return (vn_chunk_func(fp, uio, active_cred, flags, ioflag, td)); + + first_chunk = 1; + m_hold = malloc(sizeof(vm_page_t) * uio_hold_pages, M_IOV, M_WAITOK); + while (uio->uio_resid > 0) { + io_chunk = min(uio_hold_pages * PAGE_SIZE, uio->uio_resid); /* XXXKIB */ + wire_bytes = round_page(io_chunk); + error = vm_wireuio(uio, m_hold, wire_bytes, &wired_pages); + if (error != 0) { + if (!first_chunk) + error = 0; + break; + } + uio_clone = cloneuio(uio); + total_cnt = uio->uio_resid; + uio->uio_resid = io_chunk; + error = vn_chunk_func(fp, uio, active_cred, flags, ioflag, td); + vm_unwireuio(uio_clone, m_hold, wired_pages); + free(uio_clone, M_IOV); + cnt = io_chunk - uio->uio_resid; + uio->uio_resid = total_cnt - cnt; + if (error != 0) { + if (!first_chunk) + error = 0; + break; + } + if (cnt == 0) + break; + first_chunk = 0; + } + free(m_hold, M_IOV); + return (error); +} + +static inline int +vn_read_wired_chunk(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, int ioflag, struct thread *td) { struct vnode *vp; - int error, ioflag; struct mtx *mtxp; - int vfslocked; + int error, vfslocked; - KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", - uio->uio_td, td)); mtxp = NULL; vp = fp->f_vnode; - ioflag = 0; - if (fp->f_flag & FNONBLOCK) - ioflag |= IO_NDELAY; - if (fp->f_flag & O_DIRECT) - ioflag |= IO_DIRECT; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); /* * According to McKusick the vn lock was protecting f_offset here. @@ -552,38 +612,38 @@ vn_read(fp, uio, active_cred, flags, td) } /* - * File table vnode write routine. + * File table vnode read routine. */ static int -vn_write(fp, uio, active_cred, flags, td) - struct file *fp; - struct uio *uio; - struct ucred *active_cred; - struct thread *td; - int flags; +vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, + struct thread *td) { - struct vnode *vp; - struct mount *mp; - int error, ioflag; - int vfslocked; + int ioflag; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); - vp = fp->f_vnode; - vfslocked = VFS_LOCK_GIANT(vp->v_mount); - if (vp->v_type == VREG) - bwillwrite(); - ioflag = IO_UNIT; - if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) - ioflag |= IO_APPEND; + ioflag = 0; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; - if ((fp->f_flag & O_FSYNC) || - (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) - ioflag |= IO_SYNC; + return (do_vn_rw_chunked(fp, uio, active_cred, flags, ioflag, td, + vn_read_wired_chunk)); +} + +static inline int +vn_write_wired_chunk(struct file *fp, struct uio *uio, + struct ucred *active_cred, int flags, int ioflag, struct thread *td) +{ + struct mount *mp; + struct vnode *vp; + int error, vfslocked; + mp = NULL; + vp = fp->f_vnode; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + if (vp->v_type == VREG) + bwillwrite(); if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto unlock; @@ -608,6 +668,33 @@ unlock: } /* + * File table vnode write routine. + */ +static int +vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, + struct thread *td) +{ + struct vnode *vp; + int ioflag; + + KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", + uio->uio_td, td)); + vp = fp->f_vnode; + ioflag = IO_UNIT; + if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) + ioflag |= IO_APPEND; + if (fp->f_flag & FNONBLOCK) + ioflag |= IO_NDELAY; + if (fp->f_flag & O_DIRECT) + ioflag |= IO_DIRECT; + if ((fp->f_flag & O_FSYNC) || + (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) + ioflag |= IO_SYNC; + return (do_vn_rw_chunked(fp, uio, active_cred, flags, ioflag, td, + vn_write_wired_chunk)); +} + +/* * File table truncate routine. */ static int diff --git a/sys/modules/cxgb/tom/Makefile b/sys/modules/cxgb/tom/Makefile index 2b8750a..d2566fd 100644 --- a/sys/modules/cxgb/tom/Makefile +++ b/sys/modules/cxgb/tom/Makefile @@ -5,7 +5,7 @@ CXGB = ${.CURDIR}/../../../dev/cxgb KMOD= tom SRCS= cxgb_tom.c cxgb_cpl_io.c cxgb_listen.c cxgb_tom_sysctl.c cxgb_cpl_socket.c -SRCS+= cxgb_ddp.c cxgb_vm.c cxgb_l2t.c cxgb_tcp_offload.c +SRCS+= cxgb_ddp.c cxgb_l2t.c cxgb_tcp_offload.c SRCS+= opt_compat.h opt_inet.h opt_inet6.h opt_ipsec.h opt_mac.h SRCS+= opt_tcpdebug.h opt_ddb.h opt_sched.h opt_global.h opt_ktr.h SRCS+= device_if.h bus_if.h pci_if.h diff --git a/sys/vm/vm_extern.h b/sys/vm/vm_extern.h index 475a20e..f2b4d16 100644 --- a/sys/vm/vm_extern.h +++ b/sys/vm/vm_extern.h @@ -34,11 +34,13 @@ #define _VM_EXTERN_H_ struct buf; +struct iovec; struct proc; struct vmspace; struct vmtotal; struct mount; struct vnode; +struct uio; #ifdef _KERNEL @@ -56,6 +58,9 @@ void swapout_procs(int); int useracc(void *, int, int); int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int); void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t); +int vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr, + vm_page_t *mp, int count, vm_prot_t prot); +void vm_fault_unhold_pages(vm_page_t *mp, int count); void vm_fault_unwire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t); int vm_fault_wire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t); int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int); @@ -84,5 +89,9 @@ int vm_thread_new(struct thread *td, int pages); int vm_thread_new_altkstack(struct thread *td, int pages); void vm_thread_swapin(struct thread *td); void vm_thread_swapout(struct thread *td); +int vm_wireuio(struct uio *uiop, struct vm_page *m_hold[], int wire_bytes, + int *wired_pages); +void vm_unwireuio(struct uio *, struct vm_page *m_hold[], int wired_pages); + #endif /* _KERNEL */ #endif /* !_VM_EXTERN_H_ */ diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 48ed991..1b0dbcf 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -1,4 +1,31 @@ + /*- + +Copyright (c) 2007-2008, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 1994 John S. Dyson @@ -1362,3 +1389,115 @@ vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage) /* return number of pages */ return i; } + +/* + * This routine takes a user's map, array of pages, number of pages, and flags + * and then does the following: + * - validate that the user has access to those pages (flags indicates read + * or write) - if not fail + * - validate that count is enough to hold range number of pages - if not fail + * - fault in any non-resident pages + * - if the user is doing a read force a write fault for any COWed pages + * - if the user is doing a read mark all pages as dirty + * - hold all pages + */ +int +vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr, vm_page_t *mp, + int count, vm_prot_t prot) +{ + vm_offset_t end, va; + int faults, rv; + pmap_t pmap; + vm_page_t m, *pages; + + pmap = vm_map_pmap(map); + pages = mp; + addr &= ~PAGE_MASK; + + /* + * Check that virtual address range is legal. + * This check is somewhat bogus as on some architectures kernel + * and user do not share VA - however, it appears that all FreeBSD + * architectures define it + */ + end = addr + (count * PAGE_SIZE); + if (end > VM_MAXUSER_ADDRESS) + return (EFAULT); + + /* + * First optimistically assume that all pages are resident + * (and R/W if for write) if so just mark pages as held (and + * dirty if for write) and return. + */ + vm_page_lock_queues(); + for (pages = mp, faults = 0, va = addr; va < end; + va += PAGE_SIZE, pages++) { + /* + * Page queue mutex is recursable so this is OK. + * It would be really nice if we had an unlocked + * version of this so we were only acquiring the + * pmap lock 1 time as opposed to potentially + * many dozens of times. + */ + *pages = m = pmap_extract_and_hold(pmap, va, prot); + if (m == NULL) { + faults++; + continue; + } + + /* + * Preemptively mark dirty - the pages will never have + * the modified bit set if they are only changed via + * DMA. + */ + if (prot & VM_PROT_WRITE) + vm_page_dirty(m); + } + vm_page_unlock_queues(); + + if (faults == 0) + return (0); + + /* + * Pages either have insufficient permissions or are not present + * trigger a fault where neccessary. + */ + rv = 0; + for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++) { + /* + * Account for a very narrow race where the page may be + * taken away from us before it is held. + */ + while (*pages == NULL) { + rv = vm_fault(map, va, prot, (prot & VM_PROT_WRITE) ? + VM_FAULT_DIRTY : VM_FAULT_NORMAL); + if (rv) + goto error; + *pages = pmap_extract_and_hold(pmap, va, prot); + } + } + return (0); + +error: + vm_page_lock_queues(); + for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++) + if (*pages) { + vm_page_unhold(*pages); + *pages = NULL; + } + vm_page_unlock_queues(); + return (EFAULT); +} + +void +vm_fault_unhold_pages(vm_page_t *mp, int count) +{ + + KASSERT(count >= 0, ("negative count %d", count)); + vm_page_lock_queues(); + while (count--) { + vm_page_unhold(*mp); + mp++; + } + vm_page_unlock_queues(); +} diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 2c5821c..ca73db3 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -1374,6 +1374,7 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, new_entry->protection = prot; new_entry->max_protection = max; new_entry->wired_count = 0; + new_entry->pin_count = 0; /* * Insert the new entry into the list @@ -2796,7 +2797,8 @@ reclip_start: */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 || (vm_map_pmap(map) != kernel_pmap && - vm_map_entry_system_wired_count(entry) != 0)) { + vm_map_entry_system_wired_count(entry) != 0) || + (entry->pin_count != 0)) { entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; last_timestamp = map->timestamp; @@ -2816,6 +2818,7 @@ reclip_start: holder_entry->max_protection = VM_PROT_NONE; holder_entry->inheritance = VM_INHERIT_NONE; holder_entry->wired_count = 0; + holder_entry->pin_count = 0; vm_map_entry_link(map, entry->prev, holder_entry); } (void) vm_map_unlock_and_wait(map, 0); @@ -3865,6 +3868,166 @@ vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry) vm_map_unlock_read(map); } +static boolean_t +vm_map_unpin_entries(vm_map_t map, struct uio *uiop, int pinned_entries) +{ + vm_offset_t start; + struct iovec *iov; + vm_map_entry_t entry; + int i, acc, wire; + boolean_t need_wakeup; + + iov = uiop->uio_iov; + need_wakeup = FALSE; + + for (i = 0, acc = 0; acc < pinned_entries; iov++, i++) { + KASSERT(i < uiop->uio_iovcnt, ("wireio: iovcnt overflow %d %d %d", + i, uiop->uio_iovcnt, pinned_entries)); + wire = round_page(iov->iov_len); + if (acc + wire > pinned_entries) + wire = pinned_entries - acc; + start = trunc_page((vm_offset_t)iov->iov_base); + for (;;) { + if (!vm_map_lookup_entry(map, start, &entry)) { +#ifdef INVARIANTS + panic("vm_unwireuio: hole"); +#endif + } + KASSERT(entry->pin_count > 0, ("pin_count %p", entry)); + if (--entry->pin_count == 0 && + (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP)) { + entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; + need_wakeup = TRUE; + } + if (entry->end >= start + wire) { + acc += wire; + break; + } else { + acc += entry->end - start; + wire -= entry->end - start; + start = entry->end; + } + } + } + return (need_wakeup); +} + +/* + * vm_wireuio + * + * Given userspace struct uio, we set up vm state such that after the + * successfull return there will be no page faults during uiomove with + * this uio until vm_unwireuio is called. At most wire_bytes bytes of + * the user address space are held. + * + * Function performs this by first pinning all map entries that will + * be referenced. This guarantees that our ranges of user address + * space cannot be remmapped during the operation. Then, all accessed + * pages are faulted in and held. + */ +int +vm_wireuio(struct uio *uiop, struct vm_page *m_hold[], int wire_bytes, + int *wired_pages) +{ + vm_map_t map; + vm_offset_t start, start1; + struct iovec *iov; + vm_map_entry_t entry; + struct vm_page **m_hold1; + int i, acc, wire, wire_pages, pinned_entries, rv, prot; + int error; + boolean_t need_wakeup; + + KASSERT(round_page(wire_bytes) == wire_bytes, + ("wireuio: wire_bytes is not page-size aligned")); + KASSERT(uiop->uio_segflg == UIO_USERSPACE, + ("wireuio: !UIO_USERSPACE")); + + error = 0; + prot = uiop->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ; + m_hold1 = m_hold; + pinned_entries = 0; + *wired_pages = 0; + map = &uiop->uio_td->td_proc->p_vmspace->vm_map; + iov = uiop->uio_iov; + + /* + * Do the pass over iov. + */ + for (i = 0, acc = 0; acc < wire_bytes; iov++, i++) { + wire = round_page(iov->iov_len); + if (acc + wire > wire_bytes) + wire = wire_bytes - acc; + acc += wire; + wire_pages = btoc(wire); + start1 = start = trunc_page((vm_offset_t)iov->iov_base); + if (start < vm_map_min(map) || start + wire > vm_map_max(map) || + start > start + wire) { + error = EINVAL; + goto fault; + } + vm_map_lock(map); + + /* + * Pin each entry referenced by addresses in iov. + */ + for (;;) { + if (!vm_map_lookup_entry(map, start1, &entry) || + (entry->eflags & MAP_ENTRY_IS_HOLDER)) + goto fault; + entry->pin_count++; + if (entry->end >= start1 + wire) { + pinned_entries += wire; + break; + } else { + pinned_entries += entry->end - start1; + wire -= entry->end - start1; + start1 = entry->end; + } + } + vm_map_unlock(map); + + /* + * If entries are successfully pinned, the + * corresponding pages are faulted in and held. + */ + rv = vm_fault_hold_user_pages(map, start, m_hold1, wire_pages, + prot); + if (rv != KERN_SUCCESS) { + error = EFAULT; + goto fault; + } + *wired_pages += wire_pages; + m_hold1 += wire_pages; + } + return (0); + fault: + vm_fault_unhold_pages(m_hold, *wired_pages); + vm_map_lock(map); + need_wakeup = vm_map_unpin_entries(map, uiop, pinned_entries); + vm_map_unlock(map); + if (need_wakeup) + vm_map_wakeup(map); + return (error); +} + +void +vm_unwireuio(struct uio *uiop, struct vm_page *m_hold[], int wired_pages) +{ + vm_map_t map; + boolean_t need_wakeup; + + map = &uiop->uio_td->td_proc->p_vmspace->vm_map; + + vm_fault_unhold_pages(m_hold, wired_pages); + + vm_map_lock(map); + need_wakeup = vm_map_unpin_entries(map, uiop, ctob(wired_pages)); + vm_map_unlock(map); + if (need_wakeup) + vm_map_wakeup(map); +} + #include "opt_ddb.h" #ifdef DDB #include diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h index f2c4fd3..9310718 100644 --- a/sys/vm/vm_map.h +++ b/sys/vm/vm_map.h @@ -114,6 +114,7 @@ struct vm_map_entry { vm_inherit_t inheritance; /* inheritance */ int wired_count; /* can be paged if = 0 */ vm_pindex_t lastr; /* last read */ + unsigned pin_count; /* non-exclusive pin count */ }; #define MAP_ENTRY_NOSYNC 0x0001 @@ -383,5 +384,6 @@ int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end, int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags); int vmspace_swap_count (struct vmspace *vmspace); + #endif /* _KERNEL */ #endif /* _VM_MAP_ */ diff --git a/tools/regression/file/uio/uio.c b/tools/regression/file/uio/uio.c new file mode 100644 index 0000000..d857605 --- /dev/null +++ b/tools/regression/file/uio/uio.c @@ -0,0 +1,116 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int chunk_cnt = 1024; +int chunk_size = 1024; + +int +main(int argc, char *argv[]) +{ + struct iovec *wiov, *riov; + char **wdata, **rdata; + int fd, i; + ssize_t io_error; + + if (argc < 2) { + fprintf(stderr, "Usage: uio file [chunk count [chunk size]]\n"); + return (2); + } + fd = open(argv[1], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); + if (fd == -1) { + fprintf(stderr, "Failed to create %s: %s\n", + argv[1], strerror(errno)); + return (1); + } + + if (argc > 2) + chunk_cnt = atoi(argv[2]); + if (argc > 3) + chunk_size = atoi(argv[3]); + + wiov = calloc(chunk_cnt, sizeof(*wiov)); + wdata = calloc(chunk_cnt, sizeof(*wdata)); + + riov = calloc(chunk_cnt, sizeof(*riov)); + rdata = calloc(chunk_cnt, sizeof(*rdata)); + + for (i = 0; i < chunk_cnt; i++) { + rdata[i] = malloc(chunk_size); + riov[i].iov_base = rdata[i]; + riov[i].iov_len = chunk_size; + + wdata[i] = malloc(chunk_size); + memset(wdata[i], i, chunk_size); + wiov[i].iov_base = wdata[i]; + wiov[i].iov_len = chunk_size; + } + + io_error = writev(fd, wiov, chunk_cnt); + if (io_error == -1) { + fprintf(stderr, "write failed: %s\n", strerror(errno)); + return (1); + } else if (io_error != chunk_cnt * chunk_size) { + fprintf(stderr, "truncated write: %d %d\n", + io_error, chunk_cnt * chunk_size); + return (1); + } + + if (lseek(fd, 0, SEEK_SET) == -1) { + fprintf(stderr, "lseek failed: %s\n", strerror(errno)); + return (1); + } + + io_error = readv(fd, riov, chunk_cnt); + if (io_error == -1) { + fprintf(stderr, "read failed: %s\n", strerror(errno)); + return (1); + } else if (io_error != chunk_cnt * chunk_size) { + fprintf(stderr, "truncated read: %d %d\n", + io_error, chunk_cnt * chunk_size); + return (1); + } + + for (i = 0; i < chunk_cnt; i++) { + if (memcmp(rdata[i], wdata[i], chunk_size) != 0) { + fprintf(stderr, "chunk %d differs\n", i); + return (1); + } + } + + return (0); +}