Index: kern/uipc_mbuf.c =================================================================== RCS file: /home/ncvs/src/sys/kern/uipc_mbuf.c,v retrieving revision 1.167 diff -u -p -r1.167 uipc_mbuf.c --- kern/uipc_mbuf.c 21 Sep 2006 09:55:43 -0000 1.167 +++ kern/uipc_mbuf.c 6 Oct 2006 16:30:11 -0000 @@ -93,61 +93,61 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, m_defrag * chain. */ struct mbuf * -m_getm(struct mbuf *m, int len, int how, short type) +m_getm2(struct mbuf *m, int len, int how, short type, int flags) { - struct mbuf *mb, *top, *cur, *mtail; - int num, rem; - int i; + struct mbuf *mb, *nm = NULL, *mtail = NULL; - KASSERT(len >= 0, ("m_getm(): len is < 0")); + KASSERT(len >= 0, ("%s: len is < 0", __func__)); - /* If m != NULL, we will append to the end of that chain. */ - if (m != NULL) - for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next); - else - mtail = NULL; + /* Validate flags. */ + flags &= (M_PKTHDR | M_EOR); - /* - * Calculate how many mbufs+clusters ("packets") we need and how much - * leftover there is after that and allocate the first mbuf+cluster - * if required. - */ - num = len / MCLBYTES; - rem = len % MCLBYTES; - top = cur = NULL; - if (num > 0) { - if ((top = cur = m_getcl(how, type, 0)) == NULL) - goto failed; - top->m_len = 0; - } - num--; - - for (i = 0; i < num; i++) { - mb = m_getcl(how, type, 0); - if (mb == NULL) - goto failed; - mb->m_len = 0; - cur = (cur->m_next = mb); - } - if (rem > 0) { - mb = (rem >= MINCLSIZE) ? - m_getcl(how, type, 0) : m_get(how, type); - if (mb == NULL) - goto failed; - mb->m_len = 0; - if (cur == NULL) - top = mb; + /* Packet header mbuf must be first in chain. */ + if ((flags & M_PKTHDR) && m != NULL) + flags &= ~M_PKTHDR; + + /* Loop and append maximum sized mbufs to the chain tail. */ + while (len > 0) { + if (len > MCLBYTES) + mb = m_getjcl(how, type, (flags & M_PKTHDR), + MJUMPAGESIZE); + else if (len >= MINCLSIZE) + mb = m_getcl(how, type, (flags & M_PKTHDR)); + else if (flags & M_PKTHDR) + mb = m_gethdr(how, type); else - cur->m_next = mb; - } + mb = m_get(how, type); - if (mtail != NULL) - mtail->m_next = top; - return top; -failed: - if (top != NULL) - m_freem(top); - return NULL; + /* Fail the whole operation if one mbuf can't be allocated. */ + if (mb == NULL) { + if (nm != NULL) + m_freem(nm); + return (NULL); + } + + /* Book keeping. */ + len -= (mb->m_flags & M_EXT) ? mb->m_ext.ext_size : + ((mb->m_flags & M_PKTHDR) ? MHLEN : MLEN); + if (mtail != NULL) + mtail->m_next = mb; + else + nm = mb; + mtail = mb; + flags &= ~M_PKTHDR; /* Only valid on the first mbuf. */ + } + if (flags & M_EOR) + mtail->m_flags |= M_EOR; /* Only valid on the last mbuf. */ + + /* If mbuf was supplied, append new chain to the end of it. */ + if (m != NULL) { + for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next) + ; + mtail->m_next = nm; + mtail->m_flags &= ~M_EOR; + } else + m = nm; + + return (m); } /* @@ -1609,55 +1609,58 @@ nospace: #endif +/* + * Copy the contents of uio into a properly sized mbuf chain. + */ struct mbuf * -m_uiotombuf(struct uio *uio, int how, int len, int align) +m_uiotombuf(struct uio *uio, int how, int len, int align, int flags) { - struct mbuf *m_new = NULL, *m_final = NULL; - int progress = 0, error = 0, length, total; + struct mbuf *m, *mb; + int error, length, total; + int progress = 0; + /* + * len can be zero or an arbitrary large value bound by + * the total data supplied by the uio. + */ if (len > 0) total = min(uio->uio_resid, len); else total = uio->uio_resid; + + /* + * The smallest unit returned by m_getm2() is a single mbuf + * with pkthdr. We can't align past it. Align align itself. + */ + if (align) + align &= ~(sizeof(long) - 1); if (align >= MHLEN) - goto nospace; - if (total + align > MHLEN) - m_final = m_getcl(how, MT_DATA, M_PKTHDR); - else - m_final = m_gethdr(how, MT_DATA); - if (m_final == NULL) - goto nospace; - m_final->m_data += align; - m_new = m_final; - while (progress < total) { - length = total - progress; - if (length > MCLBYTES) - length = MCLBYTES; - if (m_new == NULL) { - if (length > MLEN) - m_new = m_getcl(how, MT_DATA, 0); - else - m_new = m_get(how, MT_DATA); - if (m_new == NULL) - goto nospace; + return (NULL); + + /* Give us all or nothing. */ + m = m_getm2(NULL, total + align, how, MT_DATA, flags); + if (m == NULL) + return (NULL); + m->m_data += align; + + /* Fill all mbufs with uio data and update header information. */ + for (mb = m; mb != NULL; mb = mb->m_next) { + length = min(M_TRAILINGSPACE(mb), total - progress); + + error = uiomove(mtod(mb, void *), length, uio); + if (error) { + m_freem(m); + return (NULL); } - error = uiomove(mtod(m_new, void *), length, uio); - if (error) - goto nospace; + + mb->m_len = length; progress += length; - m_new->m_len = length; - if (m_new != m_final) - m_cat(m_final, m_new); - m_new = NULL; + if (flags & M_PKTHDR) + m->m_pkthdr.len += length; } - m_fixhdr(m_final); - return (m_final); -nospace: - if (m_new) - m_free(m_new); - if (m_final) - m_freem(m_final); - return (NULL); + KASSERT(progress == total, ("%s: progress != total", __func__)); + + return (m); } /* Index: kern/uipc_sockbuf.c =================================================================== RCS file: /home/ncvs/src/sys/kern/uipc_sockbuf.c,v retrieving revision 1.165 diff -u -p -r1.165 uipc_sockbuf.c --- kern/uipc_sockbuf.c 6 Sep 2006 21:59:36 -0000 1.165 +++ kern/uipc_sockbuf.c 6 Oct 2006 16:30:11 -0000 @@ -527,6 +568,9 @@ void sbappendstream(struct sockbuf *sb, struct mbuf *m) { + /* Get rid of packet headers. */ + m_demote(m, 0); + SOCKBUF_LOCK(sb); sbappendstream_locked(sb, m); SOCKBUF_UNLOCK(sb); @@ -790,6 +834,84 @@ sbcompress(struct sockbuf *sb, struct mb SBLASTMBUFCHK(sb); } +#if 1 +/* + * Pull mbufs from the socket buffer. If len cuts into a mbuf make a copy + * of it. This function only works on stream sockets. + */ +struct mbuf * +sbpull_locked(struct sockbuf *sb, int len, int how) +{ + struct mbuf *m, *n, *top, *tail; + + SOCKBUF_LOCK_ASSERT(sb); + + if (len < sb->sb_mb->m_len) { + top = tail = NULL; + m = sb->sb_mb; + } else { + for (top = tail = m = sb->sb_mb; + m != NULL && len >= m->m_len; + m = m->m_next) { + tail = m; + len -= m->m_len; + sbfree(sb, m); /* Updates sockbuf counters. */ + } + if (m == NULL) { + sb->sb_mb = NULL; + sb->sb_mbtail = NULL; + sb->sb_lastrecord = NULL; + } + tail->m_next = NULL; + } + if (len > 0 && m != NULL) { + KASSERT(len < m->m_len, ("%s: inconsistent", __func__)); + sb->sb_mb = m; + sb->sb_lastrecord = m; + n = m_copym(m, 0, len, how); + if (n == NULL) + goto out; + if (tail != NULL) + tail->m_next = n; + else + top = n; + m->m_len -= len; + m->m_data += len; + sb->sb_cc -= len; + } +out: + SBLASTRECORDCHK(sb); + SBLASTMBUFCHK(sb); + + return (top); +} + +/* + * Prepend mbuf chain to socket buffer. Works only on stream sockets. + */ +void +sbprepend_locked(struct sockbuf *sb, struct mbuf *m) +{ + struct mbuf *n; + + SOCKBUF_LOCK_ASSERT(sb); + + for (n = m; ; ) { + sballoc(sb, n); /* updates sockbuf counters */ + if (n->m_next != NULL) + n = n->m_next; + else + break; + } + n->m_next = sb->sb_mb; + sb->sb_mb = m; + sb->sb_lastrecord = m; + + SBLASTRECORDCHK(sb); + SBLASTMBUFCHK(sb); +} +#endif + /* * Free all mbufs in a sockbuf. Check that all resources are reclaimed. */ Index: kern/uipc_socket.c =================================================================== RCS file: /home/ncvs/src/sys/kern/uipc_socket.c,v retrieving revision 1.283 diff -u -p -r1.283 uipc_socket.c --- kern/uipc_socket.c 22 Sep 2006 15:34:16 -0000 1.283 +++ kern/uipc_socket.c 6 Oct 2006 16:30:11 -0000 @@ -811,9 +811,11 @@ struct so_zerocopy_stats so_zerocp_stats #include #include #include -#endif /*ZERO_COPY_SOCKETS*/ /* + * sosend_copyin() is only used if zero copy sockets are enabled. Otherwise + * sosend_dgram() and sosend_generic() use m_uiotombuf(). + * * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or * all of the data referenced by the uio. If desired, it uses zero-copy. * *space will be updated to reflect data copied in. @@ -863,9 +865,9 @@ sosend_copyin(struct uio *uio, struct mb } } if (so_zero_copy_send && - resid>=PAGE_SIZE && - *space>=PAGE_SIZE && - uio->uio_iov->iov_len>=PAGE_SIZE) { + resid >= PAGE_SIZE && + *space >= PAGE_SIZE && + uio->uio_iov->iov_len >= PAGE_SIZE) { so_zerocp_stats.size_ok++; so_zerocp_stats.align_ok++; cow_send = socow_setup(m, uio); @@ -937,6 +939,7 @@ out: *retmp = top; return (error); } +#endif /*ZERO_COPY_SOCKETS*/ #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) @@ -952,7 +955,9 @@ sosend_dgram(so, addr, uio, top, control { long space, resid; int clen = 0, error, dontroute; +#ifdef ZERO_COPY_SOCKETS int atomic = sosendallatonce(so) || top; +#endif KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM")); KASSERT(so->so_proto->pr_flags & PR_ATOMIC, @@ -1038,9 +1043,19 @@ sosend_dgram(so, addr, uio, top, control if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { +#ifdef ZERO_COPY_SOCKETS error = sosend_copyin(uio, &top, atomic, &space, flags); if (error) goto out; +#else + top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, + (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); + if (top == NULL) { + error = EFAULT; /* only possible error */ + goto out; + } + space -= resid - uio->uio_resid; +#endif resid = uio->uio_resid; } KASSERT(resid == 0, ("sosend_dgram: resid != 0")); @@ -1200,12 +1215,25 @@ restart: if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { +#ifdef ZERO_COPY_SOCKETS error = sosend_copyin(uio, &top, atomic, &space, flags); if (error != 0) { SOCKBUF_LOCK(&so->so_snd); goto release; } +#else + top = m_uiotombuf(uio, M_WAITOK, space, + (atomic ? max_hdr : 0), + (atomic ? M_PKTHDR : 0) | + ((flags & MSG_EOR) ? M_EOR : 0)); + if (top == NULL) { + SOCKBUF_LOCK(&so->so_snd); + error = EFAULT; /* only possible error */ + goto release; + } + space -= resid - uio->uio_resid; +#endif resid = uio->uio_resid; } if (dontroute) { @@ -1375,6 +1403,209 @@ sockbuf_pushsync(struct sockbuf *sb, str } +int +soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, + struct mbuf **mp0, struct mbuf **controlp, int *flagsp) +{ + struct mbuf *m, *pull = NULL; + struct sockbuf *sb; + int drop = 0, error = 0, flags, oresid; + + /* We only do stream sockets. */ + if (so->so_type != SOCK_STREAM) + return (EINVAL); + if (psa != NULL) + *psa = NULL; + if (controlp != NULL) + return (EINVAL); + if (flagsp != NULL) + flags = *flagsp &~ MSG_EOR; + else + flags = 0; + if (flags & MSG_OOB) + return (soreceive_rcvoob(so, uio, flags)); + + sb = &so->so_rcv; + + SOCKBUF_LOCK(sb); + error = sblock(sb, SBLOCKWAIT(flags)); + if (error) + goto out; + + /* + * 1. If no messages are available at the socket, the receive call + * waits for a message to arrive, unless the socket is non-blocking + * in which case it returns EAGAIN; + * 2. The receive calls normally return any data available, up to + * the requested amount, rather than waiting for receipt of the full + * amount requested; this behavior is affected by the socket-level + * options SO_RCVLOWAT and SO_RCVTIMEO. + */ + + /* Easy one, no space to copyout anything. */ + if (uio->uio_resid <= 0) { + error = EINVAL; + goto out; + } + + /* We will never ever get anything until we are connected. */ + if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && + (so->so_proto->pr_flags & PR_CONNREQUIRED)) { + if (sb->sb_cc > 0) + goto deliver; + else { + error = ENOTCONN; + goto out; + } + } + + /* Socket buffer is empty and we shall not wait and block. */ + if (sb->sb_cc == 0 && + ((sb->sb_flags & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { + error = EAGAIN; + goto out; + } + +restart: + /* Abort if socket has reported problems. */ + if (so->so_error) { + if ((flags & MSG_PEEK) == 0) + so->so_error = 0; + if (sb->sb_cc > 0) + goto deliver; + else { + error = so->so_error; + goto out; + } + } + + /* Door is closed. Deliver what is left, if any. */ + if (sb->sb_state & SBS_CANTRCVMORE) { + if (sb->sb_cc > 0) + goto deliver; + else + goto out; + } + + /* Socket buffer got some data that we shall deliver now. */ + if (sb->sb_cc > 0 && (flags & MSG_WAITALL) == 0 && + ((sb->sb_flags & SS_NBIO) || + (flags & (MSG_DONTWAIT|MSG_NBIO)) || + sb->sb_cc >= sb->sb_lowat || + sb->sb_cc >= uio->uio_resid || + sb->sb_cc >= sb->sb_hiwat) ) { + goto deliver; + } + + /* On MSG_WAITALL we must wait until all data or error arrives. */ + if ((flags & MSG_WAITALL) && + (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_hiwat)) + goto deliver; + + /* + * Deliver what we've already got or wait and block + * until (more) data comes in. + */ + error = sbwait(sb); /* Drops socket buffer lock. */ + if (error) + goto out; + goto restart; + +deliver: + KASSERT(sb->sb_cc > 0, ("%s: sb_cc < 1", __func__)); + if (mp0 != NULL) { + /* Just provide the mbuf chain. */ + drop = min(uio->uio_resid, sb->sb_cc); + + if (flags & MSG_PEEK) { + *mp0 = m_copym(sb->sb_mb, 0, drop, M_DONTWAIT); + if (*mp0 == NULL) { + error = ENOMEM; + goto out; + } + } else { + /* + * Instead of doing a copy sbpull pulls the mbufs + * out of the socket buffer. + */ + *mp0 = sbpull_locked(sb, drop, M_DONTWAIT); + } + drop = 0; + } else { + /* + * Fill uio until full or current end of socket buffer + * is reached by pulling mbufs from socket buffer and + * processing them. If copyout fails, put them back. + */ +#if 0 /* sb unlock+lock per mbuf */ + for (m = sb->sb_mb; + m != NULL && uio->uio_resid > 0; + m = m->m_next) { + oresid = uio->uio_resid; + + SOCKBUF_UNLOCK(sb); + error = uiomove(mtod(m, char *), m->m_len, uio); + SOCKBUF_LOCK(sb); + if (error) + goto out; + + drop += oresid - uio->uio_resid; + } + pull = NULL; +#else /* sb unlock+lock per read */ + oresid = 0; + drop = min(uio->uio_resid, sb->sb_cc); + if (flags & MSG_PEEK) + pull = m_copym(sb->sb_mb, 0, drop, M_DONTWAIT); + else + pull = sbpull_locked(sb, drop, M_DONTWAIT); + + /* Must unlock socket buffer as uiomove may sleep. */ + SOCKBUF_UNLOCK(sb); + for (m = pull; + m != NULL && uio->uio_resid > 0; + m = m->m_next) { + error = uiomove(mtod(m, char *), m->m_len, uio); + if (error) { + SOCKBUF_LOCK(sb); + if (flags & MSG_PEEK) + m_freem(pull); + else + sbprepend_locked(sb, pull); + goto out; + } + } + m_freem(pull); + drop = 0; + SOCKBUF_LOCK(sb); +#endif + } + /* + * Don't drop the delivered data if we are only peeking. + * XXX: Unused with sbpull. + */ + if (drop > 0 && (flags & MSG_PEEK) == 0) + sbdrop_locked(sb, drop); + + /* Notify protocol that we drained some data. */ + if ((so->so_proto->pr_flags & PR_WANTRCVD) && !(flags & MSG_SOCALLBCK)) { + sbunlock(sb); + SOCKBUF_UNLOCK(sb); + (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); + return (error); + } + /* + * For MSG_WAITALL we may have to loop again and wait for + * more data to come in. + */ + if ((flags & MSG_WAITALL) && uio->uio_resid > 0) + goto restart; +out: + sbunlock(sb); + SOCKBUF_UNLOCK(sb); + return (error); +} + /* * Implement receive operations on a socket. We depend on the way that * records are added to the sockbuf by sbappend. In particular, each record Index: kern/uipc_syscalls.c =================================================================== RCS file: /home/ncvs/src/sys/kern/uipc_syscalls.c,v retrieving revision 1.237 diff -u -p -r1.237 uipc_syscalls.c --- kern/uipc_syscalls.c 9 Aug 2006 17:43:26 -0000 1.237 +++ kern/uipc_syscalls.c 6 Oct 2006 16:30:13 -0000 @@ -4,6 +4,7 @@ * * sendfile(2) and related extensions: * Copyright (c) 1998, David Greenman. All rights reserved. + * Copyright (c) 2006, Internet Business Solutions AG. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -108,7 +109,8 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsu * open file flags. */ static int -getsock(struct filedesc *fdp, int fd, struct file **fpp, u_int *fflagp) +getsock(struct filedesc *fdp, int fd, struct file **fpp, u_int *fflagp, + int dtype) { struct file *fp; int error; @@ -121,9 +123,9 @@ getsock(struct filedesc *fdp, int fd, st fp = fget_locked(fdp, fd); if (fp == NULL) error = EBADF; - else if (fp->f_type != DTYPE_SOCKET) { + else if (fp->f_type != dtype) { fp = NULL; - error = ENOTSOCK; + error = (dtype == DTYPE_SOCKET) ? ENOTSOCK : EBADF; } else { fhold(fp); if (fflagp != NULL) @@ -225,7 +227,7 @@ kern_bind(td, fd, sa) int error; NET_LOCK_GIANT(); - error = getsock(td->td_proc->p_fd, fd, &fp, NULL); + error = getsock(td->td_proc->p_fd, fd, &fp, NULL, DTYPE_SOCKET); if (error) goto done2; so = fp->f_data; @@ -263,7 +265,7 @@ listen(td, uap) int error; NET_LOCK_GIANT(); - error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL); + error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL, DTYPE_SOCKET); if (error == 0) { so = fp->f_data; #ifdef MAC @@ -361,7 +363,7 @@ kern_accept(struct thread *td, int s, st fdp = td->td_proc->p_fd; NET_LOCK_GIANT(); - error = getsock(fdp, s, &headfp, &fflag); + error = getsock(fdp, s, &headfp, &fflag, DTYPE_SOCKET); if (error) goto done2; head = headfp->f_data; @@ -563,7 +565,7 @@ kern_connect(td, fd, sa) int interrupted = 0; NET_LOCK_GIANT(); - error = getsock(td->td_proc->p_fd, fd, &fp, NULL); + error = getsock(td->td_proc->p_fd, fd, &fp, NULL, DTYPE_SOCKET); if (error) goto done2; so = fp->f_data; @@ -781,7 +783,7 @@ kern_sendit(td, s, mp, flags, control, s #endif NET_LOCK_GIANT(); - error = getsock(td->td_proc->p_fd, s, &fp, NULL); + error = getsock(td->td_proc->p_fd, s, &fp, NULL, DTYPE_SOCKET); if (error) goto bad2; so = (struct socket *)fp->f_data; @@ -991,7 +993,7 @@ kern_recvit(td, s, mp, fromseg, controlp *controlp = 0; NET_LOCK_GIANT(); - error = getsock(td->td_proc->p_fd, s, &fp, NULL); + error = getsock(td->td_proc->p_fd, s, &fp, NULL, DTYPE_SOCKET); if (error) { NET_UNLOCK_GIANT(); return (error); @@ -1326,7 +1328,7 @@ shutdown(td, uap) int error; NET_LOCK_GIANT(); - error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL); + error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL, DTYPE_SOCKET); if (error == 0) { so = fp->f_data; error = soshutdown(so, uap->how); @@ -1393,7 +1395,7 @@ kern_setsockopt(td, s, level, name, val, } NET_LOCK_GIANT(); - error = getsock(td->td_proc->p_fd, s, &fp, NULL); + error = getsock(td->td_proc->p_fd, s, &fp, NULL, DTYPE_SOCKET); if (error == 0) { so = fp->f_data; error = sosetopt(so, &sopt); @@ -1476,7 +1478,7 @@ kern_getsockopt(td, s, level, name, val, } NET_LOCK_GIANT(); - error = getsock(td->td_proc->p_fd, s, &fp, NULL); + error = getsock(td->td_proc->p_fd, s, &fp, NULL, DTYPE_SOCKET); if (error == 0) { so = fp->f_data; error = sogetopt(so, &sopt); @@ -1541,7 +1543,7 @@ kern_getsockname(struct thread *td, int return (EINVAL); NET_LOCK_GIANT(); - error = getsock(td->td_proc->p_fd, fd, &fp, NULL); + error = getsock(td->td_proc->p_fd, fd, &fp, NULL, DTYPE_SOCKET); if (error) goto done; so = fp->f_data; @@ -1645,7 +1647,7 @@ kern_getpeername(struct thread *td, int return (EINVAL); NET_LOCK_GIANT(); - error = getsock(td->td_proc->p_fd, fd, &fp, NULL); + error = getsock(td->td_proc->p_fd, fd, &fp, NULL, DTYPE_SOCKET); if (error) goto done2; so = fp->f_data; @@ -1881,19 +1883,20 @@ kern_sendfile(struct thread *td, struct struct vnode *vp; struct vm_object *obj = NULL; struct socket *so = NULL; - struct mbuf *m, *m_header = NULL; + struct mbuf *m = NULL; struct sf_buf *sf; struct vm_page *pg; - off_t off, xfsize, hdtr_size, sbytes = 0; - int error, headersize = 0, headersent = 0; + off_t off, xfsize, hdtr_size = 0, sbytes = 0, rem = 0; + int error, headersize = 0, headersent = 0, mnw = 0; int vfslocked; NET_LOCK_GIANT(); - hdtr_size = 0; - /* - * The descriptor must be a regular file and have a backing VM object. + * The file descriptor must be a regular file and have a + * backing VM object. + * File offset must be positive. If it goes beyond EOF + * we send only the header/trailer and no payload data. */ if ((error = fgetvp_read(td, uap->fd, &vp)) != 0) goto done; @@ -1921,7 +1924,17 @@ kern_sendfile(struct thread *td, struct error = EINVAL; goto done; } - if ((error = getsock(td->td_proc->p_fd, uap->s, &sock_fp, NULL)) != 0) + if (uap->offset < 0) { + error = EINVAL; + goto done; + } + + /* + * The socket must be a stream socket and connected. + * Remember if it a blocking or non-blocking socket. + */ + if ((error = getsock(td->td_proc->p_fd, uap->s, &sock_fp, + NULL, DTYPE_SOCKET)) != 0) goto done; so = sock_fp->f_data; if (so->so_type != SOCK_STREAM) { @@ -1932,10 +1945,8 @@ kern_sendfile(struct thread *td, struct error = ENOTCONN; goto done; } - if (uap->offset < 0) { - error = EINVAL; - goto done; - } + if (uap->flags & SF_MNOWAIT) + mnw = 1; #ifdef MAC SOCK_LOCK(so); @@ -1945,290 +1956,315 @@ kern_sendfile(struct thread *td, struct goto done; #endif - /* - * If specified, get the pointer to the sf_hdtr struct for - * any headers/trailers. - */ + /* If headers are specified copy them into mbufs. */ if (hdr_uio != NULL) { hdr_uio->uio_td = td; hdr_uio->uio_rw = UIO_WRITE; if (hdr_uio->uio_resid > 0) { - m_header = m_uiotombuf(hdr_uio, M_DONTWAIT, 0, 0); - if (m_header == NULL) + m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK), + 0, 0, 0); + if (m == NULL) { + error = mnw ? EAGAIN : ENOBUFS; goto done; - headersize = m_header->m_pkthdr.len; + } + /* XXX: This should not be a header mbuf. */ + /* m_demote(m, 0); */ + headersize = hdr_uio->uio_resid; if (compat) sbytes += headersize; } } - /* - * Protect against multiple writers to the socket. - */ + /* Protect against multiple writers to the socket. */ SOCKBUF_LOCK(&so->so_snd); (void) sblock(&so->so_snd, M_WAITOK); SOCKBUF_UNLOCK(&so->so_snd); /* - * Loop through the pages in the file, starting with the requested + * Loop through the pages of the file, starting with the requested * offset. Get a file page (do I/O if necessary), map the file page * into an sf_buf, attach an mbuf header to the sf_buf, and queue * it on the socket. + * This is done in two loops. The inner loop turns as many pages + * as it can, up to available socket buffer space, without blocking + * into mbufs to have it bulk delivered into the socket send buffer. + * The outer loop checks the state and available space of the socket + * and takes care of the overall progress. */ - for (off = uap->offset; ; off += xfsize, sbytes += xfsize) { - vm_pindex_t pindex; - vm_offset_t pgoff; + for (off = uap->offset; ; ) { + int loopbytes = 0; + int space = 0; + int done = 0; - pindex = OFF_TO_IDX(off); - VM_OBJECT_LOCK(obj); -retry_lookup: /* - * Calculate the amount to transfer. Not to exceed a page, - * the EOF, or the passed in nbytes. - */ - xfsize = obj->un_pager.vnp.vnp_size - off; - VM_OBJECT_UNLOCK(obj); - if (xfsize > PAGE_SIZE) - xfsize = PAGE_SIZE; - pgoff = (vm_offset_t)(off & PAGE_MASK); - if (PAGE_SIZE - pgoff < xfsize) - xfsize = PAGE_SIZE - pgoff; - if (uap->nbytes && xfsize > (uap->nbytes - sbytes)) - xfsize = uap->nbytes - sbytes; - if (xfsize <= 0) { - if (m_header != NULL) { - m = m_header; - m_header = NULL; - SOCKBUF_LOCK(&so->so_snd); - goto retry_space; - } else - break; - } - /* - * Optimize the non-blocking case by looking at the socket space - * before going to the extra work of constituting the sf_buf. + * Check the socket state for ongoing connection, + * no errors and space in socket buffer. + * If space is low allow for the remainder of the + * file to be processed if it fits the socket buffer. + * Otherwise block in waiting for sufficient space + * to proceed, or if the socket is nonblocking, return + * to userland with EAGAIN while reporting how far + * we've come. + * We wait until the socket buffer has significant free + * space to do bulk sends. This makes good use of file + * system read ahead and allows packet segmentation + * offloading hardware to take over lots of work. If + * we were not careful here we would send off only one + * sfbuf at a time. */ SOCKBUF_LOCK(&so->so_snd); - if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) { - if (so->so_snd.sb_state & SBS_CANTSENDMORE) - error = EPIPE; - else - error = EAGAIN; - sbunlock(&so->so_snd); + if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) + so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; +retry_space: + if (so->so_snd.sb_state & SBS_CANTSENDMORE) { + error = EPIPE; + SOCKBUF_UNLOCK(&so->so_snd); + goto done; + } else if (so->so_error) { + error = so->so_error; + so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto done; } - SOCKBUF_UNLOCK(&so->so_snd); - VM_OBJECT_LOCK(obj); - /* - * Attempt to look up the page. - * - * Allocate if not found - * - * Wait and loop if busy. - */ - pg = vm_page_lookup(obj, pindex); - - if (pg == NULL) { - pg = vm_page_alloc(obj, pindex, VM_ALLOC_NOBUSY | - VM_ALLOC_NORMAL | VM_ALLOC_WIRED); - if (pg == NULL) { - VM_OBJECT_UNLOCK(obj); - VM_WAIT; - VM_OBJECT_LOCK(obj); - goto retry_lookup; + space = sbspace(&so->so_snd); + if (space < rem && + (space <= 0 || +#if 0 + space < so->so_snd.sb_lowat || + space < PAGE_SIZE || + space < (so->so_snd.sb_hiwat / 2))) { +#else + space < so->so_snd.sb_lowat)) { +#endif + if (so->so_state & SS_NBIO) { + SOCKBUF_UNLOCK(&so->so_snd); + error = EAGAIN; + goto done; } - } else if (vm_page_sleep_if_busy(pg, TRUE, "sfpbsy")) - goto retry_lookup; - else { /* - * Wire the page so it does not get ripped out from - * under us. + * sbwait drops the lock while sleeping. + * When we loop back to retry_space the + * state may have changed and we retest + * for it. */ - vm_page_lock_queues(); - vm_page_wire(pg); - vm_page_unlock_queues(); + error = sbwait(&so->so_snd); + /* + * An error from sbwait usually indicates that we've + * been interrupted by a signal. If we've sent anything + * then return bytes sent, otherwise return the error. + */ + if (error) { + SOCKBUF_UNLOCK(&so->so_snd); + goto done; + } + goto retry_space; } + SOCKBUF_UNLOCK(&so->so_snd); /* - * If page is not valid for what we need, initiate I/O + * Loop and construct maximum sized mbuf chain to be bulk + * dumped into socket buffer. */ + while(space > loopbytes) { + vm_pindex_t pindex; + vm_offset_t pgoff; + struct mbuf *m0; - if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize)) { - VM_OBJECT_UNLOCK(obj); - } else if (uap->flags & SF_NODISKIO) { - error = EBUSY; - } else { - int bsize, resid; - + VM_OBJECT_LOCK(obj); /* - * Ensure that our page is still around when the I/O - * completes. + * Calculate the amount to transfer. + * Not to exceed a page, the EOF, + * or the passed in nbytes. */ - vm_page_io_start(pg); - VM_OBJECT_UNLOCK(obj); - + pgoff = (vm_offset_t)(off & PAGE_MASK); + xfsize = omin(PAGE_SIZE - pgoff, + obj->un_pager.vnp.vnp_size - off - + sbytes - loopbytes); + if (uap->nbytes) + rem = (uap->nbytes - sbytes - loopbytes); + else + rem = obj->un_pager.vnp.vnp_size - off - + sbytes - loopbytes; + xfsize = omin(rem, xfsize); + if (xfsize <= 0) { + VM_OBJECT_UNLOCK(obj); + done = 1; /* all data sent */ + break; + } /* - * Get the page from backing store. + * Don't overflow the send buffer. + * Stop here and send out what we've + * already got. */ - bsize = vp->v_mount->mnt_stat.f_iosize; - vfslocked = VFS_LOCK_GIANT(vp->v_mount); - vn_lock(vp, LK_SHARED | LK_RETRY, td); + if (space < loopbytes + xfsize) { + VM_OBJECT_UNLOCK(obj); + break; + } +retry_lookup: /* - * XXXMAC: Because we don't have fp->f_cred here, - * we pass in NOCRED. This is probably wrong, but - * is consistent with our original implementation. + * Attempt to look up the page. + * Allocate if not found or + * wait and loop if busy. */ - error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE, - trunc_page(off), UIO_NOCOPY, IO_NODELOCKED | - IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT), - td->td_ucred, NOCRED, &resid, td); - VOP_UNLOCK(vp, 0, td); - VFS_UNLOCK_GIANT(vfslocked); - VM_OBJECT_LOCK(obj); - vm_page_io_finish(pg); - if (!error) + pindex = OFF_TO_IDX(off); + pg = vm_page_lookup(obj, pindex); + if (pg == NULL) { + pg = vm_page_alloc(obj, pindex, + VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL | + VM_ALLOC_WIRED); + if (pg == NULL) { + VM_OBJECT_UNLOCK(obj); + VM_WAIT; + VM_OBJECT_LOCK(obj); + goto retry_lookup; + } + } else if (vm_page_sleep_if_busy(pg, TRUE, "sfpbsy")) + goto retry_lookup; + else { + /* + * Wire the page so it does not get + * ripped out from under us. + */ + vm_page_lock_queues(); + vm_page_wire(pg); + vm_page_unlock_queues(); + } + + /* + * Check if page is valid for what we need, + * otherwise initiate I/O. + * If we already turned some pages into mbufs, + * send them off before we come here again and + * block. + */ + if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize)) VM_OBJECT_UNLOCK(obj); - mbstat.sf_iocnt++; - } - - if (error) { - vm_page_lock_queues(); - vm_page_unwire(pg, 0); + else if (m != NULL) + error = EAGAIN; /* send what we already got */ + else if (uap->flags & SF_NODISKIO) + error = EBUSY; + else { + int bsize, resid; + + /* + * Ensure that our page is still around + * when the I/O completes. + */ + vm_page_io_start(pg); + VM_OBJECT_UNLOCK(obj); + + /* + * Get the page from backing store. + */ + bsize = vp->v_mount->mnt_stat.f_iosize; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vn_lock(vp, LK_SHARED | LK_RETRY, td); + + /* + * XXXMAC: Because we don't have fp->f_cred + * here, we pass in NOCRED. This is probably + * wrong, but is consistent with our original + * implementation. + */ + error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE, + trunc_page(off), UIO_NOCOPY, IO_NODELOCKED | + IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT), + td->td_ucred, NOCRED, &resid, td); + VOP_UNLOCK(vp, 0, td); + VFS_UNLOCK_GIANT(vfslocked); + VM_OBJECT_LOCK(obj); + vm_page_io_finish(pg); + if (!error) + VM_OBJECT_UNLOCK(obj); + mbstat.sf_iocnt++; + } + if (error) { + vm_page_lock_queues(); + vm_page_unwire(pg, 0); + /* + * See if anyone else might know about + * this page. If not and it is not valid, + * then free it. + */ + if (pg->wire_count == 0 && pg->valid == 0 && + pg->busy == 0 && !(pg->flags & PG_BUSY) && + pg->hold_count == 0) { + vm_page_free(pg); + } + vm_page_unlock_queues(); + VM_OBJECT_UNLOCK(obj); + if (error == EAGAIN) + error = 0; /* not a real error */ + break; + } + /* - * See if anyone else might know about this page. - * If not and it is not valid, then free it. + * Get a sendfile buf. We usually wait as long + * as necessary, but this wait can be interrupted. */ - if (pg->wire_count == 0 && pg->valid == 0 && - pg->busy == 0 && !(pg->flags & PG_BUSY) && - pg->hold_count == 0) { - vm_page_free(pg); + if ((sf = sf_buf_alloc(pg, + (mnw ? SFB_NOWAIT : SFB_CATCH))) == NULL) { + mbstat.sf_allocfail++; + vm_page_lock_queues(); + vm_page_unwire(pg, 0); + /* + * XXX: Not same check as above!? + */ + if (pg->wire_count == 0 && pg->object == NULL) + vm_page_free(pg); + vm_page_unlock_queues(); + error = (mnw ? EAGAIN : EINTR); + break; } - vm_page_unlock_queues(); - VM_OBJECT_UNLOCK(obj); - SOCKBUF_LOCK(&so->so_snd); - sbunlock(&so->so_snd); - SOCKBUF_UNLOCK(&so->so_snd); - goto done; - } - /* - * Get a sendfile buf. We usually wait as long as necessary, - * but this wait can be interrupted. - */ - if ((sf = sf_buf_alloc(pg, SFB_CATCH)) == NULL) { - mbstat.sf_allocfail++; - vm_page_lock_queues(); - vm_page_unwire(pg, 0); - if (pg->wire_count == 0 && pg->object == NULL) - vm_page_free(pg); - vm_page_unlock_queues(); - SOCKBUF_LOCK(&so->so_snd); - sbunlock(&so->so_snd); - SOCKBUF_UNLOCK(&so->so_snd); - error = EINTR; - goto done; - } + /* + * Get an mbuf and set it up as having + * external storage. + */ + m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA); + if (m0 == NULL) { + error = (mnw ? EAGAIN : ENOBUFS); + sf_buf_mext((void *)sf_buf_kva(sf), sf); + break; + } + MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext, + sf, M_RDONLY, EXT_SFBUF); + m0->m_data = (char *)sf_buf_kva(sf) + pgoff; + m0->m_len = xfsize; + + /* Append to mbuf chain. */ + if (m != NULL) + m_cat(m, m0); + else + m = m0; - /* - * Get an mbuf header and set it up as having external storage. - */ - if (m_header) - MGET(m, M_TRYWAIT, MT_DATA); - else - MGETHDR(m, M_TRYWAIT, MT_DATA); - if (m == NULL) { - error = ENOBUFS; - sf_buf_mext((void *)sf_buf_kva(sf), sf); - SOCKBUF_LOCK(&so->so_snd); - sbunlock(&so->so_snd); - SOCKBUF_UNLOCK(&so->so_snd); - goto done; - } - /* - * Setup external storage for mbuf. - */ - MEXTADD(m, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext, sf, M_RDONLY, - EXT_SFBUF); - m->m_data = (char *)sf_buf_kva(sf) + pgoff; - m->m_pkthdr.len = m->m_len = xfsize; - - if (m_header) { - m_cat(m_header, m); - m = m_header; - m_header = NULL; - m_fixhdr(m); + /* Keep track of bits processed. */ + loopbytes += xfsize; + off += xfsize; } - /* - * Add the buffer to the socket buffer chain. - */ - SOCKBUF_LOCK(&so->so_snd); -retry_space: - /* - * Make sure that the socket is still able to take more data. - * CANTSENDMORE being true usually means that the connection - * was closed. so_error is true when an error was sensed after - * a previous send. - * The state is checked after the page mapping and buffer - * allocation above since those operations may block and make - * any socket checks stale. From this point forward, nothing - * blocks before the pru_send (or more accurately, any blocking - * results in a loop back to here to re-check). - */ - SOCKBUF_LOCK_ASSERT(&so->so_snd); - if ((so->so_snd.sb_state & SBS_CANTSENDMORE) || so->so_error) { + /* Add the buffer chain to the socket buffer. */ + if (m != NULL) { + SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; - } else { - error = so->so_error; - so->so_error = 0; - } - m_freem(m); - sbunlock(&so->so_snd); - SOCKBUF_UNLOCK(&so->so_snd); - goto done; - } - /* - * Wait for socket space to become available. We do this just - * after checking the connection state above in order to avoid - * a race condition with sbwait(). - */ - if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) { - if (so->so_state & SS_NBIO) { - m_freem(m); - sbunlock(&so->so_snd); SOCKBUF_UNLOCK(&so->so_snd); - error = EAGAIN; goto done; } - error = sbwait(&so->so_snd); - /* - * An error from sbwait usually indicates that we've - * been interrupted by a signal. If we've sent anything - * then return bytes sent, otherwise return the error. - */ - if (error) { - m_freem(m); - sbunlock(&so->so_snd); - SOCKBUF_UNLOCK(&so->so_snd); - goto done; + SOCKBUF_UNLOCK(&so->so_snd); + error = (*so->so_proto->pr_usrreqs->pru_send) + (so, 0, m, NULL, NULL, td); + if (!error) { + sbytes += loopbytes; + headersent = 1; } - goto retry_space; + m = NULL; /* pru_send always consumes */ } - SOCKBUF_UNLOCK(&so->so_snd); - error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, td); - if (error) { - SOCKBUF_LOCK(&so->so_snd); - sbunlock(&so->so_snd); - SOCKBUF_UNLOCK(&so->so_snd); + + /* Quit outer loop on error or when we're done. */ + if (error || done) goto done; - } - headersent = 1; } - SOCKBUF_LOCK(&so->so_snd); - sbunlock(&so->so_snd); - SOCKBUF_UNLOCK(&so->so_snd); /* * Send trailers. Wimp out and use writev(2). @@ -2244,6 +2280,10 @@ retry_space: } done: + SOCKBUF_LOCK(&so->so_snd); + sbunlock(&so->so_snd); + SOCKBUF_UNLOCK(&so->so_snd); + if (headersent) { if (!compat) hdtr_size += headersize; @@ -2251,6 +2291,7 @@ done: if (compat) sbytes -= headersize; } + /* * If there was no error we have to clear td->td_retval[0] * because it may have been set by writev. @@ -2272,8 +2313,8 @@ done: } if (so) fdrop(sock_fp, td); - if (m_header) - m_freem(m_header); + if (m) + m_freem(m); NET_UNLOCK_GIANT(); Index: net/if_tap.c =================================================================== RCS file: /home/ncvs/src/sys/net/if_tap.c,v retrieving revision 1.63 diff -u -p -r1.63 if_tap.c --- net/if_tap.c 27 Sep 2006 19:57:01 -0000 1.63 +++ net/if_tap.c 6 Oct 2006 16:30:22 -0000 @@ -827,7 +827,8 @@ tapwrite(struct cdev *dev, struct uio *u return (EIO); } - if ((m = m_uiotombuf(uio, M_DONTWAIT, 0, ETHER_ALIGN)) == NULL) { + if ((m = m_uiotombuf(uio, M_DONTWAIT, 0, ETHER_ALIGN, + M_PKTHDR)) == NULL) { ifp->if_ierrors ++; return (error); } Index: net/if_tun.c =================================================================== RCS file: /home/ncvs/src/sys/net/if_tun.c,v retrieving revision 1.158 diff -u -p -r1.158 if_tun.c --- net/if_tun.c 8 Aug 2006 19:22:25 -0000 1.158 +++ net/if_tun.c 6 Oct 2006 16:30:22 -0000 @@ -789,7 +789,7 @@ tunwrite(struct cdev *dev, struct uio *u return (EIO); } - if ((m = m_uiotombuf(uio, M_DONTWAIT, 0, 0)) == NULL) { + if ((m = m_uiotombuf(uio, M_DONTWAIT, 0, 0, M_PKTHDR)) == NULL) { ifp->if_ierrors++; return (error); } Index: net/ppp_tty.c =================================================================== RCS file: /home/ncvs/src/sys/net/ppp_tty.c,v retrieving revision 1.69 diff -u -p -r1.69 ppp_tty.c --- net/ppp_tty.c 16 Oct 2005 20:44:18 -0000 1.69 +++ net/ppp_tty.c 6 Oct 2006 16:30:22 -0000 @@ -384,7 +384,7 @@ pppwrite(tp, uio, flag) return (EMSGSIZE); s = spltty(); - if ((m = m_uiotombuf(uio, M_DONTWAIT, 0, 0)) == NULL) { + if ((m = m_uiotombuf(uio, M_DONTWAIT, 0, 0, M_PKTHDR)) == NULL) { splx(s); return (ENOBUFS); } Index: netgraph/ng_device.c =================================================================== RCS file: /home/ncvs/src/sys/netgraph/ng_device.c,v retrieving revision 1.21 diff -u -p -r1.21 ng_device.c --- netgraph/ng_device.c 4 May 2005 18:55:02 -0000 1.21 +++ netgraph/ng_device.c 6 Oct 2006 16:30:22 -0000 @@ -466,7 +466,7 @@ ngdwrite(struct cdev *dev, struct uio *u if (uio->uio_resid < 0 || uio->uio_resid > IP_MAXPACKET) return (EIO); - if ((m = m_uiotombuf(uio, M_DONTWAIT, 0, 0)) == NULL) + if ((m = m_uiotombuf(uio, M_DONTWAIT, 0, 0, M_PKTHDR)) == NULL) return (ENOBUFS); NG_SEND_DATA_ONLY(error, priv->hook, m); Index: netinet/tcp_usrreq.c =================================================================== RCS file: /home/ncvs/src/sys/netinet/tcp_usrreq.c,v retrieving revision 1.141 diff -u -p -r1.141 tcp_usrreq.c --- netinet/tcp_usrreq.c 17 Sep 2006 13:39:35 -0000 1.141 +++ netinet/tcp_usrreq.c 6 Oct 2006 16:30:23 -0000 @@ -1058,6 +1058,7 @@ struct pr_usrreqs tcp_usrreqs = { .pru_send = tcp_usr_send, .pru_shutdown = tcp_usr_shutdown, .pru_sockaddr = tcp_sockaddr, + .pru_soreceive = soreceive_stream, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = tcp_usr_close, }; Index: sys/libkern.h =================================================================== RCS file: /home/ncvs/src/sys/sys/libkern.h,v retrieving revision 1.54 diff -u -p -r1.54 libkern.h --- sys/libkern.h 12 Aug 2006 15:28:39 -0000 1.54 +++ sys/libkern.h 6 Oct 2006 16:30:27 -0000 @@ -58,6 +58,8 @@ static __inline quad_t qmax(quad_t a, qu static __inline quad_t qmin(quad_t a, quad_t b) { return (a < b ? a : b); } static __inline u_long ulmax(u_long a, u_long b) { return (a > b ? a : b); } static __inline u_long ulmin(u_long a, u_long b) { return (a < b ? a : b); } +static __inline off_t omax(off_t a, off_t b) { return (a > b ? a : b); } +static __inline off_t omin(off_t a, off_t b) { return (a < b ? a : b); } static __inline int abs(int a) { return (a < 0 ? -a : a); } static __inline long labs(long a) { return (a < 0 ? -a : a); } Index: sys/mbuf.h =================================================================== RCS file: /home/ncvs/src/sys/sys/mbuf.h,v retrieving revision 1.197 diff -u -p -r1.197 mbuf.h --- sys/mbuf.h 22 Sep 2006 19:50:04 -0000 1.197 +++ sys/mbuf.h 6 Oct 2006 16:30:27 -0000 @@ -393,6 +393,7 @@ m_getcl(int how, short type, int flags) /* * m_getjcl() returns an mbuf with a cluster of the specified size attached. * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. + * XXX: Too big for inlining! */ static __inline /* XXX: This is rather large, should be real function maybe. */ struct mbuf * @@ -465,6 +466,7 @@ m_clget(struct mbuf *m, int how) * specified, it gets the cluster attached to it and the return value * can be safely ignored. * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. + * XXX: Too big for inlining! */ static __inline void * @@ -517,6 +519,8 @@ m_chtype(struct mbuf *m, short new_type) #define MCLGET(m, how) m_clget((m), (how)) #define MEXTADD(m, buf, size, free, args, flags, type) \ m_extadd((m), (caddr_t)(buf), (size), (free), (args), (flags), (type)) +#define m_getm(m, len, how, type) \ + m_getm2((m), (len), (how), (type), M_PKTHDR) /* * Evaluate TRUE if it's safe to write to the mbuf m's data region (this @@ -657,7 +661,7 @@ int m_dup_pkthdr(struct mbuf *, struct u_int m_fixhdr(struct mbuf *); struct mbuf *m_fragment(struct mbuf *, int, int); void m_freem(struct mbuf *); -struct mbuf *m_getm(struct mbuf *, int, int, short); +struct mbuf *m_getm2(struct mbuf *, int, int, short, int); struct mbuf *m_getptr(struct mbuf *, int, int *); u_int m_length(struct mbuf *, struct mbuf **); void m_move_pkthdr(struct mbuf *, struct mbuf *); @@ -667,7 +671,7 @@ struct mbuf *m_pulldown(struct mbuf *, i struct mbuf *m_pullup(struct mbuf *, int); int m_sanity(struct mbuf *, int); struct mbuf *m_split(struct mbuf *, int, int); -struct mbuf *m_uiotombuf(struct uio *, int, int, int); +struct mbuf *m_uiotombuf(struct uio *, int, int, int, int); struct mbuf *m_unshare(struct mbuf *, int how); /*- Index: sys/socket.h =================================================================== RCS file: /home/ncvs/src/sys/sys/socket.h,v retrieving revision 1.90 diff -u -p -r1.90 socket.h --- sys/socket.h 26 Jul 2006 03:15:15 -0000 1.90 +++ sys/socket.h 6 Oct 2006 16:30:27 -0000 @@ -546,7 +546,8 @@ struct sf_hdtr { /* * Sendfile-specific flag(s) */ -#define SF_NODISKIO 0x00000001 +#define SF_NODISKIO 0x00000001 +#define SF_MNOWAIT 0x00000002 #endif #ifndef _KERNEL Index: sys/socketvar.h =================================================================== RCS file: /home/ncvs/src/sys/sys/socketvar.h,v retrieving revision 1.154 diff -u -p -r1.154 socketvar.h --- sys/socketvar.h 1 Aug 2006 10:30:26 -0000 1.154 +++ sys/socketvar.h 6 Oct 2006 16:30:27 -0000 @@ -484,6 +496,9 @@ void sbdroprecord(struct sockbuf *sb); void sbdroprecord_locked(struct sockbuf *sb); void sbflush(struct sockbuf *sb); void sbflush_locked(struct sockbuf *sb); +void sbprepend_locked(struct sockbuf *sb, struct mbuf *m); +struct mbuf * + sbpull_locked(struct sockbuf *sb, int len, int how); void sbrelease(struct sockbuf *sb, struct socket *so); void sbrelease_locked(struct sockbuf *sb, struct socket *so); int sbreserve(struct sockbuf *sb, u_long cc, struct socket *so, @@ -538,6 +553,9 @@ int soreceive(struct socket *so, struct int soreceive_generic(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); +int soreceive_stream(struct socket *so, struct sockaddr **paddr, + struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, + int *flagsp); int soreserve(struct socket *so, u_long sndcc, u_long rcvcc); void sorflush(struct socket *so); int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,