Index: contrib/libpcap/pcap-bpf.c =================================================================== RCS file: /home/ncvs/src/contrib/libpcap/pcap-bpf.c,v retrieving revision 1.3.2.1 diff -u -r1.3.2.1 pcap-bpf.c --- contrib/libpcap/pcap-bpf.c 19 Oct 2007 03:03:56 -0000 1.3.2.1 +++ contrib/libpcap/pcap-bpf.c 5 Nov 2007 18:40:34 -0000 @@ -30,6 +30,8 @@ #endif #include /* optionally get BSD define */ +#include +#include #include #include #include @@ -139,6 +141,118 @@ return (0); } +#ifdef BIOCGETBUFMODE +/* + * Selection routine for zero-copy BPF: identify the next completed buffer, + * if any. Try shared memory first, and if that doesn't work, make a system + * call, which may dislodge a buffer. + * + * Return (1) if the buffer is found, (0) if a retry is required, and (-1) if + * there is an unrecoverable error. + * + * XXXRW: Check to make sure the version comparison we're doing here is + * really the right thing -- maybe use serial number arithmetic? + */ +static int +pcap_next_zbuf(pcap_t *p, u_int *cc) +{ + struct bpf_zbuf_header *bzh; + struct bpf_zbuf bz; + struct timeval tv; + fd_set r_set; + int r; + + FD_ZERO(&r_set); + FD_SET(p->fd, &r_set); + p->bzh = NULL; + p->buffer = NULL; + if (p->to_ms != 0) { + tv.tv_sec = p->to_ms / 1000; + tv.tv_usec = (p->to_ms * 1000) % 1000000; + } + r = select(p->fd + 1, &r_set, NULL, NULL, &tv); + if (r < 0 && errno == EINTR) + return (0); + else if (r < 0) { + (void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE, + "select: %s", strerror(errno)); + return (-1); + } + /* + * Handle timeouts here + */ + if (r == 0) { + if (ioctl(p->fd, BIOCROTZBUF, &bz) < 0) { + (void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE, + "BIOCROTZBUF: %s", strerror(errno)); + return (-1); + } + /* + * select(2) woke us up due to a timeout, and there was no + * data to be processed in the store buffer. Tell pcap to + * to wait again. + */ + if (bz.bz_bufa == NULL) + return (0); + } + /* XXXCSJP should we check FD_ISSET()? */ + /* + * If we have made it this far, chances are select(2) returned because + * there is data ready to be processed in the hold buffer. Compare the + * user generation numbers against the kernels. If there are any + * differences, process the packet data. + */ + bzh = (struct bpf_zbuf_header *)p->zbuf1; + if (bzh->bzh_kernel_gen > bzh->bzh_user_gen) { + p->bzh = bzh; + p->buffer = (u_char *)p->zbuf1; + p->buffer += sizeof(*bzh); + *cc = bzh->bzh_kernel_len; + return (1); + } + bzh = (struct bpf_zbuf_header *)p->zbuf2; + if (bzh->bzh_kernel_gen > bzh->bzh_user_gen) { + p->bzh = bzh; + p->buffer = (u_char *)p->zbuf2; + p->buffer += sizeof(*bzh); + *cc = bzh->bzh_kernel_len; + return (1); + } + /* + * If the generation numbers were the same for both buffers, then it + * is possible that we woke up because of BIOCIMMEDIATE. In either + * case, manually rotate the buffers. + */ + if (ioctl(p->fd, BIOCROTZBUF, &bz) < 0) { + (void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE, + "BIOCROTZBUF: %s", strerror(errno)); + return (-1); + } + /* + * It's possible that we were unable to rotate the buffer because the + * user generation numbers have not been modified, in which case retry. + */ + if (bz.bz_bufa == NULL) + return (0); + p->bzh = (struct bpf_zbuf_header *)bz.bz_bufa; + p->buffer = (u_char *)bz.bz_bufa; + p->buffer += sizeof(*bzh); + *cc = bz.bz_buflen; + return (1); +} + +static int +pcap_ack_zbuf(pcap_t *p) +{ + struct bpf_zbuf bz; + + p->bzh->bzh_user_gen++; + p->bzh = NULL; + p->buffer = NULL; + return (0); +} +#endif + static int pcap_read_bpf(pcap_t *p, int cnt, pcap_handler callback, u_char *user) { @@ -147,6 +261,9 @@ register u_char *bp, *ep; u_char *datap; struct bpf_insn *fcode; +#ifdef BIOCSETBUFMODE + int i; +#endif #ifdef PCAP_FDDIPAD register int pad; #endif @@ -167,7 +284,19 @@ } cc = p->cc; if (p->cc == 0) { - cc = read(p->fd, (char *)p->buffer, p->bufsize); +#ifdef BIOCSETBUFMODE + if (p->zbuf1 != NULL) { + if (p->buffer != NULL) + pcap_ack_zbuf(p); + i = pcap_next_zbuf(p, &cc); + if (i == 0) + goto again; + if (i < 0) + return (-1); + } else +#endif + cc = read(p->fd, (char *)p->buffer, p->bufsize); + if (cc < 0) { /* Don't choke when we get ptraced */ switch (errno) { @@ -609,6 +738,10 @@ struct bpf_insn total_insn; struct bpf_program total_prog; struct utsname osinfo; +#ifdef BIOCSETBUFMODE + struct bpf_zbuf bz; + u_int bufmode, zbufmax; +#endif #ifdef HAVE_DAG_API if (strstr(device, "dag")) { @@ -647,6 +780,73 @@ } /* + * XXXRW: Depending on the availability of zero-copy BPF, we take one + * of two strategies here: if it is available and usable, we go ahead + * and set it up; otherwise we play the song-and-dance to try to + * probe an acceptable read buffer size. Zero-copy BPF requires that + * buffers be mapped into memory before selecting the interface to + * attach to, so we do that here also. + */ +#ifdef BIOCSETBUFMODE + if (getenv("BPF_ZERO_COPY")) { + bufmode = BPF_BUFMODE_ZBUF; + if (ioctl(fd, BIOCSETBUFMODE, (caddr_t)&bufmode) < 0) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCSETBUFMODE: %s", + pcap_strerror(errno)); + goto bad; + } + + if (ioctl(fd, BIOCGETZMAX, (caddr_t)&zbufmax) < 0) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCGETZMAX: %s", + pcap_strerror(errno)); + goto bad; + } + + /* + * XXXRW: This logic should be revisited. + */ + p->zbufsize = 32768; + if (p->zbufsize % getpagesize() != 0) + p->zbufsize = getpagesize(); + if (p->zbufsize > zbufmax) + p->zbufsize = zbufmax; + + p->zbuf1 = mmap(NULL, p->zbufsize, PROT_READ | PROT_WRITE, + MAP_ANON, -1, 0); + p->zbuf2 = mmap(NULL, p->zbufsize, PROT_READ | PROT_WRITE, + MAP_ANON, -1, 0); + if (p->zbuf1 == MAP_FAILED || p->zbuf2 == MAP_FAILED) { + if (p->zbuf1 != MAP_FAILED) + munmap(p->zbuf1, p->zbufsize); + if (p->zbuf2 != MAP_FAILED) + munmap(p->zbuf1, p->zbufsize); + snprintf(ebuf, PCAP_ERRBUF_SIZE, "mmap: %s", + pcap_strerror(errno)); + } + + bzero(&bz, sizeof(bz)); + bz.bz_bufa = p->zbuf1; + bz.bz_bufb = p->zbuf2; + bz.bz_buflen = p->zbufsize; + + if (ioctl(fd, BIOCSETZBUF, (caddr_t)&bz) < 0) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCSETZBUF: %s", + pcap_strerror(errno)); + goto bad; + } + + (void)strncpy(ifr.ifr_name, device, sizeof(ifr.ifr_name)); + if (ioctl(fd, BIOCSETIF, (caddr_t)&ifr) < 0) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCSETIF: %s: %s", + device, pcap_strerror(errno)); + goto bad; + } + + v = p->zbufsize - sizeof(struct bpf_zbuf_header); + } else { +#endif + + /* * Try finding a good size for the buffer; 32768 may be too * big, so keep cutting it in half until we find a size * that works, or run out of sizes to try. If the default @@ -681,6 +881,9 @@ "BIOCSBLEN: %s: No buffer size worked", device); goto bad; } +#ifdef BIOCSETBUFMODE + } +#endif /* Get the data link layer type. */ if (ioctl(fd, BIOCGDLT, (caddr_t)&v) < 0) { @@ -855,7 +1058,8 @@ } #endif /* set timeout */ - if (to_ms != 0) { + p->to_ms = to_ms; + if (to_ms != 0 && getenv("BPF_ZERO_COPY") == NULL) { /* * XXX - is this seconds/nanoseconds in AIX? * (Treating it as such doesn't fix the timeout @@ -870,6 +1074,9 @@ goto bad; } } +#ifdef BIOCSETBUFMODE + p->timeout = to_ms; +#endif #ifdef _AIX #ifdef BIOCIMMEDIATE @@ -942,16 +1149,22 @@ goto bad; } p->bufsize = v; - p->buffer = (u_char *)malloc(p->bufsize); - if (p->buffer == NULL) { - snprintf(ebuf, PCAP_ERRBUF_SIZE, "malloc: %s", - pcap_strerror(errno)); - goto bad; - } +#ifdef BIOCSETBUFMODE + if (p->zbuf1 == NULL) { +#endif + p->buffer = (u_char *)malloc(p->bufsize); + if (p->buffer == NULL) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, "malloc: %s", + pcap_strerror(errno)); + goto bad; + } #ifdef _AIX - /* For some strange reason this seems to prevent the EFAULT - * problems we have experienced from AIX BPF. */ - memset(p->buffer, 0x0, p->bufsize); + /* For some strange reason this seems to prevent the EFAULT + * problems we have experienced from AIX BPF. */ + memset(p->buffer, 0x0, p->bufsize); +#endif +#ifdef BIOCSETBUFMODE + } #endif /* @@ -1036,7 +1249,24 @@ return (p); bad: + (void)close(fd); +#ifdef BIOCSETBUFMODE + if (p->zbuf1 != NULL) + munmap(p->zbuf1, v); + if (p->zbuf2 != NULL) + munmap(p->zbuf2, v); + /* + * If we are using zerocopy, the packet buffer will be referencing + * an address in one of the shared pages, if any. In which case + * we will not free it. + */ + if (getenv("BPF_ZERO_COPY") == NULL && p->buffer != NULL) + free(p->buffer); +#else + if (p->buffer != NULL) + free(p->buffer); +#endif if (p->dlt_list != NULL) free(p->dlt_list); free(p); Index: contrib/libpcap/pcap-int.h =================================================================== RCS file: /home/ncvs/src/contrib/libpcap/pcap-int.h,v retrieving revision 1.12.2.1 diff -u -r1.12.2.1 pcap-int.h --- contrib/libpcap/pcap-int.h 19 Oct 2007 03:03:56 -0000 1.12.2.1 +++ contrib/libpcap/pcap-int.h 5 Nov 2007 18:40:34 -0000 @@ -167,12 +167,35 @@ struct pcap_md md; /* - * Read buffer. + * Read buffer -- for file descriptor read buffer model. */ int bufsize; u_char *buffer; u_char *bp; int cc; + int to_ms; + + /* + * XXXRW: Exactly how to handle ifdefs, etc, is not something I've + * worked out yet. Presumably we need to add a configure check for + * zero-copy BPF. + * + * Zero-copy read buffer -- for zero-copy BPF. 'buffer' above will + * alternative between these two actual mmap'd buffers as required. + * As there is a header on the front size of the mmap'd buffer, only + * some of the buffer is exposed to libpcap as a whole via bufsize; + * zbufsize is the true size. + */ + u_char *zbuf1, *zbuf2; + u_int zbufsize; + u_int timeout; + + /* + * If there's currently a buffer being actively processed, then it is + * referenced here; 'buffer' is also pointed at it, but offset by the + * size of the header. + */ + struct bpf_zbuf_header *bzh; /* * Place holder for pcap_next(). Index: lib/libpcap/Makefile =================================================================== RCS file: /home/ncvs/src/lib/libpcap/Makefile,v retrieving revision 1.39.2.1 diff -u -r1.39.2.1 Makefile --- lib/libpcap/Makefile 19 Oct 2007 03:04:02 -0000 1.39.2.1 +++ lib/libpcap/Makefile 5 Nov 2007 18:40:34 -0000 @@ -16,6 +16,7 @@ YFLAGS+=-p pcapyy LFLAGS+=-Ppcapyy +CFLAGS+=-I../../sys CFLAGS+=-DHAVE_CONFIG_H -Dyylval=pcapyylval -I${.CURDIR} -I. CFLAGS+=-D_U_="__attribute__((unused))" CFLAGS+=-DHAVE_SNPRINTF -DHAVE_VSNPRINTF Index: share/man/man4/bpf.4 =================================================================== RCS file: /home/ncvs/src/share/man/man4/bpf.4,v retrieving revision 1.48 diff -u -r1.48 bpf.4 --- share/man/man4/bpf.4 26 Feb 2007 22:24:14 -0000 1.48 +++ share/man/man4/bpf.4 5 Nov 2007 18:40:34 -0000 @@ -1,3 +1,30 @@ +.\" Copyright (c) 2007 Seccuris Inc. +.\" All rights reserved. +.\" +.\" This sofware was developed by Robert N. M. Watson under contract to +.\" Seccuris Inc. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" .\" Copyright (c) 1990 The Regents of the University of California. .\" All rights reserved. .\" @@ -61,18 +88,53 @@ all file descriptors listening on that interface apply their filter. Each descriptor that accepts the packet receives its own copy. .Pp -Reads from these files return the next group of packets -that have matched the filter. -To improve performance, the buffer passed to read must be -the same size as the buffers used internally by -.Nm . +.Nm +devices operate in one of two buffering modes: buffered +.Xr read 2 , +in which packet data is copied from the kernel explicitly using the +.Xr read 2 +system call, and zero-copy buffer mode, in which the user process provides +two memory regions that +.Nm +will write to directly as the packets are accepted. +The buffering mode may be set with the +.Dv BIOCSETBUFMODE +ioctl (see below), and will default to buffered +.Xr read 2 +mode +.Dv ( BPF_BUFMODE_BUFFER ) +by default. +Buffers return the next group of packets that have matched the filter. +Note that an individual packet larger than the buffer size is necessarily +truncated. +.Pp +In the case of buffered +.Xr read 2 , +the user process will declare a fixed buffer size that will be used both for +sizing internal buffers and for all +.Xr read 2 +operations on the file. This size is returned by the .Dv BIOCGBLEN ioctl (see below), and can be set with .Dv BIOCSBLEN . -Note that an individual packet larger than this size is necessarily -truncated. +.Pp +In zero-copy buffering, the user process registers two memory buffers with +.Nm +via the +.Dv BIOCSETZBUF +ioctl (see below). +The user process may monitor for completion (filling) of a buffer, at which +point the memory contents of the buffer will be stable until the buffer is +returned for further kernel use using the +.Dv BIOCACKZBUF +ioctl. +Buffers will be of a fixed (and equal) size, be +page-aligned, and the size must be an integer multiple of the page size. +The maximum zero-copy buffer size is returned by the +.Dv BIOCGETZMAX +ioctl (see below). .Pp The packet filter will support any link level protocol that has fixed length headers. @@ -127,7 +189,7 @@ The (third) argument to .Xr ioctl 2 should be a pointer to the type indicated. -.Bl -tag -width BIOCGRTIMEOUT +.Bl -tag -width BIOCGETBUFMODE .It Dv BIOCGBLEN .Pq Li u_int Returns the required buffer length for reads on @@ -349,6 +411,87 @@ This prevents the execution of ioctl commands which could change the underlying operating parameters of the device. +.It Dv BIOCGETBUFMODE +.It Dv BIOCSETBUFMODE +.Pq Li u_int +Get or set the current +.Nm +buffering mode; possible values are +.Dv BPF_BUFMODE_BUFFER , +buffered +.Xr read 2 +mode, and +.Dv BPF_BUFMODE_ZBUF , +zero-copy buffer mode. +.It Dv BIOCACKZBUF +.Pq Li struct bpf_zbuf +Return a completed zero-copy buffer to the kernel for reuse. +The following structure is used as an argument to these and other zero-copy +buffer ioctls: +.Bd -literal +struct bpf_zbuf { + void *bz_bufa; + void *bz_bufb; + size_t bz_buflen; +}; +.Ed +.Pp +Only the +.Vt bz_bufa +field will be used with this ioctl. +.It Dv BIOCGETZBUF +.It Dv BIOCSETZBUF +.Pq Li struct bpf_zbuf +Get or set the current zero-copy buffer locations; buffer locations may be +set only once zero-copy buffer mode has been selected, and prior to attaching +the +.Nm +device to an interface. +Buffers must be of identical size, page-aligned, and an integer multiple of +pages in size. +The three fields +.Vt bz_bufa , +.Vt bz_bufb , +and +.Vt bz_buflen +must be filled out. +.It Dv BIOCGETZMAX +.Pq Li size_t +Get the largest individual zero-copy buffer size allowed. +As two buffers are used in zero-copy buffer mode, the limit (in practice) is +twice the returned size. +As zero-copy buffers consume kernel address space, conservative selection of +buffer size, especially when there are multiple +.Nm +descriptors in use on 32-bit systems. +.It Dv BIOCGETZNEXT +.It Dv BIOCROTZBUF +.Pq Li struct bpf_zbuf +Get the buffer pointer and length of the next zero-copy buffer buffer ready +for userspace use, or +.Dv NULL +if there is no pending buffer. +.Pp +.Dv BIOCGETZNEXT +queries for the next completely filled buffer ready for immediate use, +returning NULL if there are only empty or partially filled buffers available. +.Pp +.Dv BIOCROTZBUF +queries for a filled buffer, but in the event there is only a partially +filled buffer, will make that buffer available for userspace to use +immediately. +This allows consumers of zero-copy buffering to implement timeouts and +retrieve partially filled buffers. +.Dv BIOCROTZBUF +will return +.Dv NULL +only if no data is present in either of the zero-copy buffers. +.Pp +Only the +.Vt bz_bufa +and +.Vt bz_buflen +fields will be used with this ioctl. .El .Sh BPF HEADER The following structure is prepended to each packet returned by Index: sys/conf/files =================================================================== RCS file: /home/ncvs/src/sys/conf/files,v retrieving revision 1.1243 diff -u -r1.1243 files --- sys/conf/files 23 Sep 2007 07:34:22 -0000 1.1243 +++ sys/conf/files 5 Nov 2007 18:40:34 -0000 @@ -1595,8 +1595,10 @@ libkern/strtouq.c standard libkern/strvalid.c standard net/bpf.c standard +net/bpf_buffer.c optional bpf net/bpf_jitter.c optional bpf_jitter net/bpf_filter.c optional bpf | netgraph_bpf +net/bpf_zerocopy.c optional bpf_zerocopy net/bridgestp.c optional bridge | if_bridge net/bsd_comp.c optional ppp_bsdcomp net/ieee8023ad_lacp.c optional lagg Index: sys/conf/options =================================================================== RCS file: /home/ncvs/src/sys/conf/options,v retrieving revision 1.608 diff -u -r1.608 options --- sys/conf/options 23 Sep 2007 07:34:23 -0000 1.608 +++ sys/conf/options 5 Nov 2007 18:40:34 -0000 @@ -478,6 +478,7 @@ # DRM options DRM_DEBUG opt_drm.h +BPF_ZEROCOPY opt_bpf.h ZERO_COPY_SOCKETS opt_zero.h TI_PRIVATE_JUMBOS opt_ti.h TI_JUMBO_HDRSPLIT opt_ti.h Index: sys/net/bpf.c =================================================================== RCS file: /home/ncvs/src/sys/net/bpf.c,v retrieving revision 1.181.2.1 diff -u -r1.181.2.1 bpf.c --- sys/net/bpf.c 20 Oct 2007 15:09:24 -0000 1.181.2.1 +++ sys/net/bpf.c 6 Nov 2007 20:33:15 -0000 @@ -65,9 +65,13 @@ #include #include +#include #ifdef BPF_JITTER #include #endif +#ifdef BPF_ZEROCOPY +#include +#endif #include #include @@ -79,7 +83,7 @@ #include -static MALLOC_DEFINE(M_BPF, "BPF", "BPF data"); +MALLOC_DEFINE(M_BPF, "BPF", "BPF data"); #if defined(DEV_BPF) || defined(NETGRAPH_BPF) @@ -97,19 +101,17 @@ static struct mtx bpf_mtx; /* bpf global lock */ static int bpf_bpfd_cnt; -static void bpf_allocbufs(struct bpf_d *); static void bpf_attachd(struct bpf_d *, struct bpf_if *); static void bpf_detachd(struct bpf_d *); static void bpf_freed(struct bpf_d *); -static void bpf_mcopy(const void *, void *, size_t); static int bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **, struct sockaddr *, int *, struct bpf_insn *); static int bpf_setif(struct bpf_d *, struct ifreq *); static void bpf_timed_out(void *); static __inline void bpf_wakeup(struct bpf_d *); -static void catchpacket(struct bpf_d *, u_char *, u_int, - u_int, void (*)(const void *, void *, size_t), +static void catchpacket(struct bpf_d *, u_char *, u_int, u_int, + void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int), struct timeval *); static void reset_d(struct bpf_d *); static int bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd); @@ -124,10 +126,13 @@ SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl"); static int bpf_bufsize = 4096; SYSCTL_INT(_net_bpf, OID_AUTO, bufsize, CTLFLAG_RW, - &bpf_bufsize, 0, "Default bpf buffer size"); + &bpf_bufsize, 0, ""); static int bpf_maxbufsize = BPF_MAXBUFSIZE; SYSCTL_INT(_net_bpf, OID_AUTO, maxbufsize, CTLFLAG_RW, - &bpf_maxbufsize, 0, "Maximum bpf buffer size"); + &bpf_maxbufsize, 0, ""); +static int bpf_timestamp = 1; +SYSCTL_INT(_net_bpf, OID_AUTO, timestamp, CTLFLAG_RW, + &bpf_timestamp, 0, "timestamp each frame"); static int bpf_maxinsns = BPF_MAXINSNS; SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW, &bpf_maxinsns, 0, "Maximum bpf program instructions"); @@ -157,6 +162,216 @@ static struct filterops bpfread_filtops = { 1, NULL, filt_bpfdetach, filt_bpfread }; +/* + * Wrapper functions for various buffering methods. If the set of buffer + * modes expands, we will probably want to introduce a switch data structure + * similar to protosw, et. + */ +static void +bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src, + u_int len) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + return (bpf_buffer_append_bytes(d, buf, offset, src, len)); + +#ifdef BPF_ZEROCOPY + case BPF_BUFMODE_ZBUF: + d->bd_zcopy++; + return (bpf_zerocopy_append_bytes(d, buf, offset, src, len)); +#endif + + default: + panic("bpf_buf_append_bytes"); + } +} + +static void +bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src, + u_int len) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + return (bpf_buffer_append_mbuf(d, buf, offset, src, len)); + +#ifdef BPF_ZEROCOPY + case BPF_BUFMODE_ZBUF: + d->bd_zcopy++; + return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len)); +#endif + + default: + panic("bpf_buf_append_mbuf"); + } +} + +/* + * If the buffer mechanism has a way to decide that a held buffer can be made + * free, then it is exposed via the bpf_buffree() interface. (1) is returned + * if the buffer can be discarded, (0) is returned if it cannot. + */ +static int +bpf_buffree(struct bpf_d *d) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { +#ifdef BPF_ZEROCOPY + case BPF_BUFMODE_ZBUF: + return (bpf_zerocopy_buffree(d)); +#endif + } + return (0); +} + +void +bpf_bufheld(struct bpf_d *d) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { +#ifdef BPF_ZEROCOPY + case BPF_BUFMODE_ZBUF: + bpf_zerocopy_bufheld(d); + break; +#endif + } +} + +static void +bpf_free(struct bpf_d *d) +{ + + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + return (bpf_buffer_free(d)); + +#ifdef BPF_ZEROCOPY + case BPF_BUFMODE_ZBUF: + return (bpf_zerocopy_free(d)); +#endif + + default: + panic("bpf_buf_free"); + } +} + +static int +bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio) +{ + + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + return (bpf_buffer_uiomove(d, buf, len, uio)); + +#ifdef BPF_ZEROCOPY + case BPF_BUFMODE_ZBUF: + return (bpf_zerocopy_uiomove(d, buf, len, uio)); +#endif + + default: + panic("bpf_buf_uiomove"); + } +} + +static int +bpf_ioctl_sblen(struct bpf_d *d, u_int *i) +{ + + if (d->bd_bufmode != BPF_BUFMODE_BUFFER) + return (EOPNOTSUPP); + return (bpf_buffer_ioctl_sblen(d, i)); +} + +static int +bpf_ioctl_ackzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) +{ + + if (d->bd_bufmode != BPF_BUFMODE_ZBUF) + return (EOPNOTSUPP); +#ifdef BPF_ZEROCOPY + return (bpf_zerocopy_ioctl_ackzbuf(td, d, bz)); +#else + panic("bpf_ioctl_ackzbuf"); +#endif +} + +static int +bpf_ioctl_getzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) +{ + + if (d->bd_bufmode != BPF_BUFMODE_ZBUF) + return (EOPNOTSUPP); +#ifdef BPF_ZEROCOPY + return (bpf_zerocopy_ioctl_getzbuf(td, d, bz)); +#else + panic("bpf_ioctl_getzbuf"); +#endif +} + +static int +bpf_ioctl_getznext(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) +{ + + if (d->bd_bufmode != BPF_BUFMODE_ZBUF) + return (EOPNOTSUPP); +#ifdef BPF_ZEROCOPY + return (bpf_zerocopy_ioctl_getznext(td, d, bz)); +#else + panic("bpf_ioctl_getznext"); +#endif +} + +static int +bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i) +{ + + if (d->bd_bufmode != BPF_BUFMODE_ZBUF) + return (EOPNOTSUPP); +#ifdef BPF_ZEROCOPY + return (bpf_zerocopy_ioctl_getzmax(td, d, i)); +#else + panic("bpf_ioctl_getzmax"); +#endif +} + +static int +bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) +{ + + if (d->bd_bufmode != BPF_BUFMODE_ZBUF) + return (EOPNOTSUPP); +#ifdef BPF_ZEROCOPY + return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz)); +#else + panic("bpf_ioctl_rotzbuf"); +#endif +} + +static int +bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) +{ + + if (d->bd_bufmode != BPF_BUFMODE_ZBUF) + return (EOPNOTSUPP); +#ifdef BPF_ZEROCOPY + return (bpf_zerocopy_ioctl_setzbuf(td, d, bz)); +#else + panic("bpf_ioctl_setzbuf"); +#endif +} + +/* + * General BPF functions. + */ static int bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp, struct sockaddr *sockp, int *hdrlen, struct bpf_insn *wfilter) @@ -411,7 +626,14 @@ "bpf%d", dev2unit(dev)); MALLOC(d, struct bpf_d *, sizeof(*d), M_BPF, M_WAITOK | M_ZERO); dev->si_drv1 = d; - d->bd_bufsize = bpf_bufsize; + + /* + * XXXRW: For historical reasons, perform a one-time initialization + * call to the buffer routines, even though we're not yet committed + * to a particular buffer method. + */ + bpf_buffer_init(d); + d->bd_bufmode = BPF_BUFMODE_DEFAULT; d->bd_sig = SIGIO; d->bd_direction = BPF_D_INOUT; d->bd_pid = td->td_proc->p_pid; @@ -458,18 +680,6 @@ return (0); } - -/* - * Rotate the packet buffers in descriptor d. Move the store buffer - * into the hold slot, and the free buffer into the store slot. - * Zero the length of the new store buffer. - */ -#define ROTATE_BUFFERS(d) \ - (d)->bd_hbuf = (d)->bd_sbuf; \ - (d)->bd_hlen = (d)->bd_slen; \ - (d)->bd_sbuf = (d)->bd_fbuf; \ - (d)->bd_slen = 0; \ - (d)->bd_fbuf = NULL; /* * bpfread - read next chunk of packets from buffers */ @@ -489,6 +699,15 @@ BPFD_LOCK(d); d->bd_pid = curthread->td_proc->p_pid; + if (d->bd_bufmode != BPF_BUFMODE_BUFFER) { + /* + * XXXRW: For now, we don't implement a uiomove for the + * scatter-gather buffers associated with BPF_BUFMODE_ZBUF, + * so simply disallow read(). + */ + BPFD_UNLOCK(d); + return (EOPNOTSUPP); + } if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); timed_out = (d->bd_state == BPF_TIMED_OUT); @@ -561,8 +780,12 @@ * Move data from hold buffer into user space. * We know the entire buffer is transferred since * we checked above that the read buffer is bpf_bufsize bytes. + * + * XXXRW: More synchronization needed here: what if a second thread + * issues a read on the same fd at the same time? Don't want this + * getting invalidated. */ - error = uiomove(d->bd_hbuf, d->bd_hlen, uio); + error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio); BPFD_LOCK(d); d->bd_fbuf = d->bd_hbuf; @@ -573,7 +796,6 @@ return (error); } - /* * If there are processes sleeping on this descriptor, wake them up. */ @@ -609,6 +831,23 @@ } static int +bpf_ready(struct bpf_d *d) +{ + + BPFD_LOCK_ASSERT(d); + + // printf("bpf_ready: hlen: %d, immediate %d, state %d, slen %d\n", + // d->bd_hlen, d->bd_immediate, d->bd_state, d->bd_slen); + + if (!bpf_buffree(d) && d->bd_hlen != 0) + return (1); + if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) && + d->bd_slen != 0) + return (1); + return (0); +} + +static int bpfwrite(struct cdev *dev, struct uio *uio, int ioflag) { struct bpf_d *d = dev->si_drv1; @@ -618,25 +857,34 @@ int error, hlen; d->bd_pid = curthread->td_proc->p_pid; - if (d->bd_bif == NULL) + d->bd_wcount++; + if (d->bd_bif == NULL) { + d->bd_wdcount++; return (ENXIO); + } ifp = d->bd_bif->bif_ifp; - if ((ifp->if_flags & IFF_UP) == 0) + if ((ifp->if_flags & IFF_UP) == 0) { + d->bd_wdcount++; return (ENETDOWN); + } - if (uio->uio_resid == 0) + if (uio->uio_resid == 0) { + d->bd_wdcount++; return (0); + } bzero(&dst, sizeof(dst)); m = NULL; hlen = 0; error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp, &m, &dst, &hlen, d->bd_wfilter); - if (error) + if (error) { + d->bd_wdcount++; return (error); - + } + d->bd_wfcount++; if (d->bd_hdrcmplt) dst.sa_family = pseudo_AF_HDRCMPLT; @@ -663,6 +911,8 @@ #endif error = (*ifp->if_output)(ifp, m, &dst, NULL); + if (error) + d->bd_wdcount++; if (mc != NULL) { if (error == 0) @@ -693,6 +943,10 @@ d->bd_rcount = 0; d->bd_dcount = 0; d->bd_fcount = 0; + d->bd_wcount = 0; + d->bd_wfcount = 0; + d->bd_wdcount = 0; + d->bd_zcopy = 0; } /* @@ -717,6 +971,11 @@ * BIOCSDIRECTION Set packet direction flag * BIOCLOCK Set "locked" flag * BIOCFEEDBACK Set packet feedback mode. + * BIOCGETZBUF Query current zero-copy buffer locations. + * BIOCSETZBUF Set current zero-copy buffer locations. + * BIOCSETZBUF Acknowledge reading zero-copy buffers. + * BIOCGETZMAX Get maximum zero-copy buffer size. + * BIOCGETZNEXT Get next ready zero-copy buffer location */ /* ARGSUSED */ static int @@ -725,7 +984,7 @@ { struct bpf_d *d = dev->si_drv1; int error = 0; - + /* * Refresh PID associated with this descriptor. */ @@ -754,6 +1013,8 @@ case BIOCSRTIMEOUT: case BIOCIMMEDIATE: case TIOCGPGRP: + case BIOCACKZBUF: + case BIOCGETZBUF: break; default: return (EPERM); @@ -806,17 +1067,7 @@ * Set buffer length. */ case BIOCSBLEN: - if (d->bd_bif != NULL) - error = EINVAL; - else { - u_int size = *(u_int *)addr; - - if (size > bpf_maxbufsize) - *(u_int *)addr = size = bpf_maxbufsize; - else if (size < BPF_MINBUFSIZE) - *(u_int *)addr = size = BPF_MINBUFSIZE; - d->bd_bufsize = size; - } + error = bpf_ioctl_sblen(d, (u_int *)addr); break; /* @@ -1051,6 +1302,62 @@ case BIOCGRSIG: *(u_int *)addr = d->bd_sig; break; + + case BIOCGETBUFMODE: + *(u_int *)addr = d->bd_bufmode; + break; + + case BIOCSETBUFMODE: + /* + * Allow the buffering mode to be changed as long as we + * haven't yet committed to a particular mode. Our + * definition of commitment, for now, is whether or not a + * buffer has been allocated or an interface attached, since + * that's the point where things get tricky. + * + * XXXRW: This will need some refinement. Is checking both + * for buffers and interface binding redundant? + */ + switch (*(u_int *)addr) { + case BPF_BUFMODE_BUFFER: + break; + +#ifdef BPF_ZEROCOPY + case BPF_BUFMODE_ZBUF: + break; +#endif + + default: + return (EINVAL); + } + + BPFD_LOCK(d); + if (d->bd_sbuf != NULL || d->bd_hbuf != NULL || + d->bd_fbuf != NULL || d->bd_bif != NULL) { + BPFD_UNLOCK(d); + return (EBUSY); + } + d->bd_bufmode = *(u_int *)addr; + BPFD_UNLOCK(d); + break; + + case BIOCACKZBUF: + return (bpf_ioctl_ackzbuf(td, d, (struct bpf_zbuf *)addr)); + + case BIOCGETZBUF: + return (bpf_ioctl_getzbuf(td, d, (struct bpf_zbuf *)addr)); + + case BIOCGETZMAX: + return (bpf_ioctl_getzmax(td, d, (size_t *)addr)); + + case BIOCGETZNEXT: + return (bpf_ioctl_getznext(td, d, (struct bpf_zbuf *)addr)); + + case BIOCSETZBUF: + return (bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr)); + + case BIOCROTZBUF: + return (bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr)); } return (error); } @@ -1151,13 +1458,33 @@ return (ENXIO); bp = theywant->if_bpf; + /* - * Allocate the packet buffers if we need to. - * If we're already attached to requested interface, - * just flush the buffer. - */ - if (d->bd_sbuf == NULL) - bpf_allocbufs(d); + * Behavior here depends on the buffering model. If we're using + * kernel memory buffers, then we can allocate them here. If we're + * using zero-copy, then the user process must have registered + * buffers by the time we get here. If not, return an error. + * + * XXXRW: Could this be better abstracted? + * + * XXXRW: There are locking issues here with multi-threaded use: what + * if two threads try to set the interface at once? + */ + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + if (d->bd_sbuf == NULL) + bpf_buffer_alloc(d); + KASSERT(d->bd_sbuf != NULL, ("bpf_setif: bd_sbuf NULL")); + break; + + case BPF_BUFMODE_ZBUF: + if (d->bd_sbuf == NULL) + return (EINVAL); + break; + + default: + panic("bpf_setif: bufmode %d", d->bd_bufmode); + } if (bp != d->bd_bif) { if (d->bd_bif) /* @@ -1295,43 +1622,23 @@ if (slen != 0) { d->bd_fcount++; if (!gottime) { - microtime(&tv); + if (bpf_timestamp == 0) + bzero(&tv, sizeof(tv)); + else + microtime(&tv); gottime = 1; } #ifdef MAC if (mac_check_bpfdesc_receive(d, bp->bif_ifp) == 0) #endif - catchpacket(d, pkt, pktlen, slen, bcopy, &tv); + catchpacket(d, pkt, pktlen, slen, + bpf_append_bytes, &tv); } BPFD_UNLOCK(d); } BPFIF_UNLOCK(bp); } -/* - * Copy data from an mbuf chain into a buffer. This code is derived - * from m_copydata in sys/uipc_mbuf.c. - */ -static void -bpf_mcopy(const void *src_arg, void *dst_arg, size_t len) -{ - const struct mbuf *m; - u_int count; - u_char *dst; - - m = src_arg; - dst = dst_arg; - while (len > 0) { - if (m == NULL) - panic("bpf_mcopy"); - count = min(m->m_len, len); - bcopy(mtod(m, void *), dst, count); - m = m->m_next; - dst += count; - len -= count; - } -} - #define BPF_CHECK_DIRECTION(d, m) \ if (((d)->bd_direction == BPF_D_IN && (m)->m_pkthdr.rcvif == NULL) || \ ((d)->bd_direction == BPF_D_OUT && (m)->m_pkthdr.rcvif != NULL)) @@ -1374,14 +1681,17 @@ if (slen != 0) { d->bd_fcount++; if (!gottime) { - microtime(&tv); + if (bpf_timestamp == 0) + bzero(&tv, sizeof(tv)); + else + microtime(&tv); gottime = 1; } #ifdef MAC if (mac_check_bpfdesc_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)m, pktlen, slen, - bpf_mcopy, &tv); + bpf_append_mbuf, &tv); } BPFD_UNLOCK(d); } @@ -1429,14 +1739,17 @@ if (slen != 0) { d->bd_fcount++; if (!gottime) { - microtime(&tv); + if (bpf_timestamp == 0) + bzero(&tv, sizeof(tv)); + else + microtime(&tv); gottime = 1; } #ifdef MAC if (mac_check_bpfdesc_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)&mb, pktlen, slen, - bpf_mcopy, &tv); + bpf_append_mbuf, &tv); } BPFD_UNLOCK(d); } @@ -1449,19 +1762,34 @@ * Move the packet data from interface memory (pkt) into the * store buffer. "cpfn" is the routine called to do the actual data * transfer. bcopy is passed in to copy contiguous chunks, while - * bpf_mcopy is passed in to copy mbuf chains. In the latter case, + * bpf_append_mbuf is passed in to copy mbuf chains. In the latter case, * pkt is really an mbuf. */ static void catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, - void (*cpfn)(const void *, void *, size_t), struct timeval *tv) + void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int), + struct timeval *tv) { - struct bpf_hdr *hp; + struct bpf_hdr hdr; int totlen, curlen; int hdrlen = d->bd_bif->bif_hdrlen; int do_wakeup = 0; BPFD_LOCK_ASSERT(d); + + /* + * Detect whether user space has released a buffer back to us, and if + * so, move it from being a hold buffer to a free buffer. This may + * not be the best place to do it (for example, we might only want to + * run this check if we need the space), but for now it's a reliable + * spot to do it. + */ + if (bpf_buffree(d)) { + d->bd_fbuf = d->bd_hbuf; + d->bd_hbuf = NULL; + d->bd_hlen = 0; + } + /* * Figure out how many bytes to move. If the packet is * greater or equal to the snapshot length, transfer that @@ -1496,65 +1824,52 @@ } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) /* - * Immediate mode is set, or the read timeout has - * already expired during a select call. A packet - * arrived, so the reader should be woken up. + * Immediate mode is set, or the read timeout has already + * expired during a select call. A packet arrived, so the + * reader should be woken up. */ do_wakeup = 1; /* - * Append the bpf header. + * Append the bpf header. Note we append the actual header size, but + * move forward the length of the header plus padding. */ - hp = (struct bpf_hdr *)(d->bd_sbuf + curlen); - hp->bh_tstamp = *tv; - hp->bh_datalen = pktlen; - hp->bh_hdrlen = hdrlen; + bzero(&hdr, sizeof(hdr)); + hdr.bh_tstamp = *tv; + hdr.bh_datalen = pktlen; + hdr.bh_hdrlen = hdrlen; + hdr.bh_caplen = totlen - hdrlen; + bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr)); + /* * Copy the packet data into the store buffer and update its length. */ - (*cpfn)(pkt, (u_char *)hp + hdrlen, (hp->bh_caplen = totlen - hdrlen)); + (*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, hdr.bh_caplen); d->bd_slen = curlen + totlen; + /* + * XXXCSJP we could probably save a syscall per wakeup if we check the + * d->bd_immediate flag, hold buffer status and rotate the buffers + * before the wakeup. + */ if (do_wakeup) bpf_wakeup(d); } /* - * Initialize all nonzero fields of a descriptor. - */ -static void -bpf_allocbufs(struct bpf_d *d) -{ - - KASSERT(d->bd_fbuf == NULL, ("bpf_allocbufs: bd_fbuf != NULL")); - KASSERT(d->bd_sbuf == NULL, ("bpf_allocbufs: bd_sbuf != NULL")); - KASSERT(d->bd_hbuf == NULL, ("bpf_allocbufs: bd_hbuf != NULL")); - - d->bd_fbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK); - d->bd_sbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK); - d->bd_slen = 0; - d->bd_hlen = 0; -} - -/* * Free buffers currently in use by a descriptor. * Called on close. */ static void bpf_freed(struct bpf_d *d) { + /* * We don't need to lock out interrupts since this descriptor has * been detached from its interface and it yet hasn't been marked * free. */ - if (d->bd_sbuf != NULL) { - free(d->bd_sbuf, M_BPF); - if (d->bd_hbuf != NULL) - free(d->bd_hbuf, M_BPF); - if (d->bd_fbuf != NULL) - free(d->bd_fbuf, M_BPF); - } + bpf_free(d); if (d->bd_rfilter) { free((caddr_t)d->bd_rfilter, M_BPF); #ifdef BPF_JITTER @@ -1775,6 +2090,10 @@ strlcpy(d->bd_ifname, bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ); d->bd_locked = bd->bd_locked; + d->bd_wcount = bd->bd_wcount; + d->bd_wdcount = bd->bd_wdcount; + d->bd_wfcount = bd->bd_wfcount; + d->bd_zcopy = bd->bd_zcopy; } static int Index: sys/net/bpf.h =================================================================== RCS file: /home/ncvs/src/sys/net/bpf.h,v retrieving revision 1.47.2.1 diff -u -r1.47.2.1 bpf.h --- sys/net/bpf.h 21 Oct 2007 14:05:27 -0000 1.47.2.1 +++ sys/net/bpf.h 5 Nov 2007 18:40:34 -0000 @@ -92,6 +92,44 @@ #define BPF_MAJOR_VERSION 1 #define BPF_MINOR_VERSION 1 +/* + * Historically, BPF has supported a single buffering model, first using mbuf + * clusters in kernel, and later using malloc(9) buffers in kernel. We now + * support multiple buffering modes, which may be queried and set using + * BIOCGETBUFMODE and BIOCSETBUFMODE. So as to avoid handling the complexity + * of changing modes while sniffing packets, the mode becomes fixed once an + * interface has been attached to the BPF descriptor. + */ +#define BPF_BUFMODE_BUFFER 1 /* Kernel buffers with read(). */ +#define BPF_BUFMODE_ZBUF 2 /* Zero-copy buffers. */ + +#define BPF_BUFMODE_DEFAULT BPF_BUFMODE_BUFFER /* Default. */ + +/* + * Struct used by BIOCACKZBUF, BIOCGETZNEXT, BIOCGETZBUF, BIOCSETZBUF: + * describes up to two zero-copy buffer as used by BPF. + * + * BIOCACKZBUF Acknowledge read of stored zero-copy buffer (rotate). + * BIOCGETZBUF Query current zero-copy buffer locations. + * BIOCGETZNEXT Query next stored buffer, if available. + * BIOCSETZBUF Set current zero-copy buffer locations (once only). + * + * Pointers may be set to NULL to indicate a buffer is not configure, should + * be freed, or is not being acknowledged. + */ +struct bpf_zbuf { + void *bz_bufa; /* Location of 'a' zero-copy buffer. */ + void *bz_bufb; /* Location of 'b' zero-copy buffer. */ + size_t bz_buflen; /* Size of zero-copy buffers. */ +}; + +/* Packet directions */ +enum bpf_direction { + BPF_D_IN, /* See incoming packets */ + BPF_D_INOUT, /* See incoming and outgoing packets */ + BPF_D_OUT /* See outgoing packets */ +}; + #define BIOCGBLEN _IOR('B',102, u_int) #define BIOCSBLEN _IOWR('B',102, u_int) #define BIOCSETF _IOW('B',103, struct bpf_program) @@ -115,18 +153,19 @@ #define BIOCGDLTLIST _IOWR('B',121, struct bpf_dltlist) #define BIOCLOCK _IO('B', 122) #define BIOCSETWF _IOW('B',123, struct bpf_program) -#define BIOCFEEDBACK _IOW('B',124, u_int) - /* Obsolete */ -#define BIOCGSEESENT BIOCGDIRECTION -#define BIOCSSEESENT BIOCSDIRECTION +#define BIOCGSEESENT BIOCGDIRECTION +#define BIOCSSEESENT BIOCSDIRECTION -/* Packet directions */ -enum bpf_direction { - BPF_D_IN, /* See incoming packets */ - BPF_D_INOUT, /* See incoming and outgoing packets */ - BPF_D_OUT /* See outgoing packets */ -}; +#define BIOCGETBUFMODE _IOR('B', 124, u_int) +#define BIOCSETBUFMODE _IOW('B', 125, u_int) +#define BIOCACKZBUF _IOW('B', 126, struct bpf_zbuf) +#define BIOCGETZBUF _IOR('B', 127, struct bpf_zbuf) +#define BIOCGETZMAX _IOR('B', 128, size_t) +#define BIOCGETZNEXT _IOR('B', 129, struct bpf_zbuf) +#define BIOCROTZBUF _IOR('B', 130, struct bpf_zbuf) +#define BIOCSETZBUF _IOW('B', 131, struct bpf_zbuf) +#define BIOCFEEDBACK _IOW('B', 132, u_int) /* * Structure prepended to each packet. @@ -149,6 +188,21 @@ #endif /* + * When using zero-copy BPF buffers, a shared memory header is present + * allowing the kernel BPF implementation and user process to synchronize + * without using system calls. This structure defines that header. + * + * The layout of this structure is critical, and must not be changed; if must + * fit in a single page on all architectures. + */ +struct bpf_zbuf_header { + volatile u_int bzh_kernel_gen; /* Kernel generation number. */ + volatile u_int bzh_kernel_len; /* Length of buffer. */ + volatile u_int bzh_user_gen; /* User generation number. */ + u_int _bzh_pad[5]; +}; + +/* * Data-link level type codes. */ #define DLT_NULL 0 /* BSD loopback encapsulation */ @@ -761,6 +815,29 @@ }; #ifdef _KERNEL + +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_BPF); +#endif + +#ifdef SYSCTL_DECL +SYSCTL_DECL(_net_bpf); +#endif + +/* + * Rotate the packet buffers in descriptor d. Move the store buffer into the + * hold slot, and the free buffer ino the store slot. Zero the length of the + * new store buffer. Descriptor lock should be held. + */ +#define ROTATE_BUFFERS(d) do { \ + (d)->bd_hbuf = (d)->bd_sbuf; \ + (d)->bd_hlen = (d)->bd_slen; \ + (d)->bd_sbuf = (d)->bd_fbuf; \ + (d)->bd_slen = 0; \ + (d)->bd_fbuf = NULL; \ + bpf_bufheld(d); \ +} while (0) + /* * Descriptor associated with each attached hardware interface. */ @@ -773,6 +850,7 @@ struct mtx bif_mtx; /* mutex for interface */ }; +void bpf_bufheld(struct bpf_d *d); int bpf_validate(const struct bpf_insn *, int); void bpf_tap(struct bpf_if *, u_char *, u_int); void bpf_mtap(struct bpf_if *, struct mbuf *); Index: sys/net/bpf_buffer.c =================================================================== RCS file: sys/net/bpf_buffer.c diff -N sys/net/bpf_buffer.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/net/bpf_buffer.c 5 Nov 2007 18:52:09 -0000 @@ -0,0 +1,226 @@ +/*- + * Copyright (c) 2007 Seccuris Inc. + * All rights reserved. + * + * This sofware was developed by Robert N. M. Watson under contract to + * Seccuris Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (c) 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from the Stanford/CMU enet packet filter, + * (net/enet.c) distributed as part of 4.3BSD, and code contributed + * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence + * Berkeley Laboratory. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)bpf.c 8.4 (Berkeley) 1/9/95 + * + * $FreeBSD: src/sys/net/bpf.c,v 1.174 2006/11/06 13:42:02 rwatson Exp $ + */ + +#include "opt_bpf.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * Implement historical kernel memory buffering model for BPF: two malloc(9) + * kernel buffers are hung off of the descriptor. The size is fixed prior to + * attaching to an ifnet, ad cannot be changed after that. read(2) simply + * copies the data to user space using uiomove(9). + */ + +static int bpf_bufsize = 4096; +SYSCTL_INT(_net_bpf, OID_AUTO, bufsize, CTLFLAG_RW, + &bpf_bufsize, 0, ""); +static int bpf_maxbufsize = BPF_MAXBUFSIZE; +SYSCTL_INT(_net_bpf, OID_AUTO, maxbufsize, CTLFLAG_RW, + &bpf_maxbufsize, 0, ""); + +void +bpf_buffer_alloc(struct bpf_d *d) +{ + + KASSERT(d->bd_fbuf == NULL, ("bpf_buffer_alloc: bd_fbuf != NULL")); + KASSERT(d->bd_sbuf == NULL, ("bpf_buffer_alloc: bd_sbuf != NULL")); + KASSERT(d->bd_hbuf == NULL, ("bpf_buffer_alloc: bd_hbuf != NULL")); + + // printf("bpf_buffer_alloc size %d\n", d->bd_bufsize); + + d->bd_fbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK); + d->bd_sbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK); + d->bd_hbuf = NULL; + d->bd_slen = 0; + d->bd_hlen = 0; +} + +/* + * Simple data copy to the current kernel buffer. + */ +void +bpf_buffer_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, + void *src, u_int len) +{ + u_char *src_bytes; + + // printf("bpf_buffer_append_bytes size %d\n", len); + + src_bytes = (u_char *)src; + bcopy(src_bytes, buf + offset, len); +} + +/* + * Scatter-gather data copy from an mbuf chain to the current kernel buffer. + */ +void +bpf_buffer_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src, + u_int len) +{ + const struct mbuf *m; + u_char *dst; + u_int count; + + // printf("bpf_buffer_append_mbuf size %d\n", len); + + m = (struct mbuf *)src; + dst = (u_char *)buf + offset; + while (len > 0) { + if (m == NULL) + panic("bpf_mcopy"); + count = min(m->m_len, len); + bcopy(mtod(m, void *), dst, count); + m = m->m_next; + dst += count; + len -= count; + } +} + +/* + * Free BPF kernel buffers on device close. + */ +void +bpf_buffer_free(struct bpf_d *d) +{ + + // printf("bpf_buffer_free(sbuf: %p, hbuf: %p, fbuf: %p)\n", + // d->bd_sbuf, d->bd_hbuf, d->bd_fbuf); + + if (d->bd_sbuf != NULL) + free(d->bd_sbuf, M_BPF); + if (d->bd_hbuf != NULL) + free(d->bd_hbuf, M_BPF); + if (d->bd_fbuf != NULL) + free(d->bd_fbuf, M_BPF); + +#ifdef INVARIANTS + d->bd_sbuf = d->bd_hbuf = d->bd_fbuf = (caddr_t)~0; +#endif +} + +/* + * This is a historical initialization that occurs when the BPF descriptor is + * first opened. It does not imply selection of a buffer mode, so we don't + * allocate buffers here. + */ +void +bpf_buffer_init(struct bpf_d *d) +{ + + // printf("bpf_buffer_init: bufsize %d\n", bpf_bufsize); + + d->bd_bufsize = bpf_bufsize; +} + +/* + * Allocate or resize buffers. + */ +int +bpf_buffer_ioctl_sblen(struct bpf_d *d, u_int *i) +{ + u_int size; + + // printf("bpf_buffer_ioctl_sblen %d\n", *i); + + BPFD_LOCK(d); + if (d->bd_bif != NULL) { + BPFD_UNLOCK(d); + return (EINVAL); + } + size = *i; + if (size > bpf_maxbufsize) + *i = size = bpf_maxbufsize; + else if (size < BPF_MINBUFSIZE) + *i = size = BPF_MINBUFSIZE; + d->bd_bufsize = size; + BPFD_UNLOCK(d); + + // printf("bpf_buffer_ioctl_sblen (req: %d, set: %d)\n", *i, size); + return (0); +} + +/* + * Copy buffer storage to user space in read(). + */ +int +bpf_buffer_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio) +{ + + // printf("bpf_buffer_uiomove %d bytes\n", len); + + return (uiomove(buf, len, uio)); +} Index: sys/net/bpf_buffer.h =================================================================== RCS file: sys/net/bpf_buffer.h diff -N sys/net/bpf_buffer.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/net/bpf_buffer.h 5 Nov 2007 18:53:33 -0000 @@ -0,0 +1,50 @@ +/*- + * Copyright (c) 2007 Seccuris Inc. + * All rights reserved. + * + * This sofware was developed by Robert N. M. Watson under contract to + * Seccuris Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_BPF_BUFFER_H_ +#define _NET_BPF_BUFFER_H_ + +#ifndef _KERNEL +#error "no user-serviceable parts inside" +#endif + +void bpf_buffer_alloc(struct bpf_d *d); +void bpf_buffer_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, + void *src, u_int len); +void bpf_buffer_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, + void *src, u_int len); +void bpf_buffer_free(struct bpf_d *d); +void bpf_buffer_init(struct bpf_d *d); +int bpf_buffer_ioctl_sblen(struct bpf_d *d, u_int *i); +int bpf_buffer_uiomove(struct bpf_d *d, caddr_t buf, u_int len, + struct uio *uio); + +#endif /* !_NET_BPF_BUFFER_H_ */ Index: sys/net/bpf_filter.c =================================================================== RCS file: /home/ncvs/src/sys/net/bpf_filter.c,v retrieving revision 1.28 diff -u -r1.28 bpf_filter.c --- sys/net/bpf_filter.c 13 Sep 2007 09:00:32 -0000 1.28 +++ sys/net/bpf_filter.c 5 Nov 2007 18:40:34 -0000 @@ -83,14 +83,11 @@ static u_int32_t m_xword(struct mbuf *m, bpf_u_int32 k, int *err); static u_int32_t -m_xword(m, k, err) - register struct mbuf *m; - register bpf_u_int32 k; - register int *err; +m_xword(struct mbuf *m, bpf_u_int32 k, int *err) { - register size_t len; - register u_char *cp, *np; - register struct mbuf *m0; + size_t len; + u_char *cp, *np; + struct mbuf *m0; len = m->m_len; while (k >= len) { @@ -111,21 +108,18 @@ *err = 0; np = mtod(m0, u_char *); switch (len - k) { - case 1: return ((u_int32_t)cp[0] << 24) | ((u_int32_t)np[0] << 16) | ((u_int32_t)np[1] << 8) | (u_int32_t)np[2]; - case 2: return ((u_int32_t)cp[0] << 24) | ((u_int32_t)cp[1] << 16) | ((u_int32_t)np[0] << 8) | (u_int32_t)np[1]; - default: return ((u_int32_t)cp[0] << 24) | @@ -135,18 +129,15 @@ } bad: *err = 1; - return 0; + return (0); } static u_int16_t -m_xhalf(m, k, err) - register struct mbuf *m; - register bpf_u_int32 k; - register int *err; +m_xhalf(struct mbuf *m, bpf_u_int32 k, int *err) { - register size_t len; - register u_char *cp; - register struct mbuf *m0; + size_t len; + u_char *cp; + struct mbuf *m0; len = m->m_len; while (k >= len) { @@ -159,16 +150,16 @@ cp = mtod(m, u_char *) + k; if (len - k >= 2) { *err = 0; - return EXTRACT_SHORT(cp); + return (EXTRACT_SHORT(cp)); } m0 = m->m_next; if (m0 == 0) goto bad; *err = 0; - return (cp[0] << 8) | mtod(m0, u_char *)[0]; + return ((cp[0] << 8) | mtod(m0, u_char *)[0]); bad: *err = 1; - return 0; + return (0); } #endif @@ -178,21 +169,17 @@ * buflen is the amount of data present */ u_int -bpf_filter(pc, p, wirelen, buflen) - register const struct bpf_insn *pc; - register u_char *p; - u_int wirelen; - register u_int buflen; +bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) { - register u_int32_t A = 0, X = 0; - register bpf_u_int32 k; + u_int32_t A = 0, X = 0; + bpf_u_int32 k; u_int32_t mem[BPF_MEMWORDS]; - if (pc == 0) + if (pc == NULL) /* * No filter means accept all. */ - return (u_int)-1; + return ((u_int)-1); --pc; while (1) { @@ -206,10 +193,10 @@ abort(); #endif case BPF_RET|BPF_K: - return (u_int)pc->k; + return ((u_int)pc->k); case BPF_RET|BPF_A: - return (u_int)A; + return ((u_int)A); case BPF_LD|BPF_W|BPF_ABS: k = pc->k; @@ -224,7 +211,7 @@ return 0; continue; #else - return 0; + return (0); #endif } #ifdef BPF_ALIGN @@ -256,7 +243,7 @@ k = pc->k; if (k >= buflen) { #ifdef _KERNEL - register struct mbuf *m; + struct mbuf *m; if (buflen != 0) return 0; @@ -287,13 +274,13 @@ int merr; if (buflen != 0) - return 0; + return (0); A = m_xword((struct mbuf *)p, k, &merr); if (merr != 0) - return 0; + return (0); continue; #else - return 0; + return (0); #endif } #ifdef BPF_ALIGN @@ -315,10 +302,10 @@ return 0; A = m_xhalf((struct mbuf *)p, k, &merr); if (merr != 0) - return 0; + return (0); continue; #else - return 0; + return (0); #endif } A = EXTRACT_SHORT(&p[k]); @@ -328,7 +315,7 @@ k = X + pc->k; if (pc->k >= buflen || X >= buflen - pc->k) { #ifdef _KERNEL - register struct mbuf *m; + struct mbuf *m; if (buflen != 0) return 0; @@ -337,7 +324,7 @@ A = mtod(m, u_char *)[k]; continue; #else - return 0; + return (0); #endif } A = p[k]; Index: sys/net/bpf_zerocopy.c =================================================================== RCS file: sys/net/bpf_zerocopy.c diff -N sys/net/bpf_zerocopy.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/net/bpf_zerocopy.c 5 Nov 2007 18:52:28 -0000 @@ -0,0 +1,635 @@ +/*- + * Copyright (c) 2007 Seccuris Inc. + * All rights reserved. + * + * This sofware was developed by Robert N. M. Watson under contract to + * Seccuris Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_bpf.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* + * Zero-copy buffer scheme for BPF: user space "donates" two buffers, which + * are mapped into the kernel address space using sf_bufs and used directly + * by BPF. Memory is wired since page faults cannot be tolerated in the + * contexts where the buffers are copied to (locks held, interrupt context, + * etc). + */ + +/* + * Maximum number of pages per buffer. Since all BPF devices use two, the + * maximum per device is 2*BPF_MAX_PAGES. Resource limits on the number of + * sf_bufs may be an issue, so do not set this too high. On older systems, + * kernel address space limits may also be an issue. + */ +#define BPF_MAX_PAGES 512 + +/* + * struct zbuf describes a memory buffer loaned by a user process to the + * kernel. We represent this as a series of pages managed using an array of + * sf_bufs. Even though the memory is contiguous in user space, it may not + * be mapped contiguously in the kernel (i.e., a set of physically + * non-contiguous pages in the direct map region) so we must implement + * scatter-gather copying. + * + * At the front of the shared memor region is a bpf_zbuf_header, which + * contains shared control data to allow user space and the kernel to + * synchronize; this is included in zb_size, but not bpf_bufsize, so that BPF + * knows that the space is not available. + */ +struct zbuf { + vm_offset_t zb_uaddr; /* User address, may be stale. */ + size_t zb_size; /* Size of buffer, incl. header. */ + u_int zb_numpages; /* Number of pages. */ + struct sf_buf **zb_pages; /* Pages themselves. */ + struct bpf_zbuf_header *zb_header; /* Shared header. */ +}; + +/* + * Release a page we've previously wired. + */ +static void +zbuf_page_free(vm_page_t pp) +{ + + vm_page_lock_queues(); + vm_page_unwire(pp, 0); + if (pp->wire_count == 0 && pp->object == NULL) + vm_page_free(pp); + vm_page_unlock_queues(); +} + +/* + * Free an sf_buf with attached page. + */ +static void +zbuf_sfbuf_free(struct sf_buf *sf) +{ + vm_page_t pp; + + pp = sf_buf_page(sf); + sf_buf_free(sf); + zbuf_page_free(pp); +} + +/* + * Free a zbuf, including its page array, sbufs, and pages. Allow partially + * allocated zbufs to be freed so that it may be used even during a zbuf + * setup. + */ +static void +zbuf_free(struct zbuf *zb) +{ + int i; + + for (i = 0; i < zb->zb_numpages; i++) { + if (zb->zb_pages[i] != NULL) + zbuf_sfbuf_free(zb->zb_pages[i]); + } + free(zb->zb_pages, M_BPF); + free(zb, M_BPF); +} + +/* + * Given a user pointer to a page of user memory, return an sf_buf for the + * page. Because we may be requesting quite a few sf_bufs, prefer failure + * to deadlock and use SFB_NOWAIT. + */ +static struct sf_buf * +zbuf_sfbuf_get(struct vm_map *map, vm_offset_t uaddr) +{ + struct sf_buf *sf; + vm_page_t pp; + + if (vm_fault_quick((caddr_t) uaddr, VM_PROT_READ | VM_PROT_WRITE) + < 0) + return (NULL); + + pp = pmap_extract_and_hold(map->pmap, uaddr, + VM_PROT_READ | VM_PROT_WRITE); + if (pp == NULL) + return (NULL); + + vm_page_lock_queues(); + vm_page_wire(pp); + vm_page_unhold(pp); + vm_page_unlock_queues(); + + sf = sf_buf_alloc(pp, SFB_NOWAIT); + if (sf == NULL) { + zbuf_page_free(pp); + return (NULL); + } + return (sf); +} + +/* + * Create a zbuf describing a range of user address space memory. Validate + * page alignment, size requirements, etc. + */ +static int +zbuf_setup(struct thread *td, vm_offset_t uaddr, size_t len, + struct zbuf **zbp) +{ + struct zbuf *zb; + struct vm_map *map; + int error, i; + + *zbp = NULL; + + /* User address must be page-aligned. */ + if (uaddr & PAGE_MASK) + return (EINVAL); + + /* Length must be an integer number of full pages. */ + if (len & PAGE_MASK) + return (EINVAL); + + /* Length must not exceed per-buffer resource limit. */ + if ((len / PAGE_SIZE) > BPF_MAX_PAGES) + return (EINVAL); + + error = 0; + zb = malloc(sizeof(*zb), M_BPF, M_ZERO | M_WAITOK); + zb->zb_uaddr = uaddr; + zb->zb_size = len; + zb->zb_numpages = len / PAGE_SIZE; + zb->zb_pages = malloc(sizeof(struct sf_buf *) * + zb->zb_numpages, M_BPF, M_ZERO | M_WAITOK); + map = &td->td_proc->p_vmspace->vm_map; + for (i = 0; i < zb->zb_numpages; i++) { + zb->zb_pages[i] = zbuf_sfbuf_get(map, + uaddr + (i * PAGE_SIZE)); + if (zb->zb_pages[i] == NULL) { + error = EFAULT; + goto error; + } + } + zb->zb_header = + (struct bpf_zbuf_header *)sf_buf_kva(zb->zb_pages[0]); + bzero(zb->zb_header, sizeof(*zb->zb_header)); + *zbp = zb; + return (0); + +error: + zbuf_free(zb); + return (error); +} + +/* + * Copy bytes from a source into the specified zbuf. The caller is + * responsible for performing bounds checking, etc. + */ +void +bpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, + void *src, u_int len) +{ + u_int count, page, poffset; + u_char *src_bytes; + struct zbuf *zb; + + KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, + ("bpf_zerocopy_append_bytes: not in zbuf mode")); + KASSERT(buf != NULL, ("bpf_zerocopy_append_bytes: NULL buf")); + + + src_bytes = (u_char *)src; + zb = (struct zbuf *)buf; + + /* + * Scatter-gather copy to user pages mapped into kernel address space + * using sf_bufs: copy up to a page at a time. + */ + offset += sizeof(struct bpf_zbuf_header); + page = offset / PAGE_SIZE; + poffset = offset % PAGE_SIZE; + while (len > 0) { + KASSERT(page < zb->zb_numpages, ("bpf_zerocopy_append_bytes:" + " page overflow (%d p %d np)\n", page, zb->zb_numpages)); + + count = min(len, PAGE_SIZE - poffset); + bcopy(src_bytes, ((u_char *)sf_buf_kva(zb->zb_pages[page])) + + poffset, count); + + poffset += count; + if (poffset == PAGE_SIZE) { + poffset = 0; + page++; + } + KASSERT(poffset < PAGE_SIZE, + ("bpf_zerocopy_append_bytes: page offset overflow (%d)", + poffset)); + len -= count; + src_bytes += count; + } +} + +/* + * Copy bytes from an mbuf chain to the specified zbuf: copying will be + * scatter-gather both from mbufs, which may be fragmented over memory, and + * to pages, which may not be contiguously mapped in kernel address space. + * As with bpf_zerocopy_append_bytes(), the caller is responsible for + * checking that this will not exceed the buffer limit. + */ +void +bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, + void *src, u_int len) +{ + u_int count, moffset, page, poffset; + const struct mbuf *m; + struct zbuf *zb; + + KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, + ("bpf_zerocopy_append_mbuf not in zbuf mode")); + KASSERT(buf != NULL, ("bpf_zerocopy_append_mbuf: NULL buf")); + + m = (struct mbuf *)src; + zb = (struct zbuf *)buf; + + /* + * Scatter gather both from an mbuf chain and to a user page set + * mapped into kernel address space using sf_bufs. If we're lucky, + * each mbuf requires one copy operation, but if page alignment and + * mbuf alignment work out less well, we'll be doing two copies per + * mbuf. + */ + offset += sizeof(struct bpf_zbuf_header); + page = offset / PAGE_SIZE; + poffset = offset % PAGE_SIZE; + moffset = 0; + while (len > 0) { + KASSERT(page < zb->zb_numpages, + ("bpf_zerocopy_append_mbuf: page overflow (%d p %d " + "np)\n", page, zb->zb_numpages)); + KASSERT(m != NULL, + ("bpf_zerocopy_append_mbuf: end of mbuf chain")); + + count = min(m->m_len - moffset, len); + count = min(count, PAGE_SIZE - poffset); + bcopy(mtod(m, u_char *) + moffset, + ((u_char *)sf_buf_kva(zb->zb_pages[page])) + poffset, + count); + + poffset += count; + if (poffset == PAGE_SIZE) { + poffset = 0; + page++; + } + KASSERT(poffset < PAGE_SIZE, + ("bpf_zerocopy_append_mbuf: page offset overflow (%d)", + poffset)); + moffset += count; + if (moffset == m->m_len) { + m = m->m_next; + moffset = 0; + } + len -= count; + } +} + +/* + * Notification from the BPF framework that a buffer has moved into the held + * slot on a descriptor. Zero-copy BPF will update the shared page to let + * the user process know. + * + * XXXRW: Do we need to use a memory barrier, atomic operation, or the like + * to make sure that the generation update is the last write to make it out + * after any packet date so that user space sees the generation increase only + * at or after the last packet data change? + */ +void +bpf_zerocopy_bufheld(struct bpf_d *d) +{ + struct zbuf *zb; + + KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, + ("bpf_zerocopy_bufheld: not in zbuf mode")); + + zb = (struct zbuf *)d->bd_hbuf; + KASSERT(zb != NULL, ("bpf_zerocopy_bufheld: zb == NULL")); + zb->zb_header->bzh_kernel_len = d->bd_hlen; + zb->zb_header->bzh_kernel_gen++; +} + +/* + * Query from the BPF framework regarding whether the buffer currently in the + * held position can be moved to the free position, which can be indicated by + * the user process making their generation number equal to the kernel + * generation number. + * + * XXXRW: Memory ordering also an issue here? + */ +int +bpf_zerocopy_buffree(struct bpf_d *d) +{ + struct zbuf *zb; + + KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, + ("bpf_zerocopy_buffree: not in zbuf mode")); + + zb = (struct zbuf *)d->bd_hbuf; + if (zb == NULL) + return (0); + if (zb->zb_header->bzh_kernel_gen == zb->zb_header->bzh_user_gen) + return (1); + return (0); +} + +/* + * Free zero copy buffers at request of descriptor. + */ +void +bpf_zerocopy_free(struct bpf_d *d) +{ + struct zbuf *zb; + + KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, + ("bpf_zerocopy_free: not in zbuf mode")); + + zb = (struct zbuf *)d->bd_sbuf; + if (zb != NULL) + zbuf_free(zb); + zb = (struct zbuf *)d->bd_hbuf; + if (zb != NULL) + zbuf_free(zb); + zb = (struct zbuf *)d->bd_fbuf; + if (zb != NULL) + zbuf_free(zb); +} + +/* + * For now, allow bpfread() to rotate the buffers, but don't perform a copy + * operation or return a value. If we want to copy, we'll need to implement + * scatter-gather copying with a series of uiomove calls here. + */ +int +bpf_zerocopy_uiomove(struct bpf_d *d, caddr_t buf, u_int len, + struct uio *uio) +{ + + KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, + ("bpf_zerocopy_uiomove: not in zbuf mode")); + + return (EOPNOTSUPP); +} + +/* + * Acknowledge reading the buffer without performing read(). We accept an + * argument primarily so that we can validate that user space has the right + * idea, helping to catch application bugs faster if the application's sense + * of buffer rotation differs from the kernel's (or for that matter, kernel + * bugs). + */ +int +bpf_zerocopy_ioctl_ackzbuf(struct thread *td, struct bpf_d *d, + struct bpf_zbuf *bz) +{ + struct zbuf *zb; + + KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, + ("bpf_zerocopy_ioctl_ackzbuf: not in zbuf mode")); + + BPFD_LOCK(d); + if (d->bd_hbuf == NULL) { + BPFD_UNLOCK(d); + return (EINVAL); + } + zb = (struct zbuf *)d->bd_hbuf; + if (bz->bz_bufa != (void *)zb->zb_uaddr) { + BPFD_UNLOCK(d); + return (EINVAL); + } + zb->zb_header->bzh_user_gen = zb->zb_header->bzh_kernel_gen; + d->bd_fbuf = d->bd_hbuf; + d->bd_hbuf = NULL; + d->bd_hlen = 0; + BPFD_UNLOCK(d); + return (0); +} + +/* + * Ioctl to retrieve zbuf settings. Note that the user address pointers are + * copied versions of those originally submitted via the setzbuf ioctl--if + * user space has remapped the buffers, then they may be inconsistent. User + * applications must be aware that these are in effect buffer names, not + * pointers, if they play such games with their address space. Pointers are + * returned in arbitrary order, which may vary by ioctl. + */ +int +bpf_zerocopy_ioctl_getzbuf(struct thread *td, struct bpf_d *d, + struct bpf_zbuf *bz) +{ + struct zbuf *zb; + + KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, + ("bpf_zerocopy_ioctl_getzbuf: not in zbuf mode")); + + bzero(bz, sizeof(*bz)); + BPFD_LOCK(d); + if (d->bd_hbuf != NULL) { + zb = (struct zbuf *)d->bd_hbuf; + bz->bz_bufa = (void *)zb->zb_uaddr; + bz->bz_buflen = zb->zb_size; + zb = (struct zbuf *)d->bd_sbuf; + bz->bz_bufb = (void *)zb->zb_uaddr; + } else if (d->bd_sbuf != NULL) { + zb = (struct zbuf *)d->bd_sbuf; + bz->bz_bufa = (void *)zb->zb_uaddr; + bz->bz_buflen = zb->zb_size; + zb = (struct zbuf *)d->bd_fbuf; + bz->bz_bufb = (void *)zb->zb_uaddr; + } else { + bz->bz_bufa = bz->bz_bufb = NULL; + bz->bz_buflen = 0; + } + BPFD_UNLOCK(d); + return (0); +} + +/* + * Ioctl to return the maximum buffer size. + */ +int +bpf_zerocopy_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i) +{ + + KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, + ("bpf_zerocopy_ioctl_getzmax: not in zbuf mode")); + + *i = BPF_MAX_PAGES * PAGE_SIZE; + return (0); +} + +/* + * Ioctl to return the next completed buffer to read, if any. In immediate + * mode, this may force a buffer rotation if there is stored data but no held + * data, in similar style to calling bpfread() on an immediate mode + * descriptor. + */ +int +bpf_zerocopy_ioctl_getznext(struct thread *td, struct bpf_d *d, + struct bpf_zbuf *bz) +{ + struct zbuf *zb; + + KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, + ("bpf_zerocopy_ioctl_getznext: not in zbuf mode")); + + /* + * If in immediate mode, there's no holder buffer, but there is + * stored packet data, rotate so that the stored buffer is now the + * held buffer. + */ + BPFD_LOCK(d); + if (d->bd_immediate && d->bd_hbuf == NULL + && d->bd_slen != 0) { + ROTATE_BUFFERS(d); + } + bzero(bz, sizeof(*bz)); + if (d->bd_hbuf != NULL) { + zb = (struct zbuf *)d->bd_hbuf; + bz->bz_bufa = (void *)zb->zb_uaddr; + bz->bz_buflen = d->bd_hlen; + } + BPFD_UNLOCK(d); + return (0); +} + +int +bpf_zerocopy_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, + struct bpf_zbuf *bz) +{ + struct zbuf *bzh; + + bzero(bz, sizeof(*bz)); + BPFD_LOCK(d); + if (d->bd_hbuf == NULL && d->bd_slen != 0) { + ROTATE_BUFFERS(d); + bzh = (struct zbuf *)d->bd_hbuf; + bz->bz_bufa = (void *)bzh->zb_uaddr; + bz->bz_buflen = d->bd_hlen; + } + BPFD_UNLOCK(d); + return (0); +} +/* + * Ioctl to configure zero-copy buffers -- may be done only once. + */ +int +bpf_zerocopy_ioctl_setzbuf(struct thread *td, struct bpf_d *d, + struct bpf_zbuf *bz) +{ + struct zbuf *zba, *zbb; + int error; + + KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, + ("bpf_zerocopy_ioctl_setzbuf: not in zbuf mode")); + + /* + * Must set both buffers. Cannot clear them. + */ + if (bz->bz_bufa == NULL || bz->bz_bufb == NULL) + return (EINVAL); + + /* + * Buffers must have a size greater than 0. Alignment and other size + * validity checking is done in zbuf_setup(). + */ + if (bz->bz_buflen == 0) + return (EINVAL); + + /* + * As a simplifying assumption, we allow buffers to be designated + * only once per descriptor. Checked up front to save some trouble, + * as we can more easily return EINVAL here; if the system is low on + * sf_bufs, then it will be ENOMEM later. + * + * Note: lockless read. + */ + if (d->bd_hbuf != NULL || d->bd_sbuf != NULL || d->bd_fbuf != NULL || + d->bd_bif != NULL) + return (EINVAL); + + /* + * Allocate new buffers if required. + */ + error = zbuf_setup(td, (vm_offset_t)bz->bz_bufa, bz->bz_buflen, + &zba); + if (error) + return (error); + + error = zbuf_setup(td, (vm_offset_t)bz->bz_bufb, bz->bz_buflen, + &zbb); + if (error) { + zbuf_free(zba); + return (error); + } + + /* + * Perform atomic check-and-exchange. + */ + BPFD_LOCK(d); + if (d->bd_hbuf != NULL || d->bd_sbuf != NULL || d->bd_fbuf != NULL || + d->bd_bif != NULL) { + BPFD_UNLOCK(d); + zbuf_free(zba); + zbuf_free(zbb); + return (EINVAL); + } + d->bd_fbuf = (caddr_t)zbb; + d->bd_sbuf = (caddr_t)zba; + d->bd_slen = 0; + d->bd_hlen = 0; + + /* + * We expose only the space left in the buffer after the size of the + * shared management region. + */ + d->bd_bufsize = bz->bz_buflen - sizeof(struct bpf_zbuf_header); + BPFD_UNLOCK(d); + return (0); +} Index: sys/net/bpf_zerocopy.h =================================================================== RCS file: sys/net/bpf_zerocopy.h diff -N sys/net/bpf_zerocopy.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/net/bpf_zerocopy.h 5 Nov 2007 18:53:26 -0000 @@ -0,0 +1,61 @@ +/*- + * Copyright (c) 2007 Seccuris Inc. + * All rights reserved. + * + * This sofware was developed by Robert N. M. Watson under contract to + * Seccuris Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_BPF_ZEROCOPY_H_ +#define _NET_BPF_ZEROCOPY_H_ + +#ifndef _KERNEL +#error "no user-serviceable parts inside" +#endif + +void bpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, + void *src, u_int len); +void bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, + void *src, u_int len); +void bpf_zerocopy_bufheld(struct bpf_d *); +int bpf_zerocopy_buffree(struct bpf_d *); +void bpf_zerocopy_free(struct bpf_d *d); +int bpf_zerocopy_ioctl_ackzbuf(struct thread *td, struct bpf_d *d, + struct bpf_zbuf *bz); +int bpf_zerocopy_ioctl_getzbuf(struct thread *td, struct bpf_d *d, + struct bpf_zbuf *bz); +int bpf_zerocopy_ioctl_getzmax(struct thread *td, struct bpf_d *d, + size_t *i); +int bpf_zerocopy_ioctl_getznext(struct thread *td, struct bpf_d *d, + struct bpf_zbuf *bz); +int bpf_zerocopy_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, + struct bpf_zbuf *bz); +int bpf_zerocopy_ioctl_setzbuf(struct thread *td, struct bpf_d *d, + struct bpf_zbuf *bz); +int bpf_zerocopy_uiomove(struct bpf_d *d, caddr_t buf, u_int len, + struct uio *uio); + +#endif /* !_NET_BPF_ZEROCOPY_H_ */ Index: sys/net/bpfdesc.h =================================================================== RCS file: /home/ncvs/src/sys/net/bpfdesc.h,v retrieving revision 1.38 diff -u -r1.38 bpfdesc.h --- sys/net/bpfdesc.h 6 Aug 2007 14:26:00 -0000 1.38 +++ sys/net/bpfdesc.h 5 Nov 2007 18:40:34 -0000 @@ -48,10 +48,11 @@ /* * Descriptor associated with each open bpf file. */ +struct zbuf; struct bpf_d { LIST_ENTRY(bpf_d) bd_next; /* Linked list of descriptors */ /* - * Buffer slots: two malloc buffers store the incoming packets. + * Buffer slots: two memory clusters buffer the incoming packets. * The model has three slots. Sbuf is always occupied. * sbuf (store) - Receive interrupt puts packets here. * hbuf (hold) - When sbuf is full, put buffer here and @@ -93,6 +94,11 @@ u_long bd_fcount; /* number of packets which matched filter */ pid_t bd_pid; /* PID which created descriptor */ int bd_locked; /* true if descriptor is locked */ + u_int bd_bufmode; /* Current buffer mode. */ + u_long bd_wcount; /* number of packets written */ + u_long bd_wfcount; /* number of packets that matched write filter */ + u_long bd_wdcount; /* number of packets dropped during a write */ + u_long bd_zcopy; /* number of zero copy operations */ }; /* Values for bd_state */ @@ -104,12 +110,6 @@ #define BPFD_UNLOCK(bd) mtx_unlock(&(bd)->bd_mtx) #define BPFD_LOCK_ASSERT(bd) mtx_assert(&(bd)->bd_mtx, MA_OWNED); -/* Test whether a BPF is ready for read(). */ -#define bpf_ready(bd) \ - ((bd)->bd_hlen != 0 || \ - (((bd)->bd_immediate || (bd)->bd_state == BPF_TIMED_OUT) && \ - (bd)->bd_slen != 0)) - /* * External representation of the bpf descriptor */ @@ -130,6 +130,10 @@ pid_t bd_pid; char bd_ifname[IFNAMSIZ]; int bd_locked; + u_long bd_wcount; + u_long bd_wfcount; + u_long bd_wdcount; + u_long bd_zcopy; }; #define BPFIF_LOCK(bif) mtx_lock(&(bif)->bif_mtx) Index: usr.bin/netstat/Makefile =================================================================== RCS file: /home/ncvs/src/usr.bin/netstat/Makefile,v retrieving revision 1.39 diff -u -r1.39 Makefile --- usr.bin/netstat/Makefile 1 Jul 2007 12:08:07 -0000 1.39 +++ usr.bin/netstat/Makefile 5 Nov 2007 18:40:34 -0000 @@ -9,6 +9,7 @@ WARNS?= 3 +CFLAGS+=-I../../sys CFLAGS+=-DIPSEC CFLAGS+=-DSCTP Index: usr.bin/netstat/bpf.c =================================================================== RCS file: /home/ncvs/src/usr.bin/netstat/bpf.c,v retrieving revision 1.9 diff -u -r1.9 bpf.c --- usr.bin/netstat/bpf.c 16 Jul 2007 17:15:54 -0000 1.9 +++ usr.bin/netstat/bpf.c 5 Nov 2007 18:40:34 -0000 @@ -86,31 +86,83 @@ *flagbuf++ = '\0'; } -void -bpf_stats(char *ifname) +static int +bpf_get_stats(int *size, struct xbpf_d **bdp) { - struct xbpf_d *d, *bd; - char *pname, flagbuf[12]; - size_t size; + struct xbpf_d *bd; + size_t s; - if (sysctlbyname("net.bpf.stats", NULL, &size, + if (sysctlbyname("net.bpf.stats", NULL, &s, NULL, 0) < 0) { warn("net.bpf.stats"); - return; + return (-1); } - if (size == 0) - return; - bd = malloc(size); + if (s == 0) + return (-1); + bd = malloc(s); if (bd == NULL) { warn("malloc failed"); - return; + return (-1); } - if (sysctlbyname("net.bpf.stats", bd, &size, + if (sysctlbyname("net.bpf.stats", bd, &s, NULL, 0) < 0) { warn("net.bpf.stats"); free(bd); + return (-1); + } + *bdp = bd; + *size = s; + return (0); +} + +void +bpf_stats_extended(char *ifname) +{ + struct xbpf_d *d, *bd; + int size; + char *pname; + + if (bpf_get_stats(&size, &bd) < 0) return; + for (d = &bd[0]; d < &bd[size / sizeof(*d)]; d++) { + if (ifname && strcmp(ifname, d->bd_ifname) != 0) + continue; + pname = bpf_pidname(d->bd_pid); + (void) printf("%s: pid %d on %s:\n", pname, d->bd_pid, + d->bd_ifname); + (void) printf( + "\t%lu packets received\n" + "\t%lu packets matched receive filter\n" + "\t%lu packets dropped\n" + "\t%d current hold buffer size\n" + "\t%d current store buffer size\n" + "\t%lu packets written\n" + "\t%lu packets matched write filter\n" + "\t%lu packet writes failed\n" + "\t%lu zero copy operations\n", + d->bd_rcount, + d->bd_fcount, + d->bd_dcount, + d->bd_hlen, + d->bd_slen, + d->bd_wcount, + d->bd_wfcount, + d->bd_wdcount, + d->bd_zcopy); + free(pname); } + free(bd); +} + +void +bpf_stats(char *ifname) +{ + char *pname, flagbuf[12]; + struct xbpf_d *d, *bd; + int size; + + if (bpf_get_stats(&size, &bd) < 0) + return; printf("%5s %6s %7s %9s %9s %9s %5s %5s %s\n", "Pid", "Netif", "Flags", "Recv", "Drop", "Match", "Sblen", "Hblen", "Command"); Index: usr.bin/netstat/main.c =================================================================== RCS file: /home/ncvs/src/usr.bin/netstat/main.c,v retrieving revision 1.87 diff -u -r1.87 main.c --- usr.bin/netstat/main.c 16 Jul 2007 18:13:12 -0000 1.87 +++ usr.bin/netstat/main.c 5 Nov 2007 18:40:34 -0000 @@ -495,7 +495,10 @@ if (Bflag) { if (!live) usage(); - bpf_stats(interface); + if (sflag) + bpf_stats_extended(interface); + else + bpf_stats(interface); exit(0); } if (mflag) { Index: usr.bin/netstat/netstat.h =================================================================== RCS file: /home/ncvs/src/usr.bin/netstat/netstat.h,v retrieving revision 1.51 diff -u -r1.51 netstat.h --- usr.bin/netstat/netstat.h 16 Jul 2007 17:15:55 -0000 1.51 +++ usr.bin/netstat/netstat.h 5 Nov 2007 18:40:34 -0000 @@ -161,3 +161,4 @@ void mroutepr(u_long, u_long); void mrt_stats(u_long); void bpf_stats(char *); +void bpf_stats_extended(char *);