Index: contrib/libpcap/pcap-bpf.c
===================================================================
RCS file: /home/ncvs/src/contrib/libpcap/pcap-bpf.c,v
retrieving revision 1.3.2.1
diff -u -r1.3.2.1 pcap-bpf.c
--- contrib/libpcap/pcap-bpf.c	19 Oct 2007 03:03:56 -0000	1.3.2.1
+++ contrib/libpcap/pcap-bpf.c	5 Nov 2007 18:40:34 -0000
@@ -30,6 +30,8 @@
 #endif
 
 #include <sys/param.h>			/* optionally get BSD define */
+#include <sys/mman.h>
+#include <sys/poll.h>
 #include <sys/time.h>
 #include <sys/timeb.h>
 #include <sys/socket.h>
@@ -139,6 +141,118 @@
 	return (0);
 }
 
+#ifdef BIOCGETBUFMODE
+/*
+ * Selection routine for zero-copy BPF: identify the next completed buffer,
+ * if any.  Try shared memory first, and if that doesn't work, make a system
+ * call, which may dislodge a buffer.
+ *
+ * Return (1) if the buffer is found, (0) if a retry is required, and (-1) if
+ * there is an unrecoverable error.
+ *
+ * XXXRW: Check to make sure the version comparison we're doing here is
+ * really the right thing -- maybe use serial number arithmetic?
+ */
+static int
+pcap_next_zbuf(pcap_t *p, u_int *cc)
+{
+	struct bpf_zbuf_header *bzh;
+	struct bpf_zbuf bz;
+	struct timeval tv;
+	fd_set r_set;
+	int r;
+
+	FD_ZERO(&r_set);
+	FD_SET(p->fd, &r_set);
+	p->bzh = NULL;
+	p->buffer = NULL;
+	if (p->to_ms != 0) {
+		tv.tv_sec = p->to_ms / 1000;
+		tv.tv_usec = (p->to_ms * 1000) % 1000000;
+	}
+	r = select(p->fd + 1, &r_set, NULL, NULL, &tv);
+	if (r < 0 && errno == EINTR)
+		return (0);
+	else if (r < 0) {
+		(void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
+		    "select: %s", strerror(errno));
+		return (-1);
+	}
+	/*
+	 * Handle timeouts here
+	 */
+	if (r == 0) {
+		if (ioctl(p->fd, BIOCROTZBUF, &bz) < 0) {
+			(void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
+			    "BIOCROTZBUF: %s", strerror(errno));
+			return (-1);
+		}
+		/*
+		 * select(2) woke us up due to a timeout, and there was no
+		 * data to be processed in the store buffer.  Tell pcap to
+		 * to wait again.
+		 */
+		if (bz.bz_bufa == NULL)
+			return (0);
+	}
+	/* XXXCSJP should we check FD_ISSET()? */
+	/*
+	 * If we have made it this far, chances are select(2) returned because
+	 * there is data ready to be processed in the hold buffer.  Compare the
+	 * user generation numbers against the kernels.  If there are any
+	 * differences, process the packet data.
+	 */
+	bzh = (struct bpf_zbuf_header *)p->zbuf1;
+	if (bzh->bzh_kernel_gen > bzh->bzh_user_gen) {
+		p->bzh = bzh;
+		p->buffer = (u_char *)p->zbuf1;
+		p->buffer += sizeof(*bzh);
+		*cc = bzh->bzh_kernel_len;
+		return (1);
+	}
+	bzh = (struct bpf_zbuf_header *)p->zbuf2;
+	if (bzh->bzh_kernel_gen > bzh->bzh_user_gen) {
+		p->bzh = bzh;
+		p->buffer = (u_char *)p->zbuf2;
+		p->buffer += sizeof(*bzh);
+		*cc = bzh->bzh_kernel_len;
+		return (1);
+	}
+	/*
+	 * If the generation numbers were the same for both buffers, then it
+	 * is possible that we woke up because of BIOCIMMEDIATE.  In either
+	 * case, manually rotate the buffers.
+	 */
+	if (ioctl(p->fd, BIOCROTZBUF, &bz) < 0) {
+		(void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
+		    "BIOCROTZBUF: %s", strerror(errno));
+		return (-1);
+	}
+	/*
+	 * It's possible that we were unable to rotate the buffer because the
+	 * user generation numbers have not been modified, in which case retry.
+	 */
+	if (bz.bz_bufa == NULL)
+		return (0);
+	p->bzh = (struct bpf_zbuf_header *)bz.bz_bufa;
+	p->buffer = (u_char *)bz.bz_bufa;
+	p->buffer += sizeof(*bzh);
+	*cc = bz.bz_buflen;
+	return (1);
+}
+
+static int
+pcap_ack_zbuf(pcap_t *p)
+{
+	struct bpf_zbuf bz;
+
+	p->bzh->bzh_user_gen++;
+	p->bzh = NULL;
+	p->buffer = NULL;
+	return (0);
+}
+#endif
+
 static int
 pcap_read_bpf(pcap_t *p, int cnt, pcap_handler callback, u_char *user)
 {
@@ -147,6 +261,9 @@
 	register u_char *bp, *ep;
 	u_char *datap;
 	struct bpf_insn *fcode;
+#ifdef BIOCSETBUFMODE
+	int i;
+#endif
 #ifdef PCAP_FDDIPAD
 	register int pad;
 #endif
@@ -167,7 +284,19 @@
 	}
 	cc = p->cc;
 	if (p->cc == 0) {
-		cc = read(p->fd, (char *)p->buffer, p->bufsize);
+#ifdef BIOCSETBUFMODE
+		if (p->zbuf1 != NULL) {
+			if (p->buffer != NULL)
+				pcap_ack_zbuf(p);
+			i = pcap_next_zbuf(p, &cc);
+			if (i == 0)
+				goto again;
+			if (i < 0)
+				return (-1);
+		} else
+#endif
+			cc = read(p->fd, (char *)p->buffer, p->bufsize);
+
 		if (cc < 0) {
 			/* Don't choke when we get ptraced */
 			switch (errno) {
@@ -609,6 +738,10 @@
 	struct bpf_insn total_insn;
 	struct bpf_program total_prog;
 	struct utsname osinfo;
+#ifdef BIOCSETBUFMODE
+	struct bpf_zbuf bz;
+	u_int bufmode, zbufmax;
+#endif
 
 #ifdef HAVE_DAG_API
 	if (strstr(device, "dag")) {
@@ -647,6 +780,73 @@
 	}
 
 	/*
+	 * XXXRW: Depending on the availability of zero-copy BPF, we take one
+	 * of two strategies here: if it is available and usable, we go ahead
+	 * and set it up; otherwise we play the song-and-dance to try to
+	 * probe an acceptable read buffer size.  Zero-copy BPF requires that
+	 * buffers be mapped into memory before selecting the interface to
+	 * attach to, so we do that here also.
+	 */
+#ifdef BIOCSETBUFMODE
+	if (getenv("BPF_ZERO_COPY")) {
+		bufmode = BPF_BUFMODE_ZBUF;
+		if (ioctl(fd, BIOCSETBUFMODE, (caddr_t)&bufmode) < 0) {
+			snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCSETBUFMODE: %s",
+			    pcap_strerror(errno));
+			goto bad;
+		}
+
+		if (ioctl(fd, BIOCGETZMAX, (caddr_t)&zbufmax) < 0) {
+			snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCGETZMAX: %s",
+			    pcap_strerror(errno));
+			goto bad;
+		}
+
+		/*
+		 * XXXRW: This logic should be revisited.
+		 */
+		p->zbufsize = 32768;
+		if (p->zbufsize % getpagesize() != 0)
+			p->zbufsize = getpagesize();
+		if (p->zbufsize > zbufmax)
+			p->zbufsize = zbufmax;
+
+		p->zbuf1 = mmap(NULL, p->zbufsize, PROT_READ | PROT_WRITE,
+		    MAP_ANON, -1, 0);
+		p->zbuf2 = mmap(NULL, p->zbufsize, PROT_READ | PROT_WRITE,
+		    MAP_ANON, -1, 0);
+		if (p->zbuf1 == MAP_FAILED || p->zbuf2 == MAP_FAILED) {
+			if (p->zbuf1 != MAP_FAILED)
+				munmap(p->zbuf1, p->zbufsize);
+			if (p->zbuf2 != MAP_FAILED)
+				munmap(p->zbuf1, p->zbufsize);
+			snprintf(ebuf, PCAP_ERRBUF_SIZE, "mmap: %s",
+			    pcap_strerror(errno));
+		}
+
+		bzero(&bz, sizeof(bz));
+		bz.bz_bufa = p->zbuf1;
+		bz.bz_bufb = p->zbuf2;
+		bz.bz_buflen = p->zbufsize;
+
+		if (ioctl(fd, BIOCSETZBUF, (caddr_t)&bz) < 0) {
+			snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCSETZBUF: %s",
+			    pcap_strerror(errno));
+			goto bad;
+		}
+
+		(void)strncpy(ifr.ifr_name, device, sizeof(ifr.ifr_name));
+		if (ioctl(fd, BIOCSETIF, (caddr_t)&ifr) < 0) {
+			snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCSETIF: %s: %s",
+			    device, pcap_strerror(errno));
+			goto bad;
+		}
+
+		v = p->zbufsize - sizeof(struct bpf_zbuf_header);
+	} else {
+#endif
+
+	/*
 	 * Try finding a good size for the buffer; 32768 may be too
 	 * big, so keep cutting it in half until we find a size
 	 * that works, or run out of sizes to try.  If the default
@@ -681,6 +881,9 @@
 			 "BIOCSBLEN: %s: No buffer size worked", device);
 		goto bad;
 	}
+#ifdef BIOCSETBUFMODE
+	}
+#endif
 
 	/* Get the data link layer type. */
 	if (ioctl(fd, BIOCGDLT, (caddr_t)&v) < 0) {
@@ -855,7 +1058,8 @@
 	}
 #endif
 	/* set timeout */
-	if (to_ms != 0) {
+	p->to_ms = to_ms;
+	if (to_ms != 0 && getenv("BPF_ZERO_COPY") == NULL) {
 		/*
 		 * XXX - is this seconds/nanoseconds in AIX?
 		 * (Treating it as such doesn't fix the timeout
@@ -870,6 +1074,9 @@
 			goto bad;
 		}
 	}
+#ifdef BIOCSETBUFMODE
+	p->timeout = to_ms;
+#endif
 
 #ifdef _AIX
 #ifdef	BIOCIMMEDIATE
@@ -942,16 +1149,22 @@
 		goto bad;
 	}
 	p->bufsize = v;
-	p->buffer = (u_char *)malloc(p->bufsize);
-	if (p->buffer == NULL) {
-		snprintf(ebuf, PCAP_ERRBUF_SIZE, "malloc: %s",
-		    pcap_strerror(errno));
-		goto bad;
-	}
+#ifdef BIOCSETBUFMODE
+	if (p->zbuf1 == NULL) {
+#endif
+		p->buffer = (u_char *)malloc(p->bufsize);
+		if (p->buffer == NULL) {
+			snprintf(ebuf, PCAP_ERRBUF_SIZE, "malloc: %s",
+			    pcap_strerror(errno));
+			goto bad;
+		}
 #ifdef _AIX
-	/* For some strange reason this seems to prevent the EFAULT 
-	 * problems we have experienced from AIX BPF. */
-	memset(p->buffer, 0x0, p->bufsize);
+		/* For some strange reason this seems to prevent the EFAULT 
+		 * problems we have experienced from AIX BPF. */
+		memset(p->buffer, 0x0, p->bufsize);
+#endif
+#ifdef BIOCSETBUFMODE
+	}
 #endif
 
 	/*
@@ -1036,7 +1249,24 @@
 
 	return (p);
  bad:
+
 	(void)close(fd);
+#ifdef BIOCSETBUFMODE
+	if (p->zbuf1 != NULL)
+		munmap(p->zbuf1, v);
+	if (p->zbuf2 != NULL)
+		munmap(p->zbuf2, v);
+	/*
+	 * If we are using zerocopy, the packet buffer will be referencing
+	 * an address in one of the shared pages, if any.  In which case
+	 * we will not free it.
+	 */
+	if (getenv("BPF_ZERO_COPY") == NULL && p->buffer != NULL)
+		free(p->buffer);
+#else
+	if (p->buffer != NULL)
+		free(p->buffer);
+#endif
 	if (p->dlt_list != NULL)
 		free(p->dlt_list);
 	free(p);
Index: contrib/libpcap/pcap-int.h
===================================================================
RCS file: /home/ncvs/src/contrib/libpcap/pcap-int.h,v
retrieving revision 1.12.2.1
diff -u -r1.12.2.1 pcap-int.h
--- contrib/libpcap/pcap-int.h	19 Oct 2007 03:03:56 -0000	1.12.2.1
+++ contrib/libpcap/pcap-int.h	5 Nov 2007 18:40:34 -0000
@@ -167,12 +167,35 @@
 	struct pcap_md md;
 
 	/*
-	 * Read buffer.
+	 * Read buffer -- for file descriptor read buffer model.
 	 */
 	int bufsize;
 	u_char *buffer;
 	u_char *bp;
 	int cc;
+	int to_ms;
+
+	/*
+	 * XXXRW: Exactly how to handle ifdefs, etc, is not something I've
+	 * worked out yet.  Presumably we need to add a configure check for
+	 * zero-copy BPF.
+	 *
+	 * Zero-copy read buffer -- for zero-copy BPF.  'buffer' above will
+	 * alternative between these two actual mmap'd buffers as required.
+	 * As there is a header on the front size of the mmap'd buffer, only
+	 * some of the buffer is exposed to libpcap as a whole via bufsize;
+	 * zbufsize is the true size.
+	 */
+	u_char *zbuf1, *zbuf2;
+	u_int zbufsize;
+	u_int timeout;
+
+	/*
+	 * If there's currently a buffer being actively processed, then it is
+	 * referenced here; 'buffer' is also pointed at it, but offset by the
+	 * size of the header.
+	 */
+	struct bpf_zbuf_header *bzh;
 
 	/*
 	 * Place holder for pcap_next().
Index: lib/libpcap/Makefile
===================================================================
RCS file: /home/ncvs/src/lib/libpcap/Makefile,v
retrieving revision 1.39.2.1
diff -u -r1.39.2.1 Makefile
--- lib/libpcap/Makefile	19 Oct 2007 03:04:02 -0000	1.39.2.1
+++ lib/libpcap/Makefile	5 Nov 2007 18:40:34 -0000
@@ -16,6 +16,7 @@
 
 YFLAGS+=-p pcapyy
 LFLAGS+=-Ppcapyy
+CFLAGS+=-I../../sys
 CFLAGS+=-DHAVE_CONFIG_H -Dyylval=pcapyylval -I${.CURDIR} -I.
 CFLAGS+=-D_U_="__attribute__((unused))"
 CFLAGS+=-DHAVE_SNPRINTF -DHAVE_VSNPRINTF
Index: share/man/man4/bpf.4
===================================================================
RCS file: /home/ncvs/src/share/man/man4/bpf.4,v
retrieving revision 1.48
diff -u -r1.48 bpf.4
--- share/man/man4/bpf.4	26 Feb 2007 22:24:14 -0000	1.48
+++ share/man/man4/bpf.4	5 Nov 2007 18:40:34 -0000
@@ -1,3 +1,30 @@
+.\" Copyright (c) 2007 Seccuris Inc.
+.\" All rights reserved.
+.\"
+.\" This sofware was developed by Robert N. M. Watson under contract to
+.\" Seccuris Inc.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\" 
 .\" Copyright (c) 1990 The Regents of the University of California.
 .\" All rights reserved.
 .\"
@@ -61,18 +88,53 @@
 all file descriptors listening on that interface apply their filter.
 Each descriptor that accepts the packet receives its own copy.
 .Pp
-Reads from these files return the next group of packets
-that have matched the filter.
-To improve performance, the buffer passed to read must be
-the same size as the buffers used internally by
-.Nm .
+.Nm
+devices operate in one of two buffering modes: buffered
+.Xr read 2 ,
+in which packet data is copied from the kernel explicitly using the
+.Xr read 2
+system call, and zero-copy buffer mode, in which the user process provides
+two memory regions that
+.Nm
+will write to directly as the packets are accepted.
+The buffering mode may be set with the
+.Dv BIOCSETBUFMODE
+ioctl (see below), and will default to buffered
+.Xr read 2
+mode
+.Dv ( BPF_BUFMODE_BUFFER )
+by default.
+Buffers return the next group of packets that have matched the filter.
+Note that an individual packet larger than the buffer size is necessarily
+truncated.
+.Pp
+In the case of buffered
+.Xr read 2 ,
+the user process will declare a fixed buffer size that will be used both for
+sizing internal buffers and for all
+.Xr read 2
+operations on the file.
 This size is returned by the
 .Dv BIOCGBLEN
 ioctl (see below), and
 can be set with
 .Dv BIOCSBLEN .
-Note that an individual packet larger than this size is necessarily
-truncated.
+.Pp
+In zero-copy buffering, the user process registers two memory buffers with
+.Nm
+via the
+.Dv BIOCSETZBUF
+ioctl (see below).
+The user process may monitor for completion (filling) of a buffer, at which
+point the memory contents of the buffer will be stable until the buffer is
+returned for further kernel use using the
+.Dv BIOCACKZBUF
+ioctl.
+Buffers will be of a fixed (and equal) size, be
+page-aligned, and the size must be an integer multiple of the page size.
+The maximum zero-copy buffer size is returned by the
+.Dv BIOCGETZMAX
+ioctl (see below).
 .Pp
 The packet filter will support any link level protocol that has fixed length
 headers.
@@ -127,7 +189,7 @@
 The (third) argument to
 .Xr ioctl 2
 should be a pointer to the type indicated.
-.Bl -tag -width BIOCGRTIMEOUT
+.Bl -tag -width BIOCGETBUFMODE
 .It Dv BIOCGBLEN
 .Pq Li u_int
 Returns the required buffer length for reads on
@@ -349,6 +411,87 @@
 This prevents the execution of
 ioctl commands which could change the underlying operating parameters of
 the device.
+.It Dv BIOCGETBUFMODE
+.It Dv BIOCSETBUFMODE
+.Pq Li u_int
+Get or set the current
+.Nm
+buffering mode; possible values are
+.Dv BPF_BUFMODE_BUFFER ,
+buffered
+.Xr read 2
+mode, and
+.Dv BPF_BUFMODE_ZBUF ,
+zero-copy buffer mode.
+.It Dv BIOCACKZBUF
+.Pq Li struct bpf_zbuf
+Return a completed zero-copy buffer to the kernel for reuse.
+The following structure is used as an argument to these and other zero-copy
+buffer ioctls:
+.Bd -literal
+struct bpf_zbuf {
+	void *bz_bufa;
+	void *bz_bufb;
+	size_t bz_buflen;
+};
+.Ed
+.Pp
+Only the
+.Vt bz_bufa
+field will be used with this ioctl.
+.It Dv BIOCGETZBUF
+.It Dv BIOCSETZBUF
+.Pq Li struct bpf_zbuf
+Get or set the current zero-copy buffer locations; buffer locations may be
+set only once zero-copy buffer mode has been selected, and prior to attaching
+the
+.Nm
+device to an interface.
+Buffers must be of identical size, page-aligned, and an integer multiple of
+pages in size.
+The three fields
+.Vt bz_bufa ,
+.Vt bz_bufb ,
+and
+.Vt bz_buflen
+must be filled out.
+.It Dv BIOCGETZMAX
+.Pq Li size_t
+Get the largest individual zero-copy buffer size allowed.
+As two buffers are used in zero-copy buffer mode, the limit (in practice) is
+twice the returned size.
+As zero-copy buffers consume kernel address space, conservative selection of
+buffer size, especially when there are multiple
+.Nm
+descriptors in use on 32-bit systems.
+.It Dv BIOCGETZNEXT
+.It Dv BIOCROTZBUF
+.Pq Li struct bpf_zbuf
+Get the buffer pointer and length of the next zero-copy buffer buffer ready
+for userspace use, or
+.Dv NULL
+if there is no pending buffer.
+.Pp
+.Dv BIOCGETZNEXT
+queries for the next completely filled buffer ready for immediate use,
+returning NULL if there are only empty or partially filled buffers available.
+.Pp
+.Dv BIOCROTZBUF
+queries for a filled buffer, but in the event there is only a partially
+filled buffer, will make that buffer available for userspace to use
+immediately.
+This allows consumers of zero-copy buffering to implement timeouts and
+retrieve partially filled buffers.
+.Dv BIOCROTZBUF
+will return
+.Dv NULL
+only if no data is present in either of the zero-copy buffers.
+.Pp
+Only the
+.Vt bz_bufa
+and
+.Vt bz_buflen
+fields will be used with this ioctl.
 .El
 .Sh BPF HEADER
 The following structure is prepended to each packet returned by
Index: sys/conf/files
===================================================================
RCS file: /home/ncvs/src/sys/conf/files,v
retrieving revision 1.1243
diff -u -r1.1243 files
--- sys/conf/files	23 Sep 2007 07:34:22 -0000	1.1243
+++ sys/conf/files	5 Nov 2007 18:40:34 -0000
@@ -1595,8 +1595,10 @@
 libkern/strtouq.c		standard
 libkern/strvalid.c		standard
 net/bpf.c			standard
+net/bpf_buffer.c		optional bpf
 net/bpf_jitter.c		optional bpf_jitter
 net/bpf_filter.c		optional bpf | netgraph_bpf
+net/bpf_zerocopy.c		optional bpf_zerocopy
 net/bridgestp.c			optional bridge | if_bridge
 net/bsd_comp.c			optional ppp_bsdcomp
 net/ieee8023ad_lacp.c		optional lagg
Index: sys/conf/options
===================================================================
RCS file: /home/ncvs/src/sys/conf/options,v
retrieving revision 1.608
diff -u -r1.608 options
--- sys/conf/options	23 Sep 2007 07:34:23 -0000	1.608
+++ sys/conf/options	5 Nov 2007 18:40:34 -0000
@@ -478,6 +478,7 @@
 # DRM options
 DRM_DEBUG		opt_drm.h
 
+BPF_ZEROCOPY		opt_bpf.h
 ZERO_COPY_SOCKETS	opt_zero.h
 TI_PRIVATE_JUMBOS	opt_ti.h
 TI_JUMBO_HDRSPLIT	opt_ti.h
Index: sys/net/bpf.c
===================================================================
RCS file: /home/ncvs/src/sys/net/bpf.c,v
retrieving revision 1.181.2.1
diff -u -r1.181.2.1 bpf.c
--- sys/net/bpf.c	20 Oct 2007 15:09:24 -0000	1.181.2.1
+++ sys/net/bpf.c	6 Nov 2007 20:33:15 -0000
@@ -65,9 +65,13 @@
 
 #include <net/if.h>
 #include <net/bpf.h>
+#include <net/bpf_buffer.h>
 #ifdef BPF_JITTER
 #include <net/bpf_jitter.h>
 #endif
+#ifdef BPF_ZEROCOPY
+#include <net/bpf_zerocopy.h>
+#endif
 #include <net/bpfdesc.h>
 
 #include <netinet/in.h>
@@ -79,7 +83,7 @@
 
 #include <security/mac/mac_framework.h>
 
-static MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
+MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
 
 #if defined(DEV_BPF) || defined(NETGRAPH_BPF)
 
@@ -97,19 +101,17 @@
 static struct mtx	bpf_mtx;		/* bpf global lock */
 static int		bpf_bpfd_cnt;
 
-static void	bpf_allocbufs(struct bpf_d *);
 static void	bpf_attachd(struct bpf_d *, struct bpf_if *);
 static void	bpf_detachd(struct bpf_d *);
 static void	bpf_freed(struct bpf_d *);
-static void	bpf_mcopy(const void *, void *, size_t);
 static int	bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **,
 		    struct sockaddr *, int *, struct bpf_insn *);
 static int	bpf_setif(struct bpf_d *, struct ifreq *);
 static void	bpf_timed_out(void *);
 static __inline void
 		bpf_wakeup(struct bpf_d *);
-static void	catchpacket(struct bpf_d *, u_char *, u_int,
-		    u_int, void (*)(const void *, void *, size_t),
+static void	catchpacket(struct bpf_d *, u_char *, u_int, u_int,
+		    void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int),
 		    struct timeval *);
 static void	reset_d(struct bpf_d *);
 static int	 bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd);
@@ -124,10 +126,13 @@
 SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl");
 static int bpf_bufsize = 4096;
 SYSCTL_INT(_net_bpf, OID_AUTO, bufsize, CTLFLAG_RW,
-    &bpf_bufsize, 0, "Default bpf buffer size");
+    &bpf_bufsize, 0, "");
 static int bpf_maxbufsize = BPF_MAXBUFSIZE;
 SYSCTL_INT(_net_bpf, OID_AUTO, maxbufsize, CTLFLAG_RW,
-    &bpf_maxbufsize, 0, "Maximum bpf buffer size");
+    &bpf_maxbufsize, 0, "");
+static int bpf_timestamp = 1;
+SYSCTL_INT(_net_bpf, OID_AUTO, timestamp, CTLFLAG_RW,
+    &bpf_timestamp, 0, "timestamp each frame");
 static int bpf_maxinsns = BPF_MAXINSNS;
 SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW,
     &bpf_maxinsns, 0, "Maximum bpf program instructions");
@@ -157,6 +162,216 @@
 static struct filterops bpfread_filtops =
 	{ 1, NULL, filt_bpfdetach, filt_bpfread };
 
+/*
+ * Wrapper functions for various buffering methods.  If the set of buffer
+ * modes expands, we will probably want to introduce a switch data structure
+ * similar to protosw, et.
+ */
+static void
+bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
+    u_int len)
+{
+
+	BPFD_LOCK_ASSERT(d);
+
+	switch (d->bd_bufmode) {
+	case BPF_BUFMODE_BUFFER:
+		return (bpf_buffer_append_bytes(d, buf, offset, src, len));
+
+#ifdef BPF_ZEROCOPY
+	case BPF_BUFMODE_ZBUF:
+		d->bd_zcopy++;
+		return (bpf_zerocopy_append_bytes(d, buf, offset, src, len));
+#endif
+
+	default:
+		panic("bpf_buf_append_bytes");
+	}
+}
+
+static void
+bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
+    u_int len)
+{
+
+	BPFD_LOCK_ASSERT(d);
+
+	switch (d->bd_bufmode) {
+	case BPF_BUFMODE_BUFFER:
+		return (bpf_buffer_append_mbuf(d, buf, offset, src, len));
+
+#ifdef BPF_ZEROCOPY
+	case BPF_BUFMODE_ZBUF:
+		d->bd_zcopy++;
+		return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len));
+#endif
+
+	default:
+		panic("bpf_buf_append_mbuf");
+	}
+}
+
+/*
+ * If the buffer mechanism has a way to decide that a held buffer can be made
+ * free, then it is exposed via the bpf_buffree() interface.  (1) is returned
+ * if the buffer can be discarded, (0) is returned if it cannot.
+ */
+static int
+bpf_buffree(struct bpf_d *d)
+{
+
+	BPFD_LOCK_ASSERT(d);
+
+	switch (d->bd_bufmode) {
+#ifdef BPF_ZEROCOPY
+	case BPF_BUFMODE_ZBUF:
+		return (bpf_zerocopy_buffree(d));
+#endif
+	}
+	return (0);
+}
+
+void
+bpf_bufheld(struct bpf_d *d)
+{
+
+	BPFD_LOCK_ASSERT(d);
+
+	switch (d->bd_bufmode) {
+#ifdef BPF_ZEROCOPY
+	case BPF_BUFMODE_ZBUF:
+		bpf_zerocopy_bufheld(d);
+		break;
+#endif
+	}
+}
+
+static void
+bpf_free(struct bpf_d *d)
+{
+
+	switch (d->bd_bufmode) {
+	case BPF_BUFMODE_BUFFER:
+		return (bpf_buffer_free(d));
+
+#ifdef BPF_ZEROCOPY
+	case BPF_BUFMODE_ZBUF:
+		return (bpf_zerocopy_free(d));
+#endif
+
+	default:
+		panic("bpf_buf_free");
+	}
+}
+
+static int
+bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
+{
+
+	switch (d->bd_bufmode) {
+	case BPF_BUFMODE_BUFFER:
+		return (bpf_buffer_uiomove(d, buf, len, uio));
+
+#ifdef BPF_ZEROCOPY
+	case BPF_BUFMODE_ZBUF:
+		return (bpf_zerocopy_uiomove(d, buf, len, uio));
+#endif
+
+	default:
+		panic("bpf_buf_uiomove");
+	}
+}
+
+static int
+bpf_ioctl_sblen(struct bpf_d *d, u_int *i)
+{
+
+	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
+		return (EOPNOTSUPP);
+	return (bpf_buffer_ioctl_sblen(d, i));
+}
+
+static int
+bpf_ioctl_ackzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
+{
+
+	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
+		return (EOPNOTSUPP);
+#ifdef BPF_ZEROCOPY
+	return (bpf_zerocopy_ioctl_ackzbuf(td, d, bz));
+#else
+	panic("bpf_ioctl_ackzbuf");
+#endif
+}
+
+static int
+bpf_ioctl_getzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
+{
+
+	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
+		return (EOPNOTSUPP);
+#ifdef BPF_ZEROCOPY
+	return (bpf_zerocopy_ioctl_getzbuf(td, d, bz));
+#else
+	panic("bpf_ioctl_getzbuf");
+#endif
+}
+
+static int
+bpf_ioctl_getznext(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
+{
+
+	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
+		return (EOPNOTSUPP);
+#ifdef BPF_ZEROCOPY
+	return (bpf_zerocopy_ioctl_getznext(td, d, bz));
+#else
+	panic("bpf_ioctl_getznext");
+#endif
+}
+
+static int
+bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
+{
+
+	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
+		return (EOPNOTSUPP);
+#ifdef BPF_ZEROCOPY
+	return (bpf_zerocopy_ioctl_getzmax(td, d, i));
+#else
+	panic("bpf_ioctl_getzmax");
+#endif
+}
+
+static int
+bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
+{
+
+	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
+		return (EOPNOTSUPP);
+#ifdef BPF_ZEROCOPY
+	return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz));
+#else
+	panic("bpf_ioctl_rotzbuf");
+#endif
+}
+
+static int
+bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
+{
+
+	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
+		return (EOPNOTSUPP);
+#ifdef BPF_ZEROCOPY
+	return (bpf_zerocopy_ioctl_setzbuf(td, d, bz));
+#else
+	panic("bpf_ioctl_setzbuf");
+#endif
+}
+
+/*
+ * General BPF functions.
+ */
 static int
 bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp,
     struct sockaddr *sockp, int *hdrlen, struct bpf_insn *wfilter)
@@ -411,7 +626,14 @@
 		    "bpf%d", dev2unit(dev));
 	MALLOC(d, struct bpf_d *, sizeof(*d), M_BPF, M_WAITOK | M_ZERO);
 	dev->si_drv1 = d;
-	d->bd_bufsize = bpf_bufsize;
+
+	/*
+	 * XXXRW: For historical reasons, perform a one-time initialization
+	 * call to the buffer routines, even though we're not yet committed
+	 * to a particular buffer method.
+	 */
+	bpf_buffer_init(d);
+	d->bd_bufmode = BPF_BUFMODE_DEFAULT;
 	d->bd_sig = SIGIO;
 	d->bd_direction = BPF_D_INOUT;
 	d->bd_pid = td->td_proc->p_pid;
@@ -458,18 +680,6 @@
 	return (0);
 }
 
-
-/*
- * Rotate the packet buffers in descriptor d.  Move the store buffer
- * into the hold slot, and the free buffer into the store slot.
- * Zero the length of the new store buffer.
- */
-#define ROTATE_BUFFERS(d) \
-	(d)->bd_hbuf = (d)->bd_sbuf; \
-	(d)->bd_hlen = (d)->bd_slen; \
-	(d)->bd_sbuf = (d)->bd_fbuf; \
-	(d)->bd_slen = 0; \
-	(d)->bd_fbuf = NULL;
 /*
  *  bpfread - read next chunk of packets from buffers
  */
@@ -489,6 +699,15 @@
 
 	BPFD_LOCK(d);
 	d->bd_pid = curthread->td_proc->p_pid;
+	if (d->bd_bufmode != BPF_BUFMODE_BUFFER) {
+		/*
+		 * XXXRW: For now, we don't implement a uiomove for the
+		 * scatter-gather buffers associated with BPF_BUFMODE_ZBUF,
+		 * so simply disallow read().
+		 */
+		BPFD_UNLOCK(d);
+		return (EOPNOTSUPP);
+	}
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	timed_out = (d->bd_state == BPF_TIMED_OUT);
@@ -561,8 +780,12 @@
 	 * Move data from hold buffer into user space.
 	 * We know the entire buffer is transferred since
 	 * we checked above that the read buffer is bpf_bufsize bytes.
+	 *
+	 * XXXRW: More synchronization needed here: what if a second thread
+	 * issues a read on the same fd at the same time?  Don't want this
+	 * getting invalidated.
 	 */
-	error = uiomove(d->bd_hbuf, d->bd_hlen, uio);
+	error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio);
 
 	BPFD_LOCK(d);
 	d->bd_fbuf = d->bd_hbuf;
@@ -573,7 +796,6 @@
 	return (error);
 }
 
-
 /*
  * If there are processes sleeping on this descriptor, wake them up.
  */
@@ -609,6 +831,23 @@
 }
 
 static int
+bpf_ready(struct bpf_d *d)
+{
+
+	BPFD_LOCK_ASSERT(d);
+
+	// printf("bpf_ready: hlen: %d, immediate %d, state %d, slen %d\n",
+	//     d->bd_hlen, d->bd_immediate, d->bd_state, d->bd_slen);
+
+	if (!bpf_buffree(d) && d->bd_hlen != 0)
+		return (1);
+	if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
+	    d->bd_slen != 0)
+		return (1);
+	return (0);
+}
+
+static int
 bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct bpf_d *d = dev->si_drv1;
@@ -618,25 +857,34 @@
 	int error, hlen;
 
 	d->bd_pid = curthread->td_proc->p_pid;
-	if (d->bd_bif == NULL)
+	d->bd_wcount++;
+	if (d->bd_bif == NULL) {
+		d->bd_wdcount++;
 		return (ENXIO);
+	}
 
 	ifp = d->bd_bif->bif_ifp;
 
-	if ((ifp->if_flags & IFF_UP) == 0)
+	if ((ifp->if_flags & IFF_UP) == 0) {
+		d->bd_wdcount++;
 		return (ENETDOWN);
+	}
 
-	if (uio->uio_resid == 0)
+	if (uio->uio_resid == 0) {
+		d->bd_wdcount++;
 		return (0);
+	}
 
 	bzero(&dst, sizeof(dst));
 	m = NULL;
 	hlen = 0;
 	error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp,
 	    &m, &dst, &hlen, d->bd_wfilter);
-	if (error)
+	if (error) {
+		d->bd_wdcount++;
 		return (error);
-
+	}
+	d->bd_wfcount++;
 	if (d->bd_hdrcmplt)
 		dst.sa_family = pseudo_AF_HDRCMPLT;
 
@@ -663,6 +911,8 @@
 #endif
 
 	error = (*ifp->if_output)(ifp, m, &dst, NULL);
+	if (error)
+		d->bd_wdcount++;
 
 	if (mc != NULL) {
 		if (error == 0)
@@ -693,6 +943,10 @@
 	d->bd_rcount = 0;
 	d->bd_dcount = 0;
 	d->bd_fcount = 0;
+	d->bd_wcount = 0;
+	d->bd_wfcount = 0;
+	d->bd_wdcount = 0;
+	d->bd_zcopy = 0;
 }
 
 /*
@@ -717,6 +971,11 @@
  *  BIOCSDIRECTION	Set packet direction flag
  *  BIOCLOCK		Set "locked" flag
  *  BIOCFEEDBACK	Set packet feedback mode.
+ *  BIOCGETZBUF		Query current zero-copy buffer locations.
+ *  BIOCSETZBUF		Set current zero-copy buffer locations.
+ *  BIOCSETZBUF		Acknowledge reading zero-copy buffers.
+ *  BIOCGETZMAX		Get maximum zero-copy buffer size.
+ *  BIOCGETZNEXT	Get next ready zero-copy buffer location
  */
 /* ARGSUSED */
 static	int
@@ -725,7 +984,7 @@
 {
 	struct bpf_d *d = dev->si_drv1;
 	int error = 0;
-
+	
 	/* 
 	 * Refresh PID associated with this descriptor.
 	 */
@@ -754,6 +1013,8 @@
 		case BIOCSRTIMEOUT:
 		case BIOCIMMEDIATE:
 		case TIOCGPGRP:
+		case BIOCACKZBUF:
+		case BIOCGETZBUF:
 			break;
 		default:
 			return (EPERM);
@@ -806,17 +1067,7 @@
 	 * Set buffer length.
 	 */
 	case BIOCSBLEN:
-		if (d->bd_bif != NULL)
-			error = EINVAL;
-		else {
-			u_int size = *(u_int *)addr;
-
-			if (size > bpf_maxbufsize)
-				*(u_int *)addr = size = bpf_maxbufsize;
-			else if (size < BPF_MINBUFSIZE)
-				*(u_int *)addr = size = BPF_MINBUFSIZE;
-			d->bd_bufsize = size;
-		}
+		error = bpf_ioctl_sblen(d, (u_int *)addr);
 		break;
 
 	/*
@@ -1051,6 +1302,62 @@
 	case BIOCGRSIG:
 		*(u_int *)addr = d->bd_sig;
 		break;
+
+	case BIOCGETBUFMODE:
+		*(u_int *)addr = d->bd_bufmode;
+		break;
+
+	case BIOCSETBUFMODE:
+		/*
+		 * Allow the buffering mode to be changed as long as we
+		 * haven't yet committed to a particular mode.  Our
+		 * definition of commitment, for now, is whether or not a
+		 * buffer has been allocated or an interface attached, since
+		 * that's the point where things get tricky.
+		 *
+		 * XXXRW: This will need some refinement.  Is checking both
+		 * for buffers and interface binding redundant?
+		 */
+		switch (*(u_int *)addr) {
+		case BPF_BUFMODE_BUFFER:
+			break;
+
+#ifdef BPF_ZEROCOPY
+		case BPF_BUFMODE_ZBUF:
+			break;
+#endif
+
+		default:
+			return (EINVAL);
+		}
+
+		BPFD_LOCK(d);
+		if (d->bd_sbuf != NULL || d->bd_hbuf != NULL ||
+		    d->bd_fbuf != NULL || d->bd_bif != NULL) {
+			BPFD_UNLOCK(d);
+			return (EBUSY);
+		}
+		d->bd_bufmode = *(u_int *)addr;
+		BPFD_UNLOCK(d);
+		break;
+
+	case BIOCACKZBUF:
+		return (bpf_ioctl_ackzbuf(td, d, (struct bpf_zbuf *)addr));
+
+	case BIOCGETZBUF:
+		return (bpf_ioctl_getzbuf(td, d, (struct bpf_zbuf *)addr));
+
+	case BIOCGETZMAX:
+		return (bpf_ioctl_getzmax(td, d, (size_t *)addr));
+
+	case BIOCGETZNEXT:
+		return (bpf_ioctl_getznext(td, d, (struct bpf_zbuf *)addr));
+
+	case BIOCSETZBUF:
+		return (bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr));
+
+	case BIOCROTZBUF:
+		return (bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr));
 	}
 	return (error);
 }
@@ -1151,13 +1458,33 @@
 		return (ENXIO);
 
 	bp = theywant->if_bpf;
+
 	/*
-	 * Allocate the packet buffers if we need to.
-	 * If we're already attached to requested interface,
-	 * just flush the buffer.
-	 */
-	if (d->bd_sbuf == NULL)
-		bpf_allocbufs(d);
+	 * Behavior here depends on the buffering model.  If we're using
+	 * kernel memory buffers, then we can allocate them here.  If we're
+	 * using zero-copy, then the user process must have registered
+	 * buffers by the time we get here.  If not, return an error.
+	 *
+	 * XXXRW: Could this be better abstracted?
+	 *
+	 * XXXRW: There are locking issues here with multi-threaded use: what
+	 * if two threads try to set the interface at once?
+	 */
+	switch (d->bd_bufmode) {
+	case BPF_BUFMODE_BUFFER:
+		if (d->bd_sbuf == NULL)
+			bpf_buffer_alloc(d);
+		KASSERT(d->bd_sbuf != NULL, ("bpf_setif: bd_sbuf NULL"));
+		break;
+
+	case BPF_BUFMODE_ZBUF:
+		if (d->bd_sbuf == NULL)
+			return (EINVAL);
+		break;
+
+	default:
+		panic("bpf_setif: bufmode %d", d->bd_bufmode);
+	}
 	if (bp != d->bd_bif) {
 		if (d->bd_bif)
 			/*
@@ -1295,43 +1622,23 @@
 		if (slen != 0) {
 			d->bd_fcount++;
 			if (!gottime) {
-				microtime(&tv);
+				if (bpf_timestamp == 0) 
+					bzero(&tv, sizeof(tv));
+				else	
+					microtime(&tv);
 				gottime = 1;
 			}
 #ifdef MAC
 			if (mac_check_bpfdesc_receive(d, bp->bif_ifp) == 0)
 #endif
-				catchpacket(d, pkt, pktlen, slen, bcopy, &tv);
+				catchpacket(d, pkt, pktlen, slen,
+				    bpf_append_bytes, &tv);
 		}
 		BPFD_UNLOCK(d);
 	}
 	BPFIF_UNLOCK(bp);
 }
 
-/*
- * Copy data from an mbuf chain into a buffer.  This code is derived
- * from m_copydata in sys/uipc_mbuf.c.
- */
-static void
-bpf_mcopy(const void *src_arg, void *dst_arg, size_t len)
-{
-	const struct mbuf *m;
-	u_int count;
-	u_char *dst;
-
-	m = src_arg;
-	dst = dst_arg;
-	while (len > 0) {
-		if (m == NULL)
-			panic("bpf_mcopy");
-		count = min(m->m_len, len);
-		bcopy(mtod(m, void *), dst, count);
-		m = m->m_next;
-		dst += count;
-		len -= count;
-	}
-}
-
 #define	BPF_CHECK_DIRECTION(d, m) \
 	if (((d)->bd_direction == BPF_D_IN && (m)->m_pkthdr.rcvif == NULL) || \
 	    ((d)->bd_direction == BPF_D_OUT && (m)->m_pkthdr.rcvif != NULL))
@@ -1374,14 +1681,17 @@
 		if (slen != 0) {
 			d->bd_fcount++;
 			if (!gottime) {
-				microtime(&tv);
+				if (bpf_timestamp == 0)
+					bzero(&tv, sizeof(tv));
+				else
+					microtime(&tv);
 				gottime = 1;
 			}
 #ifdef MAC
 			if (mac_check_bpfdesc_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, (u_char *)m, pktlen, slen,
-				    bpf_mcopy, &tv);
+				    bpf_append_mbuf, &tv);
 		}
 		BPFD_UNLOCK(d);
 	}
@@ -1429,14 +1739,17 @@
 		if (slen != 0) {
 			d->bd_fcount++;
 			if (!gottime) {
-				microtime(&tv);
+				if (bpf_timestamp == 0)
+					bzero(&tv, sizeof(tv));
+				else
+					microtime(&tv);
 				gottime = 1;
 			}
 #ifdef MAC
 			if (mac_check_bpfdesc_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, (u_char *)&mb, pktlen, slen,
-				    bpf_mcopy, &tv);
+				    bpf_append_mbuf, &tv);
 		}
 		BPFD_UNLOCK(d);
 	}
@@ -1449,19 +1762,34 @@
  * Move the packet data from interface memory (pkt) into the
  * store buffer.  "cpfn" is the routine called to do the actual data
  * transfer.  bcopy is passed in to copy contiguous chunks, while
- * bpf_mcopy is passed in to copy mbuf chains.  In the latter case,
+ * bpf_append_mbuf is passed in to copy mbuf chains.  In the latter case,
  * pkt is really an mbuf.
  */
 static void
 catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
-    void (*cpfn)(const void *, void *, size_t), struct timeval *tv)
+    void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int),
+    struct timeval *tv)
 {
-	struct bpf_hdr *hp;
+	struct bpf_hdr hdr;
 	int totlen, curlen;
 	int hdrlen = d->bd_bif->bif_hdrlen;
 	int do_wakeup = 0;
 
 	BPFD_LOCK_ASSERT(d);
+
+	/*
+	 * Detect whether user space has released a buffer back to us, and if
+	 * so, move it from being a hold buffer to a free buffer.  This may
+	 * not be the best place to do it (for example, we might only want to
+	 * run this check if we need the space), but for now it's a reliable
+	 * spot to do it.
+	 */
+	if (bpf_buffree(d)) {
+		d->bd_fbuf = d->bd_hbuf;
+		d->bd_hbuf = NULL;
+		d->bd_hlen = 0;
+	}
+
 	/*
 	 * Figure out how many bytes to move.  If the packet is
 	 * greater or equal to the snapshot length, transfer that
@@ -1496,65 +1824,52 @@
 	}
 	else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
 		/*
-		 * Immediate mode is set, or the read timeout has
-		 * already expired during a select call.  A packet
-		 * arrived, so the reader should be woken up.
+		 * Immediate mode is set, or the read timeout has already
+		 * expired during a select call.  A packet arrived, so the
+		 * reader should be woken up.
 		 */
 		do_wakeup = 1;
 
 	/*
-	 * Append the bpf header.
+	 * Append the bpf header.  Note we append the actual header size, but
+	 * move forward the length of the header plus padding.
 	 */
-	hp = (struct bpf_hdr *)(d->bd_sbuf + curlen);
-	hp->bh_tstamp = *tv;
-	hp->bh_datalen = pktlen;
-	hp->bh_hdrlen = hdrlen;
+	bzero(&hdr, sizeof(hdr));
+	hdr.bh_tstamp = *tv;
+	hdr.bh_datalen = pktlen;
+	hdr.bh_hdrlen = hdrlen;
+	hdr.bh_caplen = totlen - hdrlen;
+	bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr));
+
 	/*
 	 * Copy the packet data into the store buffer and update its length.
 	 */
-	(*cpfn)(pkt, (u_char *)hp + hdrlen, (hp->bh_caplen = totlen - hdrlen));
+	(*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, hdr.bh_caplen);
 	d->bd_slen = curlen + totlen;
 
+	/*
+	 * XXXCSJP we could probably save a syscall per wakeup if we check the
+	 * d->bd_immediate flag, hold buffer status and rotate the buffers
+	 * before the wakeup.
+	 */
 	if (do_wakeup)
 		bpf_wakeup(d);
 }
 
 /*
- * Initialize all nonzero fields of a descriptor.
- */
-static void
-bpf_allocbufs(struct bpf_d *d)
-{
-
-	KASSERT(d->bd_fbuf == NULL, ("bpf_allocbufs: bd_fbuf != NULL"));
-	KASSERT(d->bd_sbuf == NULL, ("bpf_allocbufs: bd_sbuf != NULL"));
-	KASSERT(d->bd_hbuf == NULL, ("bpf_allocbufs: bd_hbuf != NULL"));
-
-	d->bd_fbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK);
-	d->bd_sbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK);
-	d->bd_slen = 0;
-	d->bd_hlen = 0;
-}
-
-/*
  * Free buffers currently in use by a descriptor.
  * Called on close.
  */
 static void
 bpf_freed(struct bpf_d *d)
 {
+
 	/*
 	 * We don't need to lock out interrupts since this descriptor has
 	 * been detached from its interface and it yet hasn't been marked
 	 * free.
 	 */
-	if (d->bd_sbuf != NULL) {
-		free(d->bd_sbuf, M_BPF);
-		if (d->bd_hbuf != NULL)
-			free(d->bd_hbuf, M_BPF);
-		if (d->bd_fbuf != NULL)
-			free(d->bd_fbuf, M_BPF);
-	}
+	bpf_free(d);
 	if (d->bd_rfilter) {
 		free((caddr_t)d->bd_rfilter, M_BPF);
 #ifdef BPF_JITTER
@@ -1775,6 +2090,10 @@
 	strlcpy(d->bd_ifname,
 	    bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ);
 	d->bd_locked = bd->bd_locked;
+	d->bd_wcount = bd->bd_wcount;
+	d->bd_wdcount = bd->bd_wdcount;
+	d->bd_wfcount = bd->bd_wfcount;
+	d->bd_zcopy = bd->bd_zcopy;
 }
 
 static int
Index: sys/net/bpf.h
===================================================================
RCS file: /home/ncvs/src/sys/net/bpf.h,v
retrieving revision 1.47.2.1
diff -u -r1.47.2.1 bpf.h
--- sys/net/bpf.h	21 Oct 2007 14:05:27 -0000	1.47.2.1
+++ sys/net/bpf.h	5 Nov 2007 18:40:34 -0000
@@ -92,6 +92,44 @@
 #define BPF_MAJOR_VERSION 1
 #define BPF_MINOR_VERSION 1
 
+/*
+ * Historically, BPF has supported a single buffering model, first using mbuf
+ * clusters in kernel, and later using malloc(9) buffers in kernel.  We now
+ * support multiple buffering modes, which may be queried and set using
+ * BIOCGETBUFMODE and BIOCSETBUFMODE.  So as to avoid handling the complexity
+ * of changing modes while sniffing packets, the mode becomes fixed once an
+ * interface has been attached to the BPF descriptor.
+ */
+#define	BPF_BUFMODE_BUFFER	1	/* Kernel buffers with read(). */
+#define	BPF_BUFMODE_ZBUF	2	/* Zero-copy buffers. */
+
+#define	BPF_BUFMODE_DEFAULT	BPF_BUFMODE_BUFFER	/* Default. */
+
+/*
+ * Struct used by BIOCACKZBUF, BIOCGETZNEXT, BIOCGETZBUF, BIOCSETZBUF:
+ * describes up to two zero-copy buffer as used by BPF.
+ *
+ * BIOCACKZBUF      Acknowledge read of stored zero-copy buffer (rotate).
+ * BIOCGETZBUF      Query current zero-copy buffer locations.
+ * BIOCGETZNEXT     Query next stored buffer, if available.
+ * BIOCSETZBUF      Set current zero-copy buffer locations (once only).
+ *
+ * Pointers may be set to NULL to indicate a buffer is not configure, should
+ * be freed, or is not being acknowledged.
+ */
+struct bpf_zbuf {
+	void	*bz_bufa;	/* Location of 'a' zero-copy buffer. */
+	void	*bz_bufb;	/* Location of 'b' zero-copy buffer. */
+	size_t	 bz_buflen;	/* Size of zero-copy buffers. */
+};
+
+/* Packet directions */
+enum bpf_direction {
+        BPF_D_IN,       /* See incoming packets */
+        BPF_D_INOUT,    /* See incoming and outgoing packets */
+        BPF_D_OUT       /* See outgoing packets */
+};
+
 #define	BIOCGBLEN	_IOR('B',102, u_int)
 #define	BIOCSBLEN	_IOWR('B',102, u_int)
 #define	BIOCSETF	_IOW('B',103, struct bpf_program)
@@ -115,18 +153,19 @@
 #define	BIOCGDLTLIST	_IOWR('B',121, struct bpf_dltlist)
 #define	BIOCLOCK	_IO('B', 122)
 #define	BIOCSETWF	_IOW('B',123, struct bpf_program)
-#define	BIOCFEEDBACK	_IOW('B',124, u_int)
-
 /* Obsolete */
-#define	BIOCGSEESENT	BIOCGDIRECTION
-#define	BIOCSSEESENT	BIOCSDIRECTION
+#define BIOCGSEESENT    BIOCGDIRECTION
+#define BIOCSSEESENT    BIOCSDIRECTION
 
-/* Packet directions */
-enum bpf_direction {
-	BPF_D_IN,	/* See incoming packets */
-	BPF_D_INOUT,	/* See incoming and outgoing packets */
-	BPF_D_OUT	/* See outgoing packets */
-};
+#define	BIOCGETBUFMODE	_IOR('B', 124, u_int)
+#define	BIOCSETBUFMODE	_IOW('B', 125, u_int)
+#define	BIOCACKZBUF	_IOW('B', 126, struct bpf_zbuf)
+#define	BIOCGETZBUF	_IOR('B', 127, struct bpf_zbuf)
+#define	BIOCGETZMAX	_IOR('B', 128, size_t)
+#define	BIOCGETZNEXT	_IOR('B', 129, struct bpf_zbuf)
+#define	BIOCROTZBUF	_IOR('B', 130, struct bpf_zbuf)
+#define	BIOCSETZBUF	_IOW('B', 131, struct bpf_zbuf)
+#define BIOCFEEDBACK	_IOW('B', 132, u_int)
 
 /*
  * Structure prepended to each packet.
@@ -149,6 +188,21 @@
 #endif
 
 /*
+ * When using zero-copy BPF buffers, a shared memory header is present
+ * allowing the kernel BPF implementation and user process to synchronize
+ * without using system calls.  This structure defines that header.
+ *
+ * The layout of this structure is critical, and must not be changed; if must
+ * fit in a single page on all architectures.
+ */
+struct bpf_zbuf_header {
+	volatile u_int	bzh_kernel_gen;	/* Kernel generation number. */
+	volatile u_int	bzh_kernel_len;	/* Length of buffer. */
+	volatile u_int	bzh_user_gen;	/* User generation number. */
+	u_int _bzh_pad[5];
+};
+
+/*
  * Data-link level type codes.
  */
 #define DLT_NULL	0	/* BSD loopback encapsulation */
@@ -761,6 +815,29 @@
 };
 
 #ifdef _KERNEL
+
+#ifdef MALLOC_DECLARE
+MALLOC_DECLARE(M_BPF);
+#endif
+
+#ifdef SYSCTL_DECL
+SYSCTL_DECL(_net_bpf);
+#endif
+
+/*
+ * Rotate the packet buffers in descriptor d.  Move the store buffer into the
+ * hold slot, and the free buffer ino the store slot.  Zero the length of the
+ * new store buffer.  Descriptor lock should be held.
+ */
+#define	ROTATE_BUFFERS(d)	do {					\
+	(d)->bd_hbuf = (d)->bd_sbuf;					\
+	(d)->bd_hlen = (d)->bd_slen;					\
+	(d)->bd_sbuf = (d)->bd_fbuf;					\
+	(d)->bd_slen = 0;						\
+	(d)->bd_fbuf = NULL;						\
+	bpf_bufheld(d);							\
+} while (0)
+
 /*
  * Descriptor associated with each attached hardware interface.
  */
@@ -773,6 +850,7 @@
 	struct mtx	bif_mtx;	/* mutex for interface */
 };
 
+void	 bpf_bufheld(struct bpf_d *d);
 int	 bpf_validate(const struct bpf_insn *, int);
 void	 bpf_tap(struct bpf_if *, u_char *, u_int);
 void	 bpf_mtap(struct bpf_if *, struct mbuf *);
Index: sys/net/bpf_buffer.c
===================================================================
RCS file: sys/net/bpf_buffer.c
diff -N sys/net/bpf_buffer.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ sys/net/bpf_buffer.c	5 Nov 2007 18:52:09 -0000
@@ -0,0 +1,226 @@
+/*-
+ * Copyright (c) 2007 Seccuris Inc.
+ * All rights reserved.
+ *
+ * This sofware was developed by Robert N. M. Watson under contract to
+ * Seccuris Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Copyright (c) 1990, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from the Stanford/CMU enet packet filter,
+ * (net/enet.c) distributed as part of 4.3BSD, and code contributed
+ * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
+ * Berkeley Laboratory.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *      @(#)bpf.c	8.4 (Berkeley) 1/9/95
+ *
+ * $FreeBSD: src/sys/net/bpf.c,v 1.174 2006/11/06 13:42:02 rwatson Exp $
+ */
+
+#include "opt_bpf.h"
+
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/bpf.h>
+#include <net/bpf_buffer.h>
+#include <net/bpfdesc.h>
+
+/*
+ * Implement historical kernel memory buffering model for BPF: two malloc(9)
+ * kernel buffers are hung off of the descriptor.  The size is fixed prior to
+ * attaching to an ifnet, ad cannot be changed after that.  read(2) simply
+ * copies the data to user space using uiomove(9).
+ */
+
+static int bpf_bufsize = 4096;
+SYSCTL_INT(_net_bpf, OID_AUTO, bufsize, CTLFLAG_RW,
+    &bpf_bufsize, 0, "");
+static int bpf_maxbufsize = BPF_MAXBUFSIZE;
+SYSCTL_INT(_net_bpf, OID_AUTO, maxbufsize, CTLFLAG_RW,
+    &bpf_maxbufsize, 0, "");
+
+void
+bpf_buffer_alloc(struct bpf_d *d)
+{
+
+	KASSERT(d->bd_fbuf == NULL, ("bpf_buffer_alloc: bd_fbuf != NULL"));
+	KASSERT(d->bd_sbuf == NULL, ("bpf_buffer_alloc: bd_sbuf != NULL"));
+	KASSERT(d->bd_hbuf == NULL, ("bpf_buffer_alloc: bd_hbuf != NULL"));
+
+	// printf("bpf_buffer_alloc size %d\n", d->bd_bufsize);
+
+	d->bd_fbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK);
+	d->bd_sbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK);
+	d->bd_hbuf = NULL;
+	d->bd_slen = 0;
+	d->bd_hlen = 0;
+}
+
+/*
+ * Simple data copy to the current kernel buffer.
+ */
+void
+bpf_buffer_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,
+    void *src, u_int len)
+{
+	u_char *src_bytes;
+
+	// printf("bpf_buffer_append_bytes size %d\n", len);
+
+	src_bytes = (u_char *)src;
+	bcopy(src_bytes, buf + offset, len);
+}
+
+/*
+ * Scatter-gather data copy from an mbuf chain to the current kernel buffer.
+ */
+void
+bpf_buffer_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
+    u_int len)
+{
+	const struct mbuf *m;
+	u_char *dst;
+	u_int count;
+
+	// printf("bpf_buffer_append_mbuf size %d\n", len);
+
+	m = (struct mbuf *)src;
+	dst = (u_char *)buf + offset;
+	while (len > 0) {
+		if (m == NULL)
+			panic("bpf_mcopy");
+		count = min(m->m_len, len);
+		bcopy(mtod(m, void *), dst, count);
+		m = m->m_next;
+		dst += count;
+		len -= count;
+	}
+}
+
+/*
+ * Free BPF kernel buffers on device close.
+ */
+void
+bpf_buffer_free(struct bpf_d *d)
+{
+
+	// printf("bpf_buffer_free(sbuf: %p, hbuf: %p, fbuf: %p)\n",
+	//    d->bd_sbuf, d->bd_hbuf, d->bd_fbuf);
+
+	if (d->bd_sbuf != NULL)
+		free(d->bd_sbuf, M_BPF);
+	if (d->bd_hbuf != NULL)
+		free(d->bd_hbuf, M_BPF);
+	if (d->bd_fbuf != NULL)
+		free(d->bd_fbuf, M_BPF);
+
+#ifdef INVARIANTS
+	d->bd_sbuf = d->bd_hbuf = d->bd_fbuf = (caddr_t)~0;
+#endif
+}
+
+/*
+ * This is a historical initialization that occurs when the BPF descriptor is
+ * first opened.  It does not imply selection of a buffer mode, so we don't
+ * allocate buffers here.
+ */
+void
+bpf_buffer_init(struct bpf_d *d)
+{
+
+	// printf("bpf_buffer_init: bufsize %d\n", bpf_bufsize);
+
+	d->bd_bufsize = bpf_bufsize;
+}
+
+/*
+ * Allocate or resize buffers.
+ */
+int
+bpf_buffer_ioctl_sblen(struct bpf_d *d, u_int *i)
+{
+	u_int size;
+
+	// printf("bpf_buffer_ioctl_sblen %d\n", *i);
+
+	BPFD_LOCK(d);
+	if (d->bd_bif != NULL) {
+		BPFD_UNLOCK(d);
+		return (EINVAL);
+	}
+	size = *i;
+	if (size > bpf_maxbufsize)
+		*i = size = bpf_maxbufsize;
+	else if (size < BPF_MINBUFSIZE)
+		*i = size = BPF_MINBUFSIZE;
+	d->bd_bufsize = size;
+	BPFD_UNLOCK(d);
+
+	// printf("bpf_buffer_ioctl_sblen (req: %d, set: %d)\n", *i, size);
+	return (0);
+}
+
+/*
+ * Copy buffer storage to user space in read().
+ */
+int
+bpf_buffer_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
+{
+
+	// printf("bpf_buffer_uiomove %d bytes\n", len);
+
+	return (uiomove(buf, len, uio));
+}
Index: sys/net/bpf_buffer.h
===================================================================
RCS file: sys/net/bpf_buffer.h
diff -N sys/net/bpf_buffer.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ sys/net/bpf_buffer.h	5 Nov 2007 18:53:33 -0000
@@ -0,0 +1,50 @@
+/*-
+ * Copyright (c) 2007 Seccuris Inc.
+ * All rights reserved.
+ *
+ * This sofware was developed by Robert N. M. Watson under contract to
+ * Seccuris Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NET_BPF_BUFFER_H_
+#define	_NET_BPF_BUFFER_H_
+
+#ifndef _KERNEL
+#error "no user-serviceable parts inside"
+#endif
+
+void	bpf_buffer_alloc(struct bpf_d *d);
+void	bpf_buffer_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,
+	    void *src, u_int len);
+void	bpf_buffer_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset,
+	    void *src, u_int len);
+void	bpf_buffer_free(struct bpf_d *d);
+void	bpf_buffer_init(struct bpf_d *d);
+int	bpf_buffer_ioctl_sblen(struct bpf_d *d, u_int *i);
+int	bpf_buffer_uiomove(struct bpf_d *d, caddr_t buf, u_int len,
+	    struct uio *uio);
+
+#endif /* !_NET_BPF_BUFFER_H_ */
Index: sys/net/bpf_filter.c
===================================================================
RCS file: /home/ncvs/src/sys/net/bpf_filter.c,v
retrieving revision 1.28
diff -u -r1.28 bpf_filter.c
--- sys/net/bpf_filter.c	13 Sep 2007 09:00:32 -0000	1.28
+++ sys/net/bpf_filter.c	5 Nov 2007 18:40:34 -0000
@@ -83,14 +83,11 @@
 static u_int32_t	m_xword(struct mbuf *m, bpf_u_int32 k, int *err);
 
 static u_int32_t
-m_xword(m, k, err)
-	register struct mbuf *m;
-	register bpf_u_int32 k;
-	register int *err;
+m_xword(struct mbuf *m, bpf_u_int32 k, int *err)
 {
-	register size_t len;
-	register u_char *cp, *np;
-	register struct mbuf *m0;
+	size_t len;
+	u_char *cp, *np;
+	struct mbuf *m0;
 
 	len = m->m_len;
 	while (k >= len) {
@@ -111,21 +108,18 @@
 	*err = 0;
 	np = mtod(m0, u_char *);
 	switch (len - k) {
-
 	case 1:
 		return
 		    ((u_int32_t)cp[0] << 24) |
 		    ((u_int32_t)np[0] << 16) |
 		    ((u_int32_t)np[1] << 8)  |
 		    (u_int32_t)np[2];
-
 	case 2:
 		return
 		    ((u_int32_t)cp[0] << 24) |
 		    ((u_int32_t)cp[1] << 16) |
 		    ((u_int32_t)np[0] << 8) |
 		    (u_int32_t)np[1];
-
 	default:
 		return
 		    ((u_int32_t)cp[0] << 24) |
@@ -135,18 +129,15 @@
 	}
     bad:
 	*err = 1;
-	return 0;
+	return (0);
 }
 
 static u_int16_t
-m_xhalf(m, k, err)
-	register struct mbuf *m;
-	register bpf_u_int32 k;
-	register int *err;
+m_xhalf(struct mbuf *m, bpf_u_int32 k, int *err)
 {
-	register size_t len;
-	register u_char *cp;
-	register struct mbuf *m0;
+	size_t len;
+	u_char *cp;
+	struct mbuf *m0;
 
 	len = m->m_len;
 	while (k >= len) {
@@ -159,16 +150,16 @@
 	cp = mtod(m, u_char *) + k;
 	if (len - k >= 2) {
 		*err = 0;
-		return EXTRACT_SHORT(cp);
+		return (EXTRACT_SHORT(cp));
 	}
 	m0 = m->m_next;
 	if (m0 == 0)
 		goto bad;
 	*err = 0;
-	return (cp[0] << 8) | mtod(m0, u_char *)[0];
+	return ((cp[0] << 8) | mtod(m0, u_char *)[0]);
  bad:
 	*err = 1;
-	return 0;
+	return (0);
 }
 #endif
 
@@ -178,21 +169,17 @@
  * buflen is the amount of data present
  */
 u_int
-bpf_filter(pc, p, wirelen, buflen)
-	register const struct bpf_insn *pc;
-	register u_char *p;
-	u_int wirelen;
-	register u_int buflen;
+bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen)
 {
-	register u_int32_t A = 0, X = 0;
-	register bpf_u_int32 k;
+	u_int32_t A = 0, X = 0;
+	bpf_u_int32 k;
 	u_int32_t mem[BPF_MEMWORDS];
 
-	if (pc == 0)
+	if (pc == NULL)
 		/*
 		 * No filter means accept all.
 		 */
-		return (u_int)-1;
+		return ((u_int)-1);
 
 	--pc;
 	while (1) {
@@ -206,10 +193,10 @@
 			abort();
 #endif
 		case BPF_RET|BPF_K:
-			return (u_int)pc->k;
+			return ((u_int)pc->k);
 
 		case BPF_RET|BPF_A:
-			return (u_int)A;
+			return ((u_int)A);
 
 		case BPF_LD|BPF_W|BPF_ABS:
 			k = pc->k;
@@ -224,7 +211,7 @@
 					return 0;
 				continue;
 #else
-				return 0;
+				return (0);
 #endif
 			}
 #ifdef BPF_ALIGN
@@ -256,7 +243,7 @@
 			k = pc->k;
 			if (k >= buflen) {
 #ifdef _KERNEL
-				register struct mbuf *m;
+				struct mbuf *m;
 
 				if (buflen != 0)
 					return 0;
@@ -287,13 +274,13 @@
 				int merr;
 
 				if (buflen != 0)
-					return 0;
+					return (0);
 				A = m_xword((struct mbuf *)p, k, &merr);
 				if (merr != 0)
-					return 0;
+					return (0);
 				continue;
 #else
-				return 0;
+				return (0);
 #endif
 			}
 #ifdef BPF_ALIGN
@@ -315,10 +302,10 @@
 					return 0;
 				A = m_xhalf((struct mbuf *)p, k, &merr);
 				if (merr != 0)
-					return 0;
+					return (0);
 				continue;
 #else
-				return 0;
+				return (0);
 #endif
 			}
 			A = EXTRACT_SHORT(&p[k]);
@@ -328,7 +315,7 @@
 			k = X + pc->k;
 			if (pc->k >= buflen || X >= buflen - pc->k) {
 #ifdef _KERNEL
-				register struct mbuf *m;
+				struct mbuf *m;
 
 				if (buflen != 0)
 					return 0;
@@ -337,7 +324,7 @@
 				A = mtod(m, u_char *)[k];
 				continue;
 #else
-				return 0;
+				return (0);
 #endif
 			}
 			A = p[k];
Index: sys/net/bpf_zerocopy.c
===================================================================
RCS file: sys/net/bpf_zerocopy.c
diff -N sys/net/bpf_zerocopy.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ sys/net/bpf_zerocopy.c	5 Nov 2007 18:52:28 -0000
@@ -0,0 +1,635 @@
+/*-
+ * Copyright (c) 2007 Seccuris Inc.
+ * All rights reserved.
+ *
+ * This sofware was developed by Robert N. M. Watson under contract to
+ * Seccuris Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_bpf.h"
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sf_buf.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+
+#include <net/if.h>
+#include <net/bpf.h>
+#include <net/bpfdesc.h>
+#include <net/bpf_zerocopy.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+
+/*
+ * Zero-copy buffer scheme for BPF: user space "donates" two buffers, which
+ * are mapped into the kernel address space using sf_bufs and used directly
+ * by BPF.  Memory is wired since page faults cannot be tolerated in the
+ * contexts where the buffers are copied to (locks held, interrupt context,
+ * etc).
+ */
+
+/*
+ * Maximum number of pages per buffer.  Since all BPF devices use two, the
+ * maximum per device is 2*BPF_MAX_PAGES.  Resource limits on the number of
+ * sf_bufs may be an issue, so do not set this too high.  On older systems,
+ * kernel address space limits may also be an issue.
+ */
+#define	BPF_MAX_PAGES	512
+
+/*
+ * struct zbuf describes a memory buffer loaned by a user process to the
+ * kernel.  We represent this as a series of pages managed using an array of
+ * sf_bufs.  Even though the memory is contiguous in user space, it may not
+ * be mapped contiguously in the kernel (i.e., a set of physically
+ * non-contiguous pages in the direct map region) so we must implement
+ * scatter-gather copying.
+ *
+ * At the front of the shared memor region is a bpf_zbuf_header, which
+ * contains shared control data to allow user space and the kernel to
+ * synchronize; this is included in zb_size, but not bpf_bufsize, so that BPF
+ * knows that the space is not available.
+ */
+struct zbuf {
+	vm_offset_t	 zb_uaddr;	/* User address, may be stale. */
+	size_t		 zb_size;	/* Size of buffer, incl. header. */
+	u_int		 zb_numpages;	/* Number of pages. */
+	struct sf_buf	**zb_pages;	/* Pages themselves. */
+	struct bpf_zbuf_header	*zb_header;	/* Shared header. */
+};
+
+/*
+ * Release a page we've previously wired.
+ */
+static void
+zbuf_page_free(vm_page_t pp)
+{
+
+	vm_page_lock_queues();
+	vm_page_unwire(pp, 0);
+	if (pp->wire_count == 0 && pp->object == NULL)
+		vm_page_free(pp);
+	vm_page_unlock_queues();
+}
+
+/*
+ * Free an sf_buf with attached page.
+ */
+static void
+zbuf_sfbuf_free(struct sf_buf *sf)
+{
+	vm_page_t pp;
+
+	pp = sf_buf_page(sf);
+	sf_buf_free(sf);
+	zbuf_page_free(pp);
+}
+
+/*
+ * Free a zbuf, including its page array, sbufs, and pages.  Allow partially
+ * allocated zbufs to be freed so that it may be used even during a zbuf
+ * setup.
+ */
+static void
+zbuf_free(struct zbuf *zb)
+{
+	int i;
+
+	for (i = 0; i < zb->zb_numpages; i++) {
+		if (zb->zb_pages[i] != NULL)
+			zbuf_sfbuf_free(zb->zb_pages[i]);
+	}
+	free(zb->zb_pages, M_BPF);
+	free(zb, M_BPF);
+}
+
+/*
+ * Given a user pointer to a page of user memory, return an sf_buf for the
+ * page.  Because we may be requesting quite a few sf_bufs, prefer failure
+ * to deadlock and use SFB_NOWAIT.
+ */
+static struct sf_buf *
+zbuf_sfbuf_get(struct vm_map *map, vm_offset_t uaddr)
+{
+	struct sf_buf *sf;
+	vm_page_t pp;
+
+	if (vm_fault_quick((caddr_t) uaddr, VM_PROT_READ | VM_PROT_WRITE)
+	    < 0)
+		return (NULL);
+
+	pp = pmap_extract_and_hold(map->pmap, uaddr,
+	    VM_PROT_READ | VM_PROT_WRITE);
+	if (pp == NULL)
+		return (NULL);
+
+	vm_page_lock_queues();
+	vm_page_wire(pp);
+	vm_page_unhold(pp);
+	vm_page_unlock_queues();
+
+	sf = sf_buf_alloc(pp, SFB_NOWAIT);
+	if (sf == NULL) {
+		zbuf_page_free(pp);
+		return (NULL);
+	}
+	return (sf);
+}
+
+/*
+ * Create a zbuf describing a range of user address space memory.  Validate
+ * page alignment, size requirements, etc.
+ */
+static int
+zbuf_setup(struct thread *td, vm_offset_t uaddr, size_t len,
+    struct zbuf **zbp)
+{
+	struct zbuf *zb;
+	struct vm_map *map;
+	int error, i;
+
+	*zbp = NULL;
+
+	/* User address must be page-aligned. */
+	if (uaddr & PAGE_MASK)
+		return (EINVAL);
+
+	/* Length must be an integer number of full pages. */
+	if (len & PAGE_MASK)
+		return (EINVAL);
+
+	/* Length must not exceed per-buffer resource limit. */
+	if ((len / PAGE_SIZE) > BPF_MAX_PAGES)
+		return (EINVAL);
+
+	error = 0;
+	zb = malloc(sizeof(*zb), M_BPF, M_ZERO | M_WAITOK);
+	zb->zb_uaddr = uaddr;
+	zb->zb_size = len;
+	zb->zb_numpages = len / PAGE_SIZE;
+	zb->zb_pages = malloc(sizeof(struct sf_buf *) *
+	    zb->zb_numpages, M_BPF, M_ZERO | M_WAITOK);
+	map = &td->td_proc->p_vmspace->vm_map;
+	for (i = 0; i < zb->zb_numpages; i++) {
+		zb->zb_pages[i] = zbuf_sfbuf_get(map,
+		    uaddr + (i * PAGE_SIZE));
+		if (zb->zb_pages[i] == NULL) {
+			error = EFAULT;
+			goto error;
+		}
+	}
+	zb->zb_header =
+	    (struct bpf_zbuf_header *)sf_buf_kva(zb->zb_pages[0]);
+	bzero(zb->zb_header, sizeof(*zb->zb_header));
+	*zbp = zb;
+	return (0);
+
+error:
+	zbuf_free(zb);
+	return (error);
+}
+
+/*
+ * Copy bytes from a source into the specified zbuf.  The caller is
+ * responsible for performing bounds checking, etc.
+ */
+void
+bpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,
+    void *src, u_int len)
+{
+	u_int count, page, poffset;
+	u_char *src_bytes;
+	struct zbuf *zb;
+
+	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+	    ("bpf_zerocopy_append_bytes: not in zbuf mode"));
+	KASSERT(buf != NULL, ("bpf_zerocopy_append_bytes: NULL buf"));
+
+
+	src_bytes = (u_char *)src;
+	zb = (struct zbuf *)buf;
+
+	/*
+	 * Scatter-gather copy to user pages mapped into kernel address space
+	 * using sf_bufs: copy up to a page at a time.
+	 */
+	offset += sizeof(struct bpf_zbuf_header);
+	page = offset / PAGE_SIZE;
+	poffset = offset % PAGE_SIZE;
+	while (len > 0) {
+		KASSERT(page < zb->zb_numpages, ("bpf_zerocopy_append_bytes:"
+		   " page overflow (%d p %d np)\n", page, zb->zb_numpages));
+
+		count = min(len, PAGE_SIZE - poffset);
+		bcopy(src_bytes, ((u_char *)sf_buf_kva(zb->zb_pages[page])) +
+		    poffset, count);
+
+		poffset += count;
+		if (poffset == PAGE_SIZE) {
+			poffset = 0;
+			page++;
+		}
+		KASSERT(poffset < PAGE_SIZE,
+		    ("bpf_zerocopy_append_bytes: page offset overflow (%d)",
+		    poffset));
+		len -= count;
+		src_bytes += count;
+	}
+}
+
+/*
+ * Copy bytes from an mbuf chain to the specified zbuf: copying will be
+ * scatter-gather both from mbufs, which may be fragmented over memory, and
+ * to pages, which may not be contiguously mapped in kernel address space.
+ * As with bpf_zerocopy_append_bytes(), the caller is responsible for
+ * checking that this will not exceed the buffer limit.
+ */
+void
+bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset,
+    void *src, u_int len)
+{
+	u_int count, moffset, page, poffset;
+	const struct mbuf *m;
+	struct zbuf *zb;
+
+	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+	    ("bpf_zerocopy_append_mbuf not in zbuf mode"));
+	KASSERT(buf != NULL, ("bpf_zerocopy_append_mbuf: NULL buf"));
+
+	m = (struct mbuf *)src;
+	zb = (struct zbuf *)buf;
+
+	/*
+	 * Scatter gather both from an mbuf chain and to a user page set
+	 * mapped into kernel address space using sf_bufs.  If we're lucky,
+	 * each mbuf requires one copy operation, but if page alignment and
+	 * mbuf alignment work out less well, we'll be doing two copies per
+	 * mbuf.
+	 */
+	offset += sizeof(struct bpf_zbuf_header);
+	page = offset / PAGE_SIZE;
+	poffset = offset % PAGE_SIZE;
+	moffset = 0;
+	while (len > 0) {
+		KASSERT(page < zb->zb_numpages,
+		    ("bpf_zerocopy_append_mbuf: page overflow (%d p %d "
+		    "np)\n", page, zb->zb_numpages));
+		KASSERT(m != NULL,
+		    ("bpf_zerocopy_append_mbuf: end of mbuf chain"));
+
+		count = min(m->m_len - moffset, len);
+		count = min(count, PAGE_SIZE - poffset);
+		bcopy(mtod(m, u_char *) + moffset,
+		    ((u_char *)sf_buf_kva(zb->zb_pages[page])) + poffset,
+		    count);
+
+		poffset += count;
+		if (poffset == PAGE_SIZE) {
+			poffset = 0;
+			page++;
+		}
+		KASSERT(poffset < PAGE_SIZE,
+		    ("bpf_zerocopy_append_mbuf: page offset overflow (%d)",
+		    poffset));
+		moffset += count;
+		if (moffset == m->m_len) {
+			m = m->m_next;
+			moffset = 0;
+		}
+		len -= count;
+	}
+}
+
+/*
+ * Notification from the BPF framework that a buffer has moved into the held
+ * slot on a descriptor.  Zero-copy BPF will update the shared page to let
+ * the user process know.
+ *
+ * XXXRW: Do we need to use a memory barrier, atomic operation, or the like
+ * to make sure that the generation update is the last write to make it out
+ * after any packet date so that user space sees the generation increase only
+ * at or after the last packet data change?
+ */
+void
+bpf_zerocopy_bufheld(struct bpf_d *d)
+{
+	struct zbuf *zb;
+
+	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+	    ("bpf_zerocopy_bufheld: not in zbuf mode"));
+
+	zb = (struct zbuf *)d->bd_hbuf;
+	KASSERT(zb != NULL, ("bpf_zerocopy_bufheld: zb == NULL"));
+	zb->zb_header->bzh_kernel_len = d->bd_hlen;
+	zb->zb_header->bzh_kernel_gen++;
+}
+
+/*
+ * Query from the BPF framework regarding whether the buffer currently in the
+ * held position can be moved to the free position, which can be indicated by
+ * the user process making their generation number equal to the kernel
+ * generation number.
+ *
+ * XXXRW: Memory ordering also an issue here?
+ */
+int
+bpf_zerocopy_buffree(struct bpf_d *d)
+{
+	struct zbuf *zb;
+
+	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+	    ("bpf_zerocopy_buffree: not in zbuf mode"));
+
+	zb = (struct zbuf *)d->bd_hbuf;
+	if (zb == NULL)
+		return (0);
+	if (zb->zb_header->bzh_kernel_gen == zb->zb_header->bzh_user_gen)
+		return (1);
+	return (0);
+}
+
+/*
+ * Free zero copy buffers at request of descriptor.
+ */
+void
+bpf_zerocopy_free(struct bpf_d *d)
+{
+	struct zbuf *zb;
+
+	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+	    ("bpf_zerocopy_free: not in zbuf mode"));
+
+	zb = (struct zbuf *)d->bd_sbuf;
+	if (zb != NULL)
+		zbuf_free(zb);
+	zb = (struct zbuf *)d->bd_hbuf;
+	if (zb != NULL)
+		zbuf_free(zb);
+	zb = (struct zbuf *)d->bd_fbuf;
+	if (zb != NULL)
+		zbuf_free(zb);
+}
+
+/*
+ * For now, allow bpfread() to rotate the buffers, but don't perform a copy
+ * operation or return a value.  If we want to copy, we'll need to implement
+ * scatter-gather copying with a series of uiomove calls here.
+ */
+int
+bpf_zerocopy_uiomove(struct bpf_d *d, caddr_t buf, u_int len,
+    struct uio *uio)
+{
+
+	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+	    ("bpf_zerocopy_uiomove: not in zbuf mode"));
+
+	return (EOPNOTSUPP);
+}
+
+/*
+ * Acknowledge reading the buffer without performing read().  We accept an
+ * argument primarily so that we can validate that user space has the right
+ * idea, helping to catch application bugs faster if the application's sense
+ * of buffer rotation differs from the kernel's (or for that matter, kernel
+ * bugs).
+ */
+int
+bpf_zerocopy_ioctl_ackzbuf(struct thread *td, struct bpf_d *d,
+    struct bpf_zbuf *bz)
+{
+	struct zbuf *zb;
+
+	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+	    ("bpf_zerocopy_ioctl_ackzbuf: not in zbuf mode"));
+
+	BPFD_LOCK(d);
+	if (d->bd_hbuf == NULL) {
+		BPFD_UNLOCK(d);
+		return (EINVAL);
+	}
+	zb = (struct zbuf *)d->bd_hbuf;
+	if (bz->bz_bufa != (void *)zb->zb_uaddr) {
+		BPFD_UNLOCK(d);
+		return (EINVAL);
+	}
+	zb->zb_header->bzh_user_gen = zb->zb_header->bzh_kernel_gen;
+	d->bd_fbuf = d->bd_hbuf;
+	d->bd_hbuf = NULL;
+	d->bd_hlen = 0;
+	BPFD_UNLOCK(d);
+	return (0);
+}
+
+/*
+ * Ioctl to retrieve zbuf settings.  Note that the user address pointers are
+ * copied versions of those originally submitted via the setzbuf ioctl--if
+ * user space has remapped the buffers, then they may be inconsistent.  User
+ * applications must be aware that these are in effect buffer names, not
+ * pointers, if they play such games with their address space.  Pointers are
+ * returned in arbitrary order, which may vary by ioctl.
+ */
+int
+bpf_zerocopy_ioctl_getzbuf(struct thread *td, struct bpf_d *d,
+    struct bpf_zbuf *bz)
+{
+	struct zbuf *zb;
+
+	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+	    ("bpf_zerocopy_ioctl_getzbuf: not in zbuf mode"));
+
+	bzero(bz, sizeof(*bz));
+	BPFD_LOCK(d);
+	if (d->bd_hbuf != NULL) {
+		zb = (struct zbuf *)d->bd_hbuf;
+		bz->bz_bufa = (void *)zb->zb_uaddr;
+		bz->bz_buflen = zb->zb_size;
+		zb = (struct zbuf *)d->bd_sbuf;
+		bz->bz_bufb = (void *)zb->zb_uaddr;
+	} else if (d->bd_sbuf != NULL) {
+		zb = (struct zbuf *)d->bd_sbuf;
+		bz->bz_bufa = (void *)zb->zb_uaddr;
+		bz->bz_buflen = zb->zb_size;
+		zb = (struct zbuf *)d->bd_fbuf;
+		bz->bz_bufb = (void *)zb->zb_uaddr;
+	} else {
+		bz->bz_bufa = bz->bz_bufb = NULL;
+		bz->bz_buflen = 0;
+	}
+	BPFD_UNLOCK(d);
+	return (0);
+}
+
+/*
+ * Ioctl to return the maximum buffer size.
+ */
+int
+bpf_zerocopy_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
+{
+
+	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+	    ("bpf_zerocopy_ioctl_getzmax: not in zbuf mode"));
+
+	*i = BPF_MAX_PAGES * PAGE_SIZE;
+	return (0);
+}
+
+/*
+ * Ioctl to return the next completed buffer to read, if any.  In immediate
+ * mode, this may force a buffer rotation if there is stored data but no held
+ * data, in similar style to calling bpfread() on an immediate mode
+ * descriptor.
+ */
+int
+bpf_zerocopy_ioctl_getznext(struct thread *td, struct bpf_d *d,
+    struct bpf_zbuf *bz)
+{
+	struct zbuf *zb;
+
+	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+	    ("bpf_zerocopy_ioctl_getznext: not in zbuf mode"));
+
+	/*
+	 * If in immediate mode, there's no holder buffer, but there is
+	 * stored packet data, rotate so that the stored buffer is now the
+	 * held buffer.
+	 */
+	BPFD_LOCK(d);
+	if (d->bd_immediate && d->bd_hbuf == NULL
+	    && d->bd_slen != 0) {
+		ROTATE_BUFFERS(d);
+	}
+	bzero(bz, sizeof(*bz));
+	if (d->bd_hbuf != NULL) {
+		zb = (struct zbuf *)d->bd_hbuf;
+		bz->bz_bufa = (void *)zb->zb_uaddr;
+		bz->bz_buflen = d->bd_hlen;
+	}
+	BPFD_UNLOCK(d);
+	return (0);
+}
+
+int
+bpf_zerocopy_ioctl_rotzbuf(struct thread *td, struct bpf_d *d,
+    struct bpf_zbuf *bz)
+{
+	struct zbuf *bzh;
+
+	bzero(bz, sizeof(*bz));
+	BPFD_LOCK(d);
+	if (d->bd_hbuf == NULL && d->bd_slen != 0) {
+		ROTATE_BUFFERS(d);
+		bzh = (struct zbuf *)d->bd_hbuf;
+		bz->bz_bufa = (void *)bzh->zb_uaddr;
+		bz->bz_buflen = d->bd_hlen;
+	}
+	BPFD_UNLOCK(d);
+	return (0);
+}
+/*
+ * Ioctl to configure zero-copy buffers -- may be done only once.
+ */
+int
+bpf_zerocopy_ioctl_setzbuf(struct thread *td, struct bpf_d *d,
+    struct bpf_zbuf *bz)
+{
+	struct zbuf *zba, *zbb;
+	int error;
+
+	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+	    ("bpf_zerocopy_ioctl_setzbuf: not in zbuf mode"));
+
+	/*
+	 * Must set both buffers.  Cannot clear them.
+	 */
+	if (bz->bz_bufa == NULL || bz->bz_bufb == NULL)
+		return (EINVAL);
+
+	/*
+	 * Buffers must have a size greater than 0.  Alignment and other size
+	 * validity checking is done in zbuf_setup().
+	 */
+	if (bz->bz_buflen == 0)
+		return (EINVAL);
+
+	/*
+	 * As a simplifying assumption, we allow buffers to be designated
+	 * only once per descriptor.  Checked up front to save some trouble,
+	 * as we can more easily return EINVAL here; if the system is low on
+	 * sf_bufs, then it will be ENOMEM later.
+	 *
+	 * Note: lockless read.
+	 */
+	if (d->bd_hbuf != NULL || d->bd_sbuf != NULL || d->bd_fbuf != NULL ||
+	    d->bd_bif != NULL)
+		return (EINVAL);
+
+	/*
+	 * Allocate new buffers if required.
+	 */
+	error = zbuf_setup(td, (vm_offset_t)bz->bz_bufa, bz->bz_buflen,
+	    &zba);
+	if (error)
+		return (error);
+
+	error = zbuf_setup(td, (vm_offset_t)bz->bz_bufb, bz->bz_buflen,
+	    &zbb);
+	if (error) {
+		zbuf_free(zba);
+		return (error);
+	}
+
+	/*
+	 * Perform atomic check-and-exchange.
+	 */
+	BPFD_LOCK(d);
+	if (d->bd_hbuf != NULL || d->bd_sbuf != NULL || d->bd_fbuf != NULL ||
+	    d->bd_bif != NULL) {
+		BPFD_UNLOCK(d);
+		zbuf_free(zba);
+		zbuf_free(zbb);
+		return (EINVAL);
+	}
+	d->bd_fbuf = (caddr_t)zbb;
+	d->bd_sbuf = (caddr_t)zba;
+	d->bd_slen = 0;
+	d->bd_hlen = 0;
+
+	/*
+	 * We expose only the space left in the buffer after the size of the
+	 * shared management region.
+	 */
+	d->bd_bufsize = bz->bz_buflen - sizeof(struct bpf_zbuf_header);
+	BPFD_UNLOCK(d);
+	return (0);
+}
Index: sys/net/bpf_zerocopy.h
===================================================================
RCS file: sys/net/bpf_zerocopy.h
diff -N sys/net/bpf_zerocopy.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ sys/net/bpf_zerocopy.h	5 Nov 2007 18:53:26 -0000
@@ -0,0 +1,61 @@
+/*-
+ * Copyright (c) 2007 Seccuris Inc.
+ * All rights reserved.
+ *
+ * This sofware was developed by Robert N. M. Watson under contract to
+ * Seccuris Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NET_BPF_ZEROCOPY_H_
+#define	_NET_BPF_ZEROCOPY_H_
+
+#ifndef _KERNEL
+#error "no user-serviceable parts inside"
+#endif
+
+void	bpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,
+	    void *src, u_int len);
+void	bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset,
+	    void *src, u_int len);
+void	bpf_zerocopy_bufheld(struct bpf_d *);
+int	bpf_zerocopy_buffree(struct bpf_d *);
+void	bpf_zerocopy_free(struct bpf_d *d);
+int	bpf_zerocopy_ioctl_ackzbuf(struct thread *td, struct bpf_d *d,
+	    struct bpf_zbuf *bz);
+int	bpf_zerocopy_ioctl_getzbuf(struct thread *td, struct bpf_d *d,
+	    struct bpf_zbuf *bz);
+int	bpf_zerocopy_ioctl_getzmax(struct thread *td, struct bpf_d *d,
+	    size_t *i);
+int	bpf_zerocopy_ioctl_getznext(struct thread *td, struct bpf_d *d,
+	    struct bpf_zbuf *bz);
+int	bpf_zerocopy_ioctl_rotzbuf(struct thread *td, struct bpf_d *d,
+	    struct bpf_zbuf *bz);
+int	bpf_zerocopy_ioctl_setzbuf(struct thread *td, struct bpf_d *d,
+	    struct bpf_zbuf *bz);
+int	bpf_zerocopy_uiomove(struct bpf_d *d, caddr_t buf, u_int len,
+	    struct uio *uio);
+
+#endif /* !_NET_BPF_ZEROCOPY_H_ */
Index: sys/net/bpfdesc.h
===================================================================
RCS file: /home/ncvs/src/sys/net/bpfdesc.h,v
retrieving revision 1.38
diff -u -r1.38 bpfdesc.h
--- sys/net/bpfdesc.h	6 Aug 2007 14:26:00 -0000	1.38
+++ sys/net/bpfdesc.h	5 Nov 2007 18:40:34 -0000
@@ -48,10 +48,11 @@
 /*
  * Descriptor associated with each open bpf file.
  */
+struct zbuf;
 struct bpf_d {
 	LIST_ENTRY(bpf_d) bd_next;	/* Linked list of descriptors */
 	/*
-	 * Buffer slots: two malloc buffers store the incoming packets.
+	 * Buffer slots: two memory clusters buffer the incoming packets.
 	 *   The model has three slots.  Sbuf is always occupied.
 	 *   sbuf (store) - Receive interrupt puts packets here.
 	 *   hbuf (hold) - When sbuf is full, put buffer here and
@@ -93,6 +94,11 @@
 	u_long		bd_fcount;	/* number of packets which matched filter */
 	pid_t		bd_pid;		/* PID which created descriptor */
 	int		bd_locked;	/* true if descriptor is locked */
+	u_int		bd_bufmode;	/* Current buffer mode. */
+	u_long		bd_wcount;	/* number of packets written */
+	u_long		bd_wfcount;	/* number of packets that matched write filter */
+	u_long		bd_wdcount;	/* number of packets dropped during a write */
+	u_long		bd_zcopy;	/* number of zero copy operations */
 };
 
 /* Values for bd_state */
@@ -104,12 +110,6 @@
 #define BPFD_UNLOCK(bd)		mtx_unlock(&(bd)->bd_mtx)
 #define BPFD_LOCK_ASSERT(bd)	mtx_assert(&(bd)->bd_mtx, MA_OWNED);
 
-/* Test whether a BPF is ready for read(). */
-#define	bpf_ready(bd)						 \
-	((bd)->bd_hlen != 0 ||					 \
-	 (((bd)->bd_immediate || (bd)->bd_state == BPF_TIMED_OUT) && \
-	  (bd)->bd_slen != 0))
-
 /*
  * External representation of the bpf descriptor
  */
@@ -130,6 +130,10 @@
 	pid_t		bd_pid;
 	char		bd_ifname[IFNAMSIZ];
 	int		bd_locked;
+	u_long		bd_wcount;
+	u_long		bd_wfcount;
+	u_long		bd_wdcount;
+	u_long		bd_zcopy;
 };
 
 #define BPFIF_LOCK(bif)		mtx_lock(&(bif)->bif_mtx)
Index: usr.bin/netstat/Makefile
===================================================================
RCS file: /home/ncvs/src/usr.bin/netstat/Makefile,v
retrieving revision 1.39
diff -u -r1.39 Makefile
--- usr.bin/netstat/Makefile	1 Jul 2007 12:08:07 -0000	1.39
+++ usr.bin/netstat/Makefile	5 Nov 2007 18:40:34 -0000
@@ -9,6 +9,7 @@
 
 WARNS?=	3
 
+CFLAGS+=-I../../sys
 CFLAGS+=-DIPSEC
 CFLAGS+=-DSCTP
 
Index: usr.bin/netstat/bpf.c
===================================================================
RCS file: /home/ncvs/src/usr.bin/netstat/bpf.c,v
retrieving revision 1.9
diff -u -r1.9 bpf.c
--- usr.bin/netstat/bpf.c	16 Jul 2007 17:15:54 -0000	1.9
+++ usr.bin/netstat/bpf.c	5 Nov 2007 18:40:34 -0000
@@ -86,31 +86,83 @@
 	*flagbuf++ = '\0';
 }       
 
-void
-bpf_stats(char *ifname)
+static int
+bpf_get_stats(int *size, struct xbpf_d **bdp)
 {
-	struct xbpf_d *d, *bd;
-	char *pname, flagbuf[12];
-	size_t size;
+	struct xbpf_d *bd;
+	size_t s;
 
-	if (sysctlbyname("net.bpf.stats", NULL, &size,
+	if (sysctlbyname("net.bpf.stats", NULL, &s,
 	    NULL, 0) < 0) {
 		warn("net.bpf.stats");
-		return;
+		return (-1);
 	}
-	if (size == 0)
-		return;
-	bd = malloc(size);
+	if (s == 0)
+		return (-1);
+	bd = malloc(s);
 	if (bd == NULL) {
 		warn("malloc failed");
-		return;
+		return (-1);
 	}
-	if (sysctlbyname("net.bpf.stats", bd, &size,
+	if (sysctlbyname("net.bpf.stats", bd, &s,
 	    NULL, 0) < 0) {
 		warn("net.bpf.stats");
 		free(bd);
+		return (-1);
+	}
+	*bdp = bd;
+	*size = s;
+	return (0);
+}
+	
+void
+bpf_stats_extended(char *ifname)
+{
+	struct xbpf_d *d, *bd;
+	int size;
+	char *pname;
+
+	if (bpf_get_stats(&size, &bd) < 0)
 		return;
+	for (d = &bd[0]; d < &bd[size / sizeof(*d)]; d++) {
+		if (ifname && strcmp(ifname, d->bd_ifname) != 0)
+			continue;
+		pname = bpf_pidname(d->bd_pid);
+		(void) printf("%s: pid %d on %s:\n", pname, d->bd_pid,
+		    d->bd_ifname);
+		(void) printf(
+		    "\t%lu packets received\n"
+		    "\t%lu packets matched receive filter\n"
+		    "\t%lu packets dropped\n"
+		    "\t%d current hold buffer size\n"
+		    "\t%d current store buffer size\n"
+		    "\t%lu packets written\n"
+		    "\t%lu packets matched write filter\n"
+		    "\t%lu packet writes failed\n"
+		    "\t%lu zero copy operations\n",
+			d->bd_rcount,
+			d->bd_fcount,
+			d->bd_dcount,
+			d->bd_hlen,
+			d->bd_slen,
+			d->bd_wcount,
+			d->bd_wfcount,
+			d->bd_wdcount,
+			d->bd_zcopy);
+		free(pname);
 	}
+	free(bd);
+}
+
+void
+bpf_stats(char *ifname)
+{
+	char *pname, flagbuf[12];
+	struct xbpf_d *d, *bd;
+	int size;
+
+	if (bpf_get_stats(&size, &bd) < 0)
+		return;
 	printf("%5s %6s %7s %9s %9s %9s %5s %5s %s\n",
 	    "Pid", "Netif", "Flags", "Recv", "Drop", "Match", "Sblen",
 	    "Hblen", "Command");
Index: usr.bin/netstat/main.c
===================================================================
RCS file: /home/ncvs/src/usr.bin/netstat/main.c,v
retrieving revision 1.87
diff -u -r1.87 main.c
--- usr.bin/netstat/main.c	16 Jul 2007 18:13:12 -0000	1.87
+++ usr.bin/netstat/main.c	5 Nov 2007 18:40:34 -0000
@@ -495,7 +495,10 @@
 	if (Bflag) {
 		if (!live)
 			usage();
-		bpf_stats(interface);
+		if (sflag)
+			bpf_stats_extended(interface);
+		else
+			bpf_stats(interface);
 		exit(0);
 	}
 	if (mflag) {
Index: usr.bin/netstat/netstat.h
===================================================================
RCS file: /home/ncvs/src/usr.bin/netstat/netstat.h,v
retrieving revision 1.51
diff -u -r1.51 netstat.h
--- usr.bin/netstat/netstat.h	16 Jul 2007 17:15:55 -0000	1.51
+++ usr.bin/netstat/netstat.h	5 Nov 2007 18:40:34 -0000
@@ -161,3 +161,4 @@
 void	mroutepr(u_long, u_long);
 void	mrt_stats(u_long);
 void	bpf_stats(char *);
+void	bpf_stats_extended(char *);