--- contrib/libpcap/pcap-bpf.c 2007/10/16 02:43:13 +++ contrib/libpcap/pcap-bpf.c 2008/02/02 18:50:46 @@ -30,6 +30,8 @@ #endif #include /* optionally get BSD define */ +#include +#include #include #include #include @@ -86,6 +88,10 @@ #endif /* _AIX */ +#ifdef BIOCSETBUFMODE +#include +#endif + #include #include #include @@ -139,6 +145,123 @@ return (0); } +#ifdef BIOCGETBUFMODE +/* + * Zero-copy BPF buffer routines to check for and acknowledge BPF data in + * shared memory buffers. + * + * pcap_next_zbuf_shm(): Check for a newly available shared memory buffer, + * and set up p->buffer and cc to reflect one if available. Notice that if + * there was no prior buffer, we select zbuf1 as this will be the first + * buffer filled for a fresh BPF session. + */ +static int +pcap_next_zbuf_shm(pcap_t *p, int *cc) +{ + struct bpf_zbuf_header *bzh; + + if (p->zbuffer == p->zbuf2 || p->zbuffer == NULL) { + bzh = (struct bpf_zbuf_header *)p->zbuf1; + if (bzh->bzh_user_gen != + atomic_load_acq_int(&bzh->bzh_kernel_gen)) { + p->bzh = bzh; + p->zbuffer = (u_char *)p->zbuf1; + p->buffer = p->zbuffer + sizeof(*bzh); + *cc = bzh->bzh_kernel_len; + return (1); + } + } else if (p->zbuffer == p->zbuf1) { + bzh = (struct bpf_zbuf_header *)p->zbuf2; + if (bzh->bzh_user_gen != + atomic_load_acq_int(&bzh->bzh_kernel_gen)) { + p->bzh = bzh; + p->zbuffer = (u_char *)p->zbuf2; + p->buffer = p->zbuffer + sizeof(*bzh); + *cc = bzh->bzh_kernel_len; + return (1); + } + } + return (0); +} + +/* + * pcap_next_zbuf() -- Similar to pcap_next_zbuf_shm(), except wait using + * select() for data or a timeout, and possibly force rotation of the buffer + * in the event we time out or are in immediate mode. Invoke the shared + * memory check before doing system calls in order to avoid doing avoidable + * work. + */ +static int +pcap_next_zbuf(pcap_t *p, int *cc) +{ + struct bpf_zbuf bz; + struct timeval tv; + fd_set r_set; + int data, r; + + /* + * Start out by seeing whether anything is waiting by checking the + * next shared memory buffer for data. + */ + data = pcap_next_zbuf_shm(p, cc); + if (data) + return (data); + + /* + * No data in the buffer, so must use select() to wait for data or + * the next timeout. + */ + FD_ZERO(&r_set); + FD_SET(p->fd, &r_set); + if (p->to_ms != 0) { + tv.tv_sec = p->to_ms / 1000; + tv.tv_usec = (p->to_ms * 1000) % 1000000; + } + r = select(p->fd + 1, &r_set, NULL, NULL, &tv); + if (r < 0 && errno == EINTR) + return (0); + else if (r < 0) { + (void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE, + "select: %s", strerror(errno)); + return (-1); + } + + /* + * Check again for data, which may exist now that we've either been + * woken up as a result of data or timed out. Try the "there's data" + * case first since it doesn't require a system call. + */ + data = pcap_next_zbuf_shm(p, cc); + if (data) + return (data); + + /* + * Try forcing a buffer rotation to dislodge timed out or immediate + * data. + */ + if (ioctl(p->fd, BIOCROTZBUF, &bz) < 0) { + (void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE, + "BIOCROTZBUF: %s", strerror(errno)); + return (-1); + } + return (pcap_next_zbuf_shm(p, cc)); +} + +/* + * Notify kernel that we are done with the buffer. We don't reset zbuffer so + * that we know which buffer to use next time around. + */ +static int +pcap_ack_zbuf(pcap_t *p) +{ + + atomic_store_rel_int(&p->bzh->bzh_user_gen, p->bzh->bzh_kernel_gen); + p->bzh = NULL; + p->buffer = NULL; + return (0); +} +#endif + static int pcap_read_bpf(pcap_t *p, int cnt, pcap_handler callback, u_char *user) { @@ -147,6 +270,9 @@ register u_char *bp, *ep; u_char *datap; struct bpf_insn *fcode; +#ifdef BIOCSETBUFMODE + int i; +#endif #ifdef PCAP_FDDIPAD register int pad; #endif @@ -167,7 +293,27 @@ } cc = p->cc; if (p->cc == 0) { - cc = read(p->fd, (char *)p->buffer, p->bufsize); + /* + * When reading without zero-copy from a file descriptor, we + * use a single buffer and return a length of data in the + * buffer. With zero-copy, we update the p->buffer pointer + * to point at whatever underlying buffer contains the next + * data and update cc to reflect the data found in the + * buffer. + */ +#ifdef BIOCSETBUFMODE + if (p->zerocopy) { + if (p->buffer != NULL) + pcap_ack_zbuf(p); + i = pcap_next_zbuf(p, &cc); + if (i == 0) + goto again; + if (i < 0) + return (-1); + } else +#endif + cc = read(p->fd, (char *)p->buffer, p->bufsize); + if (cc < 0) { /* Don't choke when we get ptraced */ switch (errno) { @@ -609,6 +755,10 @@ struct bpf_insn total_insn; struct bpf_program total_prog; struct utsname osinfo; +#ifdef BIOCSETBUFMODE + struct bpf_zbuf bz; + u_int bufmode, zbufmax; +#endif #ifdef HAVE_DAG_API if (strstr(device, "dag")) { @@ -646,41 +796,92 @@ goto bad; } +#ifdef BIOCSETBUFMODE /* - * Try finding a good size for the buffer; 32768 may be too - * big, so keep cutting it in half until we find a size - * that works, or run out of sizes to try. If the default - * is larger, don't make it smaller. - * - * XXX - there should be a user-accessible hook to set the - * initial buffer size. + * If the BPF extension to set buffer mode is present, try setting + * the mode to zero-copy. If that fails, use regular buffering. If + * it succeeds but other setup fails, return an error to the user. */ - if ((ioctl(fd, BIOCGBLEN, (caddr_t)&v) < 0) || v < 32768) - v = 32768; - for ( ; v != 0; v >>= 1) { - /* Ignore the return value - this is because the call fails - * on BPF systems that don't have kernel malloc. And if - * the call fails, it's no big deal, we just continue to - * use the standard buffer size. - */ - (void) ioctl(fd, BIOCSBLEN, (caddr_t)&v); - + bufmode = BPF_BUFMODE_ZBUF; + if (ioctl(fd, BIOCSETBUFMODE, (caddr_t)&bufmode) == 0) { + p->zerocopy = 1; + if (ioctl(fd, BIOCGETZMAX, (caddr_t)&zbufmax) < 0) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCGETZMAX: %s", + pcap_strerror(errno)); + goto bad; + } + p->zbufsize = 32768; + if (p->zbufsize > zbufmax) + p->zbufsize = zbufmax; + p->zbuf1 = mmap(NULL, p->zbufsize, PROT_READ | PROT_WRITE, + MAP_ANON, -1, 0); + p->zbuf2 = mmap(NULL, p->zbufsize, PROT_READ | PROT_WRITE, + MAP_ANON, -1, 0); + if (p->zbuf1 == MAP_FAILED || p->zbuf2 == MAP_FAILED) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, "mmap: %s", + pcap_strerror(errno)); + goto bad; + } + bzero(&bz, sizeof(bz)); + bz.bz_bufa = p->zbuf1; + bz.bz_bufb = p->zbuf2; + bz.bz_buflen = p->zbufsize; + if (ioctl(fd, BIOCSETZBUF, (caddr_t)&bz) < 0) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCSETZBUF: %s", + pcap_strerror(errno)); + goto bad; + } (void)strncpy(ifr.ifr_name, device, sizeof(ifr.ifr_name)); - if (ioctl(fd, BIOCSETIF, (caddr_t)&ifr) >= 0) - break; /* that size worked; we're done */ - - if (errno != ENOBUFS) { + if (ioctl(fd, BIOCSETIF, (caddr_t)&ifr) < 0) { snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCSETIF: %s: %s", device, pcap_strerror(errno)); goto bad; } - } + + v = p->zbufsize - sizeof(struct bpf_zbuf_header); + } else { +#endif + + /* + * Try finding a good size for the buffer; 32768 may be too + * big, so keep cutting it in half until we find a size + * that works, or run out of sizes to try. If the default + * is larger, don't make it smaller. + * + * XXX - there should be a user-accessible hook to set the + * initial buffer size. + */ + if ((ioctl(fd, BIOCGBLEN, (caddr_t)&v) < 0) || v < 32768) + v = 32768; + for ( ; v != 0; v >>= 1) { + /* Ignore the return value - this is because the call + * fails on BPF systems that don't have kernel + * malloc. And if the call fails, it's no big deal, + * we just continue to use the standard buffer size. + */ + (void) ioctl(fd, BIOCSBLEN, (caddr_t)&v); + + (void)strncpy(ifr.ifr_name, device, + sizeof(ifr.ifr_name)); + if (ioctl(fd, BIOCSETIF, (caddr_t)&ifr) >= 0) + break; /* that size worked; we're done */ + + if (errno != ENOBUFS) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, + "BIOCSETIF: %s: %s", + device, pcap_strerror(errno)); + goto bad; + } + } - if (v == 0) { - snprintf(ebuf, PCAP_ERRBUF_SIZE, - "BIOCSBLEN: %s: No buffer size worked", device); - goto bad; + if (v == 0) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, + "BIOCSBLEN: %s: No buffer size worked", device); + goto bad; + } +#ifdef BIOCSETBUFMODE } +#endif /* Get the data link layer type. */ if (ioctl(fd, BIOCGDLT, (caddr_t)&v) < 0) { @@ -855,7 +1056,8 @@ } #endif /* set timeout */ - if (to_ms != 0) { + p->to_ms = to_ms; + if (to_ms != 0 && !p->zerocopy) { /* * XXX - is this seconds/nanoseconds in AIX? * (Treating it as such doesn't fix the timeout @@ -870,6 +1072,9 @@ goto bad; } } +#ifdef BIOCSETBUFMODE + p->timeout = to_ms; +#endif #ifdef _AIX #ifdef BIOCIMMEDIATE @@ -942,16 +1147,22 @@ goto bad; } p->bufsize = v; - p->buffer = (u_char *)malloc(p->bufsize); - if (p->buffer == NULL) { - snprintf(ebuf, PCAP_ERRBUF_SIZE, "malloc: %s", - pcap_strerror(errno)); - goto bad; +#ifdef BIOCSETBUFMODE + if (!p->zerocopy) { +#endif + p->buffer = (u_char *)malloc(p->bufsize); + if (p->buffer == NULL) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, "malloc: %s", + pcap_strerror(errno)); + goto bad; + } +#ifdef _AIX + /* For some strange reason this seems to prevent the EFAULT + * problems we have experienced from AIX BPF. */ + memset(p->buffer, 0x0, p->bufsize); +#endif +#ifdef BIOCSETBUFMODE } -#ifdef _AIX - /* For some strange reason this seems to prevent the EFAULT - * problems we have experienced from AIX BPF. */ - memset(p->buffer, 0x0, p->bufsize); #endif /* @@ -1036,7 +1247,22 @@ return (p); bad: + (void)close(fd); +#ifdef BIOCSETBUFMODE + /* + * In zero-copy mode, p->buffer is just a pointer into one of the two + * memory-mapped buffers, so no need to free it. + */ + if (p->zerocopy) { + if (p->zbuf1 != MAP_FAILED && p->zbuf1 != NULL) + munmap(p->zbuf1, p->zbufsize); + if (p->zbuf2 != MAP_FAILED && p->zbuf2 != NULL) + munmap(p->zbuf2, p->zbufsize); + } else +#endif + if (p->buffer != NULL) + free(p->buffer); if (p->dlt_list != NULL) free(p->dlt_list); free(p); --- contrib/libpcap/pcap-int.h 2007/10/16 02:43:13 +++ contrib/libpcap/pcap-int.h 2007/12/23 23:51:06 @@ -167,12 +167,38 @@ struct pcap_md md; /* - * Read buffer. + * Read buffer -- for file descriptor read buffer model. */ int bufsize; u_char *buffer; u_char *bp; int cc; + int to_ms; + + /* + * XXXRW: Exactly how to handle ifdefs, etc, is not something I've + * worked out yet. Presumably we need to add a configure check for + * zero-copy BPF. + * + * Zero-copy read buffer -- for zero-copy BPF. 'buffer' above will + * alternative between these two actual mmap'd buffers as required. + * As there is a header on the front size of the mmap'd buffer, only + * some of the buffer is exposed to libpcap as a whole via bufsize; + * zbufsize is the true size. zbuffer tracks the current zbuf + * assocated with buffer so that it can be used to decide which the + * next buffer to read will be. + */ + u_char *zbuf1, *zbuf2, *zbuffer; + u_int zbufsize; + u_int timeout; + u_int zerocopy; + + /* + * If there's currently a buffer being actively processed, then it is + * referenced here; 'buffer' is also pointed at it, but offset by the + * size of the header. + */ + struct bpf_zbuf_header *bzh; /* * Place holder for pcap_next(). --- contrib/libpcap/pcap.c 2007/10/16 02:43:13 +++ contrib/libpcap/pcap.c 2008/01/30 21:51:29 @@ -44,6 +44,7 @@ #include #else /* WIN32 */ #include +#include #endif /* WIN32 */ #include @@ -738,6 +739,24 @@ void pcap_close_common(pcap_t *p) { +#ifdef BIOCSETBUFMODE + /* + * Check to see if this pcap instance was using the zerocopy buffer + * mode. If it was, delete the mappings. Note that p->buffer + * gets initialized to one of the mmaped regions in this case, so + * do not try and free it directly. + * + * If the regular buffer mode was selected, then it is safe to free + * this memory. + */ + if (p->zerocopy) { + if (p->zbuf1 != MAP_FAILED && p->zbuf1 != NULL) + munmap(p->zbuf1, p->zbufsize); + if (p->zbuf2 != MAP_FAILED && p->zbuf2 != NULL) + munmap(p->zbuf2, p->zbufsize); + p->buffer = NULL; + } else +#endif if (p->buffer != NULL) free(p->buffer); #if !defined(WIN32) && !defined(MSDOS) --- share/man/man4/bpf.4 2007/02/27 02:39:57 +++ share/man/man4/bpf.4 2008/03/09 18:46:30 @@ -1,3 +1,30 @@ +.\" Copyright (c) 2007 Seccuris Inc. +.\" All rights reserved. +.\" +.\" This sofware was developed by Robert N. M. Watson under contract to +.\" Seccuris Inc. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" .\" Copyright (c) 1990 The Regents of the University of California. .\" All rights reserved. .\" @@ -61,19 +88,6 @@ all file descriptors listening on that interface apply their filter. Each descriptor that accepts the packet receives its own copy. .Pp -Reads from these files return the next group of packets -that have matched the filter. -To improve performance, the buffer passed to read must be -the same size as the buffers used internally by -.Nm . -This size is returned by the -.Dv BIOCGBLEN -ioctl (see below), and -can be set with -.Dv BIOCSBLEN . -Note that an individual packet larger than this size is necessarily -truncated. -.Pp The packet filter will support any link level protocol that has fixed length headers. Currently, only Ethernet, @@ -94,6 +108,165 @@ Currently, only writes to Ethernets and .Tn SLIP links are supported. +.Sh BUFFER MODES +.Nm +devices deliver packet data to the application via memory buffers provided by +the application. +The buffer mode is set using the +.Dv BIOCSETBUFMODE +ioctl, and read using the +.Dv BIOCGETBUFMODE +ioctl. +.Ss Buffered read mode +By default, +.Nm +devices operate in the +.Dv BPF_BUFMODE_BUFFER +mode, in which packet data is copied explicitly from the kernel to user +memory using the +.Xr read 2 +system call. +The user process will declare a fixed buffer size that will be used both for +sizing internal buffers and for all +.Xr read 2 +operations on the file. +This size is queried using the +.Dv BIOCGBLEN +ioctl, and is set using the +.Dv BIOCSBLEN +ioctl. +Note that an individual packet larger than the buffer size is necessarily +truncated. +.Ss Zero-copy buffer mode +.Nm +devices may also operate in the +.Dv BPF_BUFMODE_ZEROCOPY +mode, in which packet data is written directly into two user memory buffers +by the kernel, avoiding both system call and copying overhead. +Buffers are of fixed (and equal) size, page-aligned, and an even multiple of +the page size. +The maximum zero-copy buffer size is returned by the +.Dv BIOCGETZMAX +ioctl. +Note that an individual packet larger than the buffer size is necessarily +truncated. +.Pp +The user process registers two memory buffers using the +.Dv BIOCSETZBUF +ioctl, which accepts a +.Vt struct bpf_zbuf +pointer as an argument: +.Bd -literal +struct bpf_zbuf { + void *bz_bufa; + void *bz_bufb; + size_t bz_buflen; +}; +.Ed +.Pp +.Vt bz_bufa +is a pointer to the userspace address of the first buffer that will be +filled, and +.Vt bz_bufb +is a pointer to the second buffer. +.Nm +will then cycle between the two buffers as they fill and are acknowledged. +.Pp +Each buffer begins with a fixed-length header to hold synchronization and +data length information for the buffer: +.Bd -literal +struct bpf_zbuf_header { + volatile u_int bzh_kernel_gen; /* Kernel generation number. */ + volatile u_int bzh_kernel_len; /* Length of data in the buffer. */ + volatile u_int bzh_user_gen; /* User generation number. */ + /* ...padding for future use... */ +}; +.Ed +.Pp +The header structure of each buffer, including all padding, should be zeroed +before it is configured using +.Dv BIOCSETZBUF . +Remaining space in the buffer will be used by the kernel to store packet +data, laid out in the same format as with buffered read mode. +.Pp +The kernel and the user process follow a simple acknowledgement protocol via +the buffer header to synchronize access to the buffer: when the header +generation numbers, +.Vt bzh_kernel_gen +and +.Vt bzh_user_gen , +hold the same value, the kernel owns the buffer, and when they differ, +userspace owns the buffer. +.Pp +While the kernel owns the buffer, the contents are unstable and may change +asynchronously; while the user process owns the buffer, its contents are +stable and will not be changed until the buffer has been acknowledged. +.Pp +Initializing the buffer headers to all 0's before registering the buffer has +the effect of assigning initial ownership of both buffers to the kernel. +The kernel signals that a buffer has been assigned to userspace by modifying +.Vt bzh_kernel_gen , +and userspace acknowledges the buffer and returns it to the kernel by setting +the value of +.Vt bzh_user_gen +to the value of +.Vt bzh_kernel_gen . +.Pp +In order to avoid caching and memory re-ordering effects, the user process +must use atomic operations and memory barriers when checking for and +acknowledging buffers: +.Bd -literal +#include + +/* + * Return ownership of a buffer to the kernel for reuse. + */ +static void +buffer_acknowledge(struct bpf_zbuf_header *bzh) +{ + + atomic_store_rel_int(&bzh->bzh_user_gen, bzh->bzh_kernel_gen); +} + +/* + * Check whether a buffer has been assigned to userspace by the kernel. + * Return true if userspace owns the buffer, and false otherwise. + */ +static int +buffer_check(struct bpf_zbuf_header *bzh) +{ + + return (bzh->bzh_user_gen != + atomic_load_acq_int(&bzh->bzh_kernel_gen)); +} +.Ed +.Pp +The user process may force the assignment of the next buffer, if any data +is pending, to userspace using the +.Dv BIOCROTZBUF +ioctl. +This allows the user process to retrieve data in a partially filled buffer +before the buffer is full, such as following a timeout; the process must +recheck for buffer ownership using the header generation numbers, as the +buffer will not be assigned to userspace if no data was present. +.Pp +As in the buffered read mode, +.Xr kqueue 2 , +.Xr poll 2 , +and +.Xr select 2 +may be used to sleep awaiting the availbility of a completed buffer. +They will return a readable file descriptor when ownership of the next buffer +is assigned to user space. +.Pp +In the current implementation, the kernel will assign ownership of at most +one buffer at a time to the user process. +The user processes must acknowledge the current buffer in order to be +notified that the next buffer is ready for processing. +Programs should not rely on this as an invariant, as it may change in future +versions; in particular, they must maintain their own notion of which buffer +is "next" so that if both buffers are owned by userspace, it can process them +in the correct order. .Sh IOCTLS The .Xr ioctl 2 @@ -127,7 +300,7 @@ The (third) argument to .Xr ioctl 2 should be a pointer to the type indicated. -.Bl -tag -width BIOCGRTIMEOUT +.Bl -tag -width BIOCGETBUFMODE .It Dv BIOCGBLEN .Pq Li u_int Returns the required buffer length for reads on @@ -349,10 +522,55 @@ This prevents the execution of ioctl commands which could change the underlying operating parameters of the device. +.It Dv BIOCGETBUFMODE +.It Dv BIOCSETBUFMODE +.Pq Li u_int +Get or set the current +.Nm +buffering mode; possible values are +.Dv BPF_BUFMODE_BUFFER , +buffered read mode, and +.Dv BPF_BUFMODE_ZBUF , +zero-copy buffer mode. +.It Dv BIOCSETZBUF +.Pq Li struct bpf_zbuf +Set the current zero-copy buffer locations; buffer locations may be +set only once zero-copy buffer mode has been selected, and prior to attaching +to an interface. +Buffers must be of identical size, page-aligned, and an integer multiple of +pages in size. +The three fields +.Vt bz_bufa , +.Vt bz_bufb , +and +.Vt bz_buflen +must be filled out. +If buffers have already been set for this device, the ioctl will fail. +.It Dv BIOCGETZMAX +.Pq Li size_t +Get the largest individual zero-copy buffer size allowed. +As two buffers are used in zero-copy buffer mode, the limit (in practice) is +twice the returned size. +As zero-copy buffers consume kernel address space, conservative selection of +buffer size is suggested, especially when there are multiple +.Nm +descriptors in use on 32-bit systems. +.It Dv BIOCROTZBUF +Force ownership of the next buffer to be assigned to userspace, if any data +present in the buffer. +If no data is present, the buffer will remain owned by the kernel. +This allows consumers of zero-copy buffering to implement timeouts and +retrieve partially filled buffers. +In order to handle the case where no data is present in the buffer and +therefore ownership is not assigned, the user process must check +.Vt bzh_kernel_gen +against +.Vt bzh_user_gen . .El .Sh BPF HEADER The following structure is prepended to each packet returned by -.Xr read 2 : +.Xr read 2 +or via a zero-copy buffer: .Bd -literal struct bpf_hdr { struct timeval bh_tstamp; /* time stamp */ @@ -718,6 +936,9 @@ .Sh SEE ALSO .Xr tcpdump 1 , .Xr ioctl 2 , +.Xr kqueue 2 , +.Xr poll 2 , +.Xr select 2 , .Xr byteorder 3 , .Xr ng_bpf 4 , .Xr bpf 9 @@ -750,6 +971,10 @@ Summer 1990. Much of the design is due to .An Van Jacobson . +.Pp +Support for zero-copy buffers was added by +.An Robert N. M. Watson +under contract to Seccuris Inc. .Sh BUGS The read buffer must be of a fixed size (returned by the .Dv BIOCGBLEN --- sys/conf/files 2008/03/06 08:10:14 +++ sys/conf/files 2008/03/09 18:17:26 @@ -1632,8 +1632,10 @@ libkern/strtouq.c standard libkern/strvalid.c standard net/bpf.c standard +net/bpf_buffer.c optional bpf net/bpf_jitter.c optional bpf_jitter net/bpf_filter.c optional bpf | netgraph_bpf +net/bpf_zerocopy.c optional bpf net/bridgestp.c optional bridge | if_bridge net/bsd_comp.c optional ppp_bsdcomp net/ieee8023ad_lacp.c optional lagg --- sys/net/bpf.c 2008/02/02 20:35:48 +++ sys/net/bpf.c 2008/03/09 22:19:22 @@ -66,9 +66,11 @@ #include #include +#include #ifdef BPF_JITTER #include #endif +#include #include #include @@ -80,7 +82,7 @@ #include -static MALLOC_DEFINE(M_BPF, "BPF", "BPF data"); +MALLOC_DEFINE(M_BPF, "BPF", "BPF data"); #if defined(DEV_BPF) || defined(NETGRAPH_BPF) @@ -98,19 +100,17 @@ static struct mtx bpf_mtx; /* bpf global lock */ static int bpf_bpfd_cnt; -static void bpf_allocbufs(struct bpf_d *); static void bpf_attachd(struct bpf_d *, struct bpf_if *); static void bpf_detachd(struct bpf_d *); static void bpf_freed(struct bpf_d *); -static void bpf_mcopy(const void *, void *, size_t); static int bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **, struct sockaddr *, int *, struct bpf_insn *); static int bpf_setif(struct bpf_d *, struct ifreq *); static void bpf_timed_out(void *); static __inline void bpf_wakeup(struct bpf_d *); -static void catchpacket(struct bpf_d *, u_char *, u_int, - u_int, void (*)(const void *, void *, size_t), +static void catchpacket(struct bpf_d *, u_char *, u_int, u_int, + void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int), struct timeval *); static void reset_d(struct bpf_d *); static int bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd); @@ -132,6 +132,9 @@ static int bpf_maxinsns = BPF_MAXINSNS; SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW, &bpf_maxinsns, 0, "Maximum bpf program instructions"); +static int bpf_zerocopy_enable = 0; +SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW, + &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions"); SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_RW, bpf_stats_sysctl, "bpf statistics portal"); @@ -158,7 +161,147 @@ static struct filterops bpfread_filtops = { 1, NULL, filt_bpfdetach, filt_bpfread }; +/* + * Wrapper functions for various buffering methods. If the set of buffer + * modes expands, we will probably want to introduce a switch data structure + * similar to protosw, et. + */ +static void +bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src, + u_int len) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + return (bpf_buffer_append_bytes(d, buf, offset, src, len)); + + case BPF_BUFMODE_ZBUF: + d->bd_zcopy++; + return (bpf_zerocopy_append_bytes(d, buf, offset, src, len)); + + default: + panic("bpf_buf_append_bytes"); + } +} + +static void +bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src, + u_int len) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + return (bpf_buffer_append_mbuf(d, buf, offset, src, len)); + + case BPF_BUFMODE_ZBUF: + d->bd_zcopy++; + return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len)); + + default: + panic("bpf_buf_append_mbuf"); + } +} + +/* + * If the buffer mechanism has a way to decide that a held buffer can be made + * free, then it is exposed via the bpf_canfreebuf() interface. (1) is + * returned if the buffer can be discarded, (0) is returned if it cannot. + */ static int +bpf_canfreebuf(struct bpf_d *d) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { + case BPF_BUFMODE_ZBUF: + return (bpf_zerocopy_canfreebuf(d)); + } + return (0); +} + +void +bpf_bufheld(struct bpf_d *d) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { + case BPF_BUFMODE_ZBUF: + bpf_zerocopy_bufheld(d); + break; + } +} + +static void +bpf_free(struct bpf_d *d) +{ + + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + return (bpf_buffer_free(d)); + + case BPF_BUFMODE_ZBUF: + return (bpf_zerocopy_free(d)); + + default: + panic("bpf_buf_free"); + } +} + +static int +bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio) +{ + + if (d->bd_bufmode != BPF_BUFMODE_BUFFER) + return (EOPNOTSUPP); + return (bpf_buffer_uiomove(d, buf, len, uio)); +} + +static int +bpf_ioctl_sblen(struct bpf_d *d, u_int *i) +{ + + if (d->bd_bufmode != BPF_BUFMODE_BUFFER) + return (EOPNOTSUPP); + return (bpf_buffer_ioctl_sblen(d, i)); +} + +static int +bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i) +{ + + if (d->bd_bufmode != BPF_BUFMODE_ZBUF) + return (EOPNOTSUPP); + return (bpf_zerocopy_ioctl_getzmax(td, d, i)); +} + +static int +bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) +{ + + if (d->bd_bufmode != BPF_BUFMODE_ZBUF) + return (EOPNOTSUPP); + return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz)); +} + +static int +bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) +{ + + if (d->bd_bufmode != BPF_BUFMODE_ZBUF) + return (EOPNOTSUPP); + return (bpf_zerocopy_ioctl_setzbuf(td, d, bz)); +} + +/* + * General BPF functions. + */ +static int bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp, struct sockaddr *sockp, int *hdrlen, struct bpf_insn *wfilter) { @@ -412,7 +555,14 @@ "bpf%d", dev2unit(dev)); MALLOC(d, struct bpf_d *, sizeof(*d), M_BPF, M_WAITOK | M_ZERO); dev->si_drv1 = d; - d->bd_bufsize = bpf_bufsize; + + /* + * For historical reasons, perform a one-time initialization call to + * the buffer routines, even though we're not yet committed to a + * particular buffer method. + */ + bpf_buffer_init(d); + d->bd_bufmode = BPF_BUFMODE_BUFFER; d->bd_sig = SIGIO; d->bd_direction = BPF_D_INOUT; d->bd_pid = td->td_proc->p_pid; @@ -459,18 +609,6 @@ return (0); } - -/* - * Rotate the packet buffers in descriptor d. Move the store buffer - * into the hold slot, and the free buffer into the store slot. - * Zero the length of the new store buffer. - */ -#define ROTATE_BUFFERS(d) \ - (d)->bd_hbuf = (d)->bd_sbuf; \ - (d)->bd_hlen = (d)->bd_slen; \ - (d)->bd_sbuf = (d)->bd_fbuf; \ - (d)->bd_slen = 0; \ - (d)->bd_fbuf = NULL; /* * bpfread - read next chunk of packets from buffers */ @@ -490,6 +628,10 @@ BPFD_LOCK(d); d->bd_pid = curthread->td_proc->p_pid; + if (d->bd_bufmode != BPF_BUFMODE_BUFFER) { + BPFD_UNLOCK(d); + return (EOPNOTSUPP); + } if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); timed_out = (d->bd_state == BPF_TIMED_OUT); @@ -567,7 +709,7 @@ * issues a read on the same fd at the same time? Don't want this * getting invalidated. */ - error = uiomove(d->bd_hbuf, d->bd_hlen, uio); + error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio); BPFD_LOCK(d); d->bd_fbuf = d->bd_hbuf; @@ -613,6 +755,20 @@ } static int +bpf_ready(struct bpf_d *d) +{ + + BPFD_LOCK_ASSERT(d); + + if (!bpf_canfreebuf(d) && d->bd_hlen != 0) + return (1); + if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) && + d->bd_slen != 0) + return (1); + return (0); +} + +static int bpfwrite(struct cdev *dev, struct uio *uio, int ioflag) { struct bpf_d *d = dev->si_drv1; @@ -622,25 +778,34 @@ int error, hlen; d->bd_pid = curthread->td_proc->p_pid; - if (d->bd_bif == NULL) + d->bd_wcount++; + if (d->bd_bif == NULL) { + d->bd_wdcount++; return (ENXIO); + } ifp = d->bd_bif->bif_ifp; - if ((ifp->if_flags & IFF_UP) == 0) + if ((ifp->if_flags & IFF_UP) == 0) { + d->bd_wdcount++; return (ENETDOWN); + } - if (uio->uio_resid == 0) + if (uio->uio_resid == 0) { + d->bd_wdcount++; return (0); + } bzero(&dst, sizeof(dst)); m = NULL; hlen = 0; error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp, &m, &dst, &hlen, d->bd_wfilter); - if (error) + if (error) { + d->bd_wdcount++; return (error); - + } + d->bd_wfcount++; if (d->bd_hdrcmplt) dst.sa_family = pseudo_AF_HDRCMPLT; @@ -667,6 +832,8 @@ #endif error = (*ifp->if_output)(ifp, m, &dst, NULL); + if (error) + d->bd_wdcount++; if (mc != NULL) { if (error == 0) @@ -697,6 +864,10 @@ d->bd_rcount = 0; d->bd_dcount = 0; d->bd_fcount = 0; + d->bd_wcount = 0; + d->bd_wfcount = 0; + d->bd_wdcount = 0; + d->bd_zcopy = 0; } /* @@ -721,6 +892,11 @@ * BIOCSDIRECTION Set packet direction flag * BIOCLOCK Set "locked" flag * BIOCFEEDBACK Set packet feedback mode. + * BIOCSETZBUF Set current zero-copy buffer locations. + * BIOCGETZMAX Get maximum zero-copy buffer size. + * BIOCROTZBUF Force rotation of zero-copy buffer + * BIOCSETBUFMODE Set buffer mode. + * BIOCGETBUFMODE Get current buffer mode. */ /* ARGSUSED */ static int @@ -758,6 +934,7 @@ case BIOCSRTIMEOUT: case BIOCIMMEDIATE: case TIOCGPGRP: + case BIOCROTZBUF: break; default: return (EPERM); @@ -810,17 +987,7 @@ * Set buffer length. */ case BIOCSBLEN: - if (d->bd_bif != NULL) - error = EINVAL; - else { - u_int size = *(u_int *)addr; - - if (size > bpf_maxbufsize) - *(u_int *)addr = size = bpf_maxbufsize; - else if (size < BPF_MINBUFSIZE) - *(u_int *)addr = size = BPF_MINBUFSIZE; - d->bd_bufsize = size; - } + error = bpf_ioctl_sblen(d, (u_int *)addr); break; /* @@ -1055,6 +1222,50 @@ case BIOCGRSIG: *(u_int *)addr = d->bd_sig; break; + + case BIOCGETBUFMODE: + *(u_int *)addr = d->bd_bufmode; + break; + + case BIOCSETBUFMODE: + /* + * Allow the buffering mode to be changed as long as we + * haven't yet committed to a particular mode. Our + * definition of commitment, for now, is whether or not a + * buffer has been allocated or an interface attached, since + * that's the point where things get tricky. + */ + switch (*(u_int *)addr) { + case BPF_BUFMODE_BUFFER: + break; + + case BPF_BUFMODE_ZBUF: + if (bpf_zerocopy_enable) + break; + /* FALLSTHROUGH */ + + default: + return (EINVAL); + } + + BPFD_LOCK(d); + if (d->bd_sbuf != NULL || d->bd_hbuf != NULL || + d->bd_fbuf != NULL || d->bd_bif != NULL) { + BPFD_UNLOCK(d); + return (EBUSY); + } + d->bd_bufmode = *(u_int *)addr; + BPFD_UNLOCK(d); + break; + + case BIOCGETZMAX: + return (bpf_ioctl_getzmax(td, d, (size_t *)addr)); + + case BIOCSETZBUF: + return (bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr)); + + case BIOCROTZBUF: + return (bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr)); } return (error); } @@ -1155,13 +1366,31 @@ return (ENXIO); bp = theywant->if_bpf; + /* - * Allocate the packet buffers if we need to. - * If we're already attached to requested interface, - * just flush the buffer. + * Behavior here depends on the buffering model. If we're using + * kernel memory buffers, then we can allocate them here. If we're + * using zero-copy, then the user process must have registered + * buffers by the time we get here. If not, return an error. + * + * XXXRW: There are locking issues here with multi-threaded use: what + * if two threads try to set the interface at once? */ - if (d->bd_sbuf == NULL) - bpf_allocbufs(d); + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + if (d->bd_sbuf == NULL) + bpf_buffer_alloc(d); + KASSERT(d->bd_sbuf != NULL, ("bpf_setif: bd_sbuf NULL")); + break; + + case BPF_BUFMODE_ZBUF: + if (d->bd_sbuf == NULL) + return (EINVAL); + break; + + default: + panic("bpf_setif: bufmode %d", d->bd_bufmode); + } if (bp != d->bd_bif) { if (d->bd_bif) /* @@ -1305,37 +1534,14 @@ #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif - catchpacket(d, pkt, pktlen, slen, bcopy, &tv); + catchpacket(d, pkt, pktlen, slen, + bpf_append_bytes, &tv); } BPFD_UNLOCK(d); } BPFIF_UNLOCK(bp); } -/* - * Copy data from an mbuf chain into a buffer. This code is derived - * from m_copydata in sys/uipc_mbuf.c. - */ -static void -bpf_mcopy(const void *src_arg, void *dst_arg, size_t len) -{ - const struct mbuf *m; - u_int count; - u_char *dst; - - m = src_arg; - dst = dst_arg; - while (len > 0) { - if (m == NULL) - panic("bpf_mcopy"); - count = min(m->m_len, len); - bcopy(mtod(m, void *), dst, count); - m = m->m_next; - dst += count; - len -= count; - } -} - #define BPF_CHECK_DIRECTION(d, m) \ if (((d)->bd_direction == BPF_D_IN && (m)->m_pkthdr.rcvif == NULL) || \ ((d)->bd_direction == BPF_D_OUT && (m)->m_pkthdr.rcvif != NULL)) @@ -1385,7 +1591,7 @@ if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)m, pktlen, slen, - bpf_mcopy, &tv); + bpf_append_mbuf, &tv); } BPFD_UNLOCK(d); } @@ -1440,7 +1646,7 @@ if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)&mb, pktlen, slen, - bpf_mcopy, &tv); + bpf_append_mbuf, &tv); } BPFD_UNLOCK(d); } @@ -1453,19 +1659,34 @@ * Move the packet data from interface memory (pkt) into the * store buffer. "cpfn" is the routine called to do the actual data * transfer. bcopy is passed in to copy contiguous chunks, while - * bpf_mcopy is passed in to copy mbuf chains. In the latter case, + * bpf_append_mbuf is passed in to copy mbuf chains. In the latter case, * pkt is really an mbuf. */ static void catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, - void (*cpfn)(const void *, void *, size_t), struct timeval *tv) + void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int), + struct timeval *tv) { - struct bpf_hdr *hp; + struct bpf_hdr hdr; int totlen, curlen; int hdrlen = d->bd_bif->bif_hdrlen; int do_wakeup = 0; BPFD_LOCK_ASSERT(d); + + /* + * Detect whether user space has released a buffer back to us, and if + * so, move it from being a hold buffer to a free buffer. This may + * not be the best place to do it (for example, we might only want to + * run this check if we need the space), but for now it's a reliable + * spot to do it. + */ + if (bpf_canfreebuf(d)) { + d->bd_fbuf = d->bd_hbuf; + d->bd_hbuf = NULL; + d->bd_hlen = 0; + } + /* * Figure out how many bytes to move. If the packet is * greater or equal to the snapshot length, transfer that @@ -1500,23 +1721,27 @@ } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) /* - * Immediate mode is set, or the read timeout has - * already expired during a select call. A packet - * arrived, so the reader should be woken up. + * Immediate mode is set, or the read timeout has already + * expired during a select call. A packet arrived, so the + * reader should be woken up. */ do_wakeup = 1; /* - * Append the bpf header. + * Append the bpf header. Note we append the actual header size, but + * move forward the length of the header plus padding. */ - hp = (struct bpf_hdr *)(d->bd_sbuf + curlen); - hp->bh_tstamp = *tv; - hp->bh_datalen = pktlen; - hp->bh_hdrlen = hdrlen; + bzero(&hdr, sizeof(hdr)); + hdr.bh_tstamp = *tv; + hdr.bh_datalen = pktlen; + hdr.bh_hdrlen = hdrlen; + hdr.bh_caplen = totlen - hdrlen; + bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr)); + /* * Copy the packet data into the store buffer and update its length. */ - (*cpfn)(pkt, (u_char *)hp + hdrlen, (hp->bh_caplen = totlen - hdrlen)); + (*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, hdr.bh_caplen); d->bd_slen = curlen + totlen; if (do_wakeup) @@ -1524,41 +1749,19 @@ } /* - * Initialize all nonzero fields of a descriptor. - */ -static void -bpf_allocbufs(struct bpf_d *d) -{ - - KASSERT(d->bd_fbuf == NULL, ("bpf_allocbufs: bd_fbuf != NULL")); - KASSERT(d->bd_sbuf == NULL, ("bpf_allocbufs: bd_sbuf != NULL")); - KASSERT(d->bd_hbuf == NULL, ("bpf_allocbufs: bd_hbuf != NULL")); - - d->bd_fbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK); - d->bd_sbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK); - d->bd_slen = 0; - d->bd_hlen = 0; -} - -/* * Free buffers currently in use by a descriptor. * Called on close. */ static void bpf_freed(struct bpf_d *d) { + /* * We don't need to lock out interrupts since this descriptor has * been detached from its interface and it yet hasn't been marked * free. */ - if (d->bd_sbuf != NULL) { - free(d->bd_sbuf, M_BPF); - if (d->bd_hbuf != NULL) - free(d->bd_hbuf, M_BPF); - if (d->bd_fbuf != NULL) - free(d->bd_fbuf, M_BPF); - } + bpf_free(d); if (d->bd_rfilter) { free((caddr_t)d->bd_rfilter, M_BPF); #ifdef BPF_JITTER @@ -1779,6 +1982,10 @@ strlcpy(d->bd_ifname, bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ); d->bd_locked = bd->bd_locked; + d->bd_wcount = bd->bd_wcount; + d->bd_wdcount = bd->bd_wdcount; + d->bd_wfcount = bd->bd_wfcount; + d->bd_zcopy = bd->bd_zcopy; } static int --- sys/net/bpf.h 2007/12/23 14:12:30 +++ sys/net/bpf.h 2008/03/09 19:58:30 @@ -92,6 +92,27 @@ #define BPF_MAJOR_VERSION 1 #define BPF_MINOR_VERSION 1 +/* + * Historically, BPF has supported a single buffering model, first using mbuf + * clusters in kernel, and later using malloc(9) buffers in kernel. We now + * support multiple buffering modes, which may be queried and set using + * BIOCGETBUFMODE and BIOCSETBUFMODE. So as to avoid handling the complexity + * of changing modes while sniffing packets, the mode becomes fixed once an + * interface has been attached to the BPF descriptor. + */ +#define BPF_BUFMODE_BUFFER 1 /* Kernel buffers with read(). */ +#define BPF_BUFMODE_ZBUF 2 /* Zero-copy buffers. */ + +/*- + * Struct used by BIOCSETZBUF, BIOCROTZBUF: describes up to two zero-copy + * buffer as used by BPF. + */ +struct bpf_zbuf { + void *bz_bufa; /* Location of 'a' zero-copy buffer. */ + void *bz_bufb; /* Location of 'b' zero-copy buffer. */ + size_t bz_buflen; /* Size of zero-copy buffers. */ +}; + #define BIOCGBLEN _IOR('B',102, u_int) #define BIOCSBLEN _IOWR('B',102, u_int) #define BIOCSETF _IOW('B',103, struct bpf_program) @@ -116,6 +137,11 @@ #define BIOCLOCK _IO('B', 122) #define BIOCSETWF _IOW('B',123, struct bpf_program) #define BIOCFEEDBACK _IOW('B',124, u_int) +#define BIOCGETBUFMODE _IOR('B',125, u_int) +#define BIOCSETBUFMODE _IOW('B',126, u_int) +#define BIOCGETZMAX _IOR('B',127, size_t) +#define BIOCROTZBUF _IOR('B',128, struct bpf_zbuf) +#define BIOCSETZBUF _IOW('B',129, struct bpf_zbuf) /* Obsolete */ #define BIOCGSEESENT BIOCGDIRECTION @@ -149,6 +175,24 @@ #endif /* + * When using zero-copy BPF buffers, a shared memory header is present + * allowing the kernel BPF implementation and user process to synchronize + * without using system calls. This structure defines that header. When + * accessing these fields, appropriate atomic operation and memory barriers + * are required in order not to see stale or out-of-order data; see bpf(4) + * for reference code to access these fields from userspace. + * + * The layout of this structure is critical, and must not be changed; if must + * fit in a single page on all architectures. + */ +struct bpf_zbuf_header { + volatile u_int bzh_kernel_gen; /* Kernel generation number. */ + volatile u_int bzh_kernel_len; /* Length of data in the buffer. */ + volatile u_int bzh_user_gen; /* User generation number. */ + u_int _bzh_pad[5]; +}; + +/* * Data-link level type codes. */ #define DLT_NULL 0 /* BSD loopback encapsulation */ @@ -761,6 +805,27 @@ }; #ifdef _KERNEL +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_BPF); +#endif +#ifdef SYSCTL_DECL +SYSCTL_DECL(_net_bpf); +#endif + +/* + * Rotate the packet buffers in descriptor d. Move the store buffer into the + * hold slot, and the free buffer ino the store slot. Zero the length of the + * new store buffer. Descriptor lock should be held. + */ +#define ROTATE_BUFFERS(d) do { \ + (d)->bd_hbuf = (d)->bd_sbuf; \ + (d)->bd_hlen = (d)->bd_slen; \ + (d)->bd_sbuf = (d)->bd_fbuf; \ + (d)->bd_slen = 0; \ + (d)->bd_fbuf = NULL; \ + bpf_bufheld(d); \ +} while (0) + /* * Descriptor associated with each attached hardware interface. */ @@ -773,6 +838,7 @@ struct mtx bif_mtx; /* mutex for interface */ }; +void bpf_bufheld(struct bpf_d *d); int bpf_validate(const struct bpf_insn *, int); void bpf_tap(struct bpf_if *, u_char *, u_int); void bpf_mtap(struct bpf_if *, struct mbuf *); --- sys/net/bpfdesc.h 2007/08/06 14:28:11 +++ sys/net/bpfdesc.h 2008/03/09 20:31:29 @@ -48,10 +48,11 @@ /* * Descriptor associated with each open bpf file. */ +struct zbuf; struct bpf_d { LIST_ENTRY(bpf_d) bd_next; /* Linked list of descriptors */ /* - * Buffer slots: two malloc buffers store the incoming packets. + * Buffer slots: two memory buffers buffer the incoming packets. * The model has three slots. Sbuf is always occupied. * sbuf (store) - Receive interrupt puts packets here. * hbuf (hold) - When sbuf is full, put buffer here and @@ -93,6 +94,11 @@ u_long bd_fcount; /* number of packets which matched filter */ pid_t bd_pid; /* PID which created descriptor */ int bd_locked; /* true if descriptor is locked */ + u_int bd_bufmode; /* Current buffer mode. */ + u_long bd_wcount; /* number of packets written */ + u_long bd_wfcount; /* number of packets that matched write filter */ + u_long bd_wdcount; /* number of packets dropped during a write */ + u_long bd_zcopy; /* number of zero copy operations */ }; /* Values for bd_state */ @@ -104,12 +110,6 @@ #define BPFD_UNLOCK(bd) mtx_unlock(&(bd)->bd_mtx) #define BPFD_LOCK_ASSERT(bd) mtx_assert(&(bd)->bd_mtx, MA_OWNED); -/* Test whether a BPF is ready for read(). */ -#define bpf_ready(bd) \ - ((bd)->bd_hlen != 0 || \ - (((bd)->bd_immediate || (bd)->bd_state == BPF_TIMED_OUT) && \ - (bd)->bd_slen != 0)) - /* * External representation of the bpf descriptor */ @@ -130,6 +130,10 @@ pid_t bd_pid; char bd_ifname[IFNAMSIZ]; int bd_locked; + u_long bd_wcount; + u_long bd_wfcount; + u_long bd_wdcount; + u_long bd_zcopy; }; #define BPFIF_LOCK(bif) mtx_lock(&(bif)->bif_mtx) --- usr.bin/netstat/bpf.c 2008/01/02 23:39:45 +++ usr.sbin/netstat/bpf.c 2008/01/06 22:38:48 @@ -88,31 +88,83 @@ *flagbuf++ = '\0'; } -void -bpf_stats(char *ifname) +static int +bpf_get_stats(int *size, struct xbpf_d **bdp) { - struct xbpf_d *d, *bd; - char *pname, flagbuf[12]; - size_t size; + struct xbpf_d *bd; + size_t s; - if (sysctlbyname("net.bpf.stats", NULL, &size, + if (sysctlbyname("net.bpf.stats", NULL, &s, NULL, 0) < 0) { warn("net.bpf.stats"); - return; + return (-1); } - if (size == 0) - return; - bd = malloc(size); + if (s == 0) + return (-1); + bd = malloc(s); if (bd == NULL) { warn("malloc failed"); - return; + return (-1); } - if (sysctlbyname("net.bpf.stats", bd, &size, + if (sysctlbyname("net.bpf.stats", bd, &s, NULL, 0) < 0) { warn("net.bpf.stats"); free(bd); + return (-1); + } + *bdp = bd; + *size = s; + return (0); +} + +void +bpf_stats_extended(char *ifname) +{ + struct xbpf_d *d, *bd; + int size; + char *pname; + + if (bpf_get_stats(&size, &bd) < 0) return; + for (d = &bd[0]; d < &bd[size / sizeof(*d)]; d++) { + if (ifname && strcmp(ifname, d->bd_ifname) != 0) + continue; + pname = bpf_pidname(d->bd_pid); + (void) printf("%s: pid %d on %s:\n", pname, d->bd_pid, + d->bd_ifname); + (void) printf( + "\t%lu packets received\n" + "\t%lu packets matched receive filter\n" + "\t%lu packets dropped\n" + "\t%d current hold buffer size\n" + "\t%d current store buffer size\n" + "\t%lu packets written\n" + "\t%lu packets matched write filter\n" + "\t%lu packet writes failed\n" + "\t%lu zero copy operations\n", + d->bd_rcount, + d->bd_fcount, + d->bd_dcount, + d->bd_hlen, + d->bd_slen, + d->bd_wcount, + d->bd_wfcount, + d->bd_wdcount, + d->bd_zcopy); + free(pname); } + free(bd); +} + +void +bpf_stats(char *ifname) +{ + char *pname, flagbuf[12]; + struct xbpf_d *d, *bd; + int size; + + if (bpf_get_stats(&size, &bd) < 0) + return; printf("%5s %6s %7s %9s %9s %9s %5s %5s %s\n", "Pid", "Netif", "Flags", "Recv", "Drop", "Match", "Sblen", "Hblen", "Command"); --- usr.bin/netstat/main.c 2008/01/02 23:39:45 +++ usr.sbin/netstat/main.c 2008/01/06 22:38:48 @@ -495,7 +495,10 @@ if (Bflag) { if (!live) usage(); - bpf_stats(interface); + if (sflag) + bpf_stats_extended(interface); + else + bpf_stats(interface); exit(0); } if (mflag) { --- usr.bin/netstat/netstat.h 2008/02/07 23:41:32 +++ usr.sbin/netstat/netstat.h 2008/03/09 18:17:26 @@ -161,3 +161,4 @@ void mroutepr(u_long, u_long); void mrt_stats(u_long); void bpf_stats(char *); +void bpf_stats_extended(char *);