diff -r f746d1cf54ed share/man/man4/siftr.4 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/share/man/man4/siftr.4 Sat Jul 03 18:21:30 2010 +1000 @@ -0,0 +1,752 @@ +.\" +.\" Copyright (c) 2010 The FreeBSD Foundation +.\" All rights reserved. +.\" +.\" Portions of this software were developed at the Centre for Advanced +.\" Internet Architectures, Swinburne University of Technology, Melbourne, +.\" Australia by Lawrence Stewart under sponsorship from the FreeBSD +.\" Foundation. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions, and the following disclaimer, +.\" without modification, immediately at the beginning of the file. +.\" 2. The name of the author may not be used to endorse or promote products +.\" derived from this software without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR +.\" ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd June 23, 2010 +.Dt SIFTR 4 +.Os +.Sh NAME +.Nm SIFTR +.Nd Statistical Information For TCP Research +.Sh SYNOPSIS +To load +.Ns Nm +as a module at run-time, run the following command as root: +.Bd -literal -offset indent +kldload siftr +.Ed +.Pp +Alternatively, to load +.Ns Nm +as a module at boot time, add the following line into the +.Xr loader.conf 5 +file: +.Bd -literal -offset indent +siftr_load="YES" +.Ed +.Sh DESCRIPTION +.Nm +.Ns ( Em S Ns tatistical +.Em I Ns nformation +.Em F Ns or +.Em T Ns CP +.Em R Ns esearch ) +is a kernel module that logs a range of statistics on active TCP connections to +a log file. +It provides the ability to make highly granular measurements of TCP connection +state, aimed at system administrators, developers and researchers. +.Ss Compile-time Configuration +The default operation of +.Nm +is to capture IPv4 TCP/IP packets. +.Nm +can be configured to support IPv4 and IPv6 by uncommenting: +.Bd -literal -offset indent +CFLAGS+=-DSIFTR_IPV6 +.Ed +.Pp +in +.Aq sys/modules/siftr/Makefile +and recompiling. +.Pp +In the IPv4-only (default) mode, standard dotted decimal notation (e.g. +"136.186.229.95") is used to format IPv4 addresses for logging. +In IPv6 mode, standard dotted decimal notation is used to format IPv4 addresses, +and standard colon-separated hex notation (see RFC 4291) is used to format IPv6 +addresses for logging. Note that SIFTR uses uncompressed notation to format IPv6 +addresses. +For example, the address "fe80::20f:feff:fea2:531b" would be logged as +"fe80:0:0:0:20f:feff:fea2:531b". +.Ss Run-time Configuration +.Nm +utilises the +.Xr sysctl 8 +interface to export its configuration variables to user-space. +The following variables are available: +.Bl -tag -offset indent +.It Va net.inet.siftr.enabled +controls whether the module performs its +measurements or not. +By default, the value is set to 0, which means the module +will not be taking any measurements. +Having the module loaded with +.Va net.inet.siftr.enabled +set to 0 will have no impact on the performance of the network stack, as the +packet filtering hooks are only inserted when +.Va net.inet.siftr.enabled +is set to 1. +.El +.Bl -tag -offset indent +.It Va net.inet.siftr.ppl +controls how many inbound/outbound packets for a given TCP connection will cause +a log message to be generated for the connection. +By default, the value is set to 1, which means the module will log a message for +every packet of every TCP connection. +The value can be set to any integer in the range [1,2^32], and can be changed at +any time, even while the module is enabled. +.El +.Bl -tag -offset indent +.It Va net.inet.siftr.logfile +controls the path to the file that the module writes its log messages to. +By default, the file /var/log/siftr.log is used. +The path can be changed at any time, even while the module is enabled. +.El +.Bl -tag -offset indent +.It Va net.inet.siftr.genhashes +controls whether a hash is generated for each TCP packet seen by +.Nm . +By default, the value is set to 0, which means no hashes are generated. +The hashes are useful to correlate which TCP packet triggered the generation of +a particular log message, but calculating them adds additional computational +overhead into the fast path. +.El +.Ss Log Format +A typical +.Nm +log file will contain 3 different types of log message. +All messages are written in plain ASCII text. +.Pp +Note: The +.Qq \e +present in the example log messages in this section indicates a +line continuation and is not part of the actual log message +.Pp +The first type of log message is written to the file when the module is +enabled and starts collecting data from the running kernel. The text below +shows an example module enable log. The fields are tab delimited key-value +pairs which describe some basic information about the system. +.Bd -literal -offset indent +enable_time_secs=1238556193 enable_time_usecs=462104 \\ +siftrver=1.2.2 hz=1000 tcp_rtt_scale=32 \\ +sysname=FreeBSD sysver=604000 ipmode=4 +.Ed +.Pp +Field descriptions are as follows: +.Bl -tag -offset indent +.It Va enable_time_secs +time at which the module was enabled, in seconds since the UNIX epoch. +.El +.Bl -tag -offset indent +.It Va enable_time_usecs +time at which the module was enabled, in microseconds since enable_time_secs. +.El +.Bl -tag -offset indent +.It Va siftrver +version of +.Nm . +.El +.Bl -tag -offset indent +.It Va hz +tick rate of the kernel in ticks per second. +.El +.Bl -tag -offset indent +.It Va tcp_rtt_scale +smoothed RTT estimate scaling factor +.El +.Bl -tag -offset indent +.It Va sysname +operating system name +.El +.Bl -tag -offset indent +.It Va sysver +operating system version +.El +.Bl -tag -offset indent +.It Va ipmode +IP mode as defined at compile time. +An ipmode of "4" means IPv6 is not supported and IP addresses are logged in +regular dotted quad format. +An ipmode of "6" means IPv6 is supported, and IP addresses are logged in dotted +quad or hex format, as described in the +.Qq Compile-time Configuration +subsection. +.El +.Pp +The second type of log message is written to the file when a data log message +is generated. +The text below shows an example data log triggered by an IPv4 +TCP/IP packet. +The data is CSV formatted. +.Bd -literal -offset indent +o,0xbec491a5,1238556193.463551,172.16.7.28,22,172.16.2.5,55931, \\ +1073725440,172312,6144,66560,66608,8,1,4,1448,936,1,996,255, \\ +33304,208,66608,0,208 +.Ed +.Pp +Field descriptions are as follows: +.Bl -tag -offset indent +.It Va 1 +Direction of packet that triggered the log message. +Either +.Qq i +for in, or +.Qq o +for out. +.El +.Bl -tag -offset indent +.It Va 2 +Hash of the packet that triggered the log message. +.El +.Bl -tag -offset indent +.It Va 3 +Time at which the packet that triggered the log message was processed by +the +.Xr pfil 9 +hook function, in seconds and microseconds since the UNIX epoch. +.El +.Bl -tag -offset indent +.It Va 4 +The IPv4 or IPv6 address of the local host, in dotted quad (IPv4 packet) +or colon-separated hex (IPv6 packet) notation. +.El +.Bl -tag -offset indent +.It Va 5 +The TCP port that the local host is communicating via. +.El +.Bl -tag -offset indent +.It Va 6 +The IPv4 or IPv6 address of the foreign host, in dotted quad (IPv4 packet) +or colon-separated hex (IPv6 packet) notation. +.El +.Bl -tag -offset indent +.It Va 7 +The TCP port that the foreign host is communicating via. +.El +.Bl -tag -offset indent +.It Va 8 +The slow start threshold for the flow, in bytes. +.El +.Bl -tag -offset indent +.It Va 9 +The current congestion window for the flow, in bytes. +.El +.Bl -tag -offset indent +.It Va 10 +The current bandwidth-controlled window for the flow, in bytes. +.El +.Bl -tag -offset indent +.It Va 11 +The current sending window for the flow, in bytes. +The post scaled value is reported, except during the initial handshake (first +few packets), during which time the unscaled value is reported. +.El +.Bl -tag -offset indent +.It Va 12 +The current receive window for the flow, in bytes. +The post scaled value is always reported. +.El +.Bl -tag -offset indent +.It Va 13 +The current window scaling factor for the sending window. +.El +.Bl -tag -offset indent +.It Va 14 +The current window scaling factor for the receiving window. +.El +.Bl -tag -offset indent +.It Va 15 +The current state of the TCP finite state machine, as defined +in +.Aq Pa netinet/tcp_fsm.h . +.El +.Bl -tag -offset indent +.It Va 16 +The maximum segment size for the flow, in bytes. +.El +.Bl -tag -offset indent +.It Va 17 +The current smoothed RTT estimate for the flow, in units of TCP_RTT_SCALE * HZ, +where TCP_RTT_SCALE is a define found in tcp_var.h, and HZ is the kernel's tick +timer. +Divide by TCP_RTT_SCALE * HZ to get the RTT in secs. TCP_RTT_SCALE and HZ are +reported in the enable log message. +.El +.Bl -tag -offset indent +.It Va 18 +SACK enabled indicator. 1 if SACK enabled, 0 otherwise. +.El +.Bl -tag -offset indent +.It Va 19 +The current state of the TCP flags for the flow. +See +.Aq Pa netinet/tcp_var.h +for information about the various flags. +.El +.Bl -tag -offset indent +.It Va 20 +The current retransmission timeout length for the flow, in units of HZ, where HZ +is the kernel's tick timer. +Divide by HZ to get the timeout length in seconds. HZ is reported in the +enable log message. +.El +.Bl -tag -offset indent +.It Va 21 +The current size of the socket send buffer in bytes. +.El +.Bl -tag -offset indent +.It Va 22 +The current number of bytes in the socket send buffer. +.El +.Bl -tag -offset indent +.It Va 23 +The current size of the socket receive buffer in bytes. +.El +.Bl -tag -offset indent +.It Va 24 +The current number of bytes in the socket receive buffer. +.El +.Bl -tag -offset indent +.It Va 25 +The current number of unacknowledged bytes in-flight. +Bytes acknowledged via SACK are not excluded from this count. +.El +.Pp +The third type of log message is written to the file when the module is disabled +and ceases collecting data from the running kernel. +The text below shows an example module disable log. +The fields are tab delimited key-value pairs which provide statistics about +operations since the module was most recently enabled. +.Bd -literal -offset indent +disable_time_secs=1238556197 disable_time_usecs=933607 \\ +num_inbound_tcp_pkts=356 num_outbound_tcp_pkts=627 \\ +total_tcp_pkts=983 num_inbound_skipped_pkts_malloc=0 \\ +num_outbound_skipped_pkts_malloc=0 num_inbound_skipped_pkts_mtx=0 \\ +num_outbound_skipped_pkts_mtx=0 num_inbound_skipped_pkts_tcb=0 \\ +num_outbound_skipped_pkts_tcb=0 num_inbound_skipped_pkts_icb=0 \\ +num_outbound_skipped_pkts_icb=0 total_skipped_tcp_pkts=0 \\ +flow_list=172.16.7.28;22-172.16.2.5;55931, +.Ed +.Pp +Field descriptions are as follows: +.Bl -tag -offset indent +.It Va disable_time_secs +Time at which the module was disabled, in seconds since the UNIX epoch. +.El +.Bl -tag -offset indent +.It Va disable_time_usecs +Time at which the module was disabled, in microseconds since disable_time_secs. +.El +.Bl -tag -offset indent +.It Va num_inbound_tcp_pkts +Number of TCP packets that traversed up the network stack. +This only includes inbound TCP packets during the periods when +.Nm +was enabled. +.El +.Bl -tag -offset indent +.It Va num_outbound_tcp_pkts +Number of TCP packets that traversed down the network stack. +This only includes outbound TCP packets during the periods when +.Nm +was enabled. +.El +.Bl -tag -offset indent +.It Va total_tcp_pkts +The summation of num_inbound_tcp_pkts and num_outbound_tcp_pkts. +.El +.Bl -tag -offset indent +.It Va num_inbound_skipped_pkts_malloc +Number of inbound packets that were not processed because of failed malloc() calls. +.El +.Bl -tag -offset indent +.It Va num_outbound_skipped_pkts_malloc +Number of outbound packets that were not processed because of failed malloc() calls. +.El +.Bl -tag -offset indent +.It Va num_inbound_skipped_pkts_mtx +Number of inbound packets that were not processed because of failure to add the +packet to the packet processing queue. +.El +.Bl -tag -offset indent +.It Va num_outbound_skipped_pkts_mtx +Number of outbound packets that were not processed because of failure to add the +packet to the packet processing queue. +.El +.Bl -tag -offset indent +.It Va num_inbound_skipped_pkts_tcb +Number of inbound packets that were not processed because of failure to find the +TCP control block associated with the packet. +.El +.Bl -tag -offset indent +.It Va num_outbound_skipped_pkts_tcb +Number of outbound packets that were not processed because of failure to find +the TCP control block associated with the packet. +.El +.Bl -tag -offset indent +.It Va num_inbound_skipped_pkts_icb +Number of inbound packets that were not processed because of failure to find the +IP control block associated with the packet. +.El +.Bl -tag -offset indent +.It Va num_outbound_skipped_pkts_icb +Number of outbound packets that were not processed because of failure to find +the IP control block associated with the packet. +.El +.Bl -tag -offset indent +.It Va total_skipped_tcp_pkts +The summation of all skipped packet counters. +.El +.Bl -tag -offset indent +.It Va flow_list +A CSV list of TCP flows that triggered data log messages to be generated since +the module was loaded. +Each flow entry in the CSV list is +formatted as +.Qq local_ip;local_port-foreign_ip;foreign_port . +If there are no entries in the list (i.e. no data log messages were generated), +the value will be blank. +If there is at least one entry in the list, a trailing comma will always be +present. +.El +.Pp +The total number of data log messages found in the log file for a module +enable/disable cycle should equate to total_tcp_pkts - total_skipped_tcp_pkts. +.Sh IMPLEMENTATION NOTES +.Nm +hooks into the network stack using the +.Xr pfil 9 +interface. +In its current incarnation, it hooks into the AF_INET/AF_INET6 (IPv4/IPv6) +.Xr pfil 9 +filtering points, which means it sees packets at the IP layer of the network +stack. +This means that TCP packets inbound to the stack are intercepted before +they have been processed by the TCP layer. +Packets outbound from the stack are intercepted after they have been processed +by the TCP layer. +.Pp +The diagram below illustrates how +.Nm +inserts itself into the stack. +.Bd -literal -offset indent +---------------------------------- + Upper Layers +---------------------------------- + ^ | + | | + | | + | v + TCP in TCP out +---------------------------------- + ^ | + |________ _________| + | | + | v + --------- + | SIFTR | + --------- + ^ | + ________| |__________ + | | + | v +IPv{4/6} in IPv{4/6} out +---------------------------------- + ^ | + | | + | v +Layer 2 in Layer 2 out +---------------------------------- + Physical Layer +---------------------------------- +.Ed +.Pp +.Nm +uses the +.Xr alq 9 +interface to manage writing data to disk. +.Pp +At first glance, you might mistakenly think that +.Nm +extracts information from +individual TCP packets. +This is not the case. +.Nm +uses TCP packet events (inbound and outbound) for each TCP flow originating from +the system to trigger a dump of the state of the TCP control block for that +flow. +With the PPL set to 1, we are in effect sampling each TCP flow's control block +state as frequently as flow packets enter/leave the system. +For example, setting PPL to 2 halves the sampling rate i.e. every second flow +packet (inbound OR outbound) causes a dump of the control block state. +.Pp +The distinction between interrogating individual packets vs interrogating the +control block is important, because +.Nm +does not remove the need for packet capturing tools like +.Xr tcpdump 1 . +.Nm +allows you to correlate and observe the cause-and-affect relationship between +what you see on the wire (captured using a tool like +.Xr tcpdump 1 Ns ) +and changes in the TCP control block corresponding to the flow of interest. +It is therefore useful to use +.Nm +and a tool like +.Xr tcpdump 1 +to gather the necessary data to piece together the complete picture. +Use of either tool on its own will not be able to provide all of the necessary +data. +.Pp +As a result of needing to interrogate the TCP control block, certain packets +during the lifecycle of a connection are unable to trigger a +.Nm +log message. +The initial handshake takes place without the existence of a control block and +the final ACK is exchanged when the connection is in the TIMEWAIT state. +.Pp +.Nm +was designed to minimise the delay introduced to packets traversing the network +stack. +This design called for a highly optimised and minimal hook function that +extracted the minimal details necessary whilst holding the packet up, and +passing these details to another thread for actual processing and logging. +.Pp +This multithreaded design does introduce some contention issues when accessing +the data structure shared between the threads of operation. +When the hook function tries to place details in the structure, it must first +acquire an exclusive lock. +Likewise, when the processing thread tries to read details from the structure, +it must also acquire an exclusive lock to do so. +If one thread holds the lock, the other must wait before it can obtain it. +This does introduce some additional bounded delay into the kernel's packet +processing code path. +.Pp +In some cases (e.g. low memory, connection termination), TCP packets that enter +the +.Nm +.Xr pfil 9 +hook function will not trigger a log message to be generated. +.Nm +refers to this outcome as a +.Qq skipped packet . +Note that +.Nm +always ensures that packets are allowed to continue through the stack, even if +they could not successfully trigger a data log message. +.Nm +will therefore not introduce any packet loss for TCP/IP packets traversing the +network stack. +.Ss Important Behaviours +The behaviour of a log file path change whilst the module is enabled is as +follows: +.Bl -enum +.It +Attempt to open the new file path for writing. +If this fails, the path change will fail and the existing path will continue to +be used. +.It +Assuming the new path is valid and opened successfully: +.Bl -dash +.It +Flush all pending log messages to the old file path. +.It +Close the old file path. +.It +Switch the active log file pointer to point at the new file path. +.It +Commence logging to the new file. +.El +.El +.Pp +During the time between the flush of pending log messages to the old file and +commencing logging to the new file, new log messages will still be generated and +buffered. +As soon as the new file path is ready for writing, the accumulated log messages +will be written out to the file. +.Sh EXAMPLES +To enable the module's operations, run the following command as root: +sysctl net.inet.siftr.enabled=1 +.Pp +To change the granularity of log messages such that 1 log message is +generated for every 10 TCP packets per connection, run the following +command as root: +sysctl net.inet.siftr.ppl=10 +.Pp +To change the log file location to /tmp/siftr.log, run the following +command as root: +sysctl net.inet.siftr.logfile=/tmp/siftr.log +.Sh SEE ALSO +.Xr alq 9 , +.Xr pfil 9 +.Xr sysctl 8 , +.Xr tcp 4 , +.Xr tcpdump 1 , +.Sh ACKNOWLEDGEMENTS +Development of this software was made possible in part by grants from the +Cisco University Research Program Fund at Community Foundation Silicon Valley, +and the FreeBSD Foundation. +.Sh HISTORY +.Nm +first appeared in +.Fx 9.0 . +.Pp +.Nm +was first released in 2007 by Lawrence Stewart and James Healy whilst working on +the NewTCP research project at Swinburne University's Centre for Advanced +Internet Architectures, Melbourne, Australia, which was made possible in part by +a grant from the Cisco University Research Program Fund at Community Foundation +Silicon Valley. +More details are available at: +.Pp +http://caia.swin.edu.au/urp/newtcp/ +.Pp +Work on +.Nm +v1.2.x was sponsored by the FreeBSD Foundation as part of +the +.Qq Enhancing the FreeBSD TCP Implementation +project 2008-2009. +More details are available at: +.Pp +http://www.freebsdfoundation.org/ +.Pp +http://caia.swin.edu.au/freebsd/etcp09/ +.Sh AUTHORS +.An -nosplit +.Nm +was written by +.An Lawrence Stewart Aq lstewart@FreeBSD.org +and +.An James Healy Aq jimmy@deefa.com . +.Pp +This manual page was written by +.An Lawrence Stewart Aq lstewart@FreeBSD.org . +.Sh BUGS +Current known limitations and any relevant workarounds are outlined below: +.Bl -dash +.It +The internal queue used to pass information between the threads of operation is +currently unbounded. +This allows +.Nm +to cope with bursty network traffic, but sustained high packet-per-second +traffic can cause exhaustion of kernel memory if the processing thread cannot +keep up with the packet rate. +.It +If using +.Nm +on a machine that is also running other modules utilising the +.Xr pfil 9 +framework e.g. +.Xr dummynet 4 , +.Xr ipfw 8 , +.Xr pf 4 Ns , +the order in which you load the modules is important. +You should kldload the other modules first, as this will ensure TCP packets +undergo any necessary manipulations before +.Nm +.Qq sees +and processes them. +.It +There is a known, harmless lock order reversal warning between the +.Xr pfil 9 +mutex and tcbinfo TCP lock reported by +.Xr witness 4 +when +.Nm +is enabled in a kernel compiled with +.Xr witness 4 +support. +.It +There is no way to filter which TCP flows you wish to capture data for. +Post processing is required to separate out data belonging to particular flows +of interest. +.It +The module does not detect deletion of the log file path. +New log messages will simply be lost if the log file being used by +.Nm +is deleted whilst the module is set to use the file. +Switching to a new log file using the +.Em net.inet.siftr.logfile +variable will create the new file and allow log messages to begin being written +to disk again. +The new log file path must differ from the path to the deleted file. +.It +The hash table used within the code is sized to hold 65536 flows. This is not a +hard limit, because chaining is used to handle collisions within the hash table +structure. +However, we suspect (based on analogies with other hash table performance data) +that the hash table look up performance (and therefore the module's packet +processing performance) will degrade in an exponential manner as the number of +unique flows handled in a module enable/disable cycle approaches and surpasses +65536. +.It +There is no garbage collection performed on the flow hash table. +The only way currently to flush it is to disable +.Nm . +.It +The PPL variable applies to packets that make it into the processing thread, +not total packets received in the hook function. +Packets are skipped before the PPL variable is applied, which means there may be +a slight discrepancy in the triggering of log messages. +For example, if PPL was set to 10, and the 8th packet since the last log message +is skipped, the 11th packet will actually trigger the log message to be +generated. +This is discussed in greater depth in CAIA technical report 070824A. +.It +At the time of writing, there was no simple way to hook into the TCP layer +to intercept packets. +.Nm Ap s +use of IP layer hook points means all IP +traffic will be processed by the +.Nm +.Xr pfil 9 +hook function, which introduces minor, but nonetheless unnecessary packet delay +and processing overhead on the system for non-TCP packets as well. +Hooking in at the IP layer is also not ideal from the data gathering point of +view. +Packets traversing up the stack will be intercepted and cause a log message +generation BEFORE they have been processed by the TCP layer, which means we +cannot observe the cause-and-affect relationship between inbound events and the +corresponding TCP control block as precisely as could be. +Ideally, +.Nm +should intercept packets after they have been processed by the TCP layer i.e. +intercept packets coming up the stack after they have been processed by +tcp_input(), and intercept packets coming down the stack after they have been +processed by tcp_output(). +The current code still gives satisfactory granularity though, as inbound events +tend to trigger outbound events, allowing the cause-and-effect to be observed +indirectly by capturing the state on outbound events as well. +.It +The +.Qq inflight bytes +value logged by +.Nm +does not take into account bytes that have been +.No SACK Ap ed +by the receiving host. +.It +Packet hash generation does not currently work for IPv6 based TCP packets. +.It +Compressed notation is not used for IPv6 address representation. +This consumes more bytes than is necessary in log output. +.El diff -r f746d1cf54ed sys/modules/Makefile --- a/sys/modules/Makefile Fri Jul 02 19:59:18 2010 +0000 +++ b/sys/modules/Makefile Sat Jul 03 18:21:30 2010 +1000 @@ -257,6 +257,7 @@ sf \ sge \ siba_bwn \ + siftr \ siis \ sis \ sk \ diff -r f746d1cf54ed sys/modules/siftr/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/modules/siftr/Makefile Sat Jul 03 18:21:30 2010 +1000 @@ -0,0 +1,12 @@ +# $FreeBSD$ + +.include + +.PATH: ${.CURDIR}/../../netinet +KMOD= siftr +SRCS= siftr.c + +# Uncomment to add IPv6 support +#CFLAGS+=-DSIFTR_IPV6 + +.include diff -r f746d1cf54ed sys/netinet/siftr.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/netinet/siftr.c Sat Jul 03 18:21:30 2010 +1000 @@ -0,0 +1,1568 @@ +/*- + * Copyright (c) 2007-2009, Centre for Advanced Internet Architectures + * Swinburne University of Technology, Melbourne, Australia + * (CRICOS number 00111D). + * Copyright (c) 2009-2010, The FreeBSD Foundation + * All rights reserved. + * + * Portions of this software were developed at the Centre for Advanced + * Internet Architectures, Swinburne University of Technology, Melbourne, + * Australia by Lawrence Stewart under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/****************************************************** + * Statistical Information For TCP Research (SIFTR) + * + * A FreeBSD kernel module that adds very basic intrumentation to the + * TCP stack, allowing internal stats to be recorded to a log file + * for experimental, debugging and performance analysis purposes. + * + * SIFTR was first released in 2007 by James Healy and Lawrence Stewart whilst + * working on the NewTCP research project at Swinburne University's Centre for + * Advanced Internet Architectures, Melbourne, Australia, which was made + * possible in part by a grant from the Cisco University Research Program Fund + * at Community Foundation Silicon Valley. More details are available at: + * http://caia.swin.edu.au/urp/newtcp/ + * + * Work on SIFTR v1.2.x was sponsored by the FreeBSD Foundation as part of + * the "Enhancing the FreeBSD TCP Implementation" project 2008-2009. + * More details are available at: + * http://www.freebsdfoundation.org/ + * http://caia.swin.edu.au/freebsd/etcp09/ + * + * Lawrence Stewart is the current maintainer, and all contact regarding + * SIFTR should be directed to him via email: lastewart@swin.edu.au + * + * Initial release date: June 2007 + * Most recent update: June 2010 + ******************************************************/ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef SIFTR_IPV6 +#include +#include +#endif /* SIFTR_IPV6 */ + +#include + +/* + * Three digit version number refers to X.Y.Z where: + * X is the major version number + * Y is bumped to mark backwards incompatible changes + * Z is bumped to mark backwards compatible changes + */ +#define V_MAJOR 1 +#define V_BACKBREAK 2 +#define V_BACKCOMPAT 3 +#define MODVERSION __CONCAT(V_MAJOR, __CONCAT(V_BACKBREAK, V_BACKCOMPAT)) +#define MODVERSION_STR __XSTRING(V_MAJOR) "." __XSTRING(V_BACKBREAK) "." \ + __XSTRING(V_BACKCOMPAT) + +#define HOOK 0 +#define UNHOOK 1 +#define SIFTR_EXPECTED_MAX_TCP_FLOWS 65536 +#define SYS_NAME "FreeBSD" +#define PACKET_TAG_SIFTR 100 +#define PACKET_COOKIE_SIFTR 21749576 +#define SIFTR_LOG_FILE_MODE 0644 +#define SIFTR_DISABLE 0 +#define SIFTR_ENABLE 1 + +/* + * Hard upper limit on the length of log messages. Bump this up if you add new + * data fields such that the line length could exceed the below value. + */ +#define MAX_LOG_MSG_LEN 200 +/* XXX: Make this a sysctl tunable. */ +#define SIFTR_ALQ_BUFLEN (1000*MAX_LOG_MSG_LEN) + +/* + * 1 byte for IP version + * IPv4: src/dst IP (4+4) + src/dst port (2+2) = 12 bytes + * IPv6: src/dst IP (16+16) + src/dst port (2+2) = 36 bytes + */ +#ifdef SIFTR_IPV6 +#define FLOW_KEY_LEN 37 +#else +#define FLOW_KEY_LEN 13 +#endif + +#ifdef SIFTR_IPV6 +#define SIFTR_IPMODE 6 +#else +#define SIFTR_IPMODE 4 +#endif + +/* useful macros */ +#define CAST_PTR_INT(X) (*((int*)(X))) + +#define UPPER_SHORT(X) (((X) & 0xFFFF0000) >> 16) +#define LOWER_SHORT(X) ((X) & 0x0000FFFF) + +#define FIRST_OCTET(X) (((X) & 0xFF000000) >> 24) +#define SECOND_OCTET(X) (((X) & 0x00FF0000) >> 16) +#define THIRD_OCTET(X) (((X) & 0x0000FF00) >> 8) +#define FOURTH_OCTET(X) ((X) & 0x000000FF) + +MALLOC_DECLARE(M_SIFTR); +MALLOC_DEFINE(M_SIFTR, "siftr", "dynamic memory used by SIFTR"); + +MALLOC_DECLARE(M_SIFTR_PKTNODE); +MALLOC_DEFINE(M_SIFTR_PKTNODE, "siftr_pktnode", "SIFTR pkt_node struct"); + +MALLOC_DECLARE(M_SIFTR_HASHNODE); +MALLOC_DEFINE(M_SIFTR_HASHNODE, "siftr_hashnode", "SIFTR flow_hash_node struct"); + +/* Used as links in the pkt manager queue. */ +struct pkt_node { + /* Timestamp of pkt as noted in the pfil hook. */ + struct timeval tval; + /* Direction pkt is travelling; either PFIL_IN or PFIL_OUT. */ + uint8_t direction; + /* IP version pkt_node relates to; either INP_IPV4 or INP_IPV6. */ + uint8_t ipver; + /* Hash of the pkt which triggered the log message. */ + uint32_t hash; + /* Local/foreign IP address. */ +#ifdef SIFTR_IPV6 + uint32_t ip_laddr[4]; + uint32_t ip_faddr[4]; +#else + uint8_t ip_laddr[4]; + uint8_t ip_faddr[4]; +#endif + /* Local TCP port. */ + uint16_t tcp_localport; + /* Foreign TCP port. */ + uint16_t tcp_foreignport; + /* Congestion Window (bytes). */ + u_long snd_cwnd; + /* Sending Window (bytes). */ + u_long snd_wnd; + /* Receive Window (bytes). */ + u_long rcv_wnd; + /* Bandwidth Controlled Window (bytes). */ + u_long snd_bwnd; + /* Slow Start Threshold (bytes). */ + u_long snd_ssthresh; + /* Current state of the TCP FSM. */ + int conn_state; + /* Max Segment Size (bytes). */ + u_int max_seg_size; + /* + * Smoothed RTT stored as found in the TCP control block + * in units of (TCP_RTT_SCALE*hz). + */ + int smoothed_rtt; + /* Is SACK enabled? */ + u_char sack_enabled; + /* Window scaling for snd window. */ + u_char snd_scale; + /* Window scaling for recv window. */ + u_char rcv_scale; + /* TCP control block flags. */ + u_int flags; + /* Retransmit timeout length. */ + int rxt_length; + /* Size of the TCP send buffer in bytes. */ + u_int snd_buf_hiwater; + /* Current num bytes in the send socket buffer. */ + u_int snd_buf_cc; + /* Size of the TCP receive buffer in bytes. */ + u_int rcv_buf_hiwater; + /* Current num bytes in the receive socket buffer. */ + u_int rcv_buf_cc; + /* Number of bytes inflight that we are waiting on ACKs for. */ + u_int sent_inflight_bytes; + /* Link to next pkt_node in the list. */ + STAILQ_ENTRY(pkt_node) nodes; +}; + +struct flow_hash_node +{ + uint16_t counter; + uint8_t key[FLOW_KEY_LEN]; + LIST_ENTRY(flow_hash_node) nodes; +}; + +struct siftr_stats +{ + /* # TCP pkts seen by the SIFTR PFIL hooks, including any skipped. */ + uint64_t n_in; + uint64_t n_out; + /* # pkts skipped due to failed malloc calls. */ + uint32_t nskip_in_malloc; + uint32_t nskip_out_malloc; + /* # pkts skipped due to failed mtx acquisition. */ + uint32_t nskip_in_mtx; + uint32_t nskip_out_mtx; + /* # pkts skipped due to failed inpcb lookups. */ + uint32_t nskip_in_inpcb; + uint32_t nskip_out_inpcb; + /* # pkts skipped due to failed tcpcb lookups. */ + uint32_t nskip_in_tcpcb; + uint32_t nskip_out_tcpcb; + /* # pkts skipped due to stack reinjection. */ + uint32_t nskip_in_dejavu; + uint32_t nskip_out_dejavu; +}; + +static DPCPU_DEFINE(struct siftr_stats, ss); + +static volatile unsigned int siftr_exit_pkt_manager_thread = 0; +static unsigned int siftr_enabled = 0; +static unsigned int siftr_pkts_per_log = 1; +static unsigned int siftr_generate_hashes = 0; +/* static unsigned int siftr_binary_log = 0; */ +static char siftr_logfile[PATH_MAX] = "/var/log/siftr.log"; +static u_long siftr_hashmask; +STAILQ_HEAD(pkthead, pkt_node) pkt_queue = STAILQ_HEAD_INITIALIZER(pkt_queue); +LIST_HEAD(listhead, flow_hash_node) *counter_hash; +static int wait_for_pkt; +static struct alq *siftr_alq = NULL; +static struct mtx siftr_pkt_queue_mtx; +static struct mtx siftr_pkt_mgr_mtx; +static struct thread *siftr_pkt_manager_thr = NULL; +/* + * pfil.h defines PFIL_IN as 1 and PFIL_OUT as 2, + * which we use as an index into this array. + */ +static char direction[3] = {'\0', 'i','o'}; + +/* Required function prototypes. */ +static int siftr_sysctl_enabled_handler(SYSCTL_HANDLER_ARGS); +static int siftr_sysctl_logfile_name_handler(SYSCTL_HANDLER_ARGS); + + +/* Declare the net.inet.siftr sysctl tree and populate it. */ + +SYSCTL_DECL(_net_inet_siftr); + +SYSCTL_NODE(_net_inet, OID_AUTO, siftr, CTLFLAG_RW, NULL, + "siftr related settings"); + +SYSCTL_PROC(_net_inet_siftr, OID_AUTO, enabled, CTLTYPE_UINT|CTLFLAG_RW, + &siftr_enabled, 0, &siftr_sysctl_enabled_handler, "IU", + "switch siftr module operations on/off"); + +SYSCTL_PROC(_net_inet_siftr, OID_AUTO, logfile, CTLTYPE_STRING|CTLFLAG_RW, + &siftr_logfile, sizeof(siftr_logfile), &siftr_sysctl_logfile_name_handler, + "A", "file to save siftr log messages to"); + +SYSCTL_UINT(_net_inet_siftr, OID_AUTO, ppl, CTLFLAG_RW, + &siftr_pkts_per_log, 1, + "number of packets between generating a log message"); + +SYSCTL_UINT(_net_inet_siftr, OID_AUTO, genhashes, CTLFLAG_RW, + &siftr_generate_hashes, 0, + "enable packet hash generation"); + +/* XXX: TODO +SYSCTL_UINT(_net_inet_siftr, OID_AUTO, binary, CTLFLAG_RW, + &siftr_binary_log, 0, + "write log files in binary instead of ascii"); +*/ + + +/* Begin functions. */ + +static void +siftr_process_pkt(struct pkt_node * pkt_node) +{ + struct flow_hash_node *hash_node; + struct listhead *counter_list; + struct siftr_stats *ss; + struct ale *log_buf; + uint8_t key[FLOW_KEY_LEN]; + uint8_t found_match, key_offset; + + hash_node = NULL; + ss = DPCPU_PTR(ss); + found_match = 0; + key_offset = 1; + + /* + * Create the key that will be used to create a hash index + * into our hash table. Our key consists of: + * ipversion, localip, localport, foreignip, foreignport + */ + key[0] = pkt_node->ipver; + memcpy(key + key_offset, &pkt_node->ip_laddr, + sizeof(pkt_node->ip_laddr)); + key_offset += sizeof(pkt_node->ip_laddr); + memcpy(key + key_offset, &pkt_node->tcp_localport, + sizeof(pkt_node->tcp_localport)); + key_offset += sizeof(pkt_node->tcp_localport); + memcpy(key + key_offset, &pkt_node->ip_faddr, + sizeof(pkt_node->ip_faddr)); + key_offset += sizeof(pkt_node->ip_faddr); + memcpy(key + key_offset, &pkt_node->tcp_foreignport, + sizeof(pkt_node->tcp_foreignport)); + + counter_list = counter_hash + + (hash32_buf(key, sizeof(key), 0) & siftr_hashmask); + + /* + * If the list is not empty i.e. the hash index has + * been used by another flow previously. + */ + if (LIST_FIRST(counter_list) != NULL) { + /* + * Loop through the hash nodes in the list. + * There should normally only be 1 hash node in the list, + * except if there have been collisions at the hash index + * computed by hash32_buf(). + */ + LIST_FOREACH(hash_node, counter_list, nodes) { + /* + * Check if the key for the pkt we are currently + * processing is the same as the key stored in the + * hash node we are currently processing. + * If they are the same, then we've found the + * hash node that stores the counter for the flow + * the pkt belongs to. + */ + if (memcmp(hash_node->key, key, sizeof(key)) == 0) { + found_match = 1; + break; + } + } + } + + /* If this flow hash hasn't been seen before or we have a collision. */ + if (hash_node == NULL || !found_match) { + /* Create a new hash node to store the flow's counter. */ + hash_node = malloc(sizeof(struct flow_hash_node), + M_SIFTR_HASHNODE, M_WAITOK); + + if (hash_node != NULL) { + /* Initialise our new hash node list entry. */ + hash_node->counter = 0; + memcpy(hash_node->key, key, sizeof(key)); + LIST_INSERT_HEAD(counter_list, hash_node, nodes); + } else { + /* Malloc failed. */ + if (pkt_node->direction == PFIL_IN) + ss->nskip_in_malloc++; + else + ss->nskip_out_malloc++; + + return; + } + } else if (siftr_pkts_per_log > 1) { + /* + * Taking the remainder of the counter divided + * by the current value of siftr_pkts_per_log + * and storing that in counter provides a neat + * way to modulate the frequency of log + * messages being written to the log file. + */ + hash_node->counter = (hash_node->counter + 1) % + siftr_pkts_per_log; + + /* + * If we have not seen enough packets since the last time + * we wrote a log message for this connection, return. + */ + if (hash_node->counter > 0) + return; + } + + log_buf = alq_getn(siftr_alq, MAX_LOG_MSG_LEN, ALQ_WAITOK); + + if (log_buf == NULL) + return; /* Should only happen if the ALQ is shutting down. */ + +#ifdef SIFTR_IPV6 + pkt_node->ip_laddr[3] = ntohl(pkt_node->ip_laddr[3]); + pkt_node->ip_faddr[3] = ntohl(pkt_node->ip_faddr[3]); + + if (pkt_node->ipver == INP_IPV6) { /* IPv6 packet */ + pkt_node->ip_laddr[0] = ntohl(pkt_node->ip_laddr[0]); + pkt_node->ip_laddr[1] = ntohl(pkt_node->ip_laddr[1]); + pkt_node->ip_laddr[2] = ntohl(pkt_node->ip_laddr[2]); + pkt_node->ip_faddr[0] = ntohl(pkt_node->ip_faddr[0]); + pkt_node->ip_faddr[1] = ntohl(pkt_node->ip_faddr[1]); + pkt_node->ip_faddr[2] = ntohl(pkt_node->ip_faddr[2]); + + /* Construct an IPv6 log message. */ + log_buf->ae_bytesused = snprintf(log_buf->ae_data, + MAX_LOG_MSG_LEN, + "%c,0x%08x,%zd.%06ld,%x:%x:%x:%x:%x:%x:%x:%x,%u,%x:%x:%x:" + "%x:%x:%x:%x:%x,%u,%ld,%ld,%ld,%ld,%ld,%u,%u,%u,%u,%u,%u," + "%u,%d,%u,%u,%u,%u,%u\n", + direction[pkt_node->direction], + pkt_node->hash, + pkt_node->tval.tv_sec, + pkt_node->tval.tv_usec, + UPPER_SHORT(pkt_node->ip_laddr[0]), + LOWER_SHORT(pkt_node->ip_laddr[0]), + UPPER_SHORT(pkt_node->ip_laddr[1]), + LOWER_SHORT(pkt_node->ip_laddr[1]), + UPPER_SHORT(pkt_node->ip_laddr[2]), + LOWER_SHORT(pkt_node->ip_laddr[2]), + UPPER_SHORT(pkt_node->ip_laddr[3]), + LOWER_SHORT(pkt_node->ip_laddr[3]), + ntohs(pkt_node->tcp_localport), + UPPER_SHORT(pkt_node->ip_faddr[0]), + LOWER_SHORT(pkt_node->ip_faddr[0]), + UPPER_SHORT(pkt_node->ip_faddr[1]), + LOWER_SHORT(pkt_node->ip_faddr[1]), + UPPER_SHORT(pkt_node->ip_faddr[2]), + LOWER_SHORT(pkt_node->ip_faddr[2]), + UPPER_SHORT(pkt_node->ip_faddr[3]), + LOWER_SHORT(pkt_node->ip_faddr[3]), + ntohs(pkt_node->tcp_foreignport), + pkt_node->snd_ssthresh, + pkt_node->snd_cwnd, + pkt_node->snd_bwnd, + pkt_node->snd_wnd, + pkt_node->rcv_wnd, + pkt_node->snd_scale, + pkt_node->rcv_scale, + pkt_node->conn_state, + pkt_node->max_seg_size, + pkt_node->smoothed_rtt, + pkt_node->sack_enabled, + pkt_node->flags, + pkt_node->rxt_length, + pkt_node->snd_buf_hiwater, + pkt_node->snd_buf_cc, + pkt_node->rcv_buf_hiwater, + pkt_node->rcv_buf_cc, + pkt_node->sent_inflight_bytes); + } else { /* IPv4 packet */ + pkt_node->ip_laddr[0] = FIRST_OCTET(pkt_node->ip_laddr[3]); + pkt_node->ip_laddr[1] = SECOND_OCTET(pkt_node->ip_laddr[3]); + pkt_node->ip_laddr[2] = THIRD_OCTET(pkt_node->ip_laddr[3]); + pkt_node->ip_laddr[3] = FOURTH_OCTET(pkt_node->ip_laddr[3]); + pkt_node->ip_faddr[0] = FIRST_OCTET(pkt_node->ip_faddr[3]); + pkt_node->ip_faddr[1] = SECOND_OCTET(pkt_node->ip_faddr[3]); + pkt_node->ip_faddr[2] = THIRD_OCTET(pkt_node->ip_faddr[3]); + pkt_node->ip_faddr[3] = FOURTH_OCTET(pkt_node->ip_faddr[3]); +#endif /* SIFTR_IPV6 */ + + /* Construct an IPv4 log message. */ + log_buf->ae_bytesused = snprintf(log_buf->ae_data, + MAX_LOG_MSG_LEN, + "%c,0x%08x,%jd.%06ld,%u.%u.%u.%u,%u,%u.%u.%u.%u,%u,%ld,%ld," + "%ld,%ld,%ld,%u,%u,%u,%u,%u,%u,%u,%d,%u,%u,%u,%u,%u\n", + direction[pkt_node->direction], + pkt_node->hash, + (intmax_t)pkt_node->tval.tv_sec, + pkt_node->tval.tv_usec, + pkt_node->ip_laddr[0], + pkt_node->ip_laddr[1], + pkt_node->ip_laddr[2], + pkt_node->ip_laddr[3], + ntohs(pkt_node->tcp_localport), + pkt_node->ip_faddr[0], + pkt_node->ip_faddr[1], + pkt_node->ip_faddr[2], + pkt_node->ip_faddr[3], + ntohs(pkt_node->tcp_foreignport), + pkt_node->snd_ssthresh, + pkt_node->snd_cwnd, + pkt_node->snd_bwnd, + pkt_node->snd_wnd, + pkt_node->rcv_wnd, + pkt_node->snd_scale, + pkt_node->rcv_scale, + pkt_node->conn_state, + pkt_node->max_seg_size, + pkt_node->smoothed_rtt, + pkt_node->sack_enabled, + pkt_node->flags, + pkt_node->rxt_length, + pkt_node->snd_buf_hiwater, + pkt_node->snd_buf_cc, + pkt_node->rcv_buf_hiwater, + pkt_node->rcv_buf_cc, + pkt_node->sent_inflight_bytes); +#ifdef SIFTR_IPV6 + } +#endif + + alq_post_flags(siftr_alq, log_buf, 0); +} + + +static void +siftr_pkt_manager_thread(void *arg) +{ + STAILQ_HEAD(pkthead, pkt_node) tmp_pkt_queue = + STAILQ_HEAD_INITIALIZER(tmp_pkt_queue); + struct pkt_node *pkt_node, *pkt_node_temp; + uint8_t draining; + + draining = 2; + + mtx_lock(&siftr_pkt_mgr_mtx); + + /* draining == 0 when queue has been flushed and it's safe to exit. */ + while (draining) { + /* + * Sleep until we are signalled to wake because thread has + * been told to exit or until 1 tick has passed. + */ + mtx_sleep(&wait_for_pkt, &siftr_pkt_mgr_mtx, PWAIT, "pktwait", + 1); + + /* Gain exclusive access to the pkt_node queue. */ + mtx_lock(&siftr_pkt_queue_mtx); + + /* + * Move pkt_queue to tmp_pkt_queue, which leaves + * pkt_queue empty and ready to receive more pkt_nodes. + */ + STAILQ_CONCAT(&tmp_pkt_queue, &pkt_queue); + + /* + * We've finished making changes to the list. Unlock it + * so the pfil hooks can continue queuing pkt_nodes. + */ + mtx_unlock(&siftr_pkt_queue_mtx); + + /* + * We can't hold a mutex whilst calling siftr_process_pkt + * because ALQ might sleep waiting for buffer space. + */ + mtx_unlock(&siftr_pkt_mgr_mtx); + + /* Flush all pkt_nodes to the log file. */ + STAILQ_FOREACH_SAFE(pkt_node, &tmp_pkt_queue, nodes, + pkt_node_temp) { + siftr_process_pkt(pkt_node); + STAILQ_REMOVE_HEAD(&tmp_pkt_queue, nodes); + free(pkt_node, M_SIFTR_PKTNODE); + } + + KASSERT(STAILQ_EMPTY(&tmp_pkt_queue), + ("SIFTR tmp_pkt_queue not empty after flush")); + + mtx_lock(&siftr_pkt_mgr_mtx); + + /* + * If siftr_exit_pkt_manager_thread gets set during the window + * where we are draining the tmp_pkt_queue above, there might + * still be pkts in pkt_queue that need to be drained. + * Allow one further iteration to occur after + * siftr_exit_pkt_manager_thread has been set to ensure + * pkt_queue is completely empty before we kill the thread. + * + * siftr_exit_pkt_manager_thread is set only after the pfil + * hooks have been removed, so only 1 extra iteration + * is needed to drain the queue. + */ + if (siftr_exit_pkt_manager_thread) + draining--; + } + + mtx_unlock(&siftr_pkt_mgr_mtx); + + /* Calls wakeup on this thread's struct thread ptr. */ + kthread_exit(); +} + + +static uint32_t +hash_pkt(struct mbuf *m, uint32_t offset) +{ + uint32_t hash; + + hash = 0; + + while (m != NULL && offset > m->m_len) { + /* + * The IP packet payload does not start in this mbuf, so + * need to figure out which mbuf it starts in and what offset + * into the mbuf's data region the payload starts at. + */ + offset -= m->m_len; + m = m->m_next; + } + + while (m != NULL) { + /* Ensure there is data in the mbuf */ + if ((m->m_len - offset) > 0) + hash = hash32_buf(m->m_data + offset, + m->m_len - offset, hash); + + m = m->m_next; + offset = 0; + } + + return (hash); +} + + +/* + * Check if a given mbuf has the SIFTR mbuf tag. If it does, log the fact that + * it's a reinjected packet and return. If it doesn't, tag the mbuf and return. + * Return value >0 means the caller should skip processing this mbuf. + */ +static inline int +siftr_chkreinject(struct mbuf *m, int dir, struct siftr_stats *ss) +{ + if (m_tag_locate(m, PACKET_COOKIE_SIFTR, PACKET_TAG_SIFTR, NULL) + != NULL) { + if (dir == PFIL_IN) + ss->nskip_in_dejavu++; + else + ss->nskip_out_dejavu++; + + return (1); + } else { + struct m_tag *tag = m_tag_alloc(PACKET_COOKIE_SIFTR, + PACKET_TAG_SIFTR, 0, M_NOWAIT); + if (tag == NULL) { + if (dir == PFIL_IN) + ss->nskip_in_malloc++; + else + ss->nskip_out_malloc++; + + return (1); + } + + m_tag_prepend(m, tag); + } + + return (0); +} + + +/* + * Look up an inpcb for a packet. Return the inpcb pointer if found, or NULL + * otherwise. + */ +static inline struct inpcb * +siftr_findinpcb(int ipver, struct ip *ip, struct mbuf *m, uint16_t sport, + uint16_t dport, int dir, struct siftr_stats *ss) +{ + struct inpcb *inp; + + /* We need the tcbinfo lock. */ + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); + + if (dir == PFIL_IN) + inp = (ipver == INP_IPV4 ? + in_pcblookup_hash(&V_tcbinfo, ip->ip_src, sport, ip->ip_dst, + dport, 0, m->m_pkthdr.rcvif) + : +#ifdef SIFTR_IPV6 + in6_pcblookup_hash(&V_tcbinfo, + &((struct ip6_hdr *)ip)->ip6_src, sport, + &((struct ip6_hdr *)ip)->ip6_dst, dport, 0, + m->m_pkthdr.rcvif) +#else + NULL +#endif + ); + + else + inp = (ipver == INP_IPV4 ? + in_pcblookup_hash(&V_tcbinfo, ip->ip_dst, dport, ip->ip_src, + sport, 0, m->m_pkthdr.rcvif) + : +#ifdef SIFTR_IPV6 + in6_pcblookup_hash(&V_tcbinfo, + &((struct ip6_hdr *)ip)->ip6_dst, dport, + &((struct ip6_hdr *)ip)->ip6_src, sport, 0, + m->m_pkthdr.rcvif) +#else + NULL +#endif + ); + + /* If we can't find the inpcb, bail. */ + if (inp == NULL) { + if (dir == PFIL_IN) + ss->nskip_in_inpcb++; + else + ss->nskip_out_inpcb++; + + INP_INFO_RUNLOCK(&V_tcbinfo); + } else { + /* Acquire the inpcb lock. */ + INP_UNLOCK_ASSERT(inp); + INP_RLOCK(inp); + INP_INFO_RUNLOCK(&V_tcbinfo); + } + + return (inp); +} + + +/* + * pfil hook that is called for each IPv4 packet making its way through the + * stack in either direction. + * The pfil subsystem holds a non-sleepable mutex somewhere when + * calling our hook function, so we can't sleep at all. + * It's very important to use the M_NOWAIT flag with all function calls + * that support it so that they won't sleep, otherwise you get a panic. + */ +static int +siftr_chkpkt(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, + struct inpcb *inp) +{ + struct pkt_node *pkt_node; + struct ip *ip; + struct tcphdr *th; + struct tcpcb *tp; + struct siftr_stats *ss; + unsigned int ip_hl; + uint8_t inp_locally_locked; + + inp_locally_locked = 0; + ss = DPCPU_PTR(ss); + + /* + * m_pullup is not required here because ip_{input|output} + * already do the heavy lifting for us. + */ + + ip = mtod(*m, struct ip *); + + /* Only continue processing if the packet is TCP. */ + if (ip->ip_p != IPPROTO_TCP) + goto ret; + + /* + * If a kernel subsystem reinjects packets into the stack, our pfil + * hook will be called multiple times for the same packet. + * Make sure we only process unique packets. + */ + if (siftr_chkreinject(*m, dir, ss)) + goto ret; + + if (dir == PFIL_IN) + ss->n_in++; + else + ss->n_out++; + + /* + * Create a tcphdr struct starting at the correct offset + * in the IP packet. ip->ip_hl gives the ip header length + * in 4-byte words, so multiply it to get the size in bytes. + */ + ip_hl = (ip->ip_hl << 2); + th = (struct tcphdr *)((caddr_t)ip + ip_hl); + + /* + * If the pfil hooks don't provide a pointer to the + * inpcb, we need to find it ourselves and lock it. + */ + if (!inp) { + /* Find the corresponding inpcb for this pkt. */ + inp = siftr_findinpcb(INP_IPV4, ip, *m, th->th_sport, + th->th_dport, dir, ss); + + if (inp == NULL) + goto ret; + else + inp_locally_locked = 1; + } + + INP_LOCK_ASSERT(inp); + + pkt_node = malloc(sizeof(struct pkt_node), M_SIFTR_PKTNODE, + M_NOWAIT | M_ZERO); + + if (pkt_node == NULL) { + if (dir == PFIL_IN) + ss->nskip_in_malloc++; + else + ss->nskip_out_malloc++; + + goto inp_unlock; + } + + /* Find the TCP control block that corresponds with this packet */ + tp = intotcpcb(inp); + + /* + * If we can't find the TCP control block (happens occasionaly for a + * packet sent during the shutdown phase of a TCP connection), + * or we're in the timewait state, bail + */ + if (tp == NULL || inp->inp_flags & INP_TIMEWAIT) { + if (dir == PFIL_IN) + ss->nskip_in_tcpcb++; + else + ss->nskip_out_tcpcb++; + + free(pkt_node, M_SIFTR_PKTNODE); + goto inp_unlock; + } + + /* Fill in pkt_node data */ +#ifdef SIFTR_IPV6 + pkt_node->ip_laddr[3] = inp->inp_laddr.s_addr; + pkt_node->ip_faddr[3] = inp->inp_faddr.s_addr; +#else + *((uint32_t *)pkt_node->ip_laddr) = inp->inp_laddr.s_addr; + *((uint32_t *)pkt_node->ip_faddr) = inp->inp_faddr.s_addr; +#endif + pkt_node->tcp_localport = inp->inp_lport; + pkt_node->tcp_foreignport = inp->inp_fport; + pkt_node->snd_cwnd = tp->snd_cwnd; + pkt_node->snd_wnd = tp->snd_wnd; + pkt_node->rcv_wnd = tp->rcv_wnd; + pkt_node->snd_bwnd = tp->snd_bwnd; + pkt_node->snd_ssthresh = tp->snd_ssthresh; + pkt_node->snd_scale = tp->snd_scale; + pkt_node->rcv_scale = tp->rcv_scale; + pkt_node->conn_state = tp->t_state; + pkt_node->max_seg_size = tp->t_maxseg; + pkt_node->smoothed_rtt = tp->t_srtt; + pkt_node->sack_enabled = (tp->t_flags & TF_SACK_PERMIT) != 0; + pkt_node->flags = tp->t_flags; + pkt_node->rxt_length = tp->t_rxtcur; + pkt_node->snd_buf_hiwater = inp->inp_socket->so_snd.sb_hiwat; + pkt_node->snd_buf_cc = inp->inp_socket->so_snd.sb_cc; + pkt_node->rcv_buf_hiwater = inp->inp_socket->so_rcv.sb_hiwat; + pkt_node->rcv_buf_cc = inp->inp_socket->so_rcv.sb_cc; + pkt_node->sent_inflight_bytes = tp->snd_max - tp->snd_una; + + /* We've finished accessing the tcb so release the lock. */ + if (inp_locally_locked) + INP_RUNLOCK(inp); + + /* These are safe to access without the inp lock. */ + pkt_node->ipver = INP_IPV4; + pkt_node->direction = dir; + + /* + * Significantly more accurate than using getmicrotime(), but slower! + * Gives true microsecond resolution at the expense of a hit to + * maximum pps throughput processing when SIFTR is loaded and enabled. + */ + microtime(&(pkt_node->tval)); + + if (siftr_generate_hashes) { + if ((*m)->m_pkthdr.csum_flags & CSUM_TCP) { + /* + * For outbound packets, the TCP checksum isn't + * calculated yet. This is a problem for our packet + * hashing as the receiver will calc a different hash + * to ours if we don't include the correct TCP checksum + * in the bytes being hashed. To work around this + * problem, we manually calc the TCP checksum here in + * software. We unset the CSUM_TCP flag so the lower + * layers don't recalc it. + */ + (*m)->m_pkthdr.csum_flags &= ~CSUM_TCP; + + /* + * Calculate the TCP checksum in software and assign + * to correct TCP header field, which will follow the + * packet mbuf down the stack. The trick here is that + * tcp_output() sets th->th_sum to the checksum of the + * pseudo header for us already. Because of the nature + * of the checksumming algorithm, we can sum over the + * entire IP payload (i.e. TCP header and data), which + * will include the already calculated pseduo header + * checksum, thus giving us the complete TCP checksum. + * + * To put it in simple terms, if checksum(1,2,3,4)=10, + * then checksum(1,2,3,4,5) == checksum(10,5). + * This property is what allows us to "cheat" and + * checksum only the IP payload which has the TCP + * th_sum field populated with the pseudo header's + * checksum, and not need to futz around checksumming + * pseudo header bytes and TCP header/data in one hit. + * Refer to RFC 1071 for more info. + * + * NB: in_cksum_skip(struct mbuf *m, int len, int skip) + * in_cksum_skip 2nd argument is NOT the number of + * bytes to read from the mbuf at "skip" bytes offset + * from the start of the mbuf (very counter intuitive!). + * The number of bytes to read is calculated internally + * by the function as len-skip i.e. to sum over the IP + * payload (TCP header + data) bytes, it is INCORRECT + * to call the function like this: + * in_cksum_skip(at, ip->ip_len - offset, offset) + * Rather, it should be called like this: + * in_cksum_skip(at, ip->ip_len, offset) + * which means read "ip->ip_len - offset" bytes from + * the mbuf cluster "at" at offset "offset" bytes from + * the beginning of the "at" mbuf's data pointer. + */ + th->th_sum = in_cksum_skip(*m, ip->ip_len, ip_hl); + } + + /* + * XXX: Having to calculate the checksum in software and then + * hash over all bytes is really inefficient. Would be nice to + * find a way to create the hash and checksum in the same pass + * over the bytes. + */ + pkt_node->hash = hash_pkt(*m, ip_hl); + } + + mtx_lock(&siftr_pkt_queue_mtx); + STAILQ_INSERT_TAIL(&pkt_queue, pkt_node, nodes); + mtx_unlock(&siftr_pkt_queue_mtx); + goto ret; + +inp_unlock: + if (inp_locally_locked) + INP_RUNLOCK(inp); + +ret: + /* Returning 0 ensures pfil will not discard the pkt */ + return (0); +} + + +#ifdef SIFTR_IPV6 +static int +siftr_chkpkt6(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, + struct inpcb *inp) +{ + struct pkt_node *pkt_node; + struct ip6_hdr *ip6; + struct tcphdr *th; + struct tcpcb *tp; + struct siftr_stats *ss; + unsigned int ip6_hl; + uint8_t inp_locally_locked; + + inp_locally_locked = 0; + ss = DPCPU_PTR(ss); + + /* + * m_pullup is not required here because ip6_{input|output} + * already do the heavy lifting for us. + */ + + ip6 = mtod(*m, struct ip6_hdr *); + + /* + * Only continue processing if the packet is TCP + * XXX: We should follow the next header fields + * as shown on Pg 6 RFC 2460, but right now we'll + * only check pkts that have no extension headers. + */ + if (ip6->ip6_nxt != IPPROTO_TCP) + goto ret6; + + /* + * If a kernel subsystem reinjects packets into the stack, our pfil + * hook will be called multiple times for the same packet. + * Make sure we only process unique packets. + */ + if (siftr_chkreinject(*m, dir, ss)) + goto ret6; + + if (dir == PFIL_IN) + ss->n_in++; + else + ss->n_out++; + + ip6_hl = sizeof(struct ip6_hdr); + + /* + * Create a tcphdr struct starting at the correct offset + * in the ipv6 packet. ip->ip_hl gives the ip header length + * in 4-byte words, so multiply it to get the size in bytes. + */ + th = (struct tcphdr *)((caddr_t)ip6 + ip6_hl); + + /* + * For inbound packets, the pfil hooks don't provide a pointer to the + * inpcb, so we need to find it ourselves and lock it. + */ + if (!inp) { + /* Find the corresponding inpcb for this pkt. */ + inp = siftr_findinpcb(INP_IPV6, (struct ip *)ip6, *m, + th->th_sport, th->th_dport, dir, ss); + + if (inp == NULL) + goto ret6; + else + inp_locally_locked = 1; + } + + pkt_node = malloc(sizeof(struct pkt_node), M_SIFTR_PKTNODE, + M_NOWAIT | M_ZERO); + + if (pkt_node == NULL) { + if (dir == PFIL_IN) + ss->nskip_in_malloc++; + else + ss->nskip_out_malloc++; + + goto inp_unlock6; + } + + /* Find the TCP control block that corresponds with this packet. */ + tp = intotcpcb(inp); + + /* + * If we can't find the TCP control block (happens occasionaly for a + * packet sent during the shutdown phase of a TCP connection), + * or we're in the timewait state, bail. + */ + if (tp == NULL || inp->inp_flags & INP_TIMEWAIT) { + if (dir == PFIL_IN) + ss->nskip_in_tcpcb++; + else + ss->nskip_out_tcpcb++; + + free(pkt_node, M_SIFTR_PKTNODE); + goto inp_unlock6; + } + + /* Fill in pkt_node data. */ + pkt_node->ip_laddr[0] = inp->in6p_laddr.s6_addr32[0]; + pkt_node->ip_laddr[1] = inp->in6p_laddr.s6_addr32[1]; + pkt_node->ip_laddr[2] = inp->in6p_laddr.s6_addr32[2]; + pkt_node->ip_laddr[3] = inp->in6p_laddr.s6_addr32[3]; + pkt_node->ip_faddr[0] = inp->in6p_faddr.s6_addr32[0]; + pkt_node->ip_faddr[1] = inp->in6p_faddr.s6_addr32[1]; + pkt_node->ip_faddr[2] = inp->in6p_faddr.s6_addr32[2]; + pkt_node->ip_faddr[3] = inp->in6p_faddr.s6_addr32[3]; + pkt_node->tcp_localport = inp->inp_lport; + pkt_node->tcp_foreignport = inp->inp_fport; + pkt_node->snd_cwnd = tp->snd_cwnd; + pkt_node->snd_wnd = tp->snd_wnd; + pkt_node->rcv_wnd = tp->rcv_wnd; + pkt_node->snd_bwnd = tp->snd_bwnd; + pkt_node->snd_ssthresh = tp->snd_ssthresh; + pkt_node->snd_scale = tp->snd_scale; + pkt_node->rcv_scale = tp->rcv_scale; + pkt_node->conn_state = tp->t_state; + pkt_node->max_seg_size = tp->t_maxseg; + pkt_node->smoothed_rtt = tp->t_srtt; + pkt_node->sack_enabled = (tp->t_flags & TF_SACK_PERMIT) != 0; + pkt_node->flags = tp->t_flags; + pkt_node->rxt_length = tp->t_rxtcur; + pkt_node->snd_buf_hiwater = inp->inp_socket->so_snd.sb_hiwat; + pkt_node->snd_buf_cc = inp->inp_socket->so_snd.sb_cc; + pkt_node->rcv_buf_hiwater = inp->inp_socket->so_rcv.sb_hiwat; + pkt_node->rcv_buf_cc = inp->inp_socket->so_rcv.sb_cc; + pkt_node->sent_inflight_bytes = tp->snd_max - tp->snd_una; + + /* We've finished accessing the tcb so release the lock. */ + if (inp_locally_locked) + INP_RUNLOCK(inp); + + /* These are safe to access without the inp lock. */ + pkt_node->ipver = INP_IPV6; + pkt_node->direction = dir; + + /* + * Significantly more accurate than using getmicrotime(), but slower! + * Gives true microsecond resolution at the expense of a hit to + * maximum pps throughput processing when SIFTR is loaded and enabled. + */ + microtime(&(pkt_node->tval)); + + /* XXX: Figure out how to do hash calcs for IPv6 */ + + mtx_lock(&siftr_pkt_queue_mtx); + STAILQ_INSERT_TAIL(&pkt_queue, pkt_node, nodes); + mtx_unlock(&siftr_pkt_queue_mtx); + goto ret6; + +inp_unlock6: + if (inp_locally_locked) + INP_RUNLOCK(inp); + +ret6: + /* Returning 0 ensures pfil will not discard the pkt. */ + return (0); +} +#endif /* #ifdef SIFTR_IPV6 */ + + +static int +siftr_pfil(int action) +{ + struct pfil_head *pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET); +#ifdef SIFTR_IPV6 + struct pfil_head *pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6); +#endif + + if (action == HOOK) { + pfil_add_hook(siftr_chkpkt, NULL, + PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh_inet); +#ifdef SIFTR_IPV6 + pfil_add_hook(siftr_chkpkt6, NULL, + PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh_inet6); +#endif + } else if (action == UNHOOK) { + pfil_remove_hook(siftr_chkpkt, NULL, + PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh_inet); +#ifdef SIFTR_IPV6 + pfil_remove_hook(siftr_chkpkt6, NULL, + PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh_inet6); +#endif + } + + return (0); +} + + +static int +siftr_sysctl_logfile_name_handler(SYSCTL_HANDLER_ARGS) +{ + struct alq *new_alq; + int error; + + if (req->newptr == NULL) + goto skip; + + /* If old filename and new filename are different. */ + if (strncmp(siftr_logfile, (char *)req->newptr, PATH_MAX)) { + + error = alq_open(&new_alq, req->newptr, curthread->td_ucred, + SIFTR_LOG_FILE_MODE, SIFTR_ALQ_BUFLEN, 0); + + /* Bail if unable to create new alq. */ + if (error) + return (1); + + /* + * If disabled, siftr_alq == NULL so we simply close + * the alq as we've proved it can be opened. + * If enabled, close the existing alq and switch the old + * for the new. + */ + if (siftr_alq == NULL) + alq_close(new_alq); + else { + alq_close(siftr_alq); + siftr_alq = new_alq; + } + } + +skip: + return (sysctl_handle_string(oidp, arg1, arg2, req)); +} + + +static int +siftr_manage_ops(uint8_t action) +{ + struct siftr_stats totalss; + struct timeval tval; + struct flow_hash_node *counter, *tmp_counter; + struct sbuf *s; + int i, key_index, ret, error; + uint32_t bytes_to_write, total_skipped_pkts; + uint16_t lport, fport; + uint8_t *key, ipver; + +#ifdef SIFTR_IPV6 + uint32_t laddr[4]; + uint32_t faddr[4]; +#else + uint8_t laddr[4]; + uint8_t faddr[4]; +#endif + + error = 0; + total_skipped_pkts = 0; + + /* Init an autosizing sbuf that initially holds 200 chars. */ + if ((s = sbuf_new(NULL, NULL, 200, SBUF_AUTOEXTEND)) == NULL) + return (-1); + + if (action == SIFTR_ENABLE) { + /* + * Create our alq + * XXX: We should abort if alq_open fails! + */ + alq_open(&siftr_alq, siftr_logfile, curthread->td_ucred, + SIFTR_LOG_FILE_MODE, SIFTR_ALQ_BUFLEN, 0); + + STAILQ_INIT(&pkt_queue); + + siftr_exit_pkt_manager_thread = 0; + + ret = kthread_add(&siftr_pkt_manager_thread, NULL, NULL, + &siftr_pkt_manager_thr, RFNOWAIT, 0, + "siftr_pkt_manager_thr"); + + siftr_pfil(HOOK); + + microtime(&tval); + + sbuf_printf(s, + "enable_time_secs=%jd\tenable_time_usecs=%06ld\t" + "siftrver=%s\thz=%u\ttcp_rtt_scale=%u\tsysname=%s\t" + "sysver=%u\tipmode=%u\n", + (intmax_t)tval.tv_sec, tval.tv_usec, MODVERSION_STR, hz, + TCP_RTT_SCALE, SYS_NAME, __FreeBSD_version, SIFTR_IPMODE); + + sbuf_finish(s); + alq_writen(siftr_alq, sbuf_data(s), sbuf_len(s), ALQ_WAITOK); + + } else if (action == SIFTR_DISABLE && siftr_pkt_manager_thr != NULL) { + /* + * Remove the pfil hook functions. All threads currently in + * the hook functions are allowed to exit before siftr_pfil() + * returns. + */ + siftr_pfil(UNHOOK); + + /* This will block until the pkt manager thread unlocks it. */ + mtx_lock(&siftr_pkt_mgr_mtx); + + /* Tell the pkt manager thread that it should exit now. */ + siftr_exit_pkt_manager_thread = 1; + + /* + * Wake the pkt_manager thread so it realises that + * siftr_exit_pkt_manager_thread == 1 and exits gracefully. + * The wakeup won't be delivered until we unlock + * siftr_pkt_mgr_mtx so this isn't racy. + */ + wakeup(&wait_for_pkt); + + /* Wait for the pkt_manager thread to exit. */ + mtx_sleep(siftr_pkt_manager_thr, &siftr_pkt_mgr_mtx, PWAIT, + "thrwait", 0); + + siftr_pkt_manager_thr = NULL; + mtx_unlock(&siftr_pkt_mgr_mtx); + + totalss.n_in = DPCPU_SUM(ss, n_in); + totalss.n_out = DPCPU_SUM(ss, n_out); + totalss.nskip_in_malloc = DPCPU_SUM(ss, nskip_in_malloc); + totalss.nskip_out_malloc = DPCPU_SUM(ss, nskip_out_malloc); + totalss.nskip_in_mtx = DPCPU_SUM(ss, nskip_in_mtx); + totalss.nskip_out_mtx = DPCPU_SUM(ss, nskip_out_mtx); + totalss.nskip_in_tcpcb = DPCPU_SUM(ss, nskip_in_tcpcb); + totalss.nskip_out_tcpcb = DPCPU_SUM(ss, nskip_out_tcpcb); + totalss.nskip_in_inpcb = DPCPU_SUM(ss, nskip_in_inpcb); + totalss.nskip_out_inpcb = DPCPU_SUM(ss, nskip_out_inpcb); + + total_skipped_pkts = totalss.nskip_in_malloc + + totalss.nskip_out_malloc + totalss.nskip_in_mtx + + totalss.nskip_out_mtx + totalss.nskip_in_tcpcb + + totalss.nskip_out_tcpcb + totalss.nskip_in_inpcb + + totalss.nskip_out_inpcb; + + microtime(&tval); + + sbuf_printf(s, + "disable_time_secs=%jd\tdisable_time_usecs=%06ld\t" + "num_inbound_tcp_pkts=%ju\tnum_outbound_tcp_pkts=%ju\t" + "total_tcp_pkts=%ju\tnum_inbound_skipped_pkts_malloc=%u\t" + "num_outbound_skipped_pkts_malloc=%u\t" + "num_inbound_skipped_pkts_mtx=%u\t" + "num_outbound_skipped_pkts_mtx=%u\t" + "num_inbound_skipped_pkts_tcpcb=%u\t" + "num_outbound_skipped_pkts_tcpcb=%u\t" + "num_inbound_skipped_pkts_inpcb=%u\t" + "num_outbound_skipped_pkts_inpcb=%u\t" + "total_skipped_tcp_pkts=%u\tflow_list=", + (intmax_t)tval.tv_sec, + tval.tv_usec, + (uintmax_t)totalss.n_in, + (uintmax_t)totalss.n_out, + (uintmax_t)(totalss.n_in + totalss.n_out), + totalss.nskip_in_malloc, + totalss.nskip_out_malloc, + totalss.nskip_in_mtx, + totalss.nskip_out_mtx, + totalss.nskip_in_tcpcb, + totalss.nskip_out_tcpcb, + totalss.nskip_in_inpcb, + totalss.nskip_out_inpcb, + total_skipped_pkts); + + /* + * Iterate over the flow hash, printing a summary of each + * flow seen and freeing any malloc'd memory. + * The hash consists of an array of LISTs (man 3 queue). + */ + for (i = 0; i < siftr_hashmask; i++) { + LIST_FOREACH_SAFE(counter, counter_hash + i, nodes, + tmp_counter) { + key = counter->key; + key_index = 1; + + ipver = key[0]; + + memcpy(laddr, key + key_index, sizeof(laddr)); + key_index += sizeof(laddr); + memcpy(&lport, key + key_index, sizeof(lport)); + key_index += sizeof(lport); + memcpy(faddr, key + key_index, sizeof(faddr)); + key_index += sizeof(faddr); + memcpy(&fport, key + key_index, sizeof(fport)); + +#ifdef SIFTR_IPV6 + laddr[3] = ntohl(laddr[3]); + faddr[3] = ntohl(faddr[3]); + + if (ipver == INP_IPV6) { + laddr[0] = ntohl(laddr[0]); + laddr[1] = ntohl(laddr[1]); + laddr[2] = ntohl(laddr[2]); + faddr[0] = ntohl(faddr[0]); + faddr[1] = ntohl(faddr[1]); + faddr[2] = ntohl(faddr[2]); + + sbuf_printf(s, + "%x:%x:%x:%x:%x:%x:%x:%x;%u-" + "%x:%x:%x:%x:%x:%x:%x:%x;%u,", + UPPER_SHORT(laddr[0]), + LOWER_SHORT(laddr[0]), + UPPER_SHORT(laddr[1]), + LOWER_SHORT(laddr[1]), + UPPER_SHORT(laddr[2]), + LOWER_SHORT(laddr[2]), + UPPER_SHORT(laddr[3]), + LOWER_SHORT(laddr[3]), + ntohs(lport), + UPPER_SHORT(faddr[0]), + LOWER_SHORT(faddr[0]), + UPPER_SHORT(faddr[1]), + LOWER_SHORT(faddr[1]), + UPPER_SHORT(faddr[2]), + LOWER_SHORT(faddr[2]), + UPPER_SHORT(faddr[3]), + LOWER_SHORT(faddr[3]), + ntohs(fport)); + } else { + laddr[0] = FIRST_OCTET(laddr[3]); + laddr[1] = SECOND_OCTET(laddr[3]); + laddr[2] = THIRD_OCTET(laddr[3]); + laddr[3] = FOURTH_OCTET(laddr[3]); + faddr[0] = FIRST_OCTET(faddr[3]); + faddr[1] = SECOND_OCTET(faddr[3]); + faddr[2] = THIRD_OCTET(faddr[3]); + faddr[3] = FOURTH_OCTET(faddr[3]); +#endif + sbuf_printf(s, + "%u.%u.%u.%u;%u-%u.%u.%u.%u;%u,", + laddr[0], + laddr[1], + laddr[2], + laddr[3], + ntohs(lport), + faddr[0], + faddr[1], + faddr[2], + faddr[3], + ntohs(fport)); +#ifdef SIFTR_IPV6 + } +#endif + + free(counter, M_SIFTR_HASHNODE); + } + + LIST_INIT(counter_hash + i); + } + + sbuf_printf(s, "\n"); + sbuf_finish(s); + + i = 0; + do { + bytes_to_write = min(SIFTR_ALQ_BUFLEN, sbuf_len(s)-i); + alq_writen(siftr_alq, sbuf_data(s)+i, bytes_to_write, ALQ_WAITOK); + i += bytes_to_write; + } while (i < sbuf_len(s)); + + alq_close(siftr_alq); + siftr_alq = NULL; + } + + sbuf_delete(s); + + /* + * XXX: Should be using ret to check if any functions fail + * and set error appropriately + */ + + return (error); +} + + +static int +siftr_sysctl_enabled_handler(SYSCTL_HANDLER_ARGS) +{ + if (req->newptr == NULL) + goto skip; + + /* If the value passed in isn't 0 or 1, return an error. */ + if (CAST_PTR_INT(req->newptr) != 0 && CAST_PTR_INT(req->newptr) != 1) + return (1); + + /* If we are changing state (0 to 1 or 1 to 0). */ + if (CAST_PTR_INT(req->newptr) != siftr_enabled ) + if (siftr_manage_ops(CAST_PTR_INT(req->newptr))) { + siftr_manage_ops(SIFTR_DISABLE); + return (1); + } + +skip: + return (sysctl_handle_int(oidp, arg1, arg2, req)); +} + + +static void +siftr_shutdown_handler(void *arg) +{ + siftr_manage_ops(SIFTR_DISABLE); +} + + +/* + * Module is being unloaded or machine is shutting down. Take care of cleanup. + */ +static int +deinit_siftr(void) +{ + /* Cleanup. */ + siftr_manage_ops(SIFTR_DISABLE); + hashdestroy(counter_hash, M_SIFTR, siftr_hashmask); + mtx_destroy(&siftr_pkt_queue_mtx); + mtx_destroy(&siftr_pkt_mgr_mtx); + + return (0); +} + + +/* + * Module has just been loaded into the kernel. + */ +static int +init_siftr(void) +{ + EVENTHANDLER_REGISTER(shutdown_pre_sync, siftr_shutdown_handler, NULL, + SHUTDOWN_PRI_FIRST); + + /* Initialise our flow counter hash table. */ + counter_hash = hashinit(SIFTR_EXPECTED_MAX_TCP_FLOWS, M_SIFTR, + &siftr_hashmask); + + mtx_init(&siftr_pkt_queue_mtx, "siftr_pkt_queue_mtx", NULL, MTX_DEF); + mtx_init(&siftr_pkt_mgr_mtx, "siftr_pkt_mgr_mtx", NULL, MTX_DEF); + + /* Print message to the user's current terminal. */ + uprintf("\nStatistical Information For TCP Research (SIFTR) %s\n" + " http://caia.swin.edu.au/urp/newtcp\n\n", + MODVERSION_STR); + + return (0); +} + + +/* + * This is the function that is called to load and unload the module. + * When the module is loaded, this function is called once with + * "what" == MOD_LOAD + * When the module is unloaded, this function is called twice with + * "what" = MOD_QUIESCE first, followed by "what" = MOD_UNLOAD second + * When the system is shut down e.g. CTRL-ALT-DEL or using the shutdown command, + * this function is called once with "what" = MOD_SHUTDOWN + * When the system is shut down, the handler isn't called until the very end + * of the shutdown sequence i.e. after the disks have been synced. + */ +static int +siftr_load_handler(module_t mod, int what, void *arg) +{ + int ret; + + switch (what) { + case MOD_LOAD: + ret = init_siftr(); + break; + + case MOD_QUIESCE: + case MOD_SHUTDOWN: + ret = deinit_siftr(); + break; + + case MOD_UNLOAD: + ret = 0; + break; + + default: + ret = EINVAL; + break; + } + + return (ret); +} + + +static moduledata_t siftr_mod = { + .name = "siftr", + .evhand = siftr_load_handler, +}; + +/* + * Param 1: name of the kernel module + * Param 2: moduledata_t struct containing info about the kernel module + * and the execution entry point for the module + * Param 3: From sysinit_sub_id enumeration in /usr/include/sys/kernel.h + * Defines the module initialisation order + * Param 4: From sysinit_elem_order enumeration in /usr/include/sys/kernel.h + * Defines the initialisation order of this kld relative to others + * within the same subsystem as defined by param 3 + */ +DECLARE_MODULE(siftr, siftr_mod, SI_SUB_SMP, SI_ORDER_ANY); +MODULE_DEPEND(siftr, alq, 1, 1, 1); +MODULE_VERSION(siftr, MODVERSION);