diff -r 1a8929bdc357 lib/libc/net/getifaddrs.c --- a/lib/libc/net/getifaddrs.c Thu Jan 24 06:03:22 2013 +0800 +++ b/lib/libc/net/getifaddrs.c Wed Feb 06 10:50:21 2013 +0800 @@ -76,7 +76,7 @@ #define HAVE_IFM_DATA #endif -#if _BSDI_VERSION >= 199802 +#if (_BSDI_VERSION >= 199802) || (__FreeBSD_version >= 901000) /* ifam_data is very specific to recent versions of bsdi */ #define HAVE_IFAM_DATA #endif diff -r 1a8929bdc357 sbin/ifconfig/af_inet.c --- a/sbin/ifconfig/af_inet.c Thu Jan 24 06:03:22 2013 +0800 +++ b/sbin/ifconfig/af_inet.c Wed Feb 06 10:50:21 2013 +0800 @@ -84,8 +84,11 @@ if (ifa->ifa_flags & IFF_BROADCAST) { sin = (struct sockaddr_in *)ifa->ifa_broadaddr; if (sin != NULL && sin->sin_addr.s_addr != 0) - printf("broadcast %s", inet_ntoa(sin->sin_addr)); + printf("broadcast %s ", inet_ntoa(sin->sin_addr)); } + + print_vhid(ifa, " "); + putchar('\n'); } diff -r 1a8929bdc357 sbin/ifconfig/af_inet6.c --- a/sbin/ifconfig/af_inet6.c Thu Jan 24 06:03:22 2013 +0800 +++ b/sbin/ifconfig/af_inet6.c Wed Feb 06 10:50:21 2013 +0800 @@ -307,6 +307,8 @@ printf("infty "); } + print_vhid(ifa, " "); + putchar('\n'); } diff -r 1a8929bdc357 sbin/ifconfig/ifcarp.c --- a/sbin/ifconfig/ifcarp.c Thu Jan 24 06:03:22 2013 +0800 +++ b/sbin/ifconfig/ifcarp.c Wed Feb 06 10:50:21 2013 +0800 @@ -1,4 +1,4 @@ -/* $FreeBSD: release/9.1.0/sbin/ifconfig/ifcarp.c 232486 2012-03-04 10:37:26Z remko $ */ +/* $FreeBSD: head/sbin/ifconfig/ifcarp.c 228571 2011-12-16 12:16:56Z glebius $ */ /* from $OpenBSD: ifconfig.c,v 1.82 2003/10/19 05:43:35 mcbride Exp $ */ /* @@ -35,10 +35,11 @@ #include #include -#include #include +#include +#include +#include #include -#include #include #include @@ -52,150 +53,153 @@ static const char *carp_states[] = { CARP_STATES }; -void carp_status(int s); -void setcarp_advbase(const char *,int, int, const struct afswtch *rafp); -void setcarp_advskew(const char *, int, int, const struct afswtch *rafp); -void setcarp_passwd(const char *, int, int, const struct afswtch *rafp); -void setcarp_vhid(const char *, int, int, const struct afswtch *rafp); -void setcarp_state(const char *, int, int, const struct afswtch *rafp); +static void carp_status(int s); +static void setcarp_vhid(const char *, int, int, const struct afswtch *rafp); +static void setcarp_callback(int, void *); +static void setcarp_advbase(const char *,int, int, const struct afswtch *rafp); +static void setcarp_advskew(const char *, int, int, const struct afswtch *rafp); +static void setcarp_passwd(const char *, int, int, const struct afswtch *rafp); -void +static int carpr_vhid = -1; +static int carpr_advskew = -1; +static int carpr_advbase = -1; +static int carpr_state = -1; +static unsigned char const *carpr_key; + +static void carp_status(int s) { - const char *state; - struct carpreq carpr; + struct carpreq carpr[CARP_MAXVHID]; + int i; - memset((char *)&carpr, 0, sizeof(struct carpreq)); + bzero(carpr, sizeof(struct carpreq) * CARP_MAXVHID); + carpr[0].carpr_count = CARP_MAXVHID; ifr.ifr_data = (caddr_t)&carpr; if (ioctl(s, SIOCGVH, (caddr_t)&ifr) == -1) return; - if (carpr.carpr_vhid > 0) { - if (carpr.carpr_state > CARP_MAXSTATE) - state = ""; + for (i = 0; i < carpr[0].carpr_count; i++) { + printf("\tcarp: %s vhid %d advbase %d advskew %d", + carp_states[carpr[i].carpr_state], carpr[i].carpr_vhid, + carpr[i].carpr_advbase, carpr[i].carpr_advskew); + if (printkeys && carpr[i].carpr_key[0] != '\0') + printf(" key \"%s\"\n", carpr[i].carpr_key); else - state = carp_states[carpr.carpr_state]; + printf("\n"); + } +} - printf("\tcarp: %s vhid %d advbase %d advskew %d\n", - state, carpr.carpr_vhid, carpr.carpr_advbase, - carpr.carpr_advskew); +static void +setcarp_vhid(const char *val, int d, int s, const struct afswtch *afp) +{ + + carpr_vhid = atoi(val); + + if (carpr_vhid <= 0 || carpr_vhid > CARP_MAXVHID) + errx(1, "vhid must be greater than 0 and less than %u", + CARP_MAXVHID); + + switch (afp->af_af) { +#ifdef INET + case AF_INET: + { + struct in_aliasreq *ifra; + + ifra = (struct in_aliasreq *)afp->af_addreq; + ifra->ifra_vhid = carpr_vhid; + break; + } +#endif +#ifdef INET6 + case AF_INET6: + { + struct in6_aliasreq *ifra; + + ifra = (struct in6_aliasreq *)afp->af_addreq; + ifra->ifra_vhid = carpr_vhid; + break; + } +#endif + default: + errx(1, "%s doesn't support carp(4)", afp->af_name); } - return; - + callback_register(setcarp_callback, NULL); } -void -setcarp_passwd(const char *val, int d, int s, const struct afswtch *afp) +static void +setcarp_callback(int s, void *arg __unused) { struct carpreq carpr; - memset((char *)&carpr, 0, sizeof(struct carpreq)); + bzero(&carpr, sizeof(struct carpreq)); + carpr.carpr_vhid = carpr_vhid; + carpr.carpr_count = 1; ifr.ifr_data = (caddr_t)&carpr; - if (ioctl(s, SIOCGVH, (caddr_t)&ifr) == -1) + if (ioctl(s, SIOCGVH, (caddr_t)&ifr) == -1 && errno != ENOENT) err(1, "SIOCGVH"); - memset(carpr.carpr_key, 0, sizeof(carpr.carpr_key)); - /* XXX Should hash the password into the key here, perhaps? */ - strlcpy(carpr.carpr_key, val, CARP_KEY_LEN); + if (carpr_key != NULL) + /* XXX Should hash the password into the key here? */ + strlcpy(carpr.carpr_key, carpr_key, CARP_KEY_LEN); + if (carpr_advskew > -1) + carpr.carpr_advskew = carpr_advskew; + if (carpr_advbase > -1) + carpr.carpr_advbase = carpr_advbase; + if (carpr_state > -1) + carpr.carpr_state = carpr_state; if (ioctl(s, SIOCSVH, (caddr_t)&ifr) == -1) err(1, "SIOCSVH"); - - return; } -void -setcarp_vhid(const char *val, int d, int s, const struct afswtch *afp) +static void +setcarp_passwd(const char *val, int d, int s, const struct afswtch *afp) { - int vhid; - struct carpreq carpr; - vhid = atoi(val); + if (carpr_vhid == -1) + errx(1, "passwd requires vhid"); - if (vhid <= 0) - errx(1, "vhid must be greater than 0"); - - memset((char *)&carpr, 0, sizeof(struct carpreq)); - ifr.ifr_data = (caddr_t)&carpr; - - if (ioctl(s, SIOCGVH, (caddr_t)&ifr) == -1) - err(1, "SIOCGVH"); - - carpr.carpr_vhid = vhid; - - if (ioctl(s, SIOCSVH, (caddr_t)&ifr) == -1) - err(1, "SIOCSVH"); - - return; + carpr_key = val; } -void +static void setcarp_advskew(const char *val, int d, int s, const struct afswtch *afp) { - int advskew; - struct carpreq carpr; - advskew = atoi(val); + if (carpr_vhid == -1) + errx(1, "advskew requires vhid"); - memset((char *)&carpr, 0, sizeof(struct carpreq)); - ifr.ifr_data = (caddr_t)&carpr; - - if (ioctl(s, SIOCGVH, (caddr_t)&ifr) == -1) - err(1, "SIOCGVH"); - - carpr.carpr_advskew = advskew; - - if (ioctl(s, SIOCSVH, (caddr_t)&ifr) == -1) - err(1, "SIOCSVH"); - - return; + carpr_advskew = atoi(val); } -void +static void setcarp_advbase(const char *val, int d, int s, const struct afswtch *afp) { - int advbase; - struct carpreq carpr; - advbase = atoi(val); + if (carpr_vhid == -1) + errx(1, "advbase requires vhid"); - memset((char *)&carpr, 0, sizeof(struct carpreq)); - ifr.ifr_data = (caddr_t)&carpr; - - if (ioctl(s, SIOCGVH, (caddr_t)&ifr) == -1) - err(1, "SIOCGVH"); - - carpr.carpr_advbase = advbase; - - if (ioctl(s, SIOCSVH, (caddr_t)&ifr) == -1) - err(1, "SIOCSVH"); - - return; + carpr_advbase = atoi(val); } -void setcarp_state(const char *val, int d, int s, const struct afswtch *afp) +static void +setcarp_state(const char *val, int d, int s, const struct afswtch *afp) { - struct carpreq carpr; int i; - bzero((char *)&carpr, sizeof(struct carpreq)); - ifr.ifr_data = (caddr_t)&carpr; + if (carpr_vhid == -1) + errx(1, "state requires vhid"); - if (ioctl(s, SIOCGVH, (caddr_t)&ifr) == -1) - err(1, "SIOCGVH"); + for (i = 0; i <= CARP_MAXSTATE; i++) + if (strcasecmp(carp_states[i], val) == 0) { + carpr_state = i; + return; + } - for (i = 0; i <= CARP_MAXSTATE; i++) { - if (!strcasecmp(val, carp_states[i])) { - carpr.carpr_state = i; - break; - } - } - - if (ioctl(s, SIOCSVH, (caddr_t)&ifr) == -1) - err(1, "SIOCSVH"); + errx(1, "unknown state"); } static struct cmd carp_cmds[] = { diff -r 1a8929bdc357 sbin/ifconfig/ifconfig.8 --- a/sbin/ifconfig/ifconfig.8 Thu Jan 24 06:03:22 2013 +0800 +++ b/sbin/ifconfig/ifconfig.8 Wed Feb 06 10:50:21 2013 +0800 @@ -26,9 +26,9 @@ .\" SUCH DAMAGE. .\" .\" From: @(#)ifconfig.8 8.3 (Berkeley) 1/5/94 -.\" $FreeBSD: release/9.1.0/sbin/ifconfig/ifconfig.8 238247 2012-07-08 14:21:36Z bz $ +.\" $FreeBSD: head/sbin/ifconfig/ifconfig.8 228571 2011-12-16 12:16:56Z glebius $ .\" -.Dd May 27, 2012 +.Dd December 16, 2011 .Dt IFCONFIG 8 .Os .Sh NAME @@ -372,32 +372,16 @@ .It Cm name Ar name Set the interface name to .Ar name . -.It Cm rxcsum , txcsum , rxcsum6 , txcsum6 +.It Cm rxcsum , txcsum If the driver supports user-configurable checksum offloading, enable receive (or transmit) checksum offloading on the interface. -The feature can be turned on selectively per protocol family. -Use -.Cm rxcsum6 , txcsum6 -for -.Xr ip6 4 -or -.Cm rxcsum , txcsum -otherwise. Some drivers may not be able to enable these flags independently of each other, so setting one may also set the other. The driver will offload as much checksum work as it can reliably support, the exact level of offloading varies between drivers. -.It Fl rxcsum , txcsum , rxcsum6 , txcsum6 +.It Fl rxcsum , txcsum If the driver supports user-configurable checksum offloading, disable receive (or transmit) checksum offloading on the interface. -The feature can be turned off selectively per protocol family. -Use -.Fl rxcsum6 , txcsum6 -for -.Xr ip6 4 -or -.Fl rxcsum , txcsum -otherwise. These settings may not always be independent of each other. .It Cm tso If the driver supports @@ -416,22 +400,6 @@ .Xr ip 4 and .Xr ip6 4 . -.It Cm tso6 , tso4 -If the driver supports -.Xr tcp 4 -segmentation offloading for -.Xr ip6 4 -or -.Xr ip 4 -use one of these to selectively enabled it only for one protocol family. -.It Fl tso6 , tso4 -If the driver supports -.Xr tcp 4 -segmentation offloading for -.Xr ip6 4 -or -.Xr ip 4 -use one of these to selectively disable it only for one protocol family. .It Cm lro If the driver supports .Xr tcp 4 @@ -455,10 +423,10 @@ is a synonym for enabling all available WOL mechanisms. To disable WOL use .Fl wol . -.It Cm vlanmtu , vlanhwtag, vlanhwfilter, vlanhwcsum, vlanhwtso +.It Cm vlanmtu , vlanhwtag, vlanhwfilter, vlanhwtso If the driver offers user-configurable VLAN support, enable reception of extended frames, tag processing in hardware, -frame filtering in hardware, checksum offloading, or TSO on VLAN, +frame filtering in hardware, or TSO on VLAN, respectively. Note that this must be issued on a physical interface associated with .Xr vlan 4 , @@ -707,7 +675,7 @@ .It Cm -ifdisabled Clear a flag .Cm ifdisabled . -When this flag is cleared and +When this flag is cleared and .Cm auto_linklocal flag is enabled, automatic configuration of a link-local address is performed. @@ -738,7 +706,7 @@ .Ar mode is one of .Cm sta , -.Cm ahdemo +.Cm ahdemo (or .Cm adhoc-demo ), .Cm ibss , @@ -776,7 +744,7 @@ Mark a .Cm wds device as operating in ``legacy mode''. -Legacy +Legacy .Cm wds devices have a fixed peer relationship and do not, for example, roam if their peer stops communicating. @@ -792,9 +760,9 @@ track received beacons. To have beacons tracked in software use .Fl beacons . -For +For .Cm hostap -mode +mode .Fl beacons can also be used to indicate no beacons should be transmitted; this can be useful when creating a WDS configuration but @@ -929,7 +897,7 @@ .Ar interval parameter is specified in seconds. By default a background scan is considered every 300 seconds (5 minutes). -The +The .Ar interval may not be set to less than 15 seconds. .It Cm bintval Ar interval @@ -1030,19 +998,19 @@ .Cm t (Atheros Dynamic Turbo mode, or appended to ``st'' and ``dt''). The full set of channel widths following a '/' are: -.Cm 5 +.Cm 5 (5MHz aka quarter-rate channel), -.Cm 10 +.Cm 10 (10MHz aka half-rate channel), -.Cm 20 +.Cm 20 (20MHz mostly for use in specifying ht20), and -.Cm 40 +.Cm 40 (40MHz mostly for use in specifying ht40). In addition, a 40MHz HT channel specification may include the location of the extension channel by appending ``+'' or ``-'' for above and below, -respectively; e.g. ``2437:ht/40+'' specifies 40MHz wide HT operation +respectively; e.g. ``2437:ht/40+'' specifies 40MHz wide HT operation with the center channel at frequency 2437 and the extension channel above. .It Cm country Ar name Set the country code to use in calculating the regulatory constraints @@ -1068,7 +1036,7 @@ DFS embodies several facilities including detection of overlapping radar signals, dynamic transmit power control, and channel selection according to a least-congested criteria. -DFS support is mandatory for some 5GHz frequencies in certain +DFS support is mandatory for some 5Ghz frequencies in certain locales (e.g. ETSI). By default DFS is enabled according to the regulatory definitions specified in /etc/regdomain.xml and the current country code, regdomain, @@ -1120,6 +1088,38 @@ specifies the number of beacon intervals between DTIM and must be in the range 1 to 15. By default DTIM is 1 (i.e., DTIM occurs at each beacon). +.It Cm quiet +Enable the use of quiet IE. Hostap will use this to silent other +stations to reduce interference for radar detection when +operating on 5Ghz frequency and doth support is enabled. +Use +.Fl quiet +to disable this functionality. +.It Cm quiet_period Ar period +Set the QUIET +.Ar period +to the number of beacon intervals between the start of regularly +scheduled quiet intervals defined by Quiet element. +.It Cm quiet_count Ar count +Set the QUIET +.Ar count +to the number of TBTTs until the beacon interval during which the +next quiet interval shall start. A value of 1 indicates the quiet +interval will start during the beacon interval starting at the next +TBTT. A value 0 is reserved. +.It Cm quiet_offset Ar offset +Set the QUIET +.Ar offset +to the offset of the start of the quiet interval from the TBTT +specified by the Quiet count, expressed in TUs. +The value of the +.Ar offset +shall be less than one beacon interval. +.It Cm quiet_duration Ar dur +Set the QUIET +.Ar dur +to the duration of the Quiet interval, expressed in TUs. +The value should be less than beacon interval. .It Cm dturbo Enable the use of Atheros Dynamic Turbo mode when communicating with another Dynamic Turbo-capable station. @@ -2021,7 +2021,7 @@ discover a path to us. .El By default -.Cm hwmprootmode +.Cm hwmprootmode is set to .Ar DISABLED . .It Cm hwmpmaxhops Ar cnt @@ -2106,7 +2106,7 @@ .It Cm maxaddr Ar size Set the size of the bridge address cache to .Ar size . -The default is 2000 entries. +The default is 100 entries. .It Cm timeout Ar seconds Set the timeout of address cache entries to .Ar seconds @@ -2114,7 +2114,7 @@ If .Ar seconds is zero, then address cache entries will not be expired. -The default is 1200 seconds. +The default is 240 seconds. .It Cm addr Display the addresses that have been learned by the bridge. .It Cm static Ar interface-name Ar address @@ -2309,21 +2309,6 @@ The default is failover. The available options are failover, fec, lacp, loadbalance, roundrobin and none. -.It Cm lagghash Ar option Ns Oo , Ns Ar option Oc -Set the packet layers to hash for aggregation protocols which load balance. -The default is -.Dq l2,l3,l4 . -The options can be combined using commas. -.Pp -.Bl -tag -width ".Cm l2" -compact -.It Cm l2 -src/dst mac address and optional vlan number. -.It Cm l3 -src/dst address for IPv4 or IPv6. -.It Cm l4 -src/dst port for TCP/UDP/SCTP. -.El -.Pp .El .Pp The following parameters are specific to IP tunnel interfaces, @@ -2460,16 +2445,36 @@ argument is useless and hence deprecated. .El .Pp -The following parameters are specific to +The following parameters are used to configure .Xr carp 4 -interfaces: +protocol on an interface: .Bl -tag -width indent +.It Cm vhid Ar n +Set the virtual host ID. +This is a required setting to initiate +.Xr carp 4 . +If the virtual host ID doesn't exist yet, it is created and attached to the +interface, otherwise configuration of an existing vhid is adjusted. +If the +.Cm vhid +keyword is supplied along with an +.Dq inet6 +or +.Dq inet +address, then this address is configured to be run under control of the +specified vhid. +Whenever a last address that refers to a particular vhid is removed from an +interface, the vhid is automatically removed from interface and destroyed. +Any other configuration parameters for the +.Xr carp 4 +protocol should be supplied along with the +.Cm vhid +keyword. +Acceptable values for vhid are 1 to 255. .It Cm advbase Ar seconds Specifies the base of the advertisement interval in seconds. The acceptable values are 1 to 255. The default value is 1. -.\" The default value is -.\" .Dv CARP_DFLTINTV . .It Cm advskew Ar interval Specifies the skew to add to the base advertisement interval to make one host advertise slower than another host. @@ -2479,17 +2484,8 @@ .It Cm pass Ar phrase Set the authentication key to .Ar phrase . -.It Cm vhid Ar n -Set the virtual host ID. -This is a required setting. -Acceptable values are 1 to 255. -.It Cm state Ar state -Force the interface into state -.Ar state . -Valid states are INIT, BACKUP, and MASTER. Note that manually setting the state -to INIT is ignored by -.Xr carp 4 . -This state is set automatically when the underlying interface is down. +.It Cm state Ar MASTER|BACKUP +Forcibly change state of a given vhid. .El .Pp The @@ -2552,11 +2548,12 @@ .Fl k flag causes keying information for the interface, if available, to be printed. -For example, the values of 802.11 WEP keys will be printed, if accessible to -the current user. +For example, the values of 802.11 WEP keys and +.Xr carp 4 +passphrases will be printed, if accessible to the current user. This information is not printed by default, as it may be considered sensitive. -.Pp +.Pp If the network interface driver is not present in the kernel then .Nm will attempt to load it. @@ -2615,6 +2612,11 @@ .Fl alias : .Dl # ifconfig em0 inet6 2001:db8:bdbd::123/48 delete .Pp +Configure a single CARP redundant address on igb0, and then switch it +to be master: +.Dl # ifconfig igb0 vhid 1 10.0.0.1/24 pass foobar +.Dl # ifconfig igb0 vhid 1 state master +.Pp Configure the interface .Li xl0 , to use 100baseTX, full duplex Ethernet media options: diff -r 1a8929bdc357 sbin/ifconfig/ifconfig.c --- a/sbin/ifconfig/ifconfig.c Thu Jan 24 06:03:22 2013 +0800 +++ b/sbin/ifconfig/ifconfig.c Wed Feb 06 10:50:21 2013 +0800 @@ -1079,6 +1079,21 @@ } void +print_vhid(const struct ifaddrs *ifa, const char *s) +{ + struct if_data *ifd; + + if (ifa->ifa_data == NULL) + return; + + ifd = ifa->ifa_data; + if (ifd->ifi_vhid == 0) + return; + + printf("vhid %d ", ifd->ifi_vhid); +} + +void ifmaybeload(const char *name) { #define MOD_PREFIX_LEN 3 /* "if_" */ diff -r 1a8929bdc357 sbin/ifconfig/ifconfig.h --- a/sbin/ifconfig/ifconfig.h Thu Jan 24 06:03:22 2013 +0800 +++ b/sbin/ifconfig/ifconfig.h Wed Feb 06 10:50:21 2013 +0800 @@ -148,3 +148,6 @@ * operations on ifmedia can avoid cmd line ordering confusion. */ struct ifmediareq *ifmedia_getstate(int s); + +void print_vhid(const struct ifaddrs *, const char *); + diff -r 1a8929bdc357 share/man/man4/carp.4 --- a/share/man/man4/carp.4 Thu Jan 24 06:03:22 2013 +0800 +++ b/share/man/man4/carp.4 Wed Feb 06 10:50:21 2013 +0800 @@ -1,6 +1,7 @@ .\" $OpenBSD: carp.4,v 1.16 2004/12/07 23:41:35 jmc Exp $ .\" .\" Copyright (c) 2003, Ryan McBride. All rights reserved. +.\" Copyright (c) 2011, Gleb Smirnoff .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions @@ -25,7 +26,7 @@ .\" .\" $FreeBSD: release/9.1.0/share/man/man4/carp.4 237216 2012-06-18 04:55:07Z eadler $ .\" -.Dd August 15, 2011 +.Dd December 16, 2011 .Dt CARP 4 .Os .Sh NAME @@ -34,33 +35,17 @@ .Sh SYNOPSIS .Cd "device carp" .Sh DESCRIPTION -The -.Nm -interface is a pseudo-device that implements and controls the -CARP protocol. -CARP allows multiple hosts on the same local network to share a set of IP addresses. +The CARP allows multiple hosts on the same local network to share a set of +IPv4 and/or IPv6 addresses. Its primary purpose is to ensure that these -addresses are always available, but in some configurations -.Nm -can also provide load balancing functionality. -.Pp -A -.Nm -interface can be created at runtime using the -.Nm ifconfig Li carp Ns Ar N Cm create -command or by configuring -it via -.Va cloned_interfaces -in the -.Pa /etc/rc.conf -file. +addresses are always available. .Pp To use .Nm , -the administrator needs to configure at minimum a common virtual host ID (VHID) -and virtual host IP address on each machine which is to take part in the virtual -group. -Additional parameters can also be set on a per-interface basis: +the administrator needs to configure at minimum a common virtual host ID +(vhid) and attach at least one IP address to this vhid on each machine which +is to take part in the virtual group. +Additional parameters can also be set on a per-vhid basis: .Cm advbase and .Cm advskew , @@ -93,9 +78,20 @@ .Dv SIOCSVH .Xr ioctl 2 . .Pp +CARP virtual hosts can be configured on multicast capable interfaces: Ethernet, +layer 2 VLAN, FDDI and Token Ring. +An arbitrary number of virtual host IDs can be configured on an interface. +An arbitrary number of IPv4 or IPv6 addresses can be attached to a particular +vhid. +It is important that all hosts participating in a vhid have the same list +of prefixes configured on the vhid, since all prefixes are included in the +cryptographic checksum supplied in each advertisement. +Multiple vhids running on one interface participate in master/backup +elections independently. +.Pp Additionally, there are a number of global parameters which can be set using .Xr sysctl 8 : -.Bl -tag -width ".Va net.inet.carp.arpbalance" +.Bl -tag -width ".Va net.inet.carp.preempt" .It Va net.inet.carp.allow Accept incoming .Nm @@ -125,9 +121,6 @@ .Nm packets. Default value is 1. -.It Va net.inet.carp.arpbalance -Balance local traffic using ARP (see below). -Disabled by default. .It Va net.inet.carp.suppress_preempt A read only value showing the status of preemption suppression. Preemption can be suppressed if link on an interface is down @@ -138,36 +131,36 @@ problems are detected. Every problem increments suppression counter. .El -.Sh ARP level load balancing -The -.Nm -has limited abilities for load balancing the incoming connections -between hosts in Ethernet network. -For load balancing operation, one needs several CARP interfaces that -are configured to the same IP address, but to a different VHIDs. -Once an ARP request is received, the CARP protocol will use a hashing -function against the source IP address in the ARP request to determine -which VHID should this request belong to. -If the corresponding CARP interface is in master state, the ARP request -will be replied, otherwise it will be ignored. -See the -.Sx EXAMPLES -section for a practical example of load balancing. -.Pp -The ARP load balancing has some limitations. -First, ARP balancing only works on the local network segment. -It cannot balance traffic that crosses a router, because the -router itself will always be balanced to the same virtual host. -Second, ARP load balancing can lead to asymmetric routing -of incoming and outgoing traffic, and thus combining it with -.Xr pfsync 4 -is dangerous, because this creates a race condition between -balanced routers and a host they are serving. -Imagine an incoming packet creating state on the first router, being -forwarded to its destination, and destination replying faster -than the state information is packed and synced with the second router. -If the reply would be load balanced to second router, it will be -dropped due to no state. +.\".Sh ARP level load balancing +.\"The +.\".Nm +.\"has limited abilities for load balancing the incoming connections +.\"between hosts in Ethernet network. +.\"For load balancing operation, one needs several CARP interfaces that +.\"are configured to the same IP address, but to a different vhids. +.\"Once an ARP request is received, the CARP protocol will use a hashing +.\"function against the source IP address in the ARP request to determine +.\"which vhid should this request belong to. +.\"If the corresponding CARP interface is in master state, the ARP request +.\"will be replied, otherwise it will be ignored. +.\"See the +.\".Sx EXAMPLES +.\"section for a practical example of load balancing. +.\".Pp +.\"The ARP load balancing has some limitations. +.\"First, ARP balancing only works on the local network segment. +.\"It cannot balance traffic that crosses a router, because the +.\"router itself will always be balanced to the same virtual host. +.\"Second, ARP load balancing can lead to asymmetric routing +.\"of incoming and outgoing traffic, and thus combining it with +.\".Xr pfsync 4 +.\"is dangerous, because this creates a race condition between +.\"balanced routers and a host they are serving. +.\"Imagine an incoming packet creating state on the first router, being +.\"forwarded to its destination, and destination replying faster +.\"than the state information is packed and synced with the second router. +.\"If the reply would be load balanced to second router, it will be +.\"dropped due to no state. .Sh STATE CHANGE NOTIFICATIONS Sometimes it is useful to get notified about .Nm @@ -175,13 +168,10 @@ This can be accomplished by using .Xr devd 8 hooks. -Master/slave events are signalled as -.Nm -interface -.Dv LINK_UP -or -.Dv LINK_DOWN -event. +Master/slave events are signalled under system +.Dv CARP . +Subsystem specifies vhid and name of interface, where event occured. +Type of the message displays new state of vhid. Please see .Xr devd.conf 5 and @@ -197,23 +187,19 @@ .Pp .Dl sysctl net.inet.carp.preempt=1 .Pp -Assume that host A is the preferred master and 192.168.1.x/24 is -configured on one physical interface and 192.168.2.y/24 on another. +Assume that host A is the preferred master and we are running the +192.168.1.0/24 prefix on em0 and 192.168.2.0/24 on em1. This is the setup for host A: .Bd -literal -offset indent -ifconfig carp0 create -ifconfig carp0 vhid 1 pass mekmitasdigoat 192.168.1.1/24 -ifconfig carp1 create -ifconfig carp1 vhid 2 pass mekmitasdigoat 192.168.2.1/24 +ifconfig em0 vhid 1 pass mekmitasdigoat 192.168.1.1/24 +ifconfig em1 vhid 2 pass mekmitasdigoat 192.168.2.1/24 .Ed .Pp The setup for host B is identical, but it has a higher .Cm advskew : .Bd -literal -offset indent -ifconfig carp0 create -ifconfig carp0 vhid 1 advskew 100 pass mekmitasdigoat 192.168.1.1/24 -ifconfig carp1 create -ifconfig carp1 vhid 2 advskew 100 pass mekmitasdigoat 192.168.2.1/24 +ifconfig em0 vhid 1 advskew 100 pass mekmitasdigoat 192.168.1.1/24 +ifconfig em1 vhid 2 advskew 100 pass mekmitasdigoat 192.168.2.1/24 .Ed .Pp Because of the preempt option, when one of the physical interfaces of @@ -224,67 +210,60 @@ interfaces. This will cause host B to preempt on both interfaces instead of just the failed one. -.Pp -In order to set up an ARP balanced virtual host, it is necessary to configure -one virtual host for each physical host which would respond to ARP requests -and thus handle the traffic. -In the following example, two virtual hosts are configured on two hosts to -provide balancing and failover for the IP address 192.168.1.10. -.Pp -First the -.Nm -interfaces on host A are configured. -The -.Cm advskew -of 100 on the second virtual host means that its advertisements will be sent -out slightly less frequently. -.Bd -literal -offset indent -ifconfig carp0 create -ifconfig carp0 vhid 1 pass mekmitasdigoat 192.168.1.10/24 -ifconfig carp1 create -ifconfig carp1 vhid 2 advskew 100 pass mekmitasdigoat 192.168.1.10/24 -.Ed -.Pp -The configuration for host B is identical, except the -.Cm advskew -is on virtual host 1 rather than virtual host 2. -.Bd -literal -offset indent -ifconfig carp0 create -ifconfig carp0 vhid 1 advskew 100 pass mekmitasdigoat 192.168.1.10/24 -ifconfig carp1 create -ifconfig carp1 vhid 2 pass mekmitasdigoat 192.168.1.10/24 -.Ed -.Pp -Finally, the ARP balancing feature must be enabled on both hosts: -.Pp -.Dl sysctl net.inet.carp.arpbalance=1 -.Pp -When the hosts receive an ARP request for 192.168.1.10, the source IP address -of the request is used to compute which virtual host should answer the request. -The host which is master of the selected virtual host will reply to the -request, the other(s) will ignore it. -.Pp -This way, locally connected systems will receive different ARP replies and -subsequent IP traffic will be balanced among the hosts. -If one of the hosts fails, the other will take over the virtual MAC address, -and begin answering ARP requests on its behalf. +.\".Pp +.\"In order to set up an ARP balanced virtual host, it is necessary to configure +.\"one virtual host for each physical host which would respond to ARP requests +.\"and thus handle the traffic. +.\"In the following example, two virtual hosts are configured on two hosts to +.\"provide balancing and failover for the IP address 192.168.1.10. +.\".Pp +.\"First the +.\".Nm +.\"interfaces on host A are configured. +.\"The +.\".Cm advskew +.\"of 100 on the second virtual host means that its advertisements will be sent +.\"out slightly less frequently. +.\".Bd -literal -offset indent +.\"ifconfig carp0 create +.\"ifconfig carp0 vhid 1 pass mekmitasdigoat 192.168.1.10/24 +.\"ifconfig carp1 create +.\"ifconfig carp1 vhid 2 advskew 100 pass mekmitasdigoat 192.168.1.10/24 +.\".Ed +.\".Pp +.\"The configuration for host B is identical, except the +.\".Cm advskew +.\"is on virtual host 1 rather than virtual host 2. +.\".Bd -literal -offset indent +.\"ifconfig carp0 create +.\"ifconfig carp0 vhid 1 advskew 100 pass mekmitasdigoat 192.168.1.10/24 +.\"ifconfig carp1 create +.\"ifconfig carp1 vhid 2 pass mekmitasdigoat 192.168.1.10/24 +.\".Ed +.\".Pp +.\"Finally, the ARP balancing feature must be enabled on both hosts: +.\".Pp +.\".Dl sysctl net.inet.carp.arpbalance=1 +.\".Pp +.\"When the hosts receive an ARP request for 192.168.1.10, the source IP address +.\"of the request is used to compute which virtual host should answer the request. +.\"The host which is master of the selected virtual host will reply to the +.\"request, the other(s) will ignore it. +.\".Pp +.\"This way, locally connected systems will receive different ARP replies and +.\"subsequent IP traffic will be balanced among the hosts. +.\"If one of the hosts fails, the other will take over the virtual MAC address, +.\"and begin answering ARP requests on its behalf. .Pp Processing of .Nm -status change events can be set up by using the following devd.conf rules: +status change events can be set up by using the following devd.conf rule: .Bd -literal -offset indent notify 0 { - match "system" "IFNET"; - match "type" "LINK_UP"; - match "subsystem" "carp*"; - action "/root/carpcontrol.sh $type $subsystem"; -}; - -notify 0 { - match "system" "IFNET"; - match "type" "LINK_DOWN"; - match "subsystem" "carp*"; - action "/root/carpcontrol.sh $type $subsystem"; + match "system" "CARP"; + match "subsystem" "[0-9]+@"; + match "type" "(MASTER|BACKUP)"; + action "/root/carpcontrol.sh $subsystem $type"; }; .Ed .Sh SEE ALSO @@ -303,3 +282,8 @@ .Nm device was imported into .Fx 5.4 . +In +.Fx 10 +the +.Nm +was significantly rewritten, and is no longer a pseudo-interface. diff -r 1a8929bdc357 sys/net/if.c --- a/sys/net/if.c Thu Jan 24 06:03:22 2013 +0800 +++ b/sys/net/if.c Wed Feb 06 10:50:21 2013 +0800 @@ -130,17 +130,19 @@ /* These are external hooks for CARP. */ void (*carp_linkstate_p)(struct ifnet *ifp); #if defined(INET) || defined(INET6) -struct ifnet *(*carp_forus_p)(struct ifnet *ifp, u_char *dhost); +int (*carp_forus_p)(struct ifnet *ifp, u_char *dhost); int (*carp_output_p)(struct ifnet *ifp, struct mbuf *m, - struct sockaddr *sa, struct rtentry *rt); + struct sockaddr *sa); +int (*carp_ioctl_p)(struct ifreq *, u_long, struct thread *); +int (*carp_attach_p)(struct ifaddr *, int); +void (*carp_detach_p)(struct ifaddr *); #endif #ifdef INET -int (*carp_iamatch_p)(struct ifnet *, struct in_ifaddr *, struct in_addr *, - u_int8_t **); +int (*carp_iamatch_p)(struct ifaddr *, uint8_t **); #endif #ifdef INET6 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6); -caddr_t (*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m, +caddr_t (*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr); #endif @@ -2521,6 +2523,16 @@ error = if_getgroupmembers((struct ifgroupreq *)data); CURVNET_RESTORE(); return (error); +#if defined(INET) || defined(INET6) + case SIOCSVH: + case SIOCGVH: + if (carp_ioctl_p == NULL) + error = EPROTONOSUPPORT; + else + error = (*carp_ioctl_p)(ifr, cmd, td); + CURVNET_RESTORE(); + return (error); +#endif } ifp = ifunit_ref(ifr->ifr_name); diff -r 1a8929bdc357 sys/net/if.h --- a/sys/net/if.h Thu Jan 24 06:03:22 2013 +0800 +++ b/sys/net/if.h Wed Feb 06 10:50:21 2013 +0800 @@ -85,7 +85,7 @@ u_char ifi_addrlen; /* media address length */ u_char ifi_hdrlen; /* media header length */ u_char ifi_link_state; /* current link state */ - u_char ifi_spare_char1; /* spare byte */ + u_char ifi_vhid; /* carp vhid */ u_char ifi_spare_char2; /* spare byte */ u_char ifi_datalen; /* length of this data struct */ u_long ifi_mtu; /* maximum transmission unit */ @@ -298,6 +298,8 @@ int ifam_flags; /* value of ifa_flags */ u_short ifam_index; /* index for associated ifp */ int ifam_metric; /* value of ifa_metric */ + struct if_data ifam_data;/* statistics and other data about if or + * address */ }; /* @@ -415,6 +417,7 @@ struct sockaddr ifra_addr; struct sockaddr ifra_broadaddr; struct sockaddr ifra_mask; + int ifra_vhid; }; struct ifmediareq { diff -r 1a8929bdc357 sys/net/if_ethersubr.c --- a/sys/net/if_ethersubr.c Thu Jan 24 06:03:22 2013 +0800 +++ b/sys/net/if_ethersubr.c Wed Feb 06 10:50:21 2013 +0800 @@ -397,7 +397,7 @@ #if defined(INET) || defined(INET6) if (ifp->if_carp && - (error = (*carp_output_p)(ifp, m, dst, NULL))) + (error = (*carp_output_p)(ifp, m, dst))) goto bad; #endif diff -r 1a8929bdc357 sys/net/if_types.h --- a/sys/net/if_types.h Thu Jan 24 06:03:22 2013 +0800 +++ b/sys/net/if_types.h Wed Feb 06 10:50:21 2013 +0800 @@ -250,6 +250,5 @@ #define IFT_ENC 0xf4 #define IFT_PFLOG 0xf6 #define IFT_PFSYNC 0xf7 -#define IFT_CARP 0xf8 /* Common Address Redundancy Protocol */ #define IFT_IPXIP 0xf9 /* IPX over IP tunneling; no longer used. */ #endif /* !_NET_IF_TYPES_H_ */ diff -r 1a8929bdc357 sys/net/if_var.h --- a/sys/net/if_var.h Thu Jan 24 06:03:22 2013 +0800 +++ b/sys/net/if_var.h Wed Feb 06 10:50:21 2013 +0800 @@ -69,6 +69,7 @@ struct socket; struct ether_header; struct carp_if; +struct carp_softc; struct ifvlantrunk; struct route; struct vnet; @@ -735,6 +736,7 @@ struct sockaddr *ifa_netmask; /* used to determine subnet */ struct if_data if_data; /* not all members are meaningful */ struct ifnet *ifa_ifp; /* back-pointer to interface */ + struct carp_softc *ifa_carp; /* pointer to CARP data */ TAILQ_ENTRY(ifaddr) ifa_link; /* queue macro glue */ void (*ifa_rtrequest) /* check or clean routes (+ or -)'d */ (int, struct rtentry *, struct rt_addrinfo *); diff -r 1a8929bdc357 sys/net/rtsock.c --- a/sys/net/rtsock.c Thu Jan 24 06:03:22 2013 +0800 +++ b/sys/net/rtsock.c Wed Feb 06 10:50:21 2013 +0800 @@ -27,7 +27,7 @@ * SUCH DAMAGE. * * @(#)rtsock.c 8.7 (Berkeley) 10/12/95 - * $FreeBSD: release/9.1.0/sys/net/rtsock.c 235055 2012-05-05 11:33:48Z melifaro $ + * $FreeBSD: head/sys/net/rtsock.c 228571 2011-12-16 12:16:56Z glebius $ */ #include "opt_compat.h" #include "opt_sctp.h" @@ -63,6 +63,7 @@ #include #include +#include #ifdef INET6 #include #endif @@ -83,7 +84,7 @@ uint8_t ifi_addrlen; uint8_t ifi_hdrlen; uint8_t ifi_link_state; - uint8_t ifi_spare_char1; + uint8_t ifi_vhid; uint8_t ifi_spare_char2; uint8_t ifi_datalen; uint32_t ifi_mtu; @@ -114,34 +115,7 @@ uint16_t ifm_index; struct if_data32 ifm_data; }; - -struct if_msghdrl32 { - uint16_t ifm_msglen; - uint8_t ifm_version; - uint8_t ifm_type; - int32_t ifm_addrs; - int32_t ifm_flags; - uint16_t ifm_index; - uint16_t _ifm_spare1; - uint16_t ifm_len; - uint16_t ifm_data_off; - struct if_data32 ifm_data; -}; - -struct ifa_msghdrl32 { - uint16_t ifam_msglen; - uint8_t ifam_version; - uint8_t ifam_type; - int32_t ifam_addrs; - int32_t ifam_flags; - uint16_t ifam_index; - uint16_t _ifam_spare1; - uint16_t ifam_len; - uint16_t ifam_data_off; - int32_t ifam_metric; - struct if_data32 ifam_data; -}; -#endif /* COMPAT_FREEBSD32 */ +#endif MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); @@ -149,6 +123,9 @@ static struct sockaddr route_src = { 2, PF_ROUTE, }; static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, }; +/* These are external hooks for CARP. */ +int (*carp_get_vhid_p)(struct ifaddr *); + /* * Used by rtsock/raw_input callback code to decide whether to filter the update * notification to a socket bound to a particular FIB. @@ -170,7 +147,7 @@ #define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx) #define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED) -SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD, 0, ""); +static SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD, 0, ""); struct walkarg { int w_tmemsize; @@ -1037,9 +1014,6 @@ return (0); } -/* - * Used by the routing socket. - */ static struct mbuf * rt_msg1(int type, struct rt_addrinfo *rtinfo) { @@ -1107,9 +1081,6 @@ return (m); } -/* - * Used by the sysctl code and routing socket. - */ static int rt_msg2(int type, struct rt_addrinfo *rtinfo, caddr_t cp, struct walkarg *w) { @@ -1123,31 +1094,17 @@ case RTM_DELADDR: case RTM_NEWADDR: - if (w != NULL && w->w_op == NET_RT_IFLISTL) { -#ifdef COMPAT_FREEBSD32 - if (w->w_req->flags & SCTL_MASK32) - len = sizeof(struct ifa_msghdrl32); - else -#endif - len = sizeof(struct ifa_msghdrl); - } else - len = sizeof(struct ifa_msghdr); + len = sizeof(struct ifa_msghdr); break; case RTM_IFINFO: #ifdef COMPAT_FREEBSD32 if (w != NULL && w->w_req->flags & SCTL_MASK32) { - if (w->w_op == NET_RT_IFLISTL) - len = sizeof(struct if_msghdrl32); - else - len = sizeof(struct if_msghdr32); + len = sizeof(struct if_msghdr32); break; } #endif - if (w != NULL && w->w_op == NET_RT_IFLISTL) - len = sizeof(struct if_msghdrl); - else - len = sizeof(struct if_msghdr); + len = sizeof(struct if_msghdr); break; case RTM_NEWMADDR: @@ -1555,6 +1512,7 @@ CP(*src, *dst, ifi_addrlen); CP(*src, *dst, ifi_hdrlen); CP(*src, *dst, ifi_link_state); + CP(*src, *dst, ifi_vhid); dst->ifi_datalen = sizeof(struct if_data32); CP(*src, *dst, ifi_mtu); CP(*src, *dst, ifi_metric); @@ -1577,127 +1535,6 @@ #endif static int -sysctl_iflist_ifml(struct ifnet *ifp, struct rt_addrinfo *info, - struct walkarg *w, int len) -{ - struct if_msghdrl *ifm; - -#ifdef COMPAT_FREEBSD32 - if (w->w_req->flags & SCTL_MASK32) { - struct if_msghdrl32 *ifm32; - - ifm32 = (struct if_msghdrl32 *)w->w_tmem; - ifm32->ifm_addrs = info->rti_addrs; - ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; - ifm32->ifm_index = ifp->if_index; - ifm32->_ifm_spare1 = 0; - ifm32->ifm_len = sizeof(*ifm32); - ifm32->ifm_data_off = offsetof(struct if_msghdrl32, ifm_data); - - copy_ifdata32(&ifp->if_data, &ifm32->ifm_data); - - return (SYSCTL_OUT(w->w_req, (caddr_t)ifm32, len)); - } -#endif - ifm = (struct if_msghdrl *)w->w_tmem; - ifm->ifm_addrs = info->rti_addrs; - ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; - ifm->ifm_index = ifp->if_index; - ifm->_ifm_spare1 = 0; - ifm->ifm_len = sizeof(*ifm); - ifm->ifm_data_off = offsetof(struct if_msghdrl, ifm_data); - - ifm->ifm_data = ifp->if_data; - - return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); -} - -static int -sysctl_iflist_ifm(struct ifnet *ifp, struct rt_addrinfo *info, - struct walkarg *w, int len) -{ - struct if_msghdr *ifm; - -#ifdef COMPAT_FREEBSD32 - if (w->w_req->flags & SCTL_MASK32) { - struct if_msghdr32 *ifm32; - - ifm32 = (struct if_msghdr32 *)w->w_tmem; - ifm32->ifm_addrs = info->rti_addrs; - ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; - ifm32->ifm_index = ifp->if_index; - - copy_ifdata32(&ifp->if_data, &ifm32->ifm_data); - - return (SYSCTL_OUT(w->w_req, (caddr_t)ifm32, len)); - } -#endif - ifm = (struct if_msghdr *)w->w_tmem; - ifm->ifm_addrs = info->rti_addrs; - ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; - ifm->ifm_index = ifp->if_index; - - ifm->ifm_data = ifp->if_data; - - return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); -} - -static int -sysctl_iflist_ifaml(struct ifaddr *ifa, struct rt_addrinfo *info, - struct walkarg *w, int len) -{ - struct ifa_msghdrl *ifam; - -#ifdef COMPAT_FREEBSD32 - if (w->w_req->flags & SCTL_MASK32) { - struct ifa_msghdrl32 *ifam32; - - ifam32 = (struct ifa_msghdrl32 *)w->w_tmem; - ifam32->ifam_addrs = info->rti_addrs; - ifam32->ifam_flags = ifa->ifa_flags; - ifam32->ifam_index = ifa->ifa_ifp->if_index; - ifam32->_ifam_spare1 = 0; - ifam32->ifam_len = sizeof(*ifam32); - ifam32->ifam_data_off = - offsetof(struct ifa_msghdrl32, ifam_data); - ifam32->ifam_metric = ifa->ifa_metric; - - copy_ifdata32(&ifa->ifa_ifp->if_data, &ifam32->ifam_data); - - return (SYSCTL_OUT(w->w_req, (caddr_t)ifam32, len)); - } -#endif - - ifam = (struct ifa_msghdrl *)w->w_tmem; - ifam->ifam_addrs = info->rti_addrs; - ifam->ifam_flags = ifa->ifa_flags; - ifam->ifam_index = ifa->ifa_ifp->if_index; - ifam->_ifam_spare1 = 0; - ifam->ifam_len = sizeof(*ifam); - ifam->ifam_data_off = offsetof(struct ifa_msghdrl, ifam_data); - ifam->ifam_metric = ifa->ifa_metric; - - ifam->ifam_data = ifa->if_data; - - return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); -} - -static int -sysctl_iflist_ifam(struct ifaddr *ifa, struct rt_addrinfo *info, - struct walkarg *w, int len) -{ - struct ifa_msghdr *ifam; - - ifam = (struct ifa_msghdr *)w->w_tmem; - ifam->ifam_addrs = info->rti_addrs; - ifam->ifam_flags = ifa->ifa_flags; - ifam->ifam_index = ifa->ifa_ifp->if_index; - ifam->ifam_metric = ifa->ifa_metric; - - return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); -} - -static int sysctl_iflist(int af, struct walkarg *w) { struct ifnet *ifp; @@ -1716,10 +1553,38 @@ len = rt_msg2(RTM_IFINFO, &info, NULL, w); info.rti_info[RTAX_IFP] = NULL; if (w->w_req && w->w_tmem) { - if (w->w_op == NET_RT_IFLISTL) - error = sysctl_iflist_ifml(ifp, &info, w, len); - else - error = sysctl_iflist_ifm(ifp, &info, w, len); + struct if_msghdr *ifm; + +#ifdef COMPAT_FREEBSD32 + if (w->w_req->flags & SCTL_MASK32) { + struct if_msghdr32 *ifm32; + + ifm32 = (struct if_msghdr32 *)w->w_tmem; + ifm32->ifm_index = ifp->if_index; + ifm32->ifm_flags = ifp->if_flags | + ifp->if_drv_flags; + copy_ifdata32(&ifp->if_data, &ifm32->ifm_data); + if (carp_get_vhid_p != NULL) + ifm32->ifm_data.ifi_vhid = + (*carp_get_vhid_p)(ifa); + ifm32->ifm_addrs = info.rti_addrs; + error = SYSCTL_OUT(w->w_req, (caddr_t)ifm32, + len); + goto sysctl_out; + } +#endif + ifm = (struct if_msghdr *)w->w_tmem; + ifm->ifm_index = ifp->if_index; + ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; + ifm->ifm_data = ifp->if_data; + if (carp_get_vhid_p != NULL) + ifm->ifm_data.ifi_vhid = + (*carp_get_vhid_p)(ifa); + ifm->ifm_addrs = info.rti_addrs; + error = SYSCTL_OUT(w->w_req, (caddr_t)ifm, len); +#ifdef COMPAT_FREEBSD32 + sysctl_out: +#endif if (error) goto done; } @@ -1734,12 +1599,17 @@ info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; len = rt_msg2(RTM_NEWADDR, &info, NULL, w); if (w->w_req && w->w_tmem) { - if (w->w_op == NET_RT_IFLISTL) - error = sysctl_iflist_ifaml(ifa, &info, - w, len); - else - error = sysctl_iflist_ifam(ifa, &info, - w, len); + struct ifa_msghdr *ifam; + + ifam = (struct ifa_msghdr *)w->w_tmem; + ifam->ifam_index = ifa->ifa_ifp->if_index; + ifam->ifam_flags = ifa->ifa_flags; + ifam->ifam_metric = ifa->ifa_metric; + ifam->ifam_addrs = info.rti_addrs; + if (carp_get_vhid_p != NULL) + ifam->ifam_data.ifi_vhid = + (*carp_get_vhid_p)(ifa); + error = SYSCTL_OUT(w->w_req, w->w_tmem, len); if (error) goto done; } @@ -1859,17 +1729,16 @@ for (error = 0; error == 0 && i <= lim; i++) { rnh = rt_tables_get_rnh(req->td->td_proc->p_fibnum, i); if (rnh != NULL) { - RADIX_NODE_HEAD_RLOCK(rnh); + RADIX_NODE_HEAD_LOCK(rnh); error = rnh->rnh_walktree(rnh, sysctl_dumpentry, &w); - RADIX_NODE_HEAD_RUNLOCK(rnh); + RADIX_NODE_HEAD_UNLOCK(rnh); } else if (af != 0) error = EAFNOSUPPORT; } break; case NET_RT_IFLIST: - case NET_RT_IFLISTL: error = sysctl_iflist(af, &w); break; @@ -1882,7 +1751,7 @@ return (error); } -SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD, sysctl_rtsock, ""); +static SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD, sysctl_rtsock, ""); /* * Definitions of protocols supported in the ROUTE domain. diff -r 1a8929bdc357 sys/netinet/if_ether.c --- a/sys/netinet/if_ether.c Thu Jan 24 06:03:22 2013 +0800 +++ b/sys/netinet/if_ether.c Wed Feb 06 10:50:21 2013 +0800 @@ -139,8 +139,6 @@ }; #ifdef AF_INET -void arp_ifscrub(struct ifnet *ifp, uint32_t addr); - /* * called by in_ifscrub to remove entry from the table when * the interface goes away @@ -516,7 +514,7 @@ int op, flags; int req_len; int bridged = 0, is_bridge = 0; - int carp_match = 0; + int carped; struct sockaddr_in sin; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; @@ -561,24 +559,14 @@ * For a bridge, we want to check the address irrespective * of the receive interface. (This will change slightly * when we have clusters of interfaces). - * If the interface does not match, but the recieving interface - * is part of carp, we call carp_iamatch to see if this is a - * request for the virtual host ip. - * XXX: This is really ugly! */ IN_IFADDR_RLOCK(); LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) || ia->ia_ifp == ifp) && - itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) { - ifa_ref(&ia->ia_ifa); - IN_IFADDR_RUNLOCK(); - goto match; - } - if (ifp->if_carp != NULL && - (*carp_iamatch_p)(ifp, ia, &isaddr, &enaddr) && - itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) { - carp_match = 1; + itaddr.s_addr == ia->ia_addr.sin_addr.s_addr && + (ia->ia_ifa.ifa_carp == NULL || + (*carp_iamatch_p)(&ia->ia_ifa, &enaddr))) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(); goto match; @@ -643,6 +631,7 @@ match: if (!enaddr) enaddr = (u_int8_t *)IF_LLADDR(ifp); + carped = (ia->ia_ifa.ifa_carp != NULL); myaddr = ia->ia_addr.sin_addr; ifa_free(&ia->ia_ifa); if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen)) @@ -659,9 +648,9 @@ * case we suppress the warning to avoid false positive complaints of * potential misconfiguration. */ - if (!bridged && isaddr.s_addr == myaddr.s_addr && myaddr.s_addr != 0) { - log(LOG_ERR, - "arp: %*D is using my IP address %s on %s!\n", + if (!bridged && !carped && isaddr.s_addr == myaddr.s_addr && + myaddr.s_addr != 0) { + log(LOG_ERR, "arp: %*D is using my IP address %s on %s!\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", inet_ntoa(isaddr), ifp->if_xname); itaddr = myaddr; @@ -682,7 +671,7 @@ IF_AFDATA_UNLOCK(ifp); if (la != NULL) { /* the following is not an error when doing bridging */ - if (!bridged && la->lle_tbl->llt_ifp != ifp && !carp_match) { + if (!bridged && la->lle_tbl->llt_ifp != ifp) { if (log_arp_wrong_iface) log(LOG_WARNING, "arp: %s is on %s " "but got reply from %*D on %s\n", @@ -879,6 +868,9 @@ { struct llentry *lle; + if (ifa->ifa_carp != NULL) + return; + if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) { arprequest(ifp, &IA_SIN(ifa)->sin_addr, &IA_SIN(ifa)->sin_addr, IF_LLADDR(ifp)); diff -r 1a8929bdc357 sys/netinet/if_ether.h --- a/sys/netinet/if_ether.h Thu Jan 24 06:03:22 2013 +0800 +++ b/sys/netinet/if_ether.h Wed Feb 06 10:50:21 2013 +0800 @@ -117,6 +117,7 @@ struct llentry **lle); void arp_ifinit(struct ifnet *, struct ifaddr *); void arp_ifinit2(struct ifnet *, struct ifaddr *, u_char *); +void arp_ifscrub(struct ifnet *, uint32_t); #include typedef void (*llevent_arp_update_fn)(void *, struct llentry *); diff -r 1a8929bdc357 sys/netinet/in.c --- a/sys/netinet/in.c Thu Jan 24 06:03:22 2013 +0800 +++ b/sys/netinet/in.c Wed Feb 06 10:50:21 2013 +0800 @@ -31,7 +31,7 @@ */ #include -__FBSDID("$FreeBSD: release/9.1.0/sys/netinet/in.c 233200 2012-03-19 20:49:16Z jhb $"); +__FBSDID("$FreeBSD: head/sys/netinet/in.c 228571 2011-12-16 12:16:56Z glebius $"); #include "opt_mpath.h" @@ -56,10 +56,12 @@ #include #include +#include #include #include #include #include +#include #include #include #include @@ -69,17 +71,15 @@ static int in_lifaddr_ioctl(struct socket *, u_long, caddr_t, struct ifnet *, struct thread *); -static int in_addprefix(struct in_ifaddr *, int); -static int in_scrubprefix(struct in_ifaddr *, u_int); static void in_socktrim(struct sockaddr_in *); -static int in_ifinit(struct ifnet *, - struct in_ifaddr *, struct sockaddr_in *, int); +static int in_ifinit(struct ifnet *, struct in_ifaddr *, + struct sockaddr_in *, int, int, int); static void in_purgemaddrs(struct ifnet *); -static VNET_DEFINE(int, sameprefixcarponly); -#define V_sameprefixcarponly VNET(sameprefixcarponly) -SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, same_prefix_carp_only, CTLFLAG_RW, - &VNET_NAME(sameprefixcarponly), 0, +static VNET_DEFINE(int, nosameprefix); +#define V_nosameprefix VNET(nosameprefix) +SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_RW, + &VNET_NAME(nosameprefix), 0, "Refuse to create same prefixes on different interfaces"); VNET_DECLARE(struct inpcbinfo, ripcbinfo); @@ -234,16 +234,43 @@ * in_lifaddr_ioctl() and ifp->if_ioctl(). */ switch (cmd) { - case SIOCAIFADDR: - case SIOCDIFADDR: case SIOCGIFADDR: case SIOCGIFBRDADDR: case SIOCGIFDSTADDR: case SIOCGIFNETMASK: + case SIOCDIFADDR: + break; + case SIOCAIFADDR: + /* + * ifra_addr must be present and be of INET family. + * ifra_broadaddr and ifra_mask are optional. + */ + if (ifra->ifra_addr.sin_len != sizeof(struct sockaddr_in) || + ifra->ifra_addr.sin_family != AF_INET) + return (EINVAL); + if (ifra->ifra_broadaddr.sin_len != 0 && + (ifra->ifra_broadaddr.sin_len != + sizeof(struct sockaddr_in) || + ifra->ifra_broadaddr.sin_family != AF_INET)) + return (EINVAL); +#if 0 + /* + * ifconfig(8) historically doesn't set af_family for mask + * for unknown reason. + */ + if (ifra->ifra_mask.sin_len != 0 && + (ifra->ifra_mask.sin_len != sizeof(struct sockaddr_in) || + ifra->ifra_mask.sin_family != AF_INET)) + return (EINVAL); +#endif + break; case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: + if (ifr->ifr_addr.sa_family != AF_INET || + ifr->ifr_addr.sa_len != sizeof(struct sockaddr_in)) + return (EINVAL); break; case SIOCALIFADDR: @@ -490,7 +517,7 @@ case SIOCSIFADDR: error = in_ifinit(ifp, ia, - (struct sockaddr_in *) &ifr->ifr_addr, 1); + (struct sockaddr_in *) &ifr->ifr_addr, 1, 0, 0); if (error != 0 && iaIsNew) break; if (error == 0) { @@ -506,7 +533,8 @@ goto out; case SIOCSIFNETMASK: - ia->ia_sockmask.sin_addr = ifra->ifra_addr.sin_addr; + ia->ia_sockmask.sin_addr = ((struct sockaddr_in *) + &ifr->ifr_addr)->sin_addr; ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); goto out; @@ -514,14 +542,9 @@ maskIsNew = 0; hostIsNew = 1; error = 0; - if (ia->ia_addr.sin_family == AF_INET) { - if (ifra->ifra_addr.sin_len == 0) { - ifra->ifra_addr = ia->ia_addr; - hostIsNew = 0; - } else if (ifra->ifra_addr.sin_addr.s_addr == - ia->ia_addr.sin_addr.s_addr) - hostIsNew = 0; - } + if (ifra->ifra_addr.sin_addr.s_addr == + ia->ia_addr.sin_addr.s_addr) + hostIsNew = 0; if (ifra->ifra_mask.sin_len) { /* * QL: XXX @@ -545,14 +568,14 @@ ia->ia_dstaddr = ifra->ifra_dstaddr; maskIsNew = 1; /* We lie; but the effect's the same */ } - if (ifra->ifra_addr.sin_family == AF_INET && - (hostIsNew || maskIsNew)) - error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0); + if (hostIsNew || maskIsNew) + error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0, + maskIsNew, ifra->ifra_vhid); if (error != 0 && iaIsNew) break; if ((ifp->if_flags & IFF_BROADCAST) && - (ifra->ifra_broadaddr.sin_family == AF_INET)) + ifra->ifra_broadaddr.sin_len) ia->ia_broadaddr = ifra->ifra_broadaddr; if (error == 0) { ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); @@ -586,6 +609,9 @@ panic("in_control: unsupported ioctl"); } + if (ia->ia_ifa.ifa_carp) + (*carp_detach_p)(&ia->ia_ifa); + IF_ADDR_WLOCK(ifp); /* Re-check that ia is still part of the list. */ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { @@ -608,31 +634,26 @@ IN_IFADDR_WLOCK(); TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); - if (ia->ia_addr.sin_family == AF_INET) { - struct in_ifaddr *if_ia; - LIST_REMOVE(ia, ia_hash); - IN_IFADDR_WUNLOCK(); - /* - * If this is the last IPv4 address configured on this - * interface, leave the all-hosts group. - * No state-change report need be transmitted. - */ - if_ia = NULL; - IFP_TO_IA(ifp, if_ia); - if (if_ia == NULL) { - ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); - IN_MULTI_LOCK(); - if (ii->ii_allhosts) { - (void)in_leavegroup_locked(ii->ii_allhosts, - NULL); - ii->ii_allhosts = NULL; - } - IN_MULTI_UNLOCK(); - } else - ifa_free(&if_ia->ia_ifa); + LIST_REMOVE(ia, ia_hash); + IN_IFADDR_WUNLOCK(); + /* + * If this is the last IPv4 address configured on this + * interface, leave the all-hosts group. + * No state-change report need be transmitted. + */ + IFP_TO_IA(ifp, iap); + if (iap == NULL) { + ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); + IN_MULTI_LOCK(); + if (ii->ii_allhosts) { + (void)in_leavegroup_locked(ii->ii_allhosts, NULL); + ii->ii_allhosts = NULL; + } + IN_MULTI_UNLOCK(); } else - IN_IFADDR_WUNLOCK(); + ifa_free(&iap->ia_ifa); + ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ out: if (ia != NULL) @@ -704,7 +725,7 @@ if (iflr->flags & IFLR_PREFIX) return (EINVAL); - /* copy args to in_aliasreq, perform ioctl(SIOCAIFADDR). */ + /* copy args to in_aliasreq, perform ioctl(SIOCAIFADDR_IN6). */ bzero(&ifra, sizeof(ifra)); bcopy(iflr->iflr_name, ifra.ifra_name, sizeof(ifra.ifra_name)); @@ -755,7 +776,7 @@ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr->sa_family != AF_INET) + if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (match.s_addr == 0) break; @@ -791,7 +812,7 @@ } else { struct in_aliasreq ifra; - /* fill in_aliasreq and do ioctl(SIOCDIFADDR) */ + /* fill in_aliasreq and do ioctl(SIOCDIFADDR_IN6) */ bzero(&ifra, sizeof(ifra)); bcopy(iflr->iflr_name, ifra.ifra_name, sizeof(ifra.ifra_name)); @@ -804,7 +825,6 @@ } bcopy(&ia->ia_sockmask, &ifra.ifra_dstaddr, ia->ia_sockmask.sin_len); - ifa_free(ifa); return (in_control(so, SIOCDIFADDR, (caddr_t)&ifra, ifp, td)); @@ -831,60 +851,46 @@ */ static int in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin, - int scrub) + int scrub, int masksupplied, int vhid) { register u_long i = ntohl(sin->sin_addr.s_addr); - struct sockaddr_in oldaddr; - int s = splimp(), flags = RTF_UP, error = 0; + int flags = RTF_UP, error = 0; - oldaddr = ia->ia_addr; - if (oldaddr.sin_family == AF_INET) + if (scrub) + in_scrubprefix(ia, LLE_STATIC); + + IN_IFADDR_WLOCK(); + if (ia->ia_addr.sin_family == AF_INET) LIST_REMOVE(ia, ia_hash); ia->ia_addr = *sin; - if (ia->ia_addr.sin_family == AF_INET) { - IN_IFADDR_WLOCK(); - LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), - ia, ia_hash); - IN_IFADDR_WUNLOCK(); + LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), + ia, ia_hash); + IN_IFADDR_WUNLOCK(); + + if (vhid > 0) { + if (carp_attach_p != NULL) + error = (*carp_attach_p)(&ia->ia_ifa, vhid); + else + error = EPROTONOSUPPORT; } + if (error) + return (error); + /* * Give the interface a chance to initialize * if this is its first address, * and to validate the address if necessary. */ - if (ifp->if_ioctl != NULL) { - error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); - if (error) { - splx(s); + if (ifp->if_ioctl != NULL && + (error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia)) != 0) /* LIST_REMOVE(ia, ia_hash) is done in in_control */ - ia->ia_addr = oldaddr; - IN_IFADDR_WLOCK(); - if (ia->ia_addr.sin_family == AF_INET) - LIST_INSERT_HEAD(INADDR_HASH( - ia->ia_addr.sin_addr.s_addr), ia, ia_hash); - else - /* - * If oldaddr family is not AF_INET (e.g. - * interface has been just created) in_control - * does not call LIST_REMOVE, and we end up - * with bogus ia entries in hash - */ - LIST_REMOVE(ia, ia_hash); - IN_IFADDR_WUNLOCK(); return (error); - } - } - splx(s); - if (scrub) { - ia->ia_ifa.ifa_addr = (struct sockaddr *)&oldaddr; - in_ifscrub(ifp, ia, LLE_STATIC); - ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; - } + /* * Be compatible with network classes, if netmask isn't supplied, * guess it based on classes. */ - if (ia->ia_subnetmask == 0) { + if (!masksupplied) { if (IN_CLASSA(i)) ia->ia_subnetmask = IN_CLASSA_NET; else if (IN_CLASSB(i)) @@ -896,11 +902,6 @@ ia->ia_subnet = i & ia->ia_subnetmask; in_socktrim(&ia->ia_sockmask); /* - * XXX: carp(4) does not have interface route - */ - if (ifp->if_type == IFT_CARP) - return (0); - /* * Add route for the network. */ ia->ia_ifa.ifa_metric = ifp->if_metric; @@ -918,27 +919,25 @@ return (0); flags |= RTF_HOST; } - if ((error = in_addprefix(ia, flags)) != 0) + if (!vhid && (error = in_addprefix(ia, flags)) != 0) return (error); if (ia->ia_addr.sin_addr.s_addr == INADDR_ANY) return (0); - if (ifp->if_flags & IFF_POINTOPOINT) { - if (ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr) + if (ifp->if_flags & IFF_POINTOPOINT && + ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr) return (0); - } - /* * add a loopback route to self */ - if (V_useloopback && !(ifp->if_flags & IFF_LOOPBACK)) { + if (V_useloopback && !vhid && !(ifp->if_flags & IFF_LOOPBACK)) { struct route ia_ro; bzero(&ia_ro, sizeof(ia_ro)); *((struct sockaddr_in *)(&ia_ro.ro_dst)) = ia->ia_addr; - rtalloc_ign_fib(&ia_ro, 0, RT_DEFAULT_FIB); + rtalloc_ign_fib(&ia_ro, 0, 0); if ((ia_ro.ro_rt != NULL) && (ia_ro.ro_rt->rt_ifp != NULL) && (ia_ro.ro_rt->rt_ifp == V_loif)) { RT_LOCK(ia_ro.ro_rt); @@ -1005,7 +1004,7 @@ /* * Check if we have a route for the given prefix already or add one accordingly. */ -static int +int in_addprefix(struct in_ifaddr *target, int flags) { struct in_ifaddr *ia; @@ -1051,9 +1050,7 @@ } else break; #endif - if (V_sameprefixcarponly && - target->ia_ifp->if_type != IFT_CARP && - ia->ia_ifp->if_type != IFT_CARP) { + if (V_nosameprefix) { IN_IFADDR_RUNLOCK(); return (EEXIST); } else { @@ -1074,18 +1071,16 @@ return (error); } -extern void arp_ifscrub(struct ifnet *ifp, uint32_t addr); - /* * If there is no other address in the system that can serve a route to the * same prefix, remove the route. Hand over the route to the new address * otherwise. */ -static int +int in_scrubprefix(struct in_ifaddr *target, u_int flags) { struct in_ifaddr *ia; - struct in_addr prefix, mask, p; + struct in_addr prefix, mask, p, m; int error = 0; struct sockaddr_in prefix0, mask0; @@ -1131,9 +1126,10 @@ arp_ifscrub(target->ia_ifp, IA_SIN(target)->sin_addr.s_addr); } - if (rtinitflags(target)) + if (rtinitflags(target)) { prefix = target->ia_dstaddr.sin_addr; - else { + mask.s_addr = 0; + } else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; @@ -1146,28 +1142,30 @@ IN_IFADDR_RLOCK(); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { - if (rtinitflags(ia)) + if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; - else { + + if (prefix.s_addr != p.s_addr) + continue; + } else { p = ia->ia_addr.sin_addr; - p.s_addr &= ia->ia_sockmask.sin_addr.s_addr; + m = ia->ia_sockmask.sin_addr; + p.s_addr &= m.s_addr; + + if (prefix.s_addr != p.s_addr || + mask.s_addr != m.s_addr) + continue; } - if ((prefix.s_addr != p.s_addr) || - !(ia->ia_ifp->if_flags & IFF_UP)) + if ((ia->ia_ifp->if_flags & IFF_UP) == 0) continue; /* * If we got a matching prefix address, move IFA_ROUTE and * the route itself to it. Make sure that routing daemons * get a heads-up. - * - * XXX: a special case for carp(4) interface - this should - * be more generally specified as an interface that - * doesn't support such action. */ - if ((ia->ia_flags & IFA_ROUTE) == 0 - && (ia->ia_ifp->if_type != IFT_CARP)) { + if ((ia->ia_flags & IFA_ROUTE) == 0) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(); error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, @@ -1315,9 +1313,6 @@ IN_MULTI_UNLOCK(); } -#include -#include - struct in_llentry { struct llentry base; struct sockaddr_in l3_addr4; diff -r 1a8929bdc357 sys/netinet/in_var.h --- a/sys/netinet/in_var.h Thu Jan 24 06:03:22 2013 +0800 +++ b/sys/netinet/in_var.h Wed Feb 06 10:50:21 2013 +0800 @@ -77,6 +77,7 @@ struct sockaddr_in ifra_broadaddr; #define ifra_dstaddr ifra_broadaddr struct sockaddr_in ifra_mask; + int ifra_vhid; }; /* * Given a pointer to an in_ifaddr (ifaddr), @@ -444,6 +445,8 @@ int in_control(struct socket *, u_long, caddr_t, struct ifnet *, struct thread *); void in_rtqdrain(void); +int in_addprefix(struct in_ifaddr *, int); +int in_scrubprefix(struct in_ifaddr *, u_int); void ip_input(struct mbuf *); int in_ifadown(struct ifaddr *ifa, int); void in_ifscrub(struct ifnet *, struct in_ifaddr *, u_int); diff -r 1a8929bdc357 sys/netinet/ip_carp.c --- a/sys/netinet/ip_carp.c Thu Jan 24 06:03:22 2013 +0800 +++ b/sys/netinet/ip_carp.c Wed Feb 06 10:50:21 2013 +0800 @@ -1,6 +1,8 @@ -/* - * Copyright (c) 2002 Michael Shalayeff. All rights reserved. - * Copyright (c) 2003 Ryan McBride. All rights reserved. +/*- + * Copyright (c) 2002 Michael Shalayeff. + * Copyright (c) 2003 Ryan McBride. + * Copyright (c) 2011 Gleb Smirnoff + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -25,44 +27,36 @@ */ #include -__FBSDID("$FreeBSD: release/9.1.0/sys/netinet/ip_carp.c 233200 2012-03-19 20:49:16Z jhb $"); +__FBSDID("$FreeBSD: head/sys/netinet/ip_carp.c 228571 2011-12-16 12:16:56Z glebius $"); #include "opt_bpf.h" #include "opt_inet.h" #include "opt_inet6.h" -#include #include #include -#include +#include +#include #include #include #include #include #include -#include #include #include #include +#include +#include #include #include -#include -#include -#include -#include -#include - -#include - -#include #include #include +#include +#include +#include +#include #include -#include -#include -#include -#include #include #include @@ -71,12 +65,9 @@ #include #include #include - #include #endif - #ifdef INET -#include #include #include #endif @@ -85,65 +76,117 @@ #include #include #include +#include #include #include -#include #include #endif #include -#define CARP_IFNAME "carp" -static MALLOC_DEFINE(M_CARP, "CARP", "CARP interfaces"); -SYSCTL_DECL(_net_inet_carp); +static MALLOC_DEFINE(M_CARP, "CARP", "CARP addresses"); struct carp_softc { - struct ifnet *sc_ifp; /* Interface clue */ - struct ifnet *sc_carpdev; /* Pointer to parent interface */ - struct in_ifaddr *sc_ia; /* primary iface address */ + struct ifnet *sc_carpdev; /* Pointer to parent ifnet. */ + struct ifaddr **sc_ifas; /* Our ifaddrs. */ + struct sockaddr_dl sc_addr; /* Our link level address. */ + struct callout sc_ad_tmo; /* Advertising timeout. */ #ifdef INET - struct ip_moptions sc_imo; + struct callout sc_md_tmo; /* Master down timeout. */ #endif #ifdef INET6 - struct in6_ifaddr *sc_ia6; /* primary iface address v6 */ - struct ip6_moptions sc_im6o; -#endif /* INET6 */ - TAILQ_ENTRY(carp_softc) sc_list; + struct callout sc_md6_tmo; /* XXX: Master down timeout. */ +#endif + struct mtx sc_mtx; + int sc_vhid; + int sc_advskew; + int sc_advbase; + + int sc_naddrs; + int sc_naddrs6; + int sc_ifasiz; enum { INIT = 0, BACKUP, MASTER } sc_state; - - int sc_flags_backup; - int sc_suppress; - - int sc_sendad_errors; + int sc_suppress; + int sc_sendad_errors; #define CARP_SENDAD_MAX_ERRORS 3 - int sc_sendad_success; + int sc_sendad_success; #define CARP_SENDAD_MIN_SUCCESS 3 - int sc_vhid; - int sc_advskew; - int sc_naddrs; - int sc_naddrs6; - int sc_advbase; /* seconds */ - int sc_init_counter; - u_int64_t sc_counter; + int sc_init_counter; + uint64_t sc_counter; /* authentication */ -#define CARP_HMAC_PAD 64 +#define CARP_HMAC_PAD 64 unsigned char sc_key[CARP_KEY_LEN]; unsigned char sc_pad[CARP_HMAC_PAD]; SHA1_CTX sc_sha1; - struct callout sc_ad_tmo; /* advertisement timeout */ - struct callout sc_md_tmo; /* master down timeout */ - struct callout sc_md6_tmo; /* master down timeout */ - - LIST_ENTRY(carp_softc) sc_next; /* Interface clue */ + TAILQ_ENTRY(carp_softc) sc_list; /* On the carp_if list. */ + LIST_ENTRY(carp_softc) sc_next; /* On the global list. */ }; -#define SC2IFP(sc) ((sc)->sc_ifp) + +struct carp_if { +#ifdef INET + int cif_naddrs; +#endif +#ifdef INET6 + int cif_naddrs6; +#endif + TAILQ_HEAD(, carp_softc) cif_vrs; +#ifdef INET + struct ip_moptions cif_imo; +#endif +#ifdef INET6 + struct ip6_moptions cif_im6o; +#endif + struct ifnet *cif_ifp; + struct mtx cif_mtx; +}; + +#define CARP_INET 0 +#define CARP_INET6 1 +static int proto_reg[] = {-1, -1}; + +/* + * Brief design of carp(4). + * + * Any carp-capable ifnet may have a list of carp softcs hanging off + * its ifp->if_carp pointer. Each softc represents one unique virtual + * host id, or vhid. The softc has a back pointer to the ifnet. All + * softcs are joined in a global list, which has quite limited use. + * + * Any interface address that takes part in CARP negotiation has a + * pointer to the softc of its vhid, ifa->ifa_carp. That could be either + * AF_INET or AF_INET6 address. + * + * Although, one can get the softc's backpointer to ifnet and traverse + * through its ifp->if_addrhead queue to find all interface addresses + * involved in CARP, we keep a growable array of ifaddr pointers. This + * allows us to avoid grabbing the IF_ADDR_LOCK() in many traversals that + * do calls into the network stack, thus avoiding LORs. + * + * Locking: + * + * Each softc has a lock sc_mtx. It is used to synchronise carp_input_c(), + * callout-driven events and ioctl()s. + * + * To traverse the list of softcs on an ifnet we use CIF_LOCK(), to + * traverse the global list we use the mutex carp_mtx. + * + * Known issues with locking: + * + * - There is no protection for races between two ioctl() requests, + * neither SIOCSVH, nor SIOCAIFADDR & SIOCAIFADDR_IN6. I think that all + * interface ioctl()s should be serialized right in net/if.c. + * - Sending ad, we put the pointer to the softc in an mtag, and no reference + * counting is done on the softc. + * - On module unload we may race (?) with packet processing thread + * dereferencing our function pointers. + */ int carp_suppress_preempt = 0; -int carp_opts[CARPCTL_MAXID] = { 0, 1, 0, 1, 0, 0 }; /* XXX for now */ +int carp_opts[CARPCTL_MAXID] = { 0, 1, 0, 1, 0, }; SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW, 0, "CARP"); SYSCTL_INT(_net_inet_carp, CARPCTL_ALLOW, allow, CTLFLAG_RW, &carp_opts[CARPCTL_ALLOW], 0, "Accept incoming CARP packets"); @@ -151,8 +194,6 @@ &carp_opts[CARPCTL_PREEMPT], 0, "high-priority backup preemption mode"); SYSCTL_INT(_net_inet_carp, CARPCTL_LOG, log, CTLFLAG_RW, &carp_opts[CARPCTL_LOG], 0, "log bad carp packets"); -SYSCTL_INT(_net_inet_carp, CARPCTL_ARPBALANCE, arpbalance, CTLFLAG_RW, - &carp_opts[CARPCTL_ARPBALANCE], 0, "balance arp responses"); SYSCTL_INT(_net_inet_carp, OID_AUTO, suppress_preempt, CTLFLAG_RD, &carp_suppress_preempt, 0, "Preemption is suppressed"); @@ -161,36 +202,22 @@ &carpstats, carpstats, "CARP statistics (struct carpstats, netinet/ip_carp.h)"); -struct carp_if { - TAILQ_HEAD(, carp_softc) vhif_vrs; - int vhif_nvrs; - - struct ifnet *vhif_ifp; - struct mtx vhif_mtx; -}; - -#define CARP_INET 0 -#define CARP_INET6 1 -static int proto_reg[] = {-1, -1}; - -/* Get carp_if from softc. Valid after carp_set_addr{,6}. */ -#define SC2CIF(sc) ((struct carp_if *)(sc)->sc_carpdev->if_carp) - -/* lock per carp_if queue */ -#define CARP_LOCK_INIT(cif) mtx_init(&(cif)->vhif_mtx, "carp_if", \ +#define CARP_LOCK_INIT(sc) mtx_init(&(sc)->sc_mtx, "carp_softc", \ NULL, MTX_DEF) -#define CARP_LOCK_DESTROY(cif) mtx_destroy(&(cif)->vhif_mtx) -#define CARP_LOCK_ASSERT(cif) mtx_assert(&(cif)->vhif_mtx, MA_OWNED) -#define CARP_LOCK(cif) mtx_lock(&(cif)->vhif_mtx) -#define CARP_UNLOCK(cif) mtx_unlock(&(cif)->vhif_mtx) - -#define CARP_SCLOCK(sc) mtx_lock(&SC2CIF(sc)->vhif_mtx) -#define CARP_SCUNLOCK(sc) mtx_unlock(&SC2CIF(sc)->vhif_mtx) -#define CARP_SCLOCK_ASSERT(sc) mtx_assert(&SC2CIF(sc)->vhif_mtx, MA_OWNED) +#define CARP_LOCK_DESTROY(sc) mtx_destroy(&(sc)->sc_mtx) +#define CARP_LOCK_ASSERT(sc) mtx_assert(&(sc)->sc_mtx, MA_OWNED) +#define CARP_LOCK(sc) mtx_lock(&(sc)->sc_mtx) +#define CARP_UNLOCK(sc) mtx_unlock(&(sc)->sc_mtx) +#define CIF_LOCK_INIT(cif) mtx_init(&(cif)->cif_mtx, "carp_if", \ + NULL, MTX_DEF) +#define CIF_LOCK_DESTROY(cif) mtx_destroy(&(cif)->cif_mtx) +#define CIF_LOCK_ASSERT(cif) mtx_assert(&(cif)->cif_mtx, MA_OWNED) +#define CIF_LOCK(cif) mtx_lock(&(cif)->cif_mtx) +#define CIF_UNLOCK(cif) mtx_unlock(&(cif)->cif_mtx) #define CARP_LOG(...) do { \ if (carp_opts[CARPCTL_LOG] > 0) \ - log(LOG_INFO, __VA_ARGS__); \ + log(LOG_INFO, "carp: " __VA_ARGS__); \ } while (0) #define CARP_DEBUG(...) do { \ @@ -198,58 +225,43 @@ log(LOG_DEBUG, __VA_ARGS__); \ } while (0) -static void carp_hmac_prepare(struct carp_softc *); -static void carp_hmac_generate(struct carp_softc *, u_int32_t *, - unsigned char *); -static int carp_hmac_verify(struct carp_softc *, u_int32_t *, - unsigned char *); -static void carp_setroute(struct carp_softc *, int); +#define IFNET_FOREACH_IFA(ifp, ifa) \ + IF_ADDR_LOCK_ASSERT(ifp); \ + TAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link) \ + if ((ifa)->ifa_carp != NULL) + +#define CARP_FOREACH_IFA(sc, ifa) \ + CARP_LOCK_ASSERT(sc); \ + for (int _i = 0; \ + _i < (sc)->sc_naddrs + (sc)->sc_naddrs6 && \ + ((ifa) = sc->sc_ifas[_i]) != NULL; \ + ++_i) + +#define IFNET_FOREACH_CARP(ifp, sc) \ + CIF_LOCK_ASSERT(ifp->if_carp); \ + TAILQ_FOREACH((sc), &(ifp)->if_carp->cif_vrs, sc_list) + static void carp_input_c(struct mbuf *, struct carp_header *, sa_family_t); -static int carp_clone_create(struct if_clone *, int, caddr_t); -static void carp_clone_destroy(struct ifnet *); -static void carpdetach(struct carp_softc *, int); -static int carp_prepare_ad(struct mbuf *, struct carp_softc *, - struct carp_header *); -static void carp_send_ad_all(void); +static struct carp_softc + *carp_alloc(struct ifnet *); +static void carp_destroy(struct carp_softc *); +static struct carp_if + *carp_alloc_if(struct ifnet *); +static void carp_free_if(struct carp_if *); +static void carp_set_state(struct carp_softc *, int); +static void carp_sc_state(struct carp_softc *); +static void carp_setrun(struct carp_softc *, sa_family_t); +static void carp_master_down(void *); +static void carp_master_down_locked(struct carp_softc *); static void carp_send_ad(void *); static void carp_send_ad_locked(struct carp_softc *); -#ifdef INET -static void carp_send_arp(struct carp_softc *); -#endif -static void carp_master_down(void *); -static void carp_master_down_locked(struct carp_softc *); -static int carp_ioctl(struct ifnet *, u_long, caddr_t); -static int carp_looutput(struct ifnet *, struct mbuf *, struct sockaddr *, - struct route *); -static void carp_start(struct ifnet *); -static void carp_setrun(struct carp_softc *, sa_family_t); -static void carp_set_state(struct carp_softc *, int); -#ifdef INET -static int carp_addrcount(struct carp_if *, struct in_ifaddr *, int); -#endif -enum { CARP_COUNT_MASTER, CARP_COUNT_RUNNING }; +static void carp_addroute(struct carp_softc *); +static void carp_delroute(struct carp_softc *); -#ifdef INET -static void carp_multicast_cleanup(struct carp_softc *, int dofree); -static int carp_set_addr(struct carp_softc *, struct sockaddr_in *); -static int carp_del_addr(struct carp_softc *, struct sockaddr_in *); -#endif -static void carp_carpdev_state_locked(struct carp_if *); -static void carp_sc_state_locked(struct carp_softc *); -#ifdef INET6 -static void carp_send_na(struct carp_softc *); -static int carp_set_addr6(struct carp_softc *, struct sockaddr_in6 *); -static int carp_del_addr6(struct carp_softc *, struct sockaddr_in6 *); -static void carp_multicast6_cleanup(struct carp_softc *, int dofree); -#endif +static LIST_HEAD(, carp_softc) carp_list; +static struct mtx carp_mtx; -static LIST_HEAD(, carp_softc) carpif_list; -static struct mtx carp_mtx; -IFC_SIMPLE_DECLARE(carp, 0); - -static eventhandler_tag if_detach_event_tag; - -static __inline u_int16_t +static __inline uint16_t carp_cksum(struct mbuf *m, int len) { return (in_cksum(m, len)); @@ -258,8 +270,8 @@ static void carp_hmac_prepare(struct carp_softc *sc) { - u_int8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT; - u_int8_t vhid = sc->sc_vhid & 0xff; + uint8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT; + uint8_t vhid = sc->sc_vhid & 0xff; struct ifaddr *ifa; int i, found; #ifdef INET @@ -269,18 +281,15 @@ struct in6_addr last6, cur6, in6; #endif - if (sc->sc_carpdev) - CARP_SCLOCK(sc); + CARP_LOCK_ASSERT(sc); - /* XXX: possible race here */ - - /* compute ipad from key */ + /* Compute ipad from key. */ bzero(sc->sc_pad, sizeof(sc->sc_pad)); bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key)); for (i = 0; i < sizeof(sc->sc_pad); i++) sc->sc_pad[i] ^= 0x36; - /* precompute first part of inner hash */ + /* Precompute first part of inner hash. */ SHA1Init(&sc->sc_sha1); SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad)); SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version)); @@ -292,8 +301,7 @@ found = 0; last = cur; cur.s_addr = 0xffffffff; - IF_ADDR_RLOCK(SC2IFP(sc)); - TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { + CARP_FOREACH_IFA(sc, ifa) { in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr; if (ifa->ifa_addr->sa_family == AF_INET && ntohl(in.s_addr) > ntohl(last.s_addr) && @@ -302,7 +310,6 @@ found++; } } - IF_ADDR_RUNLOCK(SC2IFP(sc)); if (found) SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur)); } while (found); @@ -313,8 +320,7 @@ found = 0; last6 = cur6; memset(&cur6, 0xff, sizeof(cur6)); - IF_ADDR_RLOCK(SC2IFP(sc)); - TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { + CARP_FOREACH_IFA(sc, ifa) { in6 = ifatoia6(ifa)->ia_addr.sin6_addr; if (IN6_IS_SCOPE_EMBED(&in6)) in6.s6_addr16[1] = 0; @@ -325,7 +331,6 @@ found++; } } - IF_ADDR_RUNLOCK(SC2IFP(sc)); if (found) SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6)); } while (found); @@ -334,17 +339,16 @@ /* convert ipad to opad */ for (i = 0; i < sizeof(sc->sc_pad); i++) sc->sc_pad[i] ^= 0x36 ^ 0x5c; - - if (sc->sc_carpdev) - CARP_SCUNLOCK(sc); } static void -carp_hmac_generate(struct carp_softc *sc, u_int32_t counter[2], +carp_hmac_generate(struct carp_softc *sc, uint32_t counter[2], unsigned char md[20]) { SHA1_CTX sha1ctx; + CARP_LOCK_ASSERT(sc); + /* fetch first half of inner hash */ bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx)); @@ -359,203 +363,18 @@ } static int -carp_hmac_verify(struct carp_softc *sc, u_int32_t counter[2], +carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2], unsigned char md[20]) { unsigned char md2[20]; - CARP_SCLOCK_ASSERT(sc); + CARP_LOCK_ASSERT(sc); carp_hmac_generate(sc, counter, md2); return (bcmp(md, md2, sizeof(md2))); } -static void -carp_setroute(struct carp_softc *sc, int cmd) -{ - struct ifaddr *ifa; - int s; - - if (sc->sc_carpdev) - CARP_SCLOCK_ASSERT(sc); - - s = splnet(); - TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { -#ifdef INET - if (ifa->ifa_addr->sa_family == AF_INET && - sc->sc_carpdev != NULL) { - int count = carp_addrcount( - (struct carp_if *)sc->sc_carpdev->if_carp, - ifatoia(ifa), CARP_COUNT_MASTER); - - if ((cmd == RTM_ADD && count == 1) || - (cmd == RTM_DELETE && count == 0)) - rtinit(ifa, cmd, RTF_UP | RTF_HOST); - } -#endif - } - splx(s); -} - -static int -carp_clone_create(struct if_clone *ifc, int unit, caddr_t params) -{ - - struct carp_softc *sc; - struct ifnet *ifp; - - sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO); - ifp = SC2IFP(sc) = if_alloc(IFT_ETHER); - if (ifp == NULL) { - free(sc, M_CARP); - return (ENOSPC); - } - - sc->sc_flags_backup = 0; - sc->sc_suppress = 0; - sc->sc_advbase = CARP_DFLTINTV; - sc->sc_vhid = -1; /* required setting */ - sc->sc_advskew = 0; - sc->sc_init_counter = 1; - sc->sc_naddrs = sc->sc_naddrs6 = 0; /* M_ZERO? */ -#ifdef INET - sc->sc_imo.imo_membership = (struct in_multi **)malloc( - (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP, - M_WAITOK); - sc->sc_imo.imo_mfilters = NULL; - sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS; - sc->sc_imo.imo_multicast_vif = -1; -#endif -#ifdef INET6 - sc->sc_im6o.im6o_membership = (struct in6_multi **)malloc( - (sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP, - M_WAITOK); - sc->sc_im6o.im6o_mfilters = NULL; - sc->sc_im6o.im6o_max_memberships = IPV6_MIN_MEMBERSHIPS; - sc->sc_im6o.im6o_multicast_hlim = CARP_DFLTTL; -#endif - - callout_init(&sc->sc_ad_tmo, CALLOUT_MPSAFE); - callout_init(&sc->sc_md_tmo, CALLOUT_MPSAFE); - callout_init(&sc->sc_md6_tmo, CALLOUT_MPSAFE); - - ifp->if_softc = sc; - if_initname(ifp, CARP_IFNAME, unit); - ifp->if_mtu = ETHERMTU; - ifp->if_flags = IFF_LOOPBACK; - ifp->if_ioctl = carp_ioctl; - ifp->if_output = carp_looutput; - ifp->if_start = carp_start; - ifp->if_type = IFT_CARP; - ifp->if_snd.ifq_maxlen = ifqmaxlen; - ifp->if_hdrlen = 0; - if_attach(ifp); - bpfattach(SC2IFP(sc), DLT_NULL, sizeof(u_int32_t)); - mtx_lock(&carp_mtx); - LIST_INSERT_HEAD(&carpif_list, sc, sc_next); - mtx_unlock(&carp_mtx); - return (0); -} - -static void -carp_clone_destroy(struct ifnet *ifp) -{ - struct carp_softc *sc = ifp->if_softc; - - if (sc->sc_carpdev) - CARP_SCLOCK(sc); - carpdetach(sc, 1); /* Returns unlocked. */ - - mtx_lock(&carp_mtx); - LIST_REMOVE(sc, sc_next); - mtx_unlock(&carp_mtx); - bpfdetach(ifp); - if_detach(ifp); - if_free_type(ifp, IFT_ETHER); -#ifdef INET - free(sc->sc_imo.imo_membership, M_CARP); -#endif -#ifdef INET6 - free(sc->sc_im6o.im6o_membership, M_CARP); -#endif - free(sc, M_CARP); -} - -/* - * This function can be called on CARP interface destroy path, - * and in case of the removal of the underlying interface as - * well. We differentiate these two cases: in case of destruction - * of the underlying interface, we do not cleanup our multicast - * memberships, since they are already freed. But we purge pointers - * to multicast structures, since they are no longer valid, to - * avoid panic in future calls to carpdetach(). Also, we do not - * release the lock on return, because the function will be - * called once more, for another CARP instance on the same - * interface. - */ -static void -carpdetach(struct carp_softc *sc, int unlock) -{ - struct carp_if *cif; - - callout_stop(&sc->sc_ad_tmo); - callout_stop(&sc->sc_md_tmo); - callout_stop(&sc->sc_md6_tmo); - - if (sc->sc_suppress) - carp_suppress_preempt--; - sc->sc_suppress = 0; - - if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) - carp_suppress_preempt--; - sc->sc_sendad_errors = 0; - - carp_set_state(sc, INIT); - SC2IFP(sc)->if_flags &= ~IFF_UP; - carp_setrun(sc, 0); -#ifdef INET - carp_multicast_cleanup(sc, unlock); -#endif -#ifdef INET6 - carp_multicast6_cleanup(sc, unlock); -#endif - - if (sc->sc_carpdev != NULL) { - cif = (struct carp_if *)sc->sc_carpdev->if_carp; - CARP_LOCK_ASSERT(cif); - TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list); - if (!--cif->vhif_nvrs) { - ifpromisc(sc->sc_carpdev, 0); - sc->sc_carpdev->if_carp = NULL; - CARP_LOCK_DESTROY(cif); - free(cif, M_CARP); - } else if (unlock) - CARP_UNLOCK(cif); - sc->sc_carpdev = NULL; - } -} - -/* Detach an interface from the carp. */ -static void -carp_ifdetach(void *arg __unused, struct ifnet *ifp) -{ - struct carp_if *cif = (struct carp_if *)ifp->if_carp; - struct carp_softc *sc, *nextsc; - - if (cif == NULL) - return; - - /* - * XXX: At the end of for() cycle the lock will be destroyed. - */ - CARP_LOCK(cif); - for (sc = TAILQ_FIRST(&cif->vhif_vrs); sc; sc = nextsc) { - nextsc = TAILQ_NEXT(sc, sc_list); - carpdetach(sc, 0); - } -} - /* * process input packet. * we have rearranged checks order compared to the rfc, @@ -576,20 +395,10 @@ return; } - /* check if received on a valid carp interface */ - if (m->m_pkthdr.rcvif->if_carp == NULL) { - CARPSTATS_INC(carps_badif); - CARP_DEBUG("carp_input: packet received on non-carp " - "interface: %s\n", - m->m_pkthdr.rcvif->if_xname); - m_freem(m); - return; - } - /* verify that the IP TTL is 255. */ if (ip->ip_ttl != CARP_DFLTTL) { CARPSTATS_INC(carps_badttl); - CARP_DEBUG("carp_input: received ttl %d != 255 on %s\n", + CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__, ip->ip_ttl, m->m_pkthdr.rcvif->if_xname); m_freem(m); @@ -600,9 +409,8 @@ if (m->m_pkthdr.len < iplen + sizeof(*ch)) { CARPSTATS_INC(carps_badlen); - CARP_DEBUG("carp_input: received len %zd < " - "sizeof(struct carp_header) on %s\n", - m->m_len - sizeof(struct ip), + CARP_DEBUG("%s: received len %zd < sizeof(struct carp_header) " + "on %s\n", __func__, m->m_len - sizeof(struct ip), m->m_pkthdr.rcvif->if_xname); m_freem(m); return; @@ -611,7 +419,7 @@ if (iplen + sizeof(*ch) < m->m_len) { if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) { CARPSTATS_INC(carps_hdrops); - CARP_DEBUG("carp_input: pullup failed\n"); + CARP_DEBUG("%s: pullup failed\n", __func__); return; } ip = mtod(m, struct ip *); @@ -625,7 +433,7 @@ len = iplen + sizeof(*ch); if (len > m->m_pkthdr.len) { CARPSTATS_INC(carps_badlen); - CARP_DEBUG("carp_input: packet too short %d on %s\n", + CARP_DEBUG("%s: packet too short %d on %s\n", __func__, m->m_pkthdr.len, m->m_pkthdr.rcvif->if_xname); m_freem(m); @@ -643,7 +451,7 @@ m->m_data += iplen; if (carp_cksum(m, len - iplen)) { CARPSTATS_INC(carps_badsum); - CARP_DEBUG("carp_input: checksum failed on %s\n", + CARP_DEBUG("%s: checksum failed on %s\n", __func__, m->m_pkthdr.rcvif->if_xname); m_freem(m); return; @@ -673,9 +481,8 @@ /* check if received on a valid carp interface */ if (m->m_pkthdr.rcvif->if_carp == NULL) { CARPSTATS_INC(carps_badif); - CARP_DEBUG("carp6_input: packet received on non-carp " - "interface: %s\n", - m->m_pkthdr.rcvif->if_xname); + CARP_DEBUG("%s: packet received on non-carp interface: %s\n", + __func__, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } @@ -683,9 +490,8 @@ /* verify that the IP TTL is 255 */ if (ip6->ip6_hlim != CARP_DFLTTL) { CARPSTATS_INC(carps_badttl); - CARP_DEBUG("carp6_input: received ttl %d != 255 on %s\n", - ip6->ip6_hlim, - m->m_pkthdr.rcvif->if_xname); + CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__, + ip6->ip6_hlim, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } @@ -695,7 +501,7 @@ IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch)); if (ch == NULL) { CARPSTATS_INC(carps_badlen); - CARP_DEBUG("carp6_input: packet size %u too small\n", len); + CARP_DEBUG("%s: packet size %u too small\n", __func__, len); return (IPPROTO_DONE); } @@ -704,7 +510,7 @@ m->m_data += *offp; if (carp_cksum(m, sizeof(*ch))) { CARPSTATS_INC(carps_badsum); - CARP_DEBUG("carp6_input: checksum failed, on %s\n", + CARP_DEBUG("%s: checksum failed, on %s\n", __func__, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); @@ -720,62 +526,46 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) { struct ifnet *ifp = m->m_pkthdr.rcvif; + struct ifaddr *ifa; struct carp_softc *sc; - u_int64_t tmp_counter; + uint64_t tmp_counter; struct timeval sc_tv, ch_tv; /* verify that the VHID is valid on the receiving interface */ - CARP_LOCK(ifp->if_carp); - TAILQ_FOREACH(sc, &((struct carp_if *)ifp->if_carp)->vhif_vrs, sc_list) - if (sc->sc_vhid == ch->carp_vhid) + IF_ADDR_LOCK(ifp); + IFNET_FOREACH_IFA(ifp, ifa) + if (ifa->ifa_addr->sa_family == af && + ifa->ifa_carp->sc_vhid == ch->carp_vhid) { + ifa_ref(ifa); break; + } + IF_ADDR_UNLOCK(ifp); - if (!sc || !((SC2IFP(sc)->if_flags & IFF_UP) && - (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING))) { + if (ifa == NULL) { CARPSTATS_INC(carps_badvhid); - CARP_UNLOCK(ifp->if_carp); m_freem(m); return; } - getmicrotime(&SC2IFP(sc)->if_lastchange); - SC2IFP(sc)->if_ipackets++; - SC2IFP(sc)->if_ibytes += m->m_pkthdr.len; - - if (bpf_peers_present(SC2IFP(sc)->if_bpf)) { - uint32_t af1 = af; -#ifdef INET - struct ip *ip = mtod(m, struct ip *); - - /* BPF wants net byte order */ - if (af == AF_INET) { - ip->ip_len = htons(ip->ip_len + (ip->ip_hl << 2)); - ip->ip_off = htons(ip->ip_off); - } -#endif - bpf_mtap2(SC2IFP(sc)->if_bpf, &af1, sizeof(af1), m); - } - /* verify the CARP version. */ if (ch->carp_version != CARP_VERSION) { CARPSTATS_INC(carps_badver); - SC2IFP(sc)->if_ierrors++; - CARP_UNLOCK(ifp->if_carp); - CARP_DEBUG("%s; invalid version %d\n", - SC2IFP(sc)->if_xname, + CARP_DEBUG("%s: invalid version %d\n", ifp->if_xname, ch->carp_version); + ifa_free(ifa); m_freem(m); return; } - /* verify the hash */ + sc = ifa->ifa_carp; + CARP_LOCK(sc); + ifa_free(ifa); + if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) { CARPSTATS_INC(carps_badauth); - SC2IFP(sc)->if_ierrors++; - CARP_UNLOCK(ifp->if_carp); - CARP_DEBUG("%s: incorrect hash\n", SC2IFP(sc)->if_xname); - m_freem(m); - return; + CARP_DEBUG("%s: incorrect hash for VHID %u@%s\n", __func__, + sc->sc_vhid, ifp->if_xname); + goto out; } tmp_counter = ntohl(ch->carp_counter[0]); @@ -806,12 +596,13 @@ if (timevalcmp(&sc_tv, &ch_tv, >) || timevalcmp(&sc_tv, &ch_tv, ==)) { callout_stop(&sc->sc_ad_tmo); - CARP_LOG("%s: MASTER -> BACKUP " - "(more frequent advertisement received)\n", - SC2IFP(sc)->if_xname); + CARP_LOG("VHID %u@%s: MASTER -> BACKUP " + "(more frequent advertisement received)\n", + sc->sc_vhid, + sc->sc_carpdev->if_xname); carp_set_state(sc, BACKUP); carp_setrun(sc, 0); - carp_setroute(sc, RTM_DELETE); + carp_delroute(sc); } break; case BACKUP: @@ -821,9 +612,10 @@ */ if (carp_opts[CARPCTL_PREEMPT] && timevalcmp(&sc_tv, &ch_tv, <)) { - CARP_LOG("%s: BACKUP -> MASTER " + CARP_LOG("VHID %u@%s: BACKUP -> MASTER " "(preempting a slower master)\n", - SC2IFP(sc)->if_xname); + sc->sc_vhid, + sc->sc_carpdev->if_xname); carp_master_down_locked(sc); break; } @@ -835,9 +627,10 @@ */ sc_tv.tv_sec = sc->sc_advbase * 3; if (timevalcmp(&sc_tv, &ch_tv, <)) { - CARP_LOG("%s: BACKUP -> MASTER " + CARP_LOG("VHID %u@%s: BACKUP -> MASTER " "(master timed out)\n", - SC2IFP(sc)->if_xname); + sc->sc_vhid, + sc->sc_carpdev->if_xname); carp_master_down_locked(sc); break; } @@ -850,17 +643,15 @@ break; } - CARP_UNLOCK(ifp->if_carp); - +out: + CARP_UNLOCK(sc); m_freem(m); - return; } static int carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch) { struct m_tag *mtag; - struct ifnet *ifp = SC2IFP(sc); if (sc->sc_init_counter) { /* this could also be seconds since unix epoch */ @@ -876,35 +667,38 @@ carp_hmac_generate(sc, ch->carp_counter, ch->carp_md); /* Tag packet for carp_output */ - mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct ifnet *), M_NOWAIT); - if (mtag == NULL) { + if ((mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *), + M_NOWAIT)) == NULL) { m_freem(m); - SC2IFP(sc)->if_oerrors++; + CARPSTATS_INC(carps_onomem); return (ENOMEM); } - bcopy(&ifp, (caddr_t)(mtag + 1), sizeof(struct ifnet *)); + bcopy(&sc, (caddr_t)(mtag + 1), sizeof(struct carp_softc *)); m_tag_prepend(m, mtag); return (0); } static void -carp_send_ad_all(void) +carp_send_ad_all(struct carp_softc *badsc) { struct carp_softc *sc; + /* + * Avoid LOR and recursive call to carp_send_ad_locked(). + */ + CARP_UNLOCK(badsc); + mtx_lock(&carp_mtx); - LIST_FOREACH(sc, &carpif_list, sc_next) { - if (sc->sc_carpdev == NULL) - continue; - CARP_SCLOCK(sc); - if ((SC2IFP(sc)->if_flags & IFF_UP) && - (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING) && - sc->sc_state == MASTER) + LIST_FOREACH(sc, &carp_list, sc_next) + if (sc != badsc && sc->sc_state == MASTER) { + CARP_LOCK(sc); carp_send_ad_locked(sc); - CARP_SCUNLOCK(sc); - } + CARP_UNLOCK(sc); + } mtx_unlock(&carp_mtx); + + CARP_LOCK(badsc); } static void @@ -912,9 +706,9 @@ { struct carp_softc *sc = v; - CARP_SCLOCK(sc); + CARP_LOCK_ASSERT(sc); carp_send_ad_locked(sc); - CARP_SCUNLOCK(sc); + CARP_UNLOCK(sc); } static void @@ -922,48 +716,42 @@ { struct carp_header ch; struct timeval tv; + struct sockaddr sa; + struct ifaddr *ifa; struct carp_header *ch_ptr; struct mbuf *m; - int len, advbase, advskew; + int len, advskew; - CARP_SCLOCK_ASSERT(sc); + CARP_LOCK_ASSERT(sc); - /* bow out if we've lost our UPness or RUNNINGuiness */ - if (!((SC2IFP(sc)->if_flags & IFF_UP) && - (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING))) { - advbase = 255; - advskew = 255; - } else { - advbase = sc->sc_advbase; - if (!carp_suppress_preempt || sc->sc_advskew > 240) - advskew = sc->sc_advskew; - else - advskew = 240; - tv.tv_sec = advbase; - tv.tv_usec = advskew * 1000000 / 256; - } + if (!carp_suppress_preempt || sc->sc_advskew > 240) + advskew = sc->sc_advskew; + else + advskew = 240; + tv.tv_sec = sc->sc_advbase; + tv.tv_usec = advskew * 1000000 / 256; ch.carp_version = CARP_VERSION; ch.carp_type = CARP_ADVERTISEMENT; ch.carp_vhid = sc->sc_vhid; - ch.carp_advbase = advbase; + ch.carp_advbase = sc->sc_advbase; ch.carp_advskew = advskew; ch.carp_authlen = 7; /* XXX DEFINE */ ch.carp_pad1 = 0; /* must be zero */ ch.carp_cksum = 0; + /* XXXGL: OpenBSD picks first ifaddr with needed family. */ + #ifdef INET - if (sc->sc_ia) { + if (sc->sc_naddrs) { struct ip *ip; - MGETHDR(m, M_DONTWAIT, MT_HEADER); + MGETHDR(m, M_NOWAIT, MT_HEADER); if (m == NULL) { - SC2IFP(sc)->if_oerrors++; CARPSTATS_INC(carps_onomem); /* XXX maybe less ? */ - if (advbase != 255 || advskew != 255) - callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), - carp_send_ad, sc); + callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), + carp_send_ad, sc); return; } len = sizeof(*ip) + sizeof(ch); @@ -982,7 +770,16 @@ ip->ip_ttl = CARP_DFLTTL; ip->ip_p = IPPROTO_CARP; ip->ip_sum = 0; - ip->ip_src.s_addr = sc->sc_ia->ia_addr.sin_addr.s_addr; + + bzero(&sa, sizeof(sa)); + sa.sa_family = AF_INET; + ifa = ifaof_ifpforaddr(&sa, sc->sc_carpdev); + if (ifa != NULL) { + ip->ip_src.s_addr = + ifatoia(ifa)->ia_addr.sin_addr.s_addr; + ifa_free(ifa); + } else + ip->ip_src.s_addr = 0; ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP); ch_ptr = (struct carp_header *)(&ip[1]); @@ -994,22 +791,16 @@ ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip)); m->m_data -= sizeof(*ip); - getmicrotime(&SC2IFP(sc)->if_lastchange); - SC2IFP(sc)->if_opackets++; - SC2IFP(sc)->if_obytes += len; CARPSTATS_INC(carps_opackets); - if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL)) { - SC2IFP(sc)->if_oerrors++; + if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, + &sc->sc_carpdev->if_carp->cif_imo, NULL)) { if (sc->sc_sendad_errors < INT_MAX) sc->sc_sendad_errors++; if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) { carp_suppress_preempt++; - if (carp_suppress_preempt == 1) { - CARP_SCUNLOCK(sc); - carp_send_ad_all(); - CARP_SCLOCK(sc); - } + if (carp_suppress_preempt == 1) + carp_send_ad_all(sc); } sc->sc_sendad_success = 0; } else { @@ -1025,17 +816,15 @@ } #endif /* INET */ #ifdef INET6 - if (sc->sc_ia6) { + if (sc->sc_naddrs6) { struct ip6_hdr *ip6; - MGETHDR(m, M_DONTWAIT, MT_HEADER); + MGETHDR(m, M_NOWAIT, MT_HEADER); if (m == NULL) { - SC2IFP(sc)->if_oerrors++; CARPSTATS_INC(carps_onomem); /* XXX maybe less ? */ - if (advbase != 255 || advskew != 255) - callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), - carp_send_ad, sc); + callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), + carp_send_ad, sc); return; } len = sizeof(*ip6) + sizeof(ch); @@ -1049,14 +838,23 @@ ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_hlim = CARP_DFLTTL; ip6->ip6_nxt = IPPROTO_CARP; - bcopy(&sc->sc_ia6->ia_addr.sin6_addr, &ip6->ip6_src, - sizeof(struct in6_addr)); - /* set the multicast destination */ + bzero(&sa, sizeof(sa)); + /* set the source address */ + sa.sa_family = AF_INET6; + ifa = ifaof_ifpforaddr(&sa, sc->sc_carpdev); + if (ifa != NULL) { + bcopy(IFA_IN6(ifa), &ip6->ip6_src, + sizeof(struct in6_addr)); + ifa_free(ifa); + } else + /* This should never happen with IPv6. */ + bzero(&ip6->ip6_src, sizeof(struct in6_addr)); + + /* Set the multicast destination. */ ip6->ip6_dst.s6_addr16[0] = htons(0xff02); ip6->ip6_dst.s6_addr8[15] = 0x12; if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) { - SC2IFP(sc)->if_oerrors++; m_freem(m); CARP_DEBUG("%s: in6_setscope failed\n", __func__); return; @@ -1071,22 +869,16 @@ ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip6)); m->m_data -= sizeof(*ip6); - getmicrotime(&SC2IFP(sc)->if_lastchange); - SC2IFP(sc)->if_opackets++; - SC2IFP(sc)->if_obytes += len; CARPSTATS_INC(carps_opackets6); - if (ip6_output(m, NULL, NULL, 0, &sc->sc_im6o, NULL, NULL)) { - SC2IFP(sc)->if_oerrors++; + if (ip6_output(m, NULL, NULL, 0, + &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL)) { if (sc->sc_sendad_errors < INT_MAX) sc->sc_sendad_errors++; if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) { carp_suppress_preempt++; - if (carp_suppress_preempt == 1) { - CARP_SCUNLOCK(sc); - carp_send_ad_all(); - CARP_SCLOCK(sc); - } + if (carp_suppress_preempt == 1) + carp_send_ad_all(sc); } sc->sc_sendad_success = 0; } else { @@ -1102,10 +894,55 @@ } #endif /* INET6 */ - if (advbase != 255 || advskew != 255) - callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), - carp_send_ad, sc); + callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc); +} +static void +carp_addroute(struct carp_softc *sc) +{ + struct ifaddr *ifa; + + CARP_FOREACH_IFA(sc, ifa) + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + in_addprefix(ifatoia(ifa), RTF_UP); + ifa_add_loopback_route(ifa, + (struct sockaddr *)&ifatoia(ifa)->ia_addr); + break; +#endif +#ifdef INET6 + case AF_INET6: + ifa_add_loopback_route(ifa, + (struct sockaddr *)&ifatoia6(ifa)->ia_addr); + in6_ifaddloop(ifa); + break; +#endif + } +} + +static void +carp_delroute(struct carp_softc *sc) +{ + struct ifaddr *ifa; + + CARP_FOREACH_IFA(sc, ifa) + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + ifa_del_loopback_route(ifa, + (struct sockaddr *)&ifatoia(ifa)->ia_addr); + in_scrubprefix(ifatoia(ifa), LLE_STATIC); + break; +#endif +#ifdef INET6 + case AF_INET6: + ifa_del_loopback_route(ifa, + (struct sockaddr *)&ifatoia6(ifa)->ia_addr); + in6_ifremloop(ifa); + break; +#endif + } } #ifdef INET @@ -1119,16 +956,22 @@ { struct ifaddr *ifa; - TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { + CARP_FOREACH_IFA(sc, ifa) + if (ifa->ifa_addr->sa_family == AF_INET) + arp_ifinit2(sc->sc_carpdev, ifa, LLADDR(&sc->sc_addr)); +} - if (ifa->ifa_addr->sa_family != AF_INET) - continue; +int +carp_iamatch(struct ifaddr *ifa, uint8_t **enaddr) +{ + struct carp_softc *sc = ifa->ifa_carp; -/* arprequest(sc->sc_carpdev, &in, &in, IF_LLADDR(sc->sc_ifp)); */ - arp_ifinit2(sc->sc_carpdev, ifa, IF_LLADDR(sc->sc_ifp)); + if (sc->sc_state == MASTER) { + *enaddr = LLADDR(&sc->sc_addr); + return (1); + } - DELAY(1000); /* XXX */ - } + return (0); } #endif @@ -1136,226 +979,95 @@ static void carp_send_na(struct carp_softc *sc) { + static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT; struct ifaddr *ifa; struct in6_addr *in6; - static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT; - TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { - + CARP_FOREACH_IFA(sc, ifa) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; - in6 = &ifatoia6(ifa)->ia_addr.sin6_addr; + in6 = IFA_IN6(ifa); nd6_na_output(sc->sc_carpdev, &mcast, in6, ND_NA_FLAG_OVERRIDE, 1, NULL); DELAY(1000); /* XXX */ } } -#endif /* INET6 */ -#ifdef INET -static int -carp_addrcount(struct carp_if *cif, struct in_ifaddr *ia, int type) -{ - struct carp_softc *vh; - struct ifaddr *ifa; - int count = 0; - - CARP_LOCK_ASSERT(cif); - - TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { - if ((type == CARP_COUNT_RUNNING && - (SC2IFP(vh)->if_flags & IFF_UP) && - (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING)) || - (type == CARP_COUNT_MASTER && vh->sc_state == MASTER)) { - IF_ADDR_RLOCK(SC2IFP(vh)); - TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist, - ifa_list) { - if (ifa->ifa_addr->sa_family == AF_INET && - ia->ia_addr.sin_addr.s_addr == - ifatoia(ifa)->ia_addr.sin_addr.s_addr) - count++; - } - IF_ADDR_RUNLOCK(SC2IFP(vh)); - } - } - return (count); -} - -int -carp_iamatch(struct ifnet *ifp, struct in_ifaddr *ia, - struct in_addr *isaddr, u_int8_t **enaddr) -{ - struct carp_if *cif; - struct carp_softc *vh; - int index, count = 0; - struct ifaddr *ifa; - - cif = ifp->if_carp; - CARP_LOCK(cif); - - if (carp_opts[CARPCTL_ARPBALANCE]) { - /* - * XXX proof of concept implementation. - * We use the source ip to decide which virtual host should - * handle the request. If we're master of that virtual host, - * then we respond, otherwise, just drop the arp packet on - * the floor. - */ - count = carp_addrcount(cif, ia, CARP_COUNT_RUNNING); - if (count == 0) { - /* should never reach this */ - CARP_UNLOCK(cif); - return (0); - } - - /* this should be a hash, like pf_hash() */ - index = ntohl(isaddr->s_addr) % count; - count = 0; - - TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { - if ((SC2IFP(vh)->if_flags & IFF_UP) && - (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING)) { - IF_ADDR_RLOCK(SC2IFP(vh)); - TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist, - ifa_list) { - if (ifa->ifa_addr->sa_family == - AF_INET && - ia->ia_addr.sin_addr.s_addr == - ifatoia(ifa)->ia_addr.sin_addr.s_addr) { - if (count == index) { - if (vh->sc_state == - MASTER) { - *enaddr = IF_LLADDR(vh->sc_ifp); - IF_ADDR_RUNLOCK(SC2IFP(vh)); - CARP_UNLOCK(cif); - return (1); - } else { - IF_ADDR_RUNLOCK(SC2IFP(vh)); - CARP_UNLOCK(cif); - return (0); - } - } - count++; - } - } - IF_ADDR_RUNLOCK(SC2IFP(vh)); - } - } - } else { - TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { - if ((SC2IFP(vh)->if_flags & IFF_UP) && - (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) && - ia->ia_ifp == SC2IFP(vh) && - vh->sc_state == MASTER) { - *enaddr = IF_LLADDR(vh->sc_ifp); - CARP_UNLOCK(cif); - return (1); - } - } - } - CARP_UNLOCK(cif); - return (0); -} -#endif - -#ifdef INET6 struct ifaddr * carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr) { - struct carp_if *cif; - struct carp_softc *vh; struct ifaddr *ifa; - cif = ifp->if_carp; - CARP_LOCK(cif); - TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { - IF_ADDR_RLOCK(SC2IFP(vh)); - TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist, ifa_list) { - if (IN6_ARE_ADDR_EQUAL(taddr, - &ifatoia6(ifa)->ia_addr.sin6_addr) && - (SC2IFP(vh)->if_flags & IFF_UP) && - (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) && - vh->sc_state == MASTER) { - ifa_ref(ifa); - IF_ADDR_RUNLOCK(SC2IFP(vh)); - CARP_UNLOCK(cif); - return (ifa); - } + IF_ADDR_LOCK(ifp); + IFNET_FOREACH_IFA(ifp, ifa) + if (ifa->ifa_addr->sa_family == AF_INET6 && + ifa->ifa_carp->sc_state == MASTER && + IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) { + ifa_ref(ifa); + IF_ADDR_UNLOCK(ifp); + return (ifa); } - IF_ADDR_RUNLOCK(SC2IFP(vh)); - } - CARP_UNLOCK(cif); - + IF_ADDR_UNLOCK(ifp); + return (NULL); } caddr_t carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr) { - struct m_tag *mtag; - struct carp_if *cif; - struct carp_softc *sc; struct ifaddr *ifa; - cif = ifp->if_carp; - CARP_LOCK(cif); - TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) { - IF_ADDR_RLOCK(SC2IFP(sc)); - TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { - if (IN6_ARE_ADDR_EQUAL(taddr, - &ifatoia6(ifa)->ia_addr.sin6_addr) && - (SC2IFP(sc)->if_flags & IFF_UP) && - (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING)) { - struct ifnet *ifp = SC2IFP(sc); - mtag = m_tag_get(PACKET_TAG_CARP, - sizeof(struct ifnet *), M_NOWAIT); - if (mtag == NULL) { - /* better a bit than nothing */ - IF_ADDR_RUNLOCK(SC2IFP(sc)); - CARP_UNLOCK(cif); - return (IF_LLADDR(sc->sc_ifp)); - } - bcopy(&ifp, (caddr_t)(mtag + 1), - sizeof(struct ifnet *)); - m_tag_prepend(m, mtag); + IF_ADDR_LOCK(ifp); + IFNET_FOREACH_IFA(ifp, ifa) + if (ifa->ifa_addr->sa_family == AF_INET6 && + IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) { + struct carp_softc *sc = ifa->ifa_carp; + struct m_tag *mtag; - IF_ADDR_RUNLOCK(SC2IFP(sc)); - CARP_UNLOCK(cif); - return (IF_LLADDR(sc->sc_ifp)); - } + IF_ADDR_UNLOCK(ifp); + + mtag = m_tag_get(PACKET_TAG_CARP, + sizeof(struct ifnet *), M_NOWAIT); + if (mtag == NULL) + /* Better a bit than nothing. */ + return (LLADDR(&sc->sc_addr)); + + bcopy(&ifp, (caddr_t)(mtag + 1), + sizeof(struct ifnet *)); + m_tag_prepend(m, mtag); + + return (LLADDR(&sc->sc_addr)); } - IF_ADDR_RUNLOCK(SC2IFP(sc)); - } - CARP_UNLOCK(cif); + IF_ADDR_UNLOCK(ifp); return (NULL); } -#endif +#endif /* INET6 */ -struct ifnet * +int carp_forus(struct ifnet *ifp, u_char *dhost) { - struct carp_if *cif; - struct carp_softc *vh; - u_int8_t *ena = dhost; + struct carp_softc *sc; + uint8_t *ena = dhost; if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1) - return (NULL); + return (0); - cif = ifp->if_carp; - CARP_LOCK(cif); - TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) - if ((SC2IFP(vh)->if_flags & IFF_UP) && - (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) && - vh->sc_state == MASTER && - !bcmp(dhost, IF_LLADDR(vh->sc_ifp), ETHER_ADDR_LEN)) { - CARP_UNLOCK(cif); - return (SC2IFP(vh)); + CIF_LOCK(ifp->if_carp); + IFNET_FOREACH_CARP(ifp, sc) { + CARP_LOCK(sc); + if (sc->sc_state == MASTER && !bcmp(dhost, LLADDR(&sc->sc_addr), + ETHER_ADDR_LEN)) { + CARP_UNLOCK(sc); + CIF_UNLOCK(ifp->if_carp); + return (1); } + CARP_UNLOCK(sc); + } + CIF_UNLOCK(ifp->if_carp); - CARP_UNLOCK(cif); - return (NULL); + return (0); } static void @@ -1363,24 +1075,25 @@ { struct carp_softc *sc = v; - CARP_SCLOCK(sc); - carp_master_down_locked(sc); - CARP_SCUNLOCK(sc); + CARP_LOCK_ASSERT(sc); + + if (sc->sc_state == BACKUP) { + CARP_LOG("VHID %u@%s: BACKUP -> MASTER (preempting)\n", + sc->sc_vhid, + sc->sc_carpdev->if_xname); + carp_master_down_locked(sc); + } + + CARP_UNLOCK(sc); } static void carp_master_down_locked(struct carp_softc *sc) { - if (sc->sc_carpdev) - CARP_SCLOCK_ASSERT(sc); + + CARP_LOCK_ASSERT(sc); switch (sc->sc_state) { - case INIT: - printf("%s: master_down event in INIT state\n", - SC2IFP(sc)->if_xname); - break; - case MASTER: - break; case BACKUP: carp_set_state(sc, MASTER); carp_send_ad_locked(sc); @@ -1389,9 +1102,18 @@ #endif #ifdef INET6 carp_send_na(sc); -#endif /* INET6 */ +#endif carp_setrun(sc, 0); - carp_setroute(sc, RTM_ADD); + carp_addroute(sc); + break; + case INIT: + case MASTER: +#ifdef INVARIANTS + panic("carp: VHID %u@%s: master_down event in %s state\n", + sc->sc_vhid, + sc->sc_carpdev->if_xname, + sc->sc_state ? "MASTER" : "INIT"); +#endif break; } } @@ -1405,28 +1127,19 @@ { struct timeval tv; - if (sc->sc_carpdev == NULL) { - SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; - carp_set_state(sc, INIT); + CARP_LOCK_ASSERT(sc); + + if ((sc->sc_carpdev->if_flags & IFF_UP) == 0 || + sc->sc_carpdev->if_link_state != LINK_STATE_UP || + (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0)) return; - } else - CARP_SCLOCK_ASSERT(sc); - - if (SC2IFP(sc)->if_flags & IFF_UP && - sc->sc_vhid > 0 && (sc->sc_naddrs || sc->sc_naddrs6) && - sc->sc_carpdev->if_link_state == LINK_STATE_UP) - SC2IFP(sc)->if_drv_flags |= IFF_DRV_RUNNING; - else { - SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; - carp_setroute(sc, RTM_DELETE); - return; - } switch (sc->sc_state) { case INIT: - CARP_LOG("%s: INIT -> BACKUP\n", SC2IFP(sc)->if_xname); + CARP_LOG("VHID %u@%s: INIT -> BACKUP\n", + sc->sc_vhid, + sc->sc_carpdev->if_xname); carp_set_state(sc, BACKUP); - carp_setroute(sc, RTM_DELETE); carp_setrun(sc, 0); break; case BACKUP: @@ -1439,20 +1152,24 @@ callout_reset(&sc->sc_md_tmo, tvtohz(&tv), carp_master_down, sc); break; -#endif /* INET */ +#endif #ifdef INET6 case AF_INET6: callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), carp_master_down, sc); break; -#endif /* INET6 */ +#endif default: +#ifdef INET if (sc->sc_naddrs) callout_reset(&sc->sc_md_tmo, tvtohz(&tv), carp_master_down, sc); +#endif +#ifdef INET6 if (sc->sc_naddrs6) callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), carp_master_down, sc); +#endif break; } break; @@ -1465,691 +1182,154 @@ } } +/* + * Setup multicast structures. + */ +static int +carp_multicast_setup(struct carp_softc *sc, sa_family_t sa) +{ + struct ifnet *ifp = sc->sc_carpdev; + struct carp_if *cif = ifp->if_carp; + int error = 0; + + switch (sa) { #ifdef INET -static void -carp_multicast_cleanup(struct carp_softc *sc, int dofree) -{ - struct ip_moptions *imo = &sc->sc_imo; - u_int16_t n = imo->imo_num_memberships; + case AF_INET: + { + struct ip_moptions *imo = &cif->cif_imo; + struct in_addr addr; - /* Clean up our own multicast memberships */ - while (n-- > 0) { - if (imo->imo_membership[n] != NULL) { - if (dofree) - in_delmulti(imo->imo_membership[n]); - imo->imo_membership[n] = NULL; - } - } - KASSERT(imo->imo_mfilters == NULL, - ("%s: imo_mfilters != NULL", __func__)); - imo->imo_num_memberships = 0; - imo->imo_multicast_ifp = NULL; -} -#endif + if (imo->imo_membership) + return (0); -#ifdef INET6 -static void -carp_multicast6_cleanup(struct carp_softc *sc, int dofree) -{ - struct ip6_moptions *im6o = &sc->sc_im6o; - u_int16_t n = im6o->im6o_num_memberships; + imo->imo_membership = (struct in_multi **)malloc( + (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP, + M_WAITOK); + imo->imo_mfilters = NULL; + imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; + imo->imo_multicast_vif = -1; - while (n-- > 0) { - if (im6o->im6o_membership[n] != NULL) { - if (dofree) - in6_mc_leave(im6o->im6o_membership[n], NULL); - im6o->im6o_membership[n] = NULL; - } - } - KASSERT(im6o->im6o_mfilters == NULL, - ("%s: im6o_mfilters != NULL", __func__)); - im6o->im6o_num_memberships = 0; - im6o->im6o_multicast_ifp = NULL; -} -#endif - -#ifdef INET -static int -carp_set_addr(struct carp_softc *sc, struct sockaddr_in *sin) -{ - struct ifnet *ifp; - struct carp_if *cif; - struct in_ifaddr *ia, *ia_if; - struct ip_moptions *imo = &sc->sc_imo; - struct in_addr addr; - u_long iaddr = htonl(sin->sin_addr.s_addr); - int own, error; - - if (sin->sin_addr.s_addr == 0) { - if (!(SC2IFP(sc)->if_flags & IFF_UP)) - carp_set_state(sc, INIT); - if (sc->sc_naddrs) - SC2IFP(sc)->if_flags |= IFF_UP; - if (sc->sc_carpdev) - CARP_SCLOCK(sc); - carp_setrun(sc, 0); - if (sc->sc_carpdev) - CARP_SCUNLOCK(sc); - return (0); - } - - /* we have to do it by hands to check we won't match on us */ - ia_if = NULL; own = 0; - IN_IFADDR_RLOCK(); - TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { - /* and, yeah, we need a multicast-capable iface too */ - if (ia->ia_ifp != SC2IFP(sc) && - (ia->ia_ifp->if_flags & IFF_MULTICAST) && - (iaddr & ia->ia_subnetmask) == ia->ia_subnet) { - if (!ia_if) - ia_if = ia; - if (sin->sin_addr.s_addr == - ia->ia_addr.sin_addr.s_addr) - own++; - } - } - - if (!ia_if) { - IN_IFADDR_RUNLOCK(); - return (EADDRNOTAVAIL); - } - - ia = ia_if; - ifa_ref(&ia->ia_ifa); - IN_IFADDR_RUNLOCK(); - - ifp = ia->ia_ifp; - - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 || - (imo->imo_multicast_ifp && imo->imo_multicast_ifp != ifp)) { - ifa_free(&ia->ia_ifa); - return (EADDRNOTAVAIL); - } - - if (imo->imo_num_memberships == 0) { addr.s_addr = htonl(INADDR_CARP_GROUP); - if ((imo->imo_membership[0] = in_addmulti(&addr, ifp)) == - NULL) { - ifa_free(&ia->ia_ifa); - return (ENOBUFS); + if ((error = in_joingroup(ifp, &addr, NULL, + &imo->imo_membership[0])) != 0) { + free(imo->imo_membership, M_CARP); + break; } imo->imo_num_memberships++; imo->imo_multicast_ifp = ifp; imo->imo_multicast_ttl = CARP_DFLTTL; imo->imo_multicast_loop = 0; - } - - if (!ifp->if_carp) { - - cif = malloc(sizeof(*cif), M_CARP, - M_WAITOK|M_ZERO); - if (!cif) { - error = ENOBUFS; - goto cleanup; - } - if ((error = ifpromisc(ifp, 1))) { - free(cif, M_CARP); - goto cleanup; - } - - CARP_LOCK_INIT(cif); - CARP_LOCK(cif); - cif->vhif_ifp = ifp; - TAILQ_INIT(&cif->vhif_vrs); - ifp->if_carp = cif; - - } else { - struct carp_softc *vr; - - cif = (struct carp_if *)ifp->if_carp; - CARP_LOCK(cif); - TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) - if (vr != sc && vr->sc_vhid == sc->sc_vhid) { - CARP_UNLOCK(cif); - error = EEXIST; - goto cleanup; - } - } - sc->sc_ia = ia; - sc->sc_carpdev = ifp; - - { /* XXX prevent endless loop if already in queue */ - struct carp_softc *vr, *after = NULL; - int myself = 0; - cif = (struct carp_if *)ifp->if_carp; - - /* XXX: cif should not change, right? So we still hold the lock */ - CARP_LOCK_ASSERT(cif); - - TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) { - if (vr == sc) - myself = 1; - if (vr->sc_vhid < sc->sc_vhid) - after = vr; - } - - if (!myself) { - /* We're trying to keep things in order */ - if (after == NULL) { - TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list); - } else { - TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list); - } - cif->vhif_nvrs++; - } - } - - sc->sc_naddrs++; - SC2IFP(sc)->if_flags |= IFF_UP; - if (own) - sc->sc_advskew = 0; - carp_sc_state_locked(sc); - carp_setrun(sc, 0); - - CARP_UNLOCK(cif); - ifa_free(&ia->ia_ifa); /* XXXRW: should hold reference for softc. */ - - return (0); - -cleanup: - in_delmulti(imo->imo_membership[--imo->imo_num_memberships]); - ifa_free(&ia->ia_ifa); - return (error); -} - -static int -carp_del_addr(struct carp_softc *sc, struct sockaddr_in *sin) -{ - int error = 0; - - if (!--sc->sc_naddrs) { - struct carp_if *cif = (struct carp_if *)sc->sc_carpdev->if_carp; - struct ip_moptions *imo = &sc->sc_imo; - - CARP_LOCK(cif); - callout_stop(&sc->sc_ad_tmo); - SC2IFP(sc)->if_flags &= ~IFF_UP; - SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; - sc->sc_vhid = -1; - in_delmulti(imo->imo_membership[--imo->imo_num_memberships]); - imo->imo_multicast_ifp = NULL; - TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list); - if (!--cif->vhif_nvrs) { - sc->sc_carpdev->if_carp = NULL; - CARP_LOCK_DESTROY(cif); - free(cif, M_CARP); - } else { - CARP_UNLOCK(cif); - } - } - - return (error); -} + break; + } #endif - #ifdef INET6 -static int -carp_set_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6) -{ - struct ifnet *ifp; - struct carp_if *cif; - struct in6_ifaddr *ia, *ia_if; - struct ip6_moptions *im6o = &sc->sc_im6o; - struct in6_addr in6; - int own, error; - - error = 0; - - if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { - if (!(SC2IFP(sc)->if_flags & IFF_UP)) - carp_set_state(sc, INIT); - if (sc->sc_naddrs6) - SC2IFP(sc)->if_flags |= IFF_UP; - if (sc->sc_carpdev) - CARP_SCLOCK(sc); - carp_setrun(sc, 0); - if (sc->sc_carpdev) - CARP_SCUNLOCK(sc); - return (0); - } - - /* we have to do it by hands to check we won't match on us */ - ia_if = NULL; own = 0; - IN6_IFADDR_RLOCK(); - TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { - int i; - - for (i = 0; i < 4; i++) { - if ((sin6->sin6_addr.s6_addr32[i] & - ia->ia_prefixmask.sin6_addr.s6_addr32[i]) != - (ia->ia_addr.sin6_addr.s6_addr32[i] & - ia->ia_prefixmask.sin6_addr.s6_addr32[i])) - break; - } - /* and, yeah, we need a multicast-capable iface too */ - if (ia->ia_ifp != SC2IFP(sc) && - (ia->ia_ifp->if_flags & IFF_MULTICAST) && - (i == 4)) { - if (!ia_if) - ia_if = ia; - if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, - &ia->ia_addr.sin6_addr)) - own++; - } - } - - if (!ia_if) { - IN6_IFADDR_RUNLOCK(); - return (EADDRNOTAVAIL); - } - ia = ia_if; - ifa_ref(&ia->ia_ifa); - IN6_IFADDR_RUNLOCK(); - ifp = ia->ia_ifp; - - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 || - (im6o->im6o_multicast_ifp && im6o->im6o_multicast_ifp != ifp)) { - ifa_free(&ia->ia_ifa); - return (EADDRNOTAVAIL); - } - - if (!sc->sc_naddrs6) { + case AF_INET6: + { + struct ip6_moptions *im6o = &cif->cif_im6o; + struct in6_addr in6; struct in6_multi *in6m; + if (im6o->im6o_membership) + return (0); + + im6o->im6o_membership = (struct in6_multi **)malloc( + (sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP, + M_ZERO|M_WAITOK); + im6o->im6o_mfilters = NULL; + im6o->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS; + im6o->im6o_multicast_hlim = CARP_DFLTTL; im6o->im6o_multicast_ifp = ifp; - /* join CARP multicast address */ + /* Join IPv6 CARP multicast group. */ bzero(&in6, sizeof(in6)); in6.s6_addr16[0] = htons(0xff02); in6.s6_addr8[15] = 0x12; - if (in6_setscope(&in6, ifp, NULL) != 0) - goto cleanup; + if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { + free(im6o->im6o_membership, M_CARP); + break; + } in6m = NULL; - error = in6_mc_join(ifp, &in6, NULL, &in6m, 0); - if (error) - goto cleanup; + if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) { + free(im6o->im6o_membership, M_CARP); + break; + } im6o->im6o_membership[0] = in6m; im6o->im6o_num_memberships++; - /* join solicited multicast address */ + /* Join solicited multicast address. */ bzero(&in6, sizeof(in6)); in6.s6_addr16[0] = htons(0xff02); in6.s6_addr32[1] = 0; in6.s6_addr32[2] = htonl(1); - in6.s6_addr32[3] = sin6->sin6_addr.s6_addr32[3]; + in6.s6_addr32[3] = 0; in6.s6_addr8[12] = 0xff; - if (in6_setscope(&in6, ifp, NULL) != 0) - goto cleanup; + if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { + in6_mc_leave(im6o->im6o_membership[0], NULL); + free(im6o->im6o_membership, M_CARP); + break; + } in6m = NULL; - error = in6_mc_join(ifp, &in6, NULL, &in6m, 0); - if (error) - goto cleanup; + if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) { + in6_mc_leave(im6o->im6o_membership[0], NULL); + free(im6o->im6o_membership, M_CARP); + break; + } im6o->im6o_membership[1] = in6m; im6o->im6o_num_memberships++; + break; + } +#endif } - if (!ifp->if_carp) { - cif = malloc(sizeof(*cif), M_CARP, - M_WAITOK|M_ZERO); - if (!cif) { - error = ENOBUFS; - goto cleanup; - } - if ((error = ifpromisc(ifp, 1))) { - free(cif, M_CARP); - goto cleanup; - } - - CARP_LOCK_INIT(cif); - CARP_LOCK(cif); - cif->vhif_ifp = ifp; - TAILQ_INIT(&cif->vhif_vrs); - ifp->if_carp = cif; - - } else { - struct carp_softc *vr; - - cif = (struct carp_if *)ifp->if_carp; - CARP_LOCK(cif); - TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) - if (vr != sc && vr->sc_vhid == sc->sc_vhid) { - CARP_UNLOCK(cif); - error = EINVAL; - goto cleanup; - } - } - sc->sc_ia6 = ia; - sc->sc_carpdev = ifp; - - { /* XXX prevent endless loop if already in queue */ - struct carp_softc *vr, *after = NULL; - int myself = 0; - cif = (struct carp_if *)ifp->if_carp; - CARP_LOCK_ASSERT(cif); - - TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) { - if (vr == sc) - myself = 1; - if (vr->sc_vhid < sc->sc_vhid) - after = vr; - } - - if (!myself) { - /* We're trying to keep things in order */ - if (after == NULL) { - TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list); - } else { - TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list); - } - cif->vhif_nvrs++; - } - } - - sc->sc_naddrs6++; - SC2IFP(sc)->if_flags |= IFF_UP; - if (own) - sc->sc_advskew = 0; - carp_sc_state_locked(sc); - carp_setrun(sc, 0); - - CARP_UNLOCK(cif); - ifa_free(&ia->ia_ifa); /* XXXRW: should hold reference for softc. */ - - return (0); - -cleanup: - if (!sc->sc_naddrs6) - carp_multicast6_cleanup(sc, 1); - ifa_free(&ia->ia_ifa); - return (error); -} - -static int -carp_del_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6) -{ - int error = 0; - - if (!--sc->sc_naddrs6) { - struct carp_if *cif = (struct carp_if *)sc->sc_carpdev->if_carp; - - CARP_LOCK(cif); - callout_stop(&sc->sc_ad_tmo); - SC2IFP(sc)->if_flags &= ~IFF_UP; - SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; - sc->sc_vhid = -1; - carp_multicast6_cleanup(sc, 1); - TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list); - if (!--cif->vhif_nvrs) { - CARP_LOCK_DESTROY(cif); - sc->sc_carpdev->if_carp = NULL; - free(cif, M_CARP); - } else - CARP_UNLOCK(cif); - } - - return (error); -} -#endif /* INET6 */ - -static int -carp_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr) -{ - struct carp_softc *sc = ifp->if_softc, *vr; - struct carpreq carpr; - struct ifaddr *ifa; - struct ifreq *ifr; - struct ifaliasreq *ifra; - int locked = 0, error = 0; - - ifa = (struct ifaddr *)addr; - ifra = (struct ifaliasreq *)addr; - ifr = (struct ifreq *)addr; - - switch (cmd) { - case SIOCSIFADDR: - switch (ifa->ifa_addr->sa_family) { -#ifdef INET - case AF_INET: - SC2IFP(sc)->if_flags |= IFF_UP; - bcopy(ifa->ifa_addr, ifa->ifa_dstaddr, - sizeof(struct sockaddr)); - error = carp_set_addr(sc, satosin(ifa->ifa_addr)); - break; -#endif /* INET */ -#ifdef INET6 - case AF_INET6: - SC2IFP(sc)->if_flags |= IFF_UP; - error = carp_set_addr6(sc, satosin6(ifa->ifa_addr)); - break; -#endif /* INET6 */ - default: - error = EAFNOSUPPORT; - break; - } - break; - - case SIOCAIFADDR: - switch (ifa->ifa_addr->sa_family) { -#ifdef INET - case AF_INET: - SC2IFP(sc)->if_flags |= IFF_UP; - bcopy(ifa->ifa_addr, ifa->ifa_dstaddr, - sizeof(struct sockaddr)); - error = carp_set_addr(sc, satosin(&ifra->ifra_addr)); - break; -#endif /* INET */ -#ifdef INET6 - case AF_INET6: - SC2IFP(sc)->if_flags |= IFF_UP; - error = carp_set_addr6(sc, satosin6(&ifra->ifra_addr)); - break; -#endif /* INET6 */ - default: - error = EAFNOSUPPORT; - break; - } - break; - - case SIOCDIFADDR: - switch (ifa->ifa_addr->sa_family) { -#ifdef INET - case AF_INET: - error = carp_del_addr(sc, satosin(&ifra->ifra_addr)); - break; -#endif /* INET */ -#ifdef INET6 - case AF_INET6: - error = carp_del_addr6(sc, satosin6(&ifra->ifra_addr)); - break; -#endif /* INET6 */ - default: - error = EAFNOSUPPORT; - break; - } - break; - - case SIOCSIFFLAGS: - if (sc->sc_carpdev) { - locked = 1; - CARP_SCLOCK(sc); - } - if (sc->sc_state != INIT && !(ifr->ifr_flags & IFF_UP)) { - callout_stop(&sc->sc_ad_tmo); - callout_stop(&sc->sc_md_tmo); - callout_stop(&sc->sc_md6_tmo); - if (sc->sc_state == MASTER) - carp_send_ad_locked(sc); - carp_set_state(sc, INIT); - carp_setrun(sc, 0); - } else if (sc->sc_state == INIT && (ifr->ifr_flags & IFF_UP)) { - SC2IFP(sc)->if_flags |= IFF_UP; - carp_setrun(sc, 0); - } - break; - - case SIOCSVH: - error = priv_check(curthread, PRIV_NETINET_CARP); - if (error) - break; - if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr))) - break; - error = 1; - if (sc->sc_carpdev) { - locked = 1; - CARP_SCLOCK(sc); - } - if (sc->sc_state != INIT && carpr.carpr_state != sc->sc_state) { - switch (carpr.carpr_state) { - case BACKUP: - callout_stop(&sc->sc_ad_tmo); - carp_set_state(sc, BACKUP); - carp_setrun(sc, 0); - carp_setroute(sc, RTM_DELETE); - break; - case MASTER: - carp_master_down_locked(sc); - break; - default: - break; - } - } - if (carpr.carpr_vhid > 0) { - if (carpr.carpr_vhid > 255) { - error = EINVAL; - break; - } - if (sc->sc_carpdev) { - struct carp_if *cif; - cif = (struct carp_if *)sc->sc_carpdev->if_carp; - TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) - if (vr != sc && - vr->sc_vhid == carpr.carpr_vhid) { - error = EEXIST; - break; - } - if (error == EEXIST) - break; - } - sc->sc_vhid = carpr.carpr_vhid; - IF_LLADDR(sc->sc_ifp)[0] = 0; - IF_LLADDR(sc->sc_ifp)[1] = 0; - IF_LLADDR(sc->sc_ifp)[2] = 0x5e; - IF_LLADDR(sc->sc_ifp)[3] = 0; - IF_LLADDR(sc->sc_ifp)[4] = 1; - IF_LLADDR(sc->sc_ifp)[5] = sc->sc_vhid; - error--; - } - if (carpr.carpr_advbase > 0 || carpr.carpr_advskew > 0) { - if (carpr.carpr_advskew >= 255) { - error = EINVAL; - break; - } - if (carpr.carpr_advbase > 255) { - error = EINVAL; - break; - } - sc->sc_advbase = carpr.carpr_advbase; - sc->sc_advskew = carpr.carpr_advskew; - error--; - } - bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key)); - if (error > 0) - error = EINVAL; - else { - error = 0; - carp_setrun(sc, 0); - } - break; - - case SIOCGVH: - /* XXX: lockless read */ - bzero(&carpr, sizeof(carpr)); - carpr.carpr_state = sc->sc_state; - carpr.carpr_vhid = sc->sc_vhid; - carpr.carpr_advbase = sc->sc_advbase; - carpr.carpr_advskew = sc->sc_advskew; - error = priv_check(curthread, PRIV_NETINET_CARP); - if (error == 0) - bcopy(sc->sc_key, carpr.carpr_key, - sizeof(carpr.carpr_key)); - error = copyout(&carpr, ifr->ifr_data, sizeof(carpr)); - break; - - default: - error = EINVAL; - } - - if (locked) - CARP_SCUNLOCK(sc); - - carp_hmac_prepare(sc); - return (error); } /* - * XXX: this is looutput. We should eventually use it from there. - */ -static int -carp_looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, - struct route *ro) -{ - u_int32_t af; - struct rtentry *rt = NULL; - - M_ASSERTPKTHDR(m); /* check if we have the packet header */ - - if (ro != NULL) - rt = ro->ro_rt; - if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { - m_freem(m); - return (rt->rt_flags & RTF_BLACKHOLE ? 0 : - rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); - } - - ifp->if_opackets++; - ifp->if_obytes += m->m_pkthdr.len; - - /* BPF writes need to be handled specially. */ - if (dst->sa_family == AF_UNSPEC) { - bcopy(dst->sa_data, &af, sizeof(af)); - dst->sa_family = af; - } - -#if 1 /* XXX */ - switch (dst->sa_family) { - case AF_INET: - case AF_INET6: - case AF_IPX: - case AF_APPLETALK: - break; - default: - printf("carp_looutput: af=%d unexpected\n", dst->sa_family); - m_freem(m); - return (EAFNOSUPPORT); - } -#endif - return(if_simloop(ifp, m, dst->sa_family, 0)); -} - -/* - * Start output on carp interface. This function should never be called. + * Free multicast structures. */ static void -carp_start(struct ifnet *ifp) +carp_multicast_cleanup(struct carp_softc *sc, sa_family_t sa) { -#ifdef DEBUG - printf("%s: start called\n", ifp->if_xname); + struct ifnet *ifp = sc->sc_carpdev; + struct carp_if *cif = ifp->if_carp; + + switch (sa) { +#ifdef INET + case AF_INET: + if (sc->sc_naddrs == 0) { + struct ip_moptions *imo = &cif->cif_imo; + + in_leavegroup(imo->imo_membership[0], NULL); + KASSERT(imo->imo_mfilters == NULL, + ("%s: imo_mfilters != NULL", __func__)); + free(imo->imo_membership, M_CARP); + imo->imo_membership = NULL; + + } + break; #endif +#ifdef INET6 + case AF_INET6: + if (sc->sc_naddrs6 == 0) { + struct ip6_moptions *im6o = &cif->cif_im6o; + + in6_mc_leave(im6o->im6o_membership[0], NULL); + in6_mc_leave(im6o->im6o_membership[1], NULL); + KASSERT(im6o->im6o_mfilters == NULL, + ("%s: im6o_mfilters != NULL", __func__)); + free(im6o->im6o_membership, M_CARP); + im6o->im6o_membership = NULL; + } + break; +#endif + } } int -carp_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, - struct rtentry *rt) +carp_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa) { struct m_tag *mtag; struct carp_softc *sc; - struct ifnet *carp_ifp; if (!sa) return (0); @@ -2158,11 +1338,11 @@ #ifdef INET case AF_INET: break; -#endif /* INET */ +#endif #ifdef INET6 case AF_INET6: break; -#endif /* INET6 */ +#endif default: return (0); } @@ -2171,10 +1351,9 @@ if (mtag == NULL) return (0); - bcopy(mtag + 1, &carp_ifp, sizeof(struct ifnet *)); - sc = carp_ifp->if_softc; + bcopy(mtag + 1, &sc, sizeof(struct carp_softc *)); - /* Set the source MAC address to Virtual Router MAC Address */ + /* Set the source MAC address to the Virtual Router MAC Address. */ switch (ifp->if_type) { case IFT_ETHER: case IFT_L2VLAN: { @@ -2213,95 +1392,543 @@ } break; default: - printf("%s: carp is not supported for this interface type\n", - ifp->if_xname); + printf("%s: carp is not supported for the %d interface type\n", + ifp->if_xname, ifp->if_type); return (EOPNOTSUPP); } return (0); } +static struct carp_softc* +carp_alloc(struct ifnet *ifp) +{ + struct carp_softc *sc; + struct carp_if *cif; + + if ((cif = ifp->if_carp) == NULL) { + cif = carp_alloc_if(ifp); + if (cif == NULL) + return (NULL); + } + + sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO); + + sc->sc_advbase = CARP_DFLTINTV; + sc->sc_vhid = -1; /* required setting */ + sc->sc_init_counter = 1; + sc->sc_state = INIT; + + sc->sc_ifasiz = sizeof(struct ifaddr *); + sc->sc_ifas = malloc(sc->sc_ifasiz, M_CARP, M_WAITOK|M_ZERO); + sc->sc_carpdev = ifp; + + CARP_LOCK_INIT(sc); +#ifdef INET + callout_init_mtx(&sc->sc_md_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); +#endif +#ifdef INET6 + callout_init_mtx(&sc->sc_md6_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); +#endif + callout_init_mtx(&sc->sc_ad_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); + + CIF_LOCK(cif); + TAILQ_INSERT_TAIL(&cif->cif_vrs, sc, sc_list); + CIF_UNLOCK(cif); + + mtx_lock(&carp_mtx); + LIST_INSERT_HEAD(&carp_list, sc, sc_next); + mtx_unlock(&carp_mtx); + + return (sc); +} + +static int +carp_grow_ifas(struct carp_softc *sc) +{ + struct ifaddr **new; + + CARP_LOCK_ASSERT(sc); + + new = malloc(sc->sc_ifasiz * 2, M_CARP, M_NOWAIT|M_ZERO); + if (new == NULL) + return (ENOMEM); + bcopy(sc->sc_ifas, new, sc->sc_ifasiz); + free(sc->sc_ifas, M_CARP); + sc->sc_ifas = new; + sc->sc_ifasiz *= 2; + + return (0); +} + +static void +carp_destroy(struct carp_softc *sc) +{ + struct ifnet *ifp = sc->sc_carpdev; + struct carp_if *cif = ifp->if_carp; + + CIF_LOCK(cif); + TAILQ_REMOVE(&cif->cif_vrs, sc, sc_list); + if (TAILQ_EMPTY(&cif->cif_vrs)) + carp_free_if(cif); + else + CIF_UNLOCK(cif); + + mtx_lock(&carp_mtx); + LIST_REMOVE(sc, sc_next); + mtx_unlock(&carp_mtx); + + CARP_LOCK(sc); + callout_drain(&sc->sc_ad_tmo); +#ifdef INET + callout_drain(&sc->sc_md_tmo); +#endif +#ifdef INET6 + callout_drain(&sc->sc_md6_tmo); +#endif + CARP_LOCK_DESTROY(sc); + + free(sc->sc_ifas, M_CARP); + free(sc, M_CARP); +} + +static struct carp_if* +carp_alloc_if(struct ifnet *ifp) +{ + struct carp_if *cif; + + cif = malloc(sizeof(*cif), M_CARP, M_WAITOK|M_ZERO); + + if (ifpromisc(ifp, 1) != 0) + goto cleanup; + + CIF_LOCK_INIT(cif); + cif->cif_ifp = ifp; + TAILQ_INIT(&cif->cif_vrs); + + IF_ADDR_LOCK(ifp); + ifp->if_carp = cif; + if_ref(ifp); + IF_ADDR_UNLOCK(ifp); + + return (cif); + +cleanup: + free(cif, M_CARP); + + return (NULL); +} + +static void +carp_free_if(struct carp_if *cif) +{ + struct ifnet *ifp = cif->cif_ifp; + + CIF_LOCK_ASSERT(cif); + KASSERT(TAILQ_EMPTY(&cif->cif_vrs), ("%s: softc list not empty", + __func__)); + + IF_ADDR_LOCK(ifp); + ifp->if_carp = NULL; + if_rele(ifp); + IF_ADDR_UNLOCK(ifp); + + CIF_LOCK_DESTROY(cif); + + ifpromisc(ifp, 0); + + free(cif, M_CARP); +} + +static void +carp_carprcp(struct carpreq *carpr, struct carp_softc *sc, int priv) +{ + + CARP_LOCK(sc); + carpr->carpr_state = sc->sc_state; + carpr->carpr_vhid = sc->sc_vhid; + carpr->carpr_advbase = sc->sc_advbase; + carpr->carpr_advskew = sc->sc_advskew; + if (priv) + bcopy(sc->sc_key, carpr->carpr_key, sizeof(carpr->carpr_key)); + else + bzero(carpr->carpr_key, sizeof(carpr->carpr_key)); + CARP_UNLOCK(sc); +} + +int +carp_ioctl(struct ifreq *ifr, u_long cmd, struct thread *td) +{ + struct carpreq carpr; + struct ifnet *ifp; + struct carp_softc *sc = NULL; + int error = 0, locked = 0; + + if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr))) + return (error); + + ifp = ifunit_ref(ifr->ifr_name); + if (ifp == NULL) + return (ENXIO); + + switch (ifp->if_type) { + case IFT_ETHER: + case IFT_L2VLAN: + case IFT_FDDI: + case IFT_ISO88025: + break; + default: + error = EOPNOTSUPP; + goto out; + } + + if ((ifp->if_flags & IFF_MULTICAST) == 0) { + error = EADDRNOTAVAIL; + goto out; + } + + switch (cmd) { + case SIOCSVH: + if ((error = priv_check(td, PRIV_NETINET_CARP))) + break; + if (carpr.carpr_vhid <= 0 || carpr.carpr_vhid > CARP_MAXVHID || + carpr.carpr_advbase < 0 || carpr.carpr_advskew < 0) { + error = EINVAL; + break; + } + + if (ifp->if_carp) { + CIF_LOCK(ifp->if_carp); + IFNET_FOREACH_CARP(ifp, sc) + if (sc->sc_vhid == carpr.carpr_vhid) + break; + CIF_UNLOCK(ifp->if_carp); + } + if (sc == NULL) { + sc = carp_alloc(ifp); + if (sc == NULL) { + error = EINVAL; /* XXX: ifpromisc failed */ + break; + } + + CARP_LOCK(sc); + sc->sc_vhid = carpr.carpr_vhid; + LLADDR(&sc->sc_addr)[0] = 0; + LLADDR(&sc->sc_addr)[1] = 0; + LLADDR(&sc->sc_addr)[2] = 0x5e; + LLADDR(&sc->sc_addr)[3] = 0; + LLADDR(&sc->sc_addr)[4] = 1; + LLADDR(&sc->sc_addr)[5] = sc->sc_vhid; + } else + CARP_LOCK(sc); + locked = 1; + if (carpr.carpr_advbase > 0) { + if (carpr.carpr_advbase > 255 || + carpr.carpr_advbase < CARP_DFLTINTV) { + error = EINVAL; + break; + } + sc->sc_advbase = carpr.carpr_advbase; + } + if (carpr.carpr_advskew > 0) { + if (carpr.carpr_advskew >= 255) { + error = EINVAL; + break; + } + sc->sc_advskew = carpr.carpr_advskew; + } + if (carpr.carpr_key[0] != '\0') { + bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key)); + carp_hmac_prepare(sc); + } + if (sc->sc_state != INIT && + carpr.carpr_state != sc->sc_state) { + switch (carpr.carpr_state) { + case BACKUP: + callout_stop(&sc->sc_ad_tmo); + carp_set_state(sc, BACKUP); + carp_setrun(sc, 0); + carp_delroute(sc); + break; + case MASTER: + carp_master_down_locked(sc); + break; + default: + break; + } + } + break; + + case SIOCGVH: + { + int priveleged; + + if (carpr.carpr_vhid < 0 || carpr.carpr_vhid > CARP_MAXVHID) { + error = EINVAL; + break; + } + if (carpr.carpr_count < 1) { + error = EMSGSIZE; + break; + } + if (ifp->if_carp == NULL) { + error = ENOENT; + break; + } + + priveleged = (priv_check(td, PRIV_NETINET_CARP) == 0); + if (carpr.carpr_vhid != 0) { + CIF_LOCK(ifp->if_carp); + IFNET_FOREACH_CARP(ifp, sc) + if (sc->sc_vhid == carpr.carpr_vhid) + break; + CIF_UNLOCK(ifp->if_carp); + if (sc == NULL) { + error = ENOENT; + break; + } + carp_carprcp(&carpr, sc, priveleged); + error = copyout(&carpr, ifr->ifr_data, sizeof(carpr)); + } else { + int i, count; + + count = 0; + CIF_LOCK(ifp->if_carp); + IFNET_FOREACH_CARP(ifp, sc) + count++; + + if (count > carpr.carpr_count) { + CIF_UNLOCK(ifp->if_carp); + error = EMSGSIZE; + break; + } + + i = 0; + IFNET_FOREACH_CARP(ifp, sc) { + carp_carprcp(&carpr, sc, priveleged); + carpr.carpr_count = count; + error = copyout(&carpr, ifr->ifr_data + + (i * sizeof(carpr)), sizeof(carpr)); + if (error) { + CIF_UNLOCK(ifp->if_carp); + break; + } + i++; + } + CIF_UNLOCK(ifp->if_carp); + } + break; + } + default: + error = EINVAL; + } + +out: + if (locked) + CARP_UNLOCK(sc); + if_rele(ifp); + + return (error); +} + +static int +carp_get_vhid(struct ifaddr *ifa) +{ + + if (ifa == NULL || ifa->ifa_carp == NULL) + return (0); + + return (ifa->ifa_carp->sc_vhid); +} + +int +carp_attach(struct ifaddr *ifa, int vhid) +{ + struct ifnet *ifp = ifa->ifa_ifp; + struct carp_softc *sc; + int index, error; + + if (ifp->if_carp == NULL) + return (ENOPROTOOPT); + + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: +#endif +#ifdef INET6 + case AF_INET6: +#endif + break; + default: + return (EPROTOTYPE); + } + + CIF_LOCK(ifp->if_carp); + IFNET_FOREACH_CARP(ifp, sc) + if (sc->sc_vhid == vhid) + break; + CIF_UNLOCK(ifp->if_carp); + if (sc == NULL) + return (ENOENT); + + if (ifa->ifa_carp) { + if (ifa->ifa_carp->sc_vhid != vhid) + carp_detach(ifa); + else + return (0); + } + + error = carp_multicast_setup(sc, ifa->ifa_addr->sa_family); + if (error) + return (error); + + CARP_LOCK(sc); + index = sc->sc_naddrs + sc->sc_naddrs6 + 1; + if (index > sc->sc_ifasiz / sizeof(struct ifaddr *)) + if ((error = carp_grow_ifas(sc)) != 0) { + carp_multicast_cleanup(sc, + ifa->ifa_addr->sa_family); + CARP_UNLOCK(sc); + return (error); + } + + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + sc->sc_naddrs++; + break; +#endif +#ifdef INET6 + case AF_INET6: + sc->sc_naddrs6++; + break; +#endif + } + + ifa_ref(ifa); + sc->sc_ifas[index - 1] = ifa; + ifa->ifa_carp = sc; + + carp_hmac_prepare(sc); + carp_sc_state(sc); + + CARP_UNLOCK(sc); + + return (0); +} + +void +carp_detach(struct ifaddr *ifa) +{ + struct carp_softc *sc = ifa->ifa_carp; + int i, index; + + KASSERT(sc != NULL, ("%s: %p not attached", __func__, ifa)); + + CARP_LOCK(sc); + + /* Shift array. */ + index = sc->sc_naddrs + sc->sc_naddrs6; + for (i = 0; i < index; i++) + if (sc->sc_ifas[i] == ifa) + break; + KASSERT(i < index, ("%s: %p no backref", __func__, ifa)); + for (; i < index - 1; i++) + sc->sc_ifas[i] = sc->sc_ifas[i+1]; + sc->sc_ifas[index - 1] = NULL; + + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + sc->sc_naddrs--; + break; +#endif +#ifdef INET6 + case AF_INET6: + sc->sc_naddrs6--; + break; +#endif + } + + carp_multicast_cleanup(sc, ifa->ifa_addr->sa_family); + + ifa->ifa_carp = NULL; + ifa_free(ifa); + + carp_hmac_prepare(sc); + carp_sc_state(sc); + + if (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) { + CARP_UNLOCK(sc); + carp_destroy(sc); + } else + CARP_UNLOCK(sc); +} + static void carp_set_state(struct carp_softc *sc, int state) { - int link_state; - if (sc->sc_carpdev) - CARP_SCLOCK_ASSERT(sc); + CARP_LOCK_ASSERT(sc); - if (sc->sc_state == state) - return; + if (sc->sc_state != state) { + const char *carp_states[] = { CARP_STATES }; + char subsys[IFNAMSIZ+5]; - sc->sc_state = state; - switch (state) { - case BACKUP: - link_state = LINK_STATE_DOWN; - break; - case MASTER: - link_state = LINK_STATE_UP; - break; - default: - link_state = LINK_STATE_UNKNOWN; - break; + sc->sc_state = state; + + snprintf(subsys, IFNAMSIZ+5, "%u@%s", sc->sc_vhid, + sc->sc_carpdev->if_xname); + devctl_notify("CARP", subsys, carp_states[state], NULL); } - if_link_state_change(SC2IFP(sc), link_state); -} - -void -carp_carpdev_state(struct ifnet *ifp) -{ - struct carp_if *cif; - - cif = ifp->if_carp; - CARP_LOCK(cif); - carp_carpdev_state_locked(cif); - CARP_UNLOCK(cif); } static void -carp_carpdev_state_locked(struct carp_if *cif) +carp_linkstate(struct ifnet *ifp) { struct carp_softc *sc; - TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) - carp_sc_state_locked(sc); + CIF_LOCK(ifp->if_carp); + IFNET_FOREACH_CARP(ifp, sc) { + CARP_LOCK(sc); + carp_sc_state(sc); + CARP_UNLOCK(sc); + } + CIF_UNLOCK(ifp->if_carp); } static void -carp_sc_state_locked(struct carp_softc *sc) +carp_sc_state(struct carp_softc *sc) { - CARP_SCLOCK_ASSERT(sc); + + CARP_LOCK_ASSERT(sc); if (sc->sc_carpdev->if_link_state != LINK_STATE_UP || !(sc->sc_carpdev->if_flags & IFF_UP)) { - sc->sc_flags_backup = SC2IFP(sc)->if_flags; - SC2IFP(sc)->if_flags &= ~IFF_UP; - SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; callout_stop(&sc->sc_ad_tmo); +#ifdef INET callout_stop(&sc->sc_md_tmo); +#endif +#ifdef INET6 callout_stop(&sc->sc_md6_tmo); +#endif carp_set_state(sc, INIT); carp_setrun(sc, 0); if (!sc->sc_suppress) { carp_suppress_preempt++; - if (carp_suppress_preempt == 1) { - CARP_SCUNLOCK(sc); - carp_send_ad_all(); - CARP_SCLOCK(sc); - } + if (carp_suppress_preempt == 1) + carp_send_ad_all(sc); } sc->sc_suppress = 1; } else { - SC2IFP(sc)->if_flags |= sc->sc_flags_backup; carp_set_state(sc, INIT); carp_setrun(sc, 0); if (sc->sc_suppress) carp_suppress_preempt--; sc->sc_suppress = 0; } +} - return; -} #ifdef INET extern struct domain inetdomain; @@ -2335,10 +1962,6 @@ carp_mod_cleanup(void) { - if (if_detach_event_tag == NULL) - return; - EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag); - if_clone_detach(&carp_cloner); #ifdef INET if (proto_reg[CARP_INET] == 0) { (void)ipproto_unregister(IPPROTO_CARP); @@ -2356,6 +1979,10 @@ carp_iamatch6_p = NULL; carp_macmatch6_p = NULL; #endif + carp_ioctl_p = NULL; + carp_attach_p = NULL; + carp_detach_p = NULL; + carp_get_vhid_p = NULL; carp_linkstate_p = NULL; carp_forus_p = NULL; carp_output_p = NULL; @@ -2367,22 +1994,21 @@ { int err; - if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, - carp_ifdetach, NULL, EVENTHANDLER_PRI_ANY); - if (if_detach_event_tag == NULL) - return (ENOMEM); mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF); - LIST_INIT(&carpif_list); - if_clone_attach(&carp_cloner); - carp_linkstate_p = carp_carpdev_state; + LIST_INIT(&carp_list); + carp_get_vhid_p = carp_get_vhid; carp_forus_p = carp_forus; carp_output_p = carp_output; + carp_linkstate_p = carp_linkstate; + carp_ioctl_p = carp_ioctl; + carp_attach_p = carp_attach; + carp_detach_p = carp_detach; #ifdef INET6 carp_iamatch6_p = carp_iamatch6; carp_macmatch6_p = carp_macmatch6; proto_reg[CARP_INET6] = pf_proto_register(PF_INET6, (struct protosw *)&in6_carp_protosw); - if (proto_reg[CARP_INET6] != 0) { + if (proto_reg[CARP_INET6]) { printf("carp: error %d attaching to PF_INET6\n", proto_reg[CARP_INET6]); carp_mod_cleanup(); @@ -2398,7 +2024,7 @@ #ifdef INET carp_iamatch_p = carp_iamatch; proto_reg[CARP_INET] = pf_proto_register(PF_INET, &in_carp_protosw); - if (proto_reg[CARP_INET] != 0) { + if (proto_reg[CARP_INET]) { printf("carp: error %d attaching to PF_INET\n", proto_reg[CARP_INET]); carp_mod_cleanup(); @@ -2411,7 +2037,7 @@ return (err); } #endif - return 0; + return (0); } static int @@ -2422,17 +2048,13 @@ return carp_mod_load(); /* NOTREACHED */ case MOD_UNLOAD: - /* - * XXX: For now, disallow module unloading by default due to - * a race condition where a thread may dereference one of the - * function pointer hooks after the module has been - * unloaded, during processing of a packet, causing a panic. - */ -#ifdef CARPMOD_CAN_UNLOAD - carp_mod_cleanup(); -#else - return (EBUSY); -#endif + mtx_lock(&carp_mtx); + if (LIST_EMPTY(&carp_list)) + carp_mod_cleanup(); + else { + mtx_unlock(&carp_mtx); + return (EBUSY); + } break; default: diff -r 1a8929bdc357 sys/netinet/ip_carp.h --- a/sys/netinet/ip_carp.h Thu Jan 24 06:03:22 2013 +0800 +++ b/sys/netinet/ip_carp.h Wed Feb 06 10:50:21 2013 +0800 @@ -126,10 +126,12 @@ * Configuration structure for SIOCSVH SIOCGVH */ struct carpreq { + int carpr_count; + int carpr_vhid; +#define CARP_MAXVHID 255 int carpr_state; #define CARP_STATES "INIT", "BACKUP", "MASTER" #define CARP_MAXSTATE 2 - int carpr_vhid; int carpr_advskew; int carpr_advbase; unsigned char carpr_key[CARP_KEY_LEN]; @@ -144,8 +146,7 @@ #define CARPCTL_PREEMPT 2 /* high-pri backup preemption mode */ #define CARPCTL_LOG 3 /* log bad packets */ #define CARPCTL_STATS 4 /* statistics (read-only) */ -#define CARPCTL_ARPBALANCE 5 /* balance arp responses */ -#define CARPCTL_MAXID 6 +#define CARPCTL_MAXID 5 #define CARPCTL_NAMES { \ { 0, 0 }, \ @@ -153,33 +154,37 @@ { "preempt", CTLTYPE_INT }, \ { "log", CTLTYPE_INT }, \ { "stats", CTLTYPE_STRUCT }, \ - { "arpbalance", CTLTYPE_INT }, \ } #ifdef _KERNEL -void carp_carpdev_state(struct ifnet *); -void carp_input (struct mbuf *, int); -int carp6_input (struct mbuf **, int *, int); -int carp_output (struct ifnet *, struct mbuf *, struct sockaddr *, - struct rtentry *); -int carp_iamatch (struct ifnet *, struct in_ifaddr *, struct in_addr *, - u_int8_t **); +int carp_ioctl(struct ifreq *, u_long, struct thread *); +int carp_attach(struct ifaddr *, int); +void carp_detach(struct ifaddr *); +void carp_carpdev_state(struct ifnet *); +void carp_input (struct mbuf *, int); +int carp6_input (struct mbuf **, int *, int); +int carp_output (struct ifnet *, struct mbuf *, struct sockaddr *); +int carp_iamatch(struct ifaddr *, uint8_t **); struct ifaddr *carp_iamatch6(struct ifnet *, struct in6_addr *); caddr_t carp_macmatch6(struct ifnet *, struct mbuf *, const struct in6_addr *); -struct ifnet *carp_forus (struct ifnet *, u_char *); +int carp_forus(struct ifnet *, u_char *); /* These are external networking stack hooks for CARP */ /* net/if.c */ +extern int (*carp_ioctl_p)(struct ifreq *, u_long, struct thread *); +extern int (*carp_attach_p)(struct ifaddr *, int); +extern void (*carp_detach_p)(struct ifaddr *); extern void (*carp_linkstate_p)(struct ifnet *); /* net/if_bridge.c net/if_ethersubr.c */ -extern struct ifnet *(*carp_forus_p)(struct ifnet *, u_char *); +extern int (*carp_forus_p)(struct ifnet *, u_char *); /* net/if_ethersubr.c */ extern int (*carp_output_p)(struct ifnet *, struct mbuf *, - struct sockaddr *, struct rtentry *); + struct sockaddr *); +/* net/rtsock.c */ +extern int (*carp_get_vhid_p)(struct ifaddr *); #ifdef INET /* netinet/if_ether.c */ -extern int (*carp_iamatch_p)(struct ifnet *, struct in_ifaddr *, - struct in_addr *, u_int8_t **); +extern int (*carp_iamatch_p)(struct ifaddr *, uint8_t **); #endif #ifdef INET6 /* netinet6/nd6_nbr.c */ diff -r 1a8929bdc357 sys/netinet6/in6.c --- a/sys/netinet6/in6.c Thu Jan 24 06:03:22 2013 +0800 +++ b/sys/netinet6/in6.c Wed Feb 06 10:50:21 2013 +0800 @@ -95,6 +95,7 @@ #include #include #include +#include #include #include @@ -270,6 +271,7 @@ struct in6_ifaddr *ia = NULL; struct in6_aliasreq *ifra = (struct in6_aliasreq *)data; struct sockaddr_in6 *sa6; + int carp_attached = 0; int error; switch (cmd) { @@ -655,6 +657,18 @@ break; } + if (ifra->ifra_vhid > 0) { + if (carp_attach_p != NULL) + error = (*carp_attach_p)(&ia->ia_ifa, + ifra->ifra_vhid); + else + error = EPROTONOSUPPORT; + if (error) + goto out; + else + carp_attached = 1; + } + /* * then, make the prefix on-link on the interface. * XXX: we'd rather create the prefix before the address, but @@ -698,9 +712,14 @@ * nd6_prelist_add will install the corresponding * interface route. */ - if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) + if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) { + if (carp_attached) + (*carp_detach_p)(&ia->ia_ifa); goto out; + } if (pr == NULL) { + if (carp_attached) + (*carp_detach_p)(&ia->ia_ifa); log(LOG_ERR, "nd6_prelist_add succeeded but " "no prefix\n"); error = EINVAL; @@ -1307,6 +1326,9 @@ struct sockaddr_in6 sin6; int error; + if (ifa0->ifa_carp) + (*carp_detach_p)(ifa0); + /* * Leave from multicast groups we have joined for the interface. */ diff -r 1a8929bdc357 sys/netinet6/in6_ifattach.c --- a/sys/netinet6/in6_ifattach.c Thu Jan 24 06:03:22 2013 +0800 +++ b/sys/netinet6/in6_ifattach.c Wed Feb 06 10:50:21 2013 +0800 @@ -701,7 +701,6 @@ switch (ifp->if_type) { case IFT_PFLOG: case IFT_PFSYNC: - case IFT_CARP: return; } diff -r 1a8929bdc357 sys/netinet6/in6_var.h --- a/sys/netinet6/in6_var.h Thu Jan 24 06:03:22 2013 +0800 +++ b/sys/netinet6/in6_var.h Wed Feb 06 10:50:21 2013 +0800 @@ -287,6 +287,7 @@ struct sockaddr_in6 ifra_prefixmask; int ifra_flags; struct in6_addrlifetime ifra_lifetime; + int ifra_vhid; }; /* prefix type macro */ diff -r 1a8929bdc357 sys/netinet6/nd6.c --- a/sys/netinet6/nd6.c Thu Jan 24 06:03:22 2013 +0800 +++ b/sys/netinet6/nd6.c Wed Feb 06 10:50:21 2013 +0800 @@ -2158,9 +2158,6 @@ #ifdef IFT_IEEE80211 case IFT_IEEE80211: #endif -#ifdef IFT_CARP - case IFT_CARP: -#endif case IFT_INFINIBAND: case IFT_GIF: /* XXX need more cases? */ case IFT_PPP: diff -r 1a8929bdc357 sys/netinet6/nd6_nbr.c --- a/sys/netinet6/nd6_nbr.c Thu Jan 24 06:03:22 2013 +0800 +++ b/sys/netinet6/nd6_nbr.c Wed Feb 06 10:50:21 2013 +0800 @@ -227,7 +227,7 @@ /* (1) and (3) check. */ if (ifp->if_carp) ifa = (*carp_iamatch6_p)(ifp, &taddr6); - if (ifa == NULL) + else ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6); /* (2) check. */ @@ -696,7 +696,14 @@ lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; } - ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6); + /* + * This effectively disables the DAD check on a non-master CARP + * address. + */ + if (ifp->if_carp) + ifa = (*carp_iamatch6_p)(ifp, &taddr6); + else + ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6); /* * Target address matches one of my interface address. @@ -1156,9 +1163,6 @@ #ifdef IFT_IEEE80211 case IFT_IEEE80211: #endif -#ifdef IFT_CARP - case IFT_CARP: -#endif case IFT_INFINIBAND: case IFT_BRIDGE: case IFT_ISO88025: