Rework and update SAS algorithm for better conform to RFC 6724. Add handle_pktinfo(), handle_nexthop() functions to handle Advanced Socket API options. Remove selectroute(), in6_selectif(), in6_selectroute(), in6_selectroute_fib functions. --- sys/netinet6/in6_src.c (svn+ssh://svn.freebsd.org/base/head) (revision 261548) +++ sys/netinet6/in6_src.c (working copy) @@ -128,14 +128,14 @@ static VNET_DEFINE(struct in6_addrpolicy, defaulta VNET_DEFINE(int, ip6_prefer_tempaddr) = 0; -static int selectroute(struct sockaddr_in6 *, struct ip6_pktopts *, - struct ip6_moptions *, struct route_in6 *, struct ifnet **, - struct rtentry **, int, u_int); -static int in6_selectif(struct sockaddr_in6 *, struct ip6_pktopts *, - struct ip6_moptions *, struct route_in6 *ro, struct ifnet **, - struct ifnet *, u_int); +static int cached_rtlookup(const struct sockaddr_in6 *dst, + struct route_in6 *ro, u_int fibnum); +static int handle_nexthop(struct ip6po_nhinfo *nh, u_int fibnum, + struct ifnet **ifpp); +static int handle_pktinfo(const struct in6_pktinfo* pi, struct ifnet **ifpp, + struct in6_addr *srcp); -static struct in6_addrpolicy *lookup_addrsel_policy(struct sockaddr_in6 *); +static int lookup_policy_label(const struct in6_addr *, uint32_t); static void init_policy_queue(void); static int add_addrsel_policyent(struct in6_addrpolicy *); @@ -144,6 +144,7 @@ static int walk_addrsel_policy(int (*)(struct in6_ void *); static int dump_addrsel_policyent(struct in6_addrpolicy *, void *); static struct in6_addrpolicy *match_addrsel_policy(struct sockaddr_in6 *); +static int in6_srcaddrscope(const struct in6_addr *); /* * Return an IPv6 address, which is the most appropriate for a given @@ -151,730 +152,544 @@ static struct in6_addrpolicy *match_addrsel_policy * If necessary, this function lookups the routing table and returns * an entry to the caller for later use. */ -#define REPLACE(r) do {\ - IP6STAT_INC(ip6s_sources_rule[(r)]); \ - rule = (r); \ - /* { \ - char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \ - printf("in6_selectsrc: replace %s with %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \ - } */ \ - goto replace; \ -} while(0) -#define NEXT(r) do {\ - /* { \ - char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \ - printf("in6_selectsrc: keep %s against %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \ - } */ \ - goto next; /* XXX: we can't use 'continue' here */ \ -} while(0) -#define BREAK(r) do { \ - IP6STAT_INC(ip6s_sources_rule[(r)]); \ - rule = (r); \ - goto out; /* XXX: we can't use 'break' here */ \ -} while(0) +struct srcaddr_choice { + struct in6_ifaddr *ia; + int scope; + int label; + int prefixlen; + int rule; +}; +struct dstaddr_props { + struct ifnet *ifp; + struct in6_addr *addr; + int scope; + int label; + int prefixlen; +}; -int -in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, - struct inpcb *inp, struct route_in6 *ro, struct ucred *cred, - struct ifnet **ifpp, struct in6_addr *srcp) +#define REPLACE(r) { rule = r; goto replace; } +#define NEXT(r) { rule = r; goto next; } + +static int +srcaddrcmp(struct srcaddr_choice *c, struct in6_ifaddr *ia, + struct dstaddr_props *dst, struct ucred *cred, + struct ip6_pktopts *opts) { - struct in6_addr dst, tmp; - struct ifnet *ifp = NULL, *oifp = NULL; - struct in6_ifaddr *ia = NULL, *ia_best = NULL; - struct in6_pktinfo *pi = NULL; - int dst_scope = -1, best_scope = -1, best_matchlen = -1; - struct in6_addrpolicy *dst_policy = NULL, *best_policy = NULL; - u_int32_t odstzone; - int prefer_tempaddr; - int error, rule; - struct ip6_moptions *mopts; + int srcscope, rule, label, prefer_tempaddr, prefixlen; - KASSERT(srcp != NULL, ("%s: srcp is NULL", __func__)); - - dst = dstsock->sin6_addr; /* make a copy for local operation */ - if (ifpp) { - /* - * Save a possibly passed in ifp for in6_selectsrc. Only - * neighbor discovery code should use this feature, where - * we may know the interface but not the FIB number holding - * the connected subnet in case someone deleted it from the - * default FIB and we need to check the interface. - */ - if (*ifpp != NULL) - oifp = *ifpp; - *ifpp = NULL; - } - - if (inp != NULL) { - INP_LOCK_ASSERT(inp); - mopts = inp->in6p_moptions; - } else { - mopts = NULL; - } - + /* Avoid unusable addresses */ + if ((ia->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_DETACHED)) || + (ia->ia_ifp->if_flags & IFF_UP) == 0) + return (-1); /* - * If the source address is explicitly specified by the caller, - * check if the requested source address is indeed a unicast address - * assigned to the node, and can be used as the packet's source - * address. If everything is okay, use the address as source. + * In any case, multicast addresses and the unspecified address + * MUST NOT be included in a candidate set. */ - if (opts && (pi = opts->ip6po_pktinfo) && - !IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) { - struct sockaddr_in6 srcsock; - struct in6_ifaddr *ia6; - - /* get the outgoing interface */ - if ((error = in6_selectif(dstsock, opts, mopts, ro, &ifp, oifp, - (inp != NULL) ? inp->inp_inc.inc_fibnum : RT_DEFAULT_FIB)) - != 0) - return (error); - - /* - * determine the appropriate zone id of the source based on - * the zone of the destination and the outgoing interface. - * If the specified address is ambiguous wrt the scope zone, - * the interface must be specified; otherwise, ifa_ifwithaddr() - * will fail matching the address. - */ - bzero(&srcsock, sizeof(srcsock)); - srcsock.sin6_family = AF_INET6; - srcsock.sin6_len = sizeof(srcsock); - srcsock.sin6_addr = pi->ipi6_addr; - if (ifp) { - error = in6_setscope(&srcsock.sin6_addr, ifp, NULL); - if (error) - return (error); - } - if (cred != NULL && (error = prison_local_ip6(cred, - &srcsock.sin6_addr, (inp != NULL && - (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0) - return (error); - - ia6 = (struct in6_ifaddr *)ifa_ifwithaddr( - (struct sockaddr *)&srcsock); - if (ia6 == NULL || - (ia6->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY))) { - if (ia6 != NULL) - ifa_free(&ia6->ia_ifa); - return (EADDRNOTAVAIL); - } - pi->ipi6_addr = srcsock.sin6_addr; /* XXX: this overrides pi */ - if (ifpp) - *ifpp = ifp; - bcopy(&ia6->ia_addr.sin6_addr, srcp, sizeof(*srcp)); - ifa_free(&ia6->ia_ifa); - return (0); + if (IN6_IS_ADDR_MULTICAST(IA6_IN6(ia)) || + IN6_IS_ADDR_UNSPECIFIED(IA6_IN6(ia))) + return (-1); + if (!V_ip6_use_deprecated && IFA6_IS_DEPRECATED(ia)) + return (-1); + /* If jailed, only take addresses of the jail into account. */ + if (cred != NULL && prison_check_ip6(cred, IA6_SIN6(ia)) != 0) + return (-1); + /* Source address can not break the destination zone */ + srcscope = in6_srcaddrscope(IA6_IN6(ia)); + if (ia->ia_ifp != dst->ifp && + in6_getscopezone(ia->ia_ifp, srcscope) != + in6_getscopezone(dst->ifp, dst->scope)) + return (-1); + label = ADDR_LABEL_NOTAPP; + prefixlen = -1; + /* Rule 1: Prefer same address. */ + if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), dst->addr)) + REPLACE(1); + /* Rule 2: Prefer appropriate scope. */ + if (c->ia == NULL) { + dst->label = lookup_policy_label(dst->addr, + in6_getscopezone(dst->ifp, dst->scope)); + REPLACE(0); } - - /* - * Otherwise, if the socket has already bound the source, just use it. - */ - if (inp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { - if (cred != NULL && - (error = prison_local_ip6(cred, &inp->in6p_laddr, - ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0) - return (error); - bcopy(&inp->in6p_laddr, srcp, sizeof(*srcp)); - return (0); + if (IN6_ARE_SCOPE_CMP(c->scope, srcscope) < 0) { + if (IN6_ARE_SCOPE_CMP(c->scope, dst->scope) < 0) + REPLACE(2); + NEXT(2); + } else if (IN6_ARE_SCOPE_CMP(srcscope, c->scope) < 0) { + if (IN6_ARE_SCOPE_CMP(srcscope, dst->scope) < 0) + NEXT(2); + REPLACE(2); } - + /* Rule 3: Avoid deprecated addresses. */ + if (!IFA6_IS_DEPRECATED(c->ia) && IFA6_IS_DEPRECATED(ia)) + NEXT(3); + if (IFA6_IS_DEPRECATED(c->ia) && !IFA6_IS_DEPRECATED(ia)) + REPLACE(3); /* - * Bypass source address selection and use the primary jail IP - * if requested. + * Rule 4: Prefer home addresses. + * XXX: This is a TODO. */ - if (cred != NULL && !prison_saddrsel_ip6(cred, srcp)) - return (0); - + /* Rule 5: Prefer outgoing interface. */ + if (c->ia->ia_ifp == dst->ifp && ia->ia_ifp != dst->ifp) + NEXT(5); + if (c->ia->ia_ifp != dst->ifp && ia->ia_ifp == dst->ifp) + REPLACE(5); /* - * If the address is not specified, choose the best one based on - * the outgoing interface and the destination address. + * Rule 5.5: Prefer addresses in a prefix advertised by + * the next-hop. + * XXX: not yet. */ - /* get the outgoing interface */ - if ((error = in6_selectif(dstsock, opts, mopts, ro, &ifp, oifp, - (inp != NULL) ? inp->inp_inc.inc_fibnum : RT_DEFAULT_FIB)) != 0) - return (error); - -#ifdef DIAGNOSTIC - if (ifp == NULL) /* this should not happen */ - panic("in6_selectsrc: NULL ifp"); -#endif - error = in6_setscope(&dst, ifp, &odstzone); - if (error) - return (error); - - rule = 0; - IN6_IFADDR_RLOCK(); - TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { - int new_scope = -1, new_matchlen = -1; - struct in6_addrpolicy *new_policy = NULL; - u_int32_t srczone, osrczone, dstzone; - struct in6_addr src; - struct ifnet *ifp1 = ia->ia_ifp; - - /* - * We'll never take an address that breaks the scope zone - * of the destination. We also skip an address if its zone - * does not contain the outgoing interface. - * XXX: we should probably use sin6_scope_id here. - */ - if (in6_setscope(&dst, ifp1, &dstzone) || - odstzone != dstzone) { - continue; - } - src = ia->ia_addr.sin6_addr; - if (in6_setscope(&src, ifp, &osrczone) || - in6_setscope(&src, ifp1, &srczone) || - osrczone != srczone) { - continue; - } - - /* avoid unusable addresses */ - if ((ia->ia6_flags & - (IN6_IFF_NOTREADY | IN6_IFF_ANYCAST | IN6_IFF_DETACHED))) { - continue; - } - if (!V_ip6_use_deprecated && IFA6_IS_DEPRECATED(ia)) - continue; - - /* If jailed only take addresses of the jail into account. */ - if (cred != NULL && - prison_check_ip6(cred, &ia->ia_addr.sin6_addr) != 0) - continue; - - /* Rule 1: Prefer same address */ - if (IN6_ARE_ADDR_EQUAL(&dst, &ia->ia_addr.sin6_addr)) { - ia_best = ia; - BREAK(1); /* there should be no better candidate */ - } - - if (ia_best == NULL) - REPLACE(0); - - /* Rule 2: Prefer appropriate scope */ - if (dst_scope < 0) - dst_scope = in6_addrscope(&dst); - new_scope = in6_addrscope(&ia->ia_addr.sin6_addr); - if (IN6_ARE_SCOPE_CMP(best_scope, new_scope) < 0) { - if (IN6_ARE_SCOPE_CMP(best_scope, dst_scope) < 0) - REPLACE(2); - NEXT(2); - } else if (IN6_ARE_SCOPE_CMP(new_scope, best_scope) < 0) { - if (IN6_ARE_SCOPE_CMP(new_scope, dst_scope) < 0) - NEXT(2); - REPLACE(2); - } - - /* - * Rule 3: Avoid deprecated addresses. Note that the case of - * !ip6_use_deprecated is already rejected above. - */ - if (!IFA6_IS_DEPRECATED(ia_best) && IFA6_IS_DEPRECATED(ia)) - NEXT(3); - if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia)) - REPLACE(3); - - /* Rule 4: Prefer home addresses */ - /* - * XXX: This is a TODO. We should probably merge the MIP6 - * case above. - */ - - /* Rule 5: Prefer outgoing interface */ - if (!(ND_IFINFO(ifp)->flags & ND6_IFF_NO_PREFER_IFACE)) { - if (ia_best->ia_ifp == ifp && ia->ia_ifp != ifp) - NEXT(5); - if (ia_best->ia_ifp != ifp && ia->ia_ifp == ifp) - REPLACE(5); - } - - /* - * Rule 6: Prefer matching label - * Note that best_policy should be non-NULL here. - */ - if (dst_policy == NULL) - dst_policy = lookup_addrsel_policy(dstsock); - if (dst_policy->label != ADDR_LABEL_NOTAPP) { - new_policy = lookup_addrsel_policy(&ia->ia_addr); - if (dst_policy->label == best_policy->label && - dst_policy->label != new_policy->label) - NEXT(6); - if (dst_policy->label != best_policy->label && - dst_policy->label == new_policy->label) - REPLACE(6); - } - - /* - * Rule 7: Prefer public addresses. - * We allow users to reverse the logic by configuring - * a sysctl variable, so that privacy conscious users can - * always prefer temporary addresses. - */ - if (opts == NULL || - opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) { - prefer_tempaddr = V_ip6_prefer_tempaddr; - } else if (opts->ip6po_prefer_tempaddr == - IP6PO_TEMPADDR_NOTPREFER) { - prefer_tempaddr = 0; - } else - prefer_tempaddr = 1; - if (!(ia_best->ia6_flags & IN6_IFF_TEMPORARY) && - (ia->ia6_flags & IN6_IFF_TEMPORARY)) { - if (prefer_tempaddr) - REPLACE(7); - else - NEXT(7); - } - if ((ia_best->ia6_flags & IN6_IFF_TEMPORARY) && - !(ia->ia6_flags & IN6_IFF_TEMPORARY)) { - if (prefer_tempaddr) - NEXT(7); - else - REPLACE(7); - } - - /* - * Rule 8: prefer addresses on alive interfaces. - * This is a KAME specific rule. - */ - if ((ia_best->ia_ifp->if_flags & IFF_UP) && - !(ia->ia_ifp->if_flags & IFF_UP)) - NEXT(8); - if (!(ia_best->ia_ifp->if_flags & IFF_UP) && - (ia->ia_ifp->if_flags & IFF_UP)) - REPLACE(8); - - /* - * Rule 14: Use longest matching prefix. - * Note: in the address selection draft, this rule is - * documented as "Rule 8". However, since it is also - * documented that this rule can be overridden, we assign - * a large number so that it is easy to assign smaller numbers - * to more preferred rules. - */ - new_matchlen = in6_matchlen(&ia->ia_addr.sin6_addr, &dst); - if (best_matchlen < new_matchlen) - REPLACE(14); - if (new_matchlen < best_matchlen) - NEXT(14); - - /* Rule 15 is reserved. */ - - /* - * Last resort: just keep the current candidate. - * Or, do we need more rules? - */ - continue; - - replace: - ia_best = ia; - best_scope = (new_scope >= 0 ? new_scope : - in6_addrscope(&ia_best->ia_addr.sin6_addr)); - best_policy = (new_policy ? new_policy : - lookup_addrsel_policy(&ia_best->ia_addr)); - best_matchlen = (new_matchlen >= 0 ? new_matchlen : - in6_matchlen(&ia_best->ia_addr.sin6_addr, - &dst)); - - next: - continue; - - out: - break; + /* Rule 6: Prefer matching label. */ + if (dst->label != ADDR_LABEL_NOTAPP) { + c->label = lookup_policy_label(IA6_IN6(c->ia), + in6_getscopezone(c->ia->ia_ifp, c->scope)); + label = lookup_policy_label(IA6_IN6(ia), + in6_getscopezone(ia->ia_ifp, srcscope)); + if (c->label == dst->label && label != dst->label) + NEXT(6); + if (label == dst->label && c->label != dst->label) + REPLACE(6); } - - if ((ia = ia_best) == NULL) { - IN6_IFADDR_RUNLOCK(); - IP6STAT_INC(ip6s_sources_none); - return (EADDRNOTAVAIL); + /* Rule 7: Prefer temporary addresses. */ + if (opts == NULL || + opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) + prefer_tempaddr = V_ip6_prefer_tempaddr; + else if (opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_NOTPREFER) + prefer_tempaddr = 0; + else + prefer_tempaddr = 1; + if ((c->ia->ia6_flags & IN6_IFF_TEMPORARY) != 0 && + (ia->ia6_flags & IN6_IFF_TEMPORARY) == 0) { + if (prefer_tempaddr) + NEXT(7); + REPLACE(7); } + if ((c->ia->ia6_flags & IN6_IFF_TEMPORARY) == 0 && + (ia->ia6_flags & IN6_IFF_TEMPORARY) != 0) { + if (prefer_tempaddr) + REPLACE(7); + NEXT(7); + } + /* Rule 8: Use longest matching prefix. */ + if (c->prefixlen < 0) + c->prefixlen = in6_matchlen(IA6_IN6(c->ia), dst->addr); + prefixlen = in6_matchlen(IA6_IN6(ia), dst->addr); + if (c->prefixlen > prefixlen) + NEXT(8); + if (prefixlen > c->prefixlen) + REPLACE(8); + return (-1); +replace: + /* debug output */ + c->ia = ia; + c->label = label; + c->scope = srcscope; + c->rule = rule; + c->prefixlen = prefixlen; + /* Update statistic */ + IP6STAT_INC(ip6s_sources_rule[rule]); + return (rule); +next: + /* debug output */ + return (rule); +} +#undef REPLACE +#undef NEXT +static int +cached_rtlookup(const struct sockaddr_in6 *dst, struct route_in6 *ro, + u_int fibnum) +{ + /* - * At this point at least one of the addresses belonged to the jail - * but it could still be, that we want to further restrict it, e.g. - * theoratically IN6_IS_ADDR_LOOPBACK. - * It must not be IN6_IS_ADDR_UNSPECIFIED anymore. - * prison_local_ip6() will fix an IN6_IS_ADDR_LOOPBACK but should - * let all others previously selected pass. - * Use tmp to not change ::1 on lo0 to the primary jail address. + * Use a cached route if it exists and is valid, else try to allocate + * a new one. Note that we should check the address family of the + * cached destination, in case of sharing the cache with IPv4. */ - tmp = ia->ia_addr.sin6_addr; - if (cred != NULL && prison_local_ip6(cred, &tmp, (inp != NULL && - (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)) != 0) { - IN6_IFADDR_RUNLOCK(); - IP6STAT_INC(ip6s_sources_none); - return (EADDRNOTAVAIL); + KASSERT(ro != NULL, ("%s: ro is NULL", __func__)); + if (ro->ro_rt != NULL && ( + (ro->ro_rt->rt_flags & RTF_UP) == 0 || + ro->ro_dst.sin6_family != AF_INET6 || + !IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &dst->sin6_addr))) { + RO_RTFREE(ro); } - - if (ifpp) - *ifpp = ifp; - - bcopy(&tmp, srcp, sizeof(*srcp)); - if (ia->ia_ifp == ifp) - IP6STAT_INC(ip6s_sources_sameif[best_scope]); - else - IP6STAT_INC(ip6s_sources_otherif[best_scope]); - if (dst_scope == best_scope) - IP6STAT_INC(ip6s_sources_samescope[best_scope]); - else - IP6STAT_INC(ip6s_sources_otherscope[best_scope]); - if (IFA6_IS_DEPRECATED(ia)) - IP6STAT_INC(ip6s_sources_deprecated[best_scope]); - IN6_IFADDR_RUNLOCK(); + if (ro->ro_rt == NULL) { + /* No route yet, so try to acquire one */ + memcpy(&ro->ro_dst, dst, sizeof(*dst)); + in6_rtalloc(ro, fibnum); + } + if (ro->ro_rt == NULL) + return (EHOSTUNREACH); return (0); } /* - * clone - meaningful only for bsdi and freebsd + * pi - options configured via IPV6_PKTINFO; + * + * These parameters are returned back to caller: + * ifpp - determined outgoing interface; + * srcp - determined source address; */ static int -selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, - struct ip6_moptions *mopts, struct route_in6 *ro, - struct ifnet **retifp, struct rtentry **retrt, int norouteok, u_int fibnum) +handle_pktinfo(const struct in6_pktinfo* pi, struct ifnet **ifpp, + struct in6_addr *srcp) { - int error = 0; - struct ifnet *ifp = NULL; - struct rtentry *rt = NULL; - struct sockaddr_in6 *sin6_next; - struct in6_pktinfo *pi = NULL; - struct in6_addr *dst = &dstsock->sin6_addr; -#if 0 - char ip6buf[INET6_ADDRSTRLEN]; + struct ifnet *ifp; - if (dstsock->sin6_addr.s6_addr32[0] == 0 && - dstsock->sin6_addr.s6_addr32[1] == 0 && - !IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) { - printf("in6_selectroute: strange destination %s\n", - ip6_sprintf(ip6buf, &dstsock->sin6_addr)); - } else { - printf("in6_selectroute: destination = %s%%%d\n", - ip6_sprintf(ip6buf, &dstsock->sin6_addr), - dstsock->sin6_scope_id); /* for debug */ - } -#endif - - /* If the caller specify the outgoing interface explicitly, use it. */ - if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) { - /* XXX boundary check is assumed to be already done. */ + ifp = NULL; + if (pi->ipi6_ifindex != 0) { ifp = ifnet_byindex(pi->ipi6_ifindex); - if (ifp != NULL && - (norouteok || retrt == NULL || - IN6_IS_ADDR_MULTICAST(dst))) { - /* - * we do not have to check or get the route for - * multicast. - */ - goto done; - } else - goto getroute; + if (ifp == NULL) + return (ENXIO); + if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) + return (ENETDOWN); } + if (ifp != NULL) + *ifpp = ifp; + if (IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) + return (0); + *srcp = pi->ipi6_addr; + return (0); +} - /* - * If the destination address is a multicast address and the outgoing - * interface for the address is specified by the caller, use it. - */ - if (IN6_IS_ADDR_MULTICAST(dst) && - mopts != NULL && (ifp = mopts->im6o_multicast_ifp) != NULL) { - goto done; /* we do not need a route for multicast. */ - } +/* + * nh - next hop destination and route; + * fibnum - FIB number. + * ifpp - pointer to outgoing interface. + * + * NOTE: we can keep this route, it will be freed in the socket + * option handling code (see ip6_output.c). + */ +static int +handle_nexthop(struct ip6po_nhinfo *nh, u_int fibnum, struct ifnet **ifpp) +{ + struct sockaddr_in6 *sa; + struct route_in6 *ro; + struct ifnet *ifp, *oifp; - getroute: + sa = (struct sockaddr_in6 *)nh->ip6po_nhi_nexthop; + ro = &nh->ip6po_nhi_route; + if (sa->sin6_family != AF_INET6) + return (EAFNOSUPPORT); /* - * If the next hop address for the packet is specified by the caller, - * use it as the gateway. + * If *ifpp is not NULL, this means that outgoing interface + * was determined in the PKTINFO handling code. */ - if (opts && opts->ip6po_nexthop) { - struct route_in6 *ron; - struct llentry *la; - - sin6_next = satosin6(opts->ip6po_nexthop); - - /* at this moment, we only support AF_INET6 next hops */ - if (sin6_next->sin6_family != AF_INET6) { - error = EAFNOSUPPORT; /* or should we proceed? */ - goto done; - } - + oifp = *ifpp; + if (IN6_IS_ADDR_LINKLOCAL(&sa->sin6_addr)) /* - * If the next hop is an IPv6 address, then the node identified - * by that address must be a neighbor of the sending host. + * Next hop is LLA, thus it should be neighbor. + * Determine outgoing interface by zone index. */ - ron = &opts->ip6po_nextroute; + ifp = in6_getlinkifnet(sa->sin6_scope_id); + else { + if (cached_rtlookup(sa, ro, fibnum) != 0) + return (EHOSTUNREACH); /* - * XXX what do we do here? - * PLZ to be fixing + * The node identified by that address must be a + * neighbor of the sending host. */ + if (ro->ro_rt->rt_flags & RTF_GATEWAY) + return (EHOSTUNREACH); + ifp = ro->ro_rt->rt_ifp; + } + /* + * When the outgoing interface is specified by IPV6_PKTINFO + * as well, the next hop specified by this option must be + * reachable via the specified interface. + */ + if (ifp == NULL || (oifp != NULL && oifp != ifp)) + return (EHOSTUNREACH); + *ifpp = ifp; + return (0); +} - if (ron->ro_rt == NULL) { - in6_rtalloc(ron, fibnum); /* multi path case? */ - if (ron->ro_rt == NULL) { - /* XXX-BZ WT.? */ - if (ron->ro_rt) { - RTFREE(ron->ro_rt); - ron->ro_rt = NULL; - } - error = EHOSTUNREACH; - goto done; - } - } +static int +check_addrs(const struct sockaddr_in6 *src, const struct sockaddr_in6 *dst, + struct ifnet *ifp) +{ + struct in6_ifaddr *ia; - rt = ron->ro_rt; - ifp = rt->rt_ifp; - IF_AFDATA_RLOCK(ifp); - la = lla_lookup(LLTABLE6(ifp), 0, (struct sockaddr *)sin6_next); - IF_AFDATA_RUNLOCK(ifp); - if (la != NULL) - LLE_RUNLOCK(la); - else { - error = EHOSTUNREACH; - goto done; - } -#if 0 - if ((ron->ro_rt && - (ron->ro_rt->rt_flags & (RTF_UP | RTF_LLINFO)) != - (RTF_UP | RTF_LLINFO)) || - !IN6_ARE_ADDR_EQUAL(&satosin6(&ron->ro_dst)->sin6_addr, - &sin6_next->sin6_addr)) { - if (ron->ro_rt) { - RTFREE(ron->ro_rt); - ron->ro_rt = NULL; - } - *satosin6(&ron->ro_dst) = *sin6_next; - } - if (ron->ro_rt == NULL) { - in6_rtalloc(ron, fibnum); /* multi path case? */ - if (ron->ro_rt == NULL || - !(ron->ro_rt->rt_flags & RTF_LLINFO)) { - if (ron->ro_rt) { - RTFREE(ron->ro_rt); - ron->ro_rt = NULL; - } - error = EHOSTUNREACH; - goto done; - } - } -#endif - - /* - * When cloning is required, try to allocate a route to the - * destination so that the caller can store path MTU - * information. - */ - goto done; + /* + * Check that source address is available on the interface. + */ + ia = in6ifa_ifpwithaddr(ifp, &src->sin6_addr); + if (ia == NULL || ( + ia->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY))) { + if (ia != NULL) + ifa_free(&ia->ia_ifa); + return (EADDRNOTAVAIL); } - + ifa_free(&ia->ia_ifa); /* - * Use a cached route if it exists and is valid, else try to allocate - * a new one. Note that we should check the address family of the - * cached destination, in case of sharing the cache with IPv4. + * Check that source address does not break the destination + * zone. */ - if (ro) { - if (ro->ro_rt && - (!(ro->ro_rt->rt_flags & RTF_UP) || - ((struct sockaddr *)(&ro->ro_dst))->sa_family != AF_INET6 || - !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr, - dst))) { - RTFREE(ro->ro_rt); - ro->ro_rt = (struct rtentry *)NULL; - } - if (ro->ro_rt == (struct rtentry *)NULL) { - struct sockaddr_in6 *sa6; + if (dst->sin6_scope_id != 0 && + dst->sin6_scope_id != in6_getscopezone(ifp, + in6_srcaddrscope(&dst->sin6_addr))) + return (EHOSTUNREACH); + return (0); +} - /* No route yet, so try to acquire one */ - bzero(&ro->ro_dst, sizeof(struct sockaddr_in6)); - sa6 = (struct sockaddr_in6 *)&ro->ro_dst; - *sa6 = *dstsock; - sa6->sin6_scope_id = 0; +int +in6_selectsrc(struct sockaddr_in6 *dst, struct ip6_pktopts *opts, + struct inpcb *inp, struct route_in6 *ro, struct ucred *cred, + struct ifnet **ifpp, struct in6_addr *srcp) +{ + struct route_in6 ro6; + struct dstaddr_props dstprops; + struct srcaddr_choice best; + struct sockaddr_in6 srcsock; + struct ip6_moptions *mopts; + struct in6_ifaddr *ia; + struct ifaddr *ifa; + struct ifnet *ifp, *oifp; + u_int fibnum; + int error; -#ifdef RADIX_MPATH - rtalloc_mpath_fib((struct route *)ro, - ntohl(sa6->sin6_addr.s6_addr32[3]), fibnum); -#else - ro->ro_rt = in6_rtalloc1((struct sockaddr *) - &ro->ro_dst, 0, 0UL, fibnum); - if (ro->ro_rt) - RT_UNLOCK(ro->ro_rt); -#endif + KASSERT(srcp != NULL, ("%s: srcp is NULL", __func__)); + KASSERT(ifpp != NULL, ("%s: ifpp is NULL", __func__)); + KASSERT(sa6_checkzone(dst) == 0, ("%s: invalid zone information", + __func__)); + + ifp = NULL; + /* + * XXX: Save a possibly passed in ifp for in6_selectsrc. Only + * neighbor discovery code should use this feature, where + * we may know the interface but not the FIB number holding + * the connected subnet in case someone deleted it from the + * default FIB and we need to check the interface. + */ + oifp = *ifpp; + if (inp != NULL) { + INP_LOCK_ASSERT(inp); + mopts = inp->in6p_moptions; + fibnum = inp->inp_inc.inc_fibnum; + /* Use "sticky" options if opts isn't specified. */ + if (opts == NULL) + opts = inp->in6p_outputopts; + } else { + mopts = NULL; + fibnum = RT_DEFAULT_FIB; + } + if (ro == NULL) { + ro = &ro6; + bzero(ro, sizeof(*ro)); + } + srcsock = sa6_any; + if (opts != NULL && opts->ip6po_pktinfo != NULL) { + error = handle_pktinfo(opts->ip6po_pktinfo, &ifp, + &srcsock.sin6_addr); + if (error != 0) + return (error); + if (ifp != NULL) { + /* + * When the outgoing interface is specified by + * IPV6_PKTINFO as well, the next hop specified by + * this option must be reachable via the specified + * interface. + * We ignore next hop for multicast destinations. + */ + if (!IN6_IS_ADDR_MULTICAST(&dst->sin6_addr) && + opts->ip6po_nexthop != NULL) { + error = handle_nexthop(&opts->ip6po_nhinfo, + fibnum, &ifp); + if (error != 0) + return (error); + } } - + } + if (ifp != NULL) + goto oif_found; + if (IN6_IS_ADDR_MULTICAST(&dst->sin6_addr)) { /* - * do not care about the result if we have the nexthop - * explicitly specified. + * If the destination address is a multicast address and + * the IPV6_MULTICAST_IF socket option is specified for the + * socket, the interface is used. */ - if (opts && opts->ip6po_nexthop) - goto done; - - if (ro->ro_rt) { + if (mopts && mopts->im6o_multicast_ifp) { + ifp = mopts->im6o_multicast_ifp; + } else if (IN6_IS_ADDR_MC_LINKLOCAL(&dst->sin6_addr) || + IN6_IS_ADDR_MC_INTFACELOCAL(&dst->sin6_addr)) { + /* + * Destination multicast address is in the link-local + * or interface-local scope. Use its sin6_scope_id to + * determine outgoing interface. + */ + if (dst->sin6_scope_id != 0) + ifp = in6_getlinkifnet(dst->sin6_scope_id); + } else { + /* + * Try to lookup route for this multicast + * destination address. + */ + if (cached_rtlookup(dst, ro, fibnum) != 0) + return (EHOSTUNREACH); ifp = ro->ro_rt->rt_ifp; - - if (ifp == NULL) { /* can this really happen? */ - RTFREE(ro->ro_rt); - ro->ro_rt = NULL; - } } - if (ro->ro_rt == NULL) - error = EHOSTUNREACH; - rt = ro->ro_rt; - + } else if (opts != NULL && opts->ip6po_nexthop != NULL) { + error = handle_nexthop(&opts->ip6po_nhinfo, fibnum, &ifp); + if (error != 0) + return (error); + } else { /* - * Check if the outgoing interface conflicts with - * the interface specified by ipi6_ifindex (if specified). - * Note that loopback interface is always okay. - * (this may happen when we are sending a packet to one of - * our own addresses.) + * We don't have any options and destination isn't multicast. + * Use sin6_scope_id for link-local addresses. + * Do a route lookup for global addresses. */ - if (ifp && opts && opts->ip6po_pktinfo && - opts->ip6po_pktinfo->ipi6_ifindex) { - if (!(ifp->if_flags & IFF_LOOPBACK) && - ifp->if_index != - opts->ip6po_pktinfo->ipi6_ifindex) { - error = EHOSTUNREACH; - goto done; - } + if (IN6_IS_ADDR_LINKLOCAL(&dst->sin6_addr)) { + if (dst->sin6_scope_id != 0) + ifp = in6_getlinkifnet(dst->sin6_scope_id); + } else { + if (cached_rtlookup(dst, ro, fibnum) != 0) + return (EHOSTUNREACH); + ifp = ro->ro_rt->rt_ifp; } } - - done: - if (ifp == NULL && rt == NULL) { - /* - * This can happen if the caller did not pass a cached route - * nor any other hints. We treat this case an error. - */ - error = EHOSTUNREACH; + if (ifp == NULL) { + if (oifp == NULL) + return (EHOSTUNREACH); + /* Use outgoing interface specified by caller. */ + ifp = oifp; } - if (error == EHOSTUNREACH) - IP6STAT_INC(ip6s_noroute); - - if (retifp != NULL) { - *retifp = ifp; - - /* - * Adjust the "outgoing" interface. If we're going to loop - * the packet back to ourselves, the ifp would be the loopback - * interface. However, we'd rather know the interface associated - * to the destination address (which should probably be one of - * our own addresses.) - */ - if (rt) { - if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) && - (rt->rt_gateway->sa_family == AF_LINK)) - *retifp = - ifnet_byindex(((struct sockaddr_dl *) - rt->rt_gateway)->sdl_index); +oif_found: + if (!IN6_IS_ADDR_UNSPECIFIED(&srcsock.sin6_addr)) { + if (ro == &ro6) + RO_RTFREE(ro); + if (cred != NULL) { + srcsock.sin6_scope_id = in6_getscopezone(ifp, + in6_srcaddrscope(&srcsock.sin6_addr)); + error = prison_local_ip6(cred, &srcsock, + (inp != NULL && + (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)); + if (error != 0) + return (error); } + error = check_addrs(&srcsock, dst, ifp); + if (error != 0) + return (error); + *ifpp = ifp; + *srcp = srcsock.sin6_addr; + return (0); } - - if (retrt != NULL) - *retrt = rt; /* rt may be NULL */ - - return (error); -} - -static int -in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, - struct ip6_moptions *mopts, struct route_in6 *ro, struct ifnet **retifp, - struct ifnet *oifp, u_int fibnum) -{ - int error; - struct route_in6 sro; - struct rtentry *rt = NULL; - - KASSERT(retifp != NULL, ("%s: retifp is NULL", __func__)); - - if (ro == NULL) { - bzero(&sro, sizeof(sro)); - ro = &sro; + /* + * Otherwise, if the socket has already bound the source, just use it. + */ + if (inp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { + if (ro == &ro6) + RO_RTFREE(ro); + *srcp = inp->in6p_laddr; + *ifpp = ifp; + return (0); } - - if ((error = selectroute(dstsock, opts, mopts, ro, retifp, - &rt, 1, fibnum)) != 0) { - if (ro == &sro && rt && rt == sro.ro_rt) - RTFREE(rt); - /* Help ND. See oifp comment in in6_selectsrc(). */ - if (oifp != NULL && fibnum == RT_DEFAULT_FIB) { - *retifp = oifp; - error = 0; - } - return (error); + /* + * Bypass source address selection and use the primary jail IP + * if requested. + */ + if (cred != NULL && !prison_saddrsel_ip6(cred, &srcsock)) { + if (ro == &ro6) + RO_RTFREE(ro); + error = check_addrs(&srcsock, dst, ifp); + if (error != 0) + return (error); + *ifpp = ifp; + *srcp = srcsock.sin6_addr; + return (0); } /* - * do not use a rejected or black hole route. - * XXX: this check should be done in the L2 output routine. - * However, if we skipped this check here, we'd see the following - * scenario: - * - install a rejected route for a scoped address prefix - * (like fe80::/10) - * - send a packet to a destination that matches the scoped prefix, - * with ambiguity about the scope zone. - * - pick the outgoing interface from the route, and disambiguate the - * scope zone with the interface. - * - ip6_output() would try to get another route with the "new" - * destination, which may be valid. - * - we'd see no error on output. - * Although this may not be very harmful, it should still be confusing. - * We thus reject the case here. + * If the address is not specified, choose the best one based on + * the outgoing interface and the destination address. */ - if (rt && (rt->rt_flags & (RTF_REJECT | RTF_BLACKHOLE))) { - int flags = (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); - - if (ro == &sro && rt && rt == sro.ro_rt) - RTFREE(rt); - return (flags); + dstprops.ifp = ifp; + dstprops.addr = &dst->sin6_addr; + dstprops.scope = in6_srcaddrscope(&dst->sin6_addr); + best.rule = -1; + best.ia = NULL; + /* + * RFC 6724 (section 4): + * For all multicast and link-local destination addresses, the set of + * candidate source addresses MUST only include addresses assigned to + * interfaces belonging to the same link as the outgoing interface. + */ + if (IN6_IS_ADDR_MULTICAST(&dst->sin6_addr) || + dstprops.scope == IPV6_ADDR_SCOPE_LINKLOCAL) { + IF_ADDR_RLOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_INET6) + continue; + error = srcaddrcmp(&best, (struct in6_ifaddr*)ifa, + &dstprops, cred, opts); + if (error == 1) + break; + } + if (best.rule >= 0) + ifa_ref(&best.ia->ia_ifa); + IF_ADDR_RUNLOCK(ifp); + } else { + IN6_IFADDR_RLOCK(); + TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { + error = srcaddrcmp(&best, ia, &dstprops, cred, opts); + if (error == 1) + break; + } + if (best.rule >= 0) + ifa_ref(&best.ia->ia_ifa); + IN6_IFADDR_RUNLOCK(); } + if (best.rule < 0) { + IP6STAT_INC(ip6s_sources_none); + if (ro == &ro6) + RO_RTFREE(ro); + return (EADDRNOTAVAIL); + } + *ifpp = ifp; + *srcp = best.ia->ia_addr.sin6_addr; - if (ro == &sro && rt && rt == sro.ro_rt) - RTFREE(rt); + /* Update statistic */ + if (best.ia->ia_ifp == ifp) + IP6STAT_INC(ip6s_sources_sameif[best.scope]); + else + IP6STAT_INC(ip6s_sources_otherif[best.scope]); + if (dstprops.scope == best.scope) + IP6STAT_INC(ip6s_sources_samescope[best.scope]); + else + IP6STAT_INC(ip6s_sources_otherscope[best.scope]); + if (IFA6_IS_DEPRECATED(best.ia)) + IP6STAT_INC(ip6s_sources_deprecated[best.scope]); + ifa_free(&best.ia->ia_ifa); + if (ro == &ro6) + RO_RTFREE(ro); return (0); } /* - * Public wrapper function to selectroute(). - * - * XXX-BZ in6_selectroute() should and will grow the FIB argument. The - * in6_selectroute_fib() function is only there for backward compat on stable. - */ -int -in6_selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, - struct ip6_moptions *mopts, struct route_in6 *ro, - struct ifnet **retifp, struct rtentry **retrt) -{ - - return (selectroute(dstsock, opts, mopts, ro, retifp, - retrt, 0, RT_DEFAULT_FIB)); -} - -#ifndef BURN_BRIDGES -int -in6_selectroute_fib(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, - struct ip6_moptions *mopts, struct route_in6 *ro, - struct ifnet **retifp, struct rtentry **retrt, u_int fibnum) -{ - - return (selectroute(dstsock, opts, mopts, ro, retifp, - retrt, 0, fibnum)); -} -#endif - -/* * Default hop limit selection. The precedence is as follows: * 1. Hoplimit value specified via ioctl. * 2. (If the outgoing interface is detected) the current * hop limit of the interface specified by router advertisement. - * 3. The system default hoplimit. + * 3. If destination address is from link-local scope, use its zoneid + * to determine outgoing interface and use its hop limit. + * 4. The system default hoplimit. */ int in6_selecthlim(struct inpcb *in6p, struct ifnet *ifp) { + struct route_in6 ro6; if (in6p && in6p->in6p_hops >= 0) return (in6p->in6p_hops); - else if (ifp) + + if (ifp != NULL) return (ND_IFINFO(ifp)->chlim); - else if (in6p && !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) { - struct route_in6 ro6; - struct ifnet *lifp; + /* XXX: should we check for multicast here?*/ + if (in6p && IN6_IS_ADDR_LINKLOCAL(&in6p->in6p_faddr)) { + if (in6p->in6p_zoneid != 0 && + (ifp = in6_getlinkifnet(in6p->in6p_zoneid))) + return (ND_IFINFO(ifp)->chlim); + } else if (in6p && !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) { bzero(&ro6, sizeof(ro6)); ro6.ro_dst.sin6_family = AF_INET6; ro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6); @@ -881,15 +696,14 @@ in6_selecthlim(struct inpcb *in6p, struct ifnet *i ro6.ro_dst.sin6_addr = in6p->in6p_faddr; in6_rtalloc(&ro6, in6p->inp_inc.inc_fibnum); if (ro6.ro_rt) { - lifp = ro6.ro_rt->rt_ifp; + ifp = ro6.ro_rt->rt_ifp; RTFREE(ro6.ro_rt); - if (lifp) - return (ND_IFINFO(lifp)->chlim); + if (ifp) + return (ND_IFINFO(ifp)->chlim); } } return (V_ip6_defhlim); } - /* * XXX: this is borrowed from in6_pcbbind(). If possible, we should * share this function by all *bsd*... @@ -907,10 +721,12 @@ in6_pcbsetport(struct in6_addr *laddr, struct inpc INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); +#if 0 error = prison_local_ip6(cred, laddr, ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)); if (error) return(error); +#endif /* XXX: this is redundant when called from in6_pcbbind */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) @@ -949,14 +765,17 @@ addrsel_policy_init(void) ADDRSEL_SXLOCK_INIT(); } -static struct in6_addrpolicy * -lookup_addrsel_policy(struct sockaddr_in6 *key) +static int +lookup_policy_label(const struct in6_addr *addr, uint32_t zoneid) { + struct sockaddr_in6 sa6; struct in6_addrpolicy *match = NULL; + sa6.sin6_addr = *addr; + sa6.sin6_scope_id = zoneid; + ADDRSEL_LOCK(); - match = match_addrsel_policy(key); - + match = match_addrsel_policy(&sa6); if (match == NULL) match = &V_defaultaddrpolicy; else @@ -963,7 +782,7 @@ addrsel_policy_init(void) match->use++; ADDRSEL_UNLOCK(); - return (match); + return (match->label); } /* @@ -1181,3 +1000,20 @@ match_addrsel_policy(struct sockaddr_in6 *key) return (bestpol); } + +/* + * This function is similar to in6_addrscope, but has some difference, + * specific for the source address selection algorithm (RFC 6724). + */ +static int +in6_srcaddrscope(const struct in6_addr *addr) +{ + + /* 169.254/16 and 127/8 have link-local scope */ + if (IN6_IS_ADDR_V4MAPPED(addr)) { + if (addr->s6_addr[12] == 127 || ( + addr->s6_addr[12] == 169 && addr->s6_addr[13] == 254)) + return (IPV6_ADDR_SCOPE_LINKLOCAL); + } + return (in6_addrscope(addr)); +}