Index: sys/netinet6/udp6_usrreq.c =================================================================== --- sys/netinet6/udp6_usrreq.c (revision 287250) +++ sys/netinet6/udp6_usrreq.c (working copy) @@ -621,36 +621,40 @@ udp6_getcred(SYSCTL_HANDLER_ARGS) SYSCTL_PROC(_net_inet6_udp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, 0, 0, udp6_getcred, "S,xucred", "Get the xucred of a UDP6 connection"); +#define UH_WLOCKED 2 +#define UH_RLOCKED 1 +#define UH_UNLOCKED 0 static int -udp6_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr6, - struct mbuf *control, struct thread *td) +udp6_output(struct socket *so, int flags, struct mbuf *m, + struct sockaddr *addr6, struct mbuf *control, struct thread *td) { - u_int32_t ulen = m->m_pkthdr.len; - u_int32_t plen = sizeof(struct udphdr) + ulen; + struct inpcbinfo *pcbinfo; + struct inpcb *inp; struct ip6_hdr *ip6; struct udphdr *udp6; struct in6_addr *laddr, *faddr, in6a; - struct sockaddr_in6 *sin6 = NULL; - struct ifnet *oifp = NULL; - int cscov_partial = 0; + struct sockaddr_in6 *sin6; + struct ip6_pktopts *optp, opt; + struct sockaddr_in6 tmp; + int cscov_partial; + int error = 0, hlen; int scope_ambiguous = 0; + int unlock_udbinfo; + u_int32_t plen, ulen; u_short fport; - int error = 0; + uint16_t cscov; uint8_t nxt; - uint16_t cscov = 0; - struct ip6_pktopts *optp, opt; - int af = AF_INET6, hlen = sizeof(struct ip6_hdr); - int flags; - struct sockaddr_in6 tmp; - INP_WLOCK_ASSERT(inp); - INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); + /* addr6 was validated in udp6_send(). */ + sin6 = (struct sockaddr_in6 *)addr6; - if (addr6) { - /* addr6 has been validated in udp6_send(). */ - sin6 = (struct sockaddr_in6 *)addr6; + /* + * Contrary to IPv4 we do not validate the max. packet length + * here due to IPv6 Jumbograms (RFC2675). + */ - /* protect *sin6 from overwrites */ + if (sin6) { + /* Protect *addr6 from overwrites. */ tmp = *sin6; sin6 = &tmp; @@ -664,19 +668,80 @@ static int */ if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; - if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) + if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) { + if (control) + m_freem(control); + m_freem(m); return (error); + } } + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); + INP_RLOCK(inp); + nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? + IPPROTO_UDP : IPPROTO_UDPLITE; +#ifdef INET + if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { + int hasv4addr; + + if (sin6 == NULL) + hasv4addr = (inp->inp_vflag & INP_IPV4); + else + hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) + ? 1 : 0; + if (hasv4addr) { + struct pr_usrreqs *pru; + + /* + * XXXRW: We release UDP-layer locks before calling + * udp_send() in order to avoid recursion. However, + * this does mean there is a short window where inp's + * fields are unstable. Could this lead to a + * potential race in which the factors causing us to + * select the UDPv4 output routine are invalidated? + */ + INP_RUNLOCK(inp); + if (sin6) + in6_sin6_2_sin_in_sock((struct sockaddr *)sin6); + pru = inetsw[ip_protox[nxt]].pr_usrreqs; + /* addr will just be freed in sendit(). */ + return ((*pru->pru_send)(so, flags, m, + (struct sockaddr *)sin6, control, td)); + } + } +#endif + if (control) { if ((error = ip6_setpktopts(control, &opt, - inp->in6p_outputopts, td->td_ucred, IPPROTO_UDP)) != 0) - goto release; + inp->in6p_outputopts, td->td_ucred, nxt)) != 0) { + INP_RUNLOCK(inp); + ip6_clearpktopts(&opt, -1); + m_freem(control); + m_freem(m); + return (error); + } optp = &opt; } else optp = inp->in6p_outputopts; - if (sin6) { + pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); + if (sin6 != NULL && + IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && inp->inp_lport == 0) { + INP_RUNLOCK(inp); + INP_WLOCK(inp); + INP_HASH_WLOCK(pcbinfo); + unlock_udbinfo = UH_WLOCKED; + } else if (sin6 != NULL && + (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || + IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) || + inp->inp_lport == 0)) { + INP_HASH_RLOCK(pcbinfo); + unlock_udbinfo = UH_RLOCKED; + } else + unlock_udbinfo = UH_UNLOCKED; + + if (sin6 != NULL) { faddr = &sin6->sin6_addr; /* @@ -698,37 +763,24 @@ static int fport = sin6->sin6_port; /* allow 0 port */ - if (IN6_IS_ADDR_V4MAPPED(faddr)) { - if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) { - /* - * I believe we should explicitly discard the - * packet when mapped addresses are disabled, - * rather than send the packet as an IPv6 one. - * If we chose the latter approach, the packet - * might be sent out on the wire based on the - * default route, the situation which we'd - * probably want to avoid. - * (20010421 jinmei@kame.net) - */ - error = EINVAL; - goto release; - } - if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && - !IN6_IS_ADDR_V4MAPPED(&inp->in6p_laddr)) { - /* - * when remote addr is an IPv4-mapped address, - * local addr should not be an IPv6 address, - * since you cannot determine how to map IPv6 - * source address to IPv4. - */ - error = EINVAL; - goto release; - } + /* + * Given above handles the IN6_IS_ADDR_V4MAPPED() + * case already, assert here that it must not happen. + */ + KASSERT(!IN6_IS_ADDR_V4MAPPED(faddr), + ("%s: sin6(%p)->sin6_addr is v4mapped which we " + "should have handled.", __func__, sin6)); - af = AF_INET; - } + /* + * Given the KASSERT above, the below should not be needed + * anymore; otherwise we would need an INP_HASH_WLOCK and + * should call the proper inp function for the following. + * XXX-BZ regression test? + */ if (!IN6_IS_ADDR_V4MAPPED(faddr)) { + struct ifnet *oifp = NULL; + error = in6_selectsrc(sin6, optp, inp, NULL, td->td_ucred, &oifp, &in6a); if (error) @@ -758,7 +810,7 @@ static int goto release; } if (IN6_IS_ADDR_V4MAPPED(&inp->in6p_faddr)) { - if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) { + if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { /* * XXX: this case would happen when the * application sets the V6ONLY flag after @@ -770,8 +822,11 @@ static int "option was set for a connected socket\n"); error = EINVAL; goto release; - } else - af = AF_INET; + } else { + /* AF_INET */ + error = EAFNOSUPPORT; + goto release; + } } laddr = &inp->in6p_laddr; faddr = &inp->in6p_faddr; @@ -778,8 +833,9 @@ static int fport = inp->inp_fport; } - if (af == AF_INET) - hlen = sizeof(struct ip); + ulen = m->m_pkthdr.len; + plen = sizeof(struct udphdr) + ulen; + hlen = sizeof(struct ip6_hdr); /* * Calculate data length and get a mbuf @@ -786,7 +842,7 @@ static int * for UDP and IP6 headers. */ M_PREPEND(m, hlen + sizeof(struct udphdr), M_NOWAIT); - if (m == 0) { + if (m == NULL) { error = ENOBUFS; goto release; } @@ -794,11 +850,10 @@ static int /* * Stuff checksum and output datagram. */ - nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? - IPPROTO_UDP : IPPROTO_UDPLITE; udp6 = (struct udphdr *)(mtod(m, caddr_t) + hlen); udp6->uh_sport = inp->inp_lport; /* lport is always set in the PCB */ udp6->uh_dport = fport; + cscov_partial = cscov = 0; if (nxt == IPPROTO_UDPLITE) { struct udpcb *up; @@ -818,87 +873,101 @@ static int udp6->uh_ulen = 0; udp6->uh_sum = 0; - switch (af) { - case AF_INET6: - ip6 = mtod(m, struct ip6_hdr *); - ip6->ip6_flow = inp->inp_flow & IPV6_FLOWINFO_MASK; - ip6->ip6_vfc &= ~IPV6_VERSION_MASK; - ip6->ip6_vfc |= IPV6_VERSION; - ip6->ip6_plen = htons((u_short)plen); - ip6->ip6_nxt = nxt; - ip6->ip6_hlim = in6_selecthlim(inp, NULL); - ip6->ip6_src = *laddr; - ip6->ip6_dst = *faddr; + ip6 = mtod(m, struct ip6_hdr *); + ip6->ip6_flow = inp->inp_flow & IPV6_FLOWINFO_MASK; + ip6->ip6_vfc &= ~IPV6_VERSION_MASK; + ip6->ip6_vfc |= IPV6_VERSION; + ip6->ip6_plen = htons((u_short)plen); + ip6->ip6_nxt = nxt; + ip6->ip6_hlim = in6_selecthlim(inp, NULL); + ip6->ip6_src = *laddr; + ip6->ip6_dst = *faddr; - if (cscov_partial) { - if ((udp6->uh_sum = in6_cksum_partial(m, nxt, - sizeof(struct ip6_hdr), plen, cscov)) == 0) - udp6->uh_sum = 0xffff; - } else { - udp6->uh_sum = in6_cksum_pseudo(ip6, plen, nxt, 0); - m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; - m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); - } +#ifdef MAC + mac_inpcb_create_mbuf(inp, m); +#endif + if (cscov_partial) { + if ((udp6->uh_sum = in6_cksum_partial(m, nxt, + sizeof(struct ip6_hdr), plen, cscov)) == 0) + udp6->uh_sum = 0xffff; + } else { + udp6->uh_sum = in6_cksum_pseudo(ip6, plen, nxt, 0); + m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + } + #ifdef RSS - { - uint32_t hash_val, hash_type; - uint8_t pr; + { + uint32_t hash_val, hash_type; + uint8_t pr; - pr = inp->inp_socket->so_proto->pr_protocol; - /* - * Calculate an appropriate RSS hash for UDP and - * UDP Lite. - * - * The called function will take care of figuring out - * whether a 2-tuple or 4-tuple hash is required based - * on the currently configured scheme. - * - * Later later on connected socket values should be - * cached in the inpcb and reused, rather than constantly - * re-calculating it. - * - * UDP Lite is a different protocol number and will - * likely end up being hashed as a 2-tuple until - * RSS / NICs grow UDP Lite protocol awareness. - */ - if (rss_proto_software_hash_v6(faddr, laddr, fport, - inp->inp_lport, pr, &hash_val, &hash_type) == 0) { - m->m_pkthdr.flowid = hash_val; - M_HASHTYPE_SET(m, hash_type); - } - } -#endif - flags = 0; -#ifdef RSS + pr = inp->inp_socket->so_proto->pr_protocol; /* - * Don't override with the inp cached flowid. + * Calculate an appropriate RSS hash for UDP and + * UDP Lite. * - * Until the whole UDP path is vetted, it may actually - * be incorrect. + * The called function will take care of figuring out + * whether a 2-tuple or 4-tuple hash is required based + * on the currently configured scheme. + * + * Later later on connected socket values should be + * cached in the inpcb and reused, rather than constantly + * re-calculating it. + * + * UDP Lite is a different protocol number and will + * likely end up being hashed as a 2-tuple until + * RSS / NICs grow UDP Lite protocol awareness. */ - flags |= IP_NODEFAULTFLOWID; + if (rss_proto_software_hash_v6(faddr, laddr, fport, + inp->inp_lport, pr, &hash_val, &hash_type) == 0) { + m->m_pkthdr.flowid = hash_val; + M_HASHTYPE_SET(m, hash_type); + } + } #endif + flags = 0; +#ifdef RSS + /* + * Don't override with the inp cached flowid. + * + * Until the whole UDP path is vetted, it may actually + * be incorrect. + */ + flags |= IP_NODEFAULTFLOWID; +#endif - UDP_PROBE(send, NULL, inp, ip6, inp, udp6); - UDPSTAT_INC(udps_opackets); - error = ip6_output(m, optp, NULL, flags, inp->in6p_moptions, - NULL, inp); - break; - case AF_INET: - error = EAFNOSUPPORT; - goto release; + UDP_PROBE(send, NULL, inp, ip6, inp, udp6); + UDPSTAT_INC(udps_opackets); + if (unlock_udbinfo == UH_WLOCKED) + INP_HASH_WUNLOCK(pcbinfo); + else if (unlock_udbinfo == UH_RLOCKED) + INP_HASH_RUNLOCK(pcbinfo); + error = ip6_output(m, optp, NULL, flags, inp->in6p_moptions, NULL, inp); + if (unlock_udbinfo == UH_WLOCKED) + INP_WUNLOCK(inp); + else + INP_RUNLOCK(inp); + if (control) { + ip6_clearpktopts(&opt, -1); + m_freem(control); } - goto releaseopt; + return (error); release: - m_freem(m); - -releaseopt: + if (unlock_udbinfo == UH_WLOCKED) { + INP_HASH_WUNLOCK(pcbinfo); + INP_WUNLOCK(inp); + } else if (unlock_udbinfo == UH_RLOCKED) { + INP_HASH_RUNLOCK(pcbinfo); + INP_RUNLOCK(inp); + } else + INP_RUNLOCK(inp); if (control) { ip6_clearpktopts(&opt, -1); m_freem(control); } + m_freem(m); return (error); } @@ -1184,15 +1253,8 @@ static int udp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { - struct inpcb *inp; - struct inpcbinfo *pcbinfo; - int error = 0; + int error; - pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("udp6_send: inp == NULL")); - - INP_WLOCK(inp); if (addr) { if (addr->sa_len != sizeof(struct sockaddr_in6)) { error = EINVAL; @@ -1204,50 +1266,11 @@ udp6_send(struct socket *so, int flags, struct mbu } } -#ifdef INET - if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { - int hasv4addr; - struct sockaddr_in6 *sin6 = 0; + return (udp6_output(so, flags, m, addr, control, td)); - if (addr == 0) - hasv4addr = (inp->inp_vflag & INP_IPV4); - else { - sin6 = (struct sockaddr_in6 *)addr; - hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) - ? 1 : 0; - } - if (hasv4addr) { - struct pr_usrreqs *pru; - - /* - * XXXRW: We release UDP-layer locks before calling - * udp_send() in order to avoid recursion. However, - * this does mean there is a short window where inp's - * fields are unstable. Could this lead to a - * potential race in which the factors causing us to - * select the UDPv4 output routine are invalidated? - */ - INP_WUNLOCK(inp); - if (sin6) - in6_sin6_2_sin_in_sock(addr); - pru = inetsw[ip_protox[IPPROTO_UDP]].pr_usrreqs; - /* addr will just be freed in sendit(). */ - return ((*pru->pru_send)(so, flags, m, addr, control, - td)); - } - } -#endif -#ifdef MAC - mac_inpcb_create_mbuf(inp, m); -#endif - INP_HASH_WLOCK(pcbinfo); - error = udp6_output(inp, m, addr, control, td); - INP_HASH_WUNLOCK(pcbinfo); - INP_WUNLOCK(inp); - return (error); - bad: - INP_WUNLOCK(inp); + if (control) + m_freem(control); m_freem(m); return (error); }