diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c index 9013f03ddaa3..ca04cb88b10f 100644 --- a/sys/dev/cxgbe/tom/t4_cpl_io.c +++ b/sys/dev/cxgbe/tom/t4_cpl_io.c @@ -2203,14 +2203,14 @@ t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) /* Inline sosend_generic(). */ - error = sblock(sb, SBL_WAIT); + error = SOCK_IO_SEND_LOCK(so, SBL_WAIT); MPASS(error == 0); sendanother: SOCKBUF_LOCK(sb); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(sb); - sbunlock(sb); + SOCK_IO_SEND_UNLOCK(so); if ((so->so_options & SO_NOSIGPIPE) == 0) { PROC_LOCK(job->userproc); kern_psignal(job->userproc, SIGPIPE); @@ -2223,12 +2223,12 @@ t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(sb); - sbunlock(sb); + SOCK_IO_SEND_UNLOCK(so); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { SOCKBUF_UNLOCK(sb); - sbunlock(sb); + SOCK_IO_SEND_UNLOCK(so); error = ENOTCONN; goto out; } @@ -2241,13 +2241,13 @@ t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) */ if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { SOCKBUF_UNLOCK(sb); - sbunlock(sb); + SOCK_IO_SEND_UNLOCK(so); error = ECANCELED; goto out; } TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); SOCKBUF_UNLOCK(sb); - sbunlock(sb); + SOCK_IO_SEND_UNLOCK(so); goto out; } @@ -2274,7 +2274,7 @@ t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) m = alloc_aiotx_mbuf(job, len); if (m == NULL) { - sbunlock(sb); + SOCK_IO_SEND_UNLOCK(so); error = EFAULT; goto out; } @@ -2285,7 +2285,7 @@ t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); - sbunlock(sb); + SOCK_IO_SEND_UNLOCK(so); error = ECONNRESET; goto out; } @@ -2307,7 +2307,7 @@ t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) INP_WUNLOCK(inp); if (sendmore) goto sendanother; - sbunlock(sb); + SOCK_IO_SEND_UNLOCK(so); if (error) goto out; diff --git a/sys/dev/hyperv/hvsock/hv_sock.c b/sys/dev/hyperv/hvsock/hv_sock.c index a920d1850c7d..b95b8eebb77d 100644 --- a/sys/dev/hyperv/hvsock/hv_sock.c +++ b/sys/dev/hyperv/hvsock/hv_sock.c @@ -664,18 +664,17 @@ hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr, if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ) return (EINVAL); - sb = &so->so_rcv; - orig_resid = uio->uio_resid; /* Prevent other readers from entering the socket. */ - error = sblock(sb, SBLOCKWAIT(flags)); + error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); if (error) { HVSOCK_DBG(HVSOCK_DBG_ERR, - "%s: sblock returned error = %d\n", __func__, error); + "%s: soiolock returned error = %d\n", __func__, error); return (error); } + sb = &so->so_rcv; SOCKBUF_LOCK(sb); cbarg.uio = uio; @@ -779,8 +778,7 @@ hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr, out: SOCKBUF_UNLOCK(sb); - - sbunlock(sb); + SOCK_IO_RECV_UNLOCK(so); /* We recieved a FIN in this call */ if (so->so_error == ESHUTDOWN) { @@ -823,18 +821,17 @@ hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE) return (EINVAL); - sb = &so->so_snd; - orig_resid = uio->uio_resid; /* Prevent other writers from entering the socket. */ - error = sblock(sb, SBLOCKWAIT(flags)); + error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); if (error) { HVSOCK_DBG(HVSOCK_DBG_ERR, - "%s: sblock returned error = %d\n", __func__, error); + "%s: soiolocak returned error = %d\n", __func__, error); return (error); } + sb = &so->so_snd; SOCKBUF_LOCK(sb); if ((sb->sb_state & SBS_CANTSENDMORE) || @@ -893,7 +890,7 @@ hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, out: SOCKBUF_UNLOCK(sb); - sbunlock(sb); + SOCK_IO_SEND_UNLOCK(so); return (error); } @@ -1674,7 +1671,7 @@ hvsock_detach(device_t dev) { struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); struct socket *so; - int error, retry; + int retry; if (bootverbose) device_printf(dev, "hvsock_detach called.\n"); @@ -1703,8 +1700,7 @@ hvsock_detach(device_t dev) */ if (so) { retry = 0; - while ((error = sblock(&so->so_rcv, 0)) == - EWOULDBLOCK) { + while (SOCK_IO_RECV_LOCK(so, 0) == EWOULDBLOCK) { /* * Someone is reading, rx br is busy */ @@ -1715,8 +1711,7 @@ hvsock_detach(device_t dev) "retry = %d\n", retry++); } retry = 0; - while ((error = sblock(&so->so_snd, 0)) == - EWOULDBLOCK) { + while (SOCK_IO_SEND_LOCK(so, 0) == EWOULDBLOCK) { /* * Someone is sending, tx br is busy */ @@ -1734,8 +1729,8 @@ hvsock_detach(device_t dev) sc->pcb = NULL; if (so) { - sbunlock(&so->so_rcv); - sbunlock(&so->so_snd); + SOCK_IO_RECV_UNLOCK(so); + SOCK_IO_SEND_UNLOCK(so); so->so_pcb = NULL; } diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c index ac1072ca2406..ba1fc201c2de 100644 --- a/sys/kern/kern_sendfile.c +++ b/sys/kern/kern_sendfile.c @@ -668,8 +668,6 @@ sendfile_getsock(struct thread *td, int s, struct file **sock_fp, */ if ((*so)->so_proto->pr_protocol == IPPROTO_SCTP) return (EINVAL); - if (SOLISTENING(*so)) - return (ENOTCONN); return (0); } @@ -741,7 +739,9 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, * XXXRW: Historically this has assumed non-interruptibility, so now * we implement that, but possibly shouldn't. */ - (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR); + error = SOCK_IO_SEND_LOCK(so, SBL_WAIT | SBL_NOINTR); + if (error != 0) + goto out; #ifdef KERN_TLS tls = ktls_hold(so->so_snd.sb_tls_info); #endif @@ -1211,7 +1211,7 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, * Send trailers. Wimp out and use writev(2). */ if (trl_uio != NULL) { - sbunlock(&so->so_snd); + SOCK_IO_SEND_UNLOCK(so); error = kern_writev(td, sockfd, trl_uio); if (error == 0) sbytes += td->td_retval[0]; @@ -1219,7 +1219,7 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, } done: - sbunlock(&so->so_snd); + SOCK_IO_SEND_UNLOCK(so); out: /* * If there was no error we have to clear td->td_retval[0] diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c index ee615255d16a..28fc7a0a97ec 100644 --- a/sys/kern/uipc_ktls.c +++ b/sys/kern/uipc_ktls.c @@ -1171,7 +1171,7 @@ ktls_enable_tx(struct socket *so, struct tls_enable *en) return (error); } - error = sblock(&so->so_snd, SBL_WAIT); + error = SOCK_IO_SEND_LOCK(so, SBL_WAIT); if (error) { ktls_cleanup(tls); return (error); @@ -1191,7 +1191,7 @@ ktls_enable_tx(struct socket *so, struct tls_enable *en) so->so_snd.sb_flags |= SB_TLS_IFNET; SOCKBUF_UNLOCK(&so->so_snd); INP_WUNLOCK(inp); - sbunlock(&so->so_snd); + SOCK_IO_SEND_UNLOCK(so); counter_u64_add(ktls_offload_total, 1); @@ -1292,7 +1292,7 @@ ktls_set_tx_mode(struct socket *so, int mode) return (error); } - error = sblock(&so->so_snd, SBL_WAIT); + error = SOCK_IO_SEND_LOCK(so, SBL_WAIT); if (error) { counter_u64_add(ktls_switch_failed, 1); ktls_free(tls_new); @@ -1307,7 +1307,7 @@ ktls_set_tx_mode(struct socket *so, int mode) */ if (tls != so->so_snd.sb_tls_info) { counter_u64_add(ktls_switch_failed, 1); - sbunlock(&so->so_snd); + SOCK_IO_SEND_UNLOCK(so); ktls_free(tls_new); ktls_free(tls); INP_WLOCK(inp); @@ -1319,7 +1319,7 @@ ktls_set_tx_mode(struct socket *so, int mode) if (tls_new->mode != TCP_TLS_MODE_SW) so->so_snd.sb_flags |= SB_TLS_IFNET; SOCKBUF_UNLOCK(&so->so_snd); - sbunlock(&so->so_snd); + SOCK_IO_SEND_UNLOCK(so); /* * Drop two references on 'tls'. The first is for the diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c index b2202fe15192..bb179043682e 100644 --- a/sys/kern/uipc_sockbuf.c +++ b/sys/kern/uipc_sockbuf.c @@ -475,34 +475,6 @@ sbwait(struct sockbuf *sb) sb->sb_timeo, 0, 0)); } -int -sblock(struct sockbuf *sb, int flags) -{ - - KASSERT((flags & SBL_VALID) == flags, - ("sblock: flags invalid (0x%x)", flags)); - - if (flags & SBL_WAIT) { - if ((sb->sb_flags & SB_NOINTR) || - (flags & SBL_NOINTR)) { - sx_xlock(&sb->sb_sx); - return (0); - } - return (sx_xlock_sig(&sb->sb_sx)); - } else { - if (sx_try_xlock(&sb->sb_sx) == 0) - return (EWOULDBLOCK); - return (0); - } -} - -void -sbunlock(struct sockbuf *sb) -{ - - sx_xunlock(&sb->sb_sx); -} - /* * Wakeup processes waiting on a socket buffer. Do asynchronous notification * via SIGIO if the socket has the SS_ASYNC flag set. diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 602d6c8b4216..b40ee6c1fbe7 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -309,7 +309,7 @@ socket_init(void *tag) { socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, - NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + NULL, NULL, UMA_ALIGN_PTR, 0); maxsockets = uma_zone_set_max(socket_zone, maxsockets); uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, @@ -418,12 +418,14 @@ soalloc(struct vnet *vnet) * a feature to change class of an existing lock, so we use DUPOK. */ mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); + so->so_snd.sb_mtx = &so->so_snd_mtx; + so->so_rcv.sb_mtx = &so->so_rcv_mtx; SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); so->so_rcv.sb_sel = &so->so_rdsel; so->so_snd.sb_sel = &so->so_wrsel; - sx_init(&so->so_snd.sb_sx, "so_snd_sx"); - sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); + sx_init(&so->so_snd_sx, "so_snd_sx"); + sx_init(&so->so_rcv_sx, "so_rcv_sx"); TAILQ_INIT(&so->so_snd.sb_aiojobq); TAILQ_INIT(&so->so_rcv.sb_aiojobq); TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); @@ -487,8 +489,8 @@ sodealloc(struct socket *so) if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); - sx_destroy(&so->so_snd.sb_sx); - sx_destroy(&so->so_rcv.sb_sx); + sx_destroy(&so->so_snd_sx); + sx_destroy(&so->so_rcv_sx); SOCKBUF_LOCK_DESTROY(&so->so_snd); SOCKBUF_LOCK_DESTROY(&so->so_rcv); } @@ -899,18 +901,48 @@ solisten(struct socket *so, int backlog, struct thread *td) return (error); } +/* + * Prepare for a call to solisten_proto(). Acquire all socket buffer locks in + * order to interlock with socket I/O. + */ int solisten_proto_check(struct socket *so) { - SOCK_LOCK_ASSERT(so); - if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | - SS_ISDISCONNECTING)) + if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | + SS_ISDISCONNECTING)) != 0) return (EINVAL); + + /* + * Sleeping is not permitted here, so simply fail if userspace is + * attempting to transmit or receive on the socket. This kind of + * transient failure is not ideal, but it should occur only if userspace + * is misusing the socket interfaces. + */ + if (!sx_try_xlock(&so->so_snd_sx)) + return (EAGAIN); + if (!sx_try_xlock(&so->so_rcv_sx)) { + sx_xunlock(&so->so_snd_sx); + return (EAGAIN); + } + mtx_lock(&so->so_snd_mtx); + mtx_lock(&so->so_rcv_mtx); return (0); } +/* + * Undo the setup done by solisten_proto_check(). + */ +void +solisten_proto_abort(struct socket *so) +{ + mtx_unlock(&so->so_snd_mtx); + mtx_unlock(&so->so_rcv_mtx); + sx_xunlock(&so->so_snd_sx); + sx_xunlock(&so->so_rcv_sx); +} + void solisten_proto(struct socket *so, int backlog) { @@ -920,6 +952,9 @@ solisten_proto(struct socket *so, int backlog) sbintime_t sbrcv_timeo, sbsnd_timeo; SOCK_LOCK_ASSERT(so); + KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | + SS_ISDISCONNECTING)) == 0, + ("%s: bad socket state %p", __func__, so)); if (SOLISTENING(so)) goto listening; @@ -938,10 +973,6 @@ solisten_proto(struct socket *so, int backlog) sbdestroy(&so->so_snd, so); sbdestroy(&so->so_rcv, so); - sx_destroy(&so->so_snd.sb_sx); - sx_destroy(&so->so_rcv.sb_sx); - SOCKBUF_LOCK_DESTROY(&so->so_snd); - SOCKBUF_LOCK_DESTROY(&so->so_rcv); #ifdef INVARIANTS bzero(&so->so_rcv, @@ -974,6 +1005,11 @@ solisten_proto(struct socket *so, int backlog) if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; so->sol_qlimit = backlog; + + mtx_unlock(&so->so_snd_mtx); + mtx_unlock(&so->so_rcv_mtx); + sx_xunlock(&so->so_snd_sx); + sx_xunlock(&so->so_rcv_sx); } /* @@ -1180,6 +1216,7 @@ int soclose(struct socket *so) { struct accept_queue lqueue; + struct socket *sp, *tsp; int error = 0; KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); @@ -1214,11 +1251,9 @@ soclose(struct socket *so) if (so->so_proto->pr_usrreqs->pru_close != NULL) (*so->so_proto->pr_usrreqs->pru_close)(so); + TAILQ_INIT(&lqueue); SOCK_LOCK(so); if (SOLISTENING(so)) { - struct socket *sp; - - TAILQ_INIT(&lqueue); TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); @@ -1236,17 +1271,14 @@ soclose(struct socket *so) KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; sorele(so); - if (SOLISTENING(so)) { - struct socket *sp, *tsp; - - TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) { - SOCK_LOCK(sp); - if (sp->so_count == 0) { - SOCK_UNLOCK(sp); - soabort(sp); - } else - /* sp is now in sofree() */ - SOCK_UNLOCK(sp); + TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) { + SOCK_LOCK(sp); + if (refcount_load(&sp->so_count) == 0) { + SOCK_UNLOCK(sp); + soabort(sp); + } else { + /* sp is now in sofree() */ + SOCK_UNLOCK(sp); } } CURVNET_RESTORE(); @@ -1316,10 +1348,6 @@ soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; - /* XXXMJ racy */ - if (SOLISTENING(so)) - return (EOPNOTSUPP); - CURVNET_SET(so->so_vnet); /* * If protocol is connection-based, can only connect once. @@ -1587,7 +1615,7 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, if (control != NULL) clen = control->m_len; - error = sblock(&so->so_snd, SBLOCKWAIT(flags)); + error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); if (error) goto out; @@ -1785,7 +1813,7 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, } while (resid); release: - sbunlock(&so->so_snd); + SOCK_IO_SEND_UNLOCK(so); out: #ifdef KERN_TLS if (tls != NULL) @@ -1805,14 +1833,8 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, int error; CURVNET_SET(so->so_vnet); - if (!SOLISTENING(so)) - error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, - top, control, flags, td); - else { - m_freem(top); - m_freem(control); - error = ENOTCONN; - } + error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, + top, control, flags, td); CURVNET_RESTORE(); return (error); } @@ -1932,7 +1954,7 @@ soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, (*pr->pr_usrreqs->pru_rcvd)(so, 0); } - error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); + error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); if (error) return (error); @@ -2387,7 +2409,7 @@ soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, if (flagsp != NULL) *flagsp |= flags; release: - sbunlock(&so->so_rcv); + SOCK_IO_RECV_UNLOCK(so); return (error); } @@ -2434,7 +2456,7 @@ soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, #endif /* Prevent other readers from entering the socket. */ - error = sblock(sb, SBLOCKWAIT(flags)); + error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); if (error) return (error); SOCKBUF_LOCK(sb); @@ -2442,7 +2464,7 @@ soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, #ifdef KERN_TLS if (sb->sb_tls_info != NULL) { SOCKBUF_UNLOCK(sb); - sbunlock(sb); + SOCK_IO_RECV_UNLOCK(so); return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); } @@ -2605,11 +2627,10 @@ soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, if ((flags & MSG_WAITALL) && uio->uio_resid > 0) goto restart; out: - SOCKBUF_LOCK_ASSERT(sb); SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); SOCKBUF_UNLOCK(sb); - sbunlock(sb); + SOCK_IO_RECV_UNLOCK(so); return (error); } @@ -2808,11 +2829,8 @@ soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, int error; CURVNET_SET(so->so_vnet); - if (!SOLISTENING(so)) - error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, - mp0, controlp, flagsp)); - else - error = ENOTCONN; + error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, + mp0, controlp, flagsp)); CURVNET_RESTORE(); return (error); } @@ -2876,6 +2894,7 @@ sorflush(struct socket *so) struct sockbuf *sb = &so->so_rcv; struct protosw *pr = so->so_proto; struct socket aso; + int error; VNET_SO_ASSERT(so); @@ -2893,7 +2912,9 @@ sorflush(struct socket *so) * despite any existing socket disposition on interruptable waiting. */ socantrcvmore(so); - (void) sblock(sb, SBL_WAIT | SBL_NOINTR); + error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR); + KASSERT(error == 0, ("%s: cannot lock sock %p recv buffer", + __func__, so)); /* * Invalidate/clear most of the sockbuf structure, but leave selinfo @@ -2907,7 +2928,7 @@ sorflush(struct socket *so) bzero(&sb->sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); SOCKBUF_UNLOCK(sb); - sbunlock(sb); + SOCK_IO_RECV_UNLOCK(so); /* * Dispose of special rights and flush the copied socket. Don't call @@ -4100,6 +4121,39 @@ soisdisconnected(struct socket *so) wakeup(&so->so_timeo); } +int +soiolock(struct socket *so, struct sx *sx, int flags) +{ + int error; + + KASSERT((flags & SBL_VALID) == flags, + ("soiolock: invalid flags %#x", flags)); + + if ((flags & SBL_WAIT) != 0) { + if ((flags & SBL_NOINTR) != 0) { + sx_xlock(sx); + } else { + error = sx_xlock_sig(sx); + if (error != 0) + return (error); + } + } else if (!sx_try_xlock(sx)) { + return (EWOULDBLOCK); + } + + if (__predict_false(SOLISTENING(so))) { + sx_xunlock(sx); + return (ENOTCONN); + } + return (0); +} + +void +soiounlock(struct sx *sx) +{ + sx_xunlock(sx); +} + /* * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. */ diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c index 3208dc0491dd..44079bae1e9b 100644 --- a/sys/kern/uipc_syscalls.c +++ b/sys/kern/uipc_syscalls.c @@ -484,7 +484,7 @@ kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) { struct socket *so; struct file *fp; - int error, interrupted = 0; + int error; #ifdef CAPABILITY_MODE if (IN_CAPABILITY_MODE(td) && (dirfd == AT_FDCWD)) @@ -522,11 +522,8 @@ kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { error = msleep(&so->so_timeo, &so->so_lock, PSOCK | PCATCH, "connec", 0); - if (error != 0) { - if (error == EINTR || error == ERESTART) - interrupted = 1; + if (error != 0) break; - } } if (error == 0) { error = so->so_error; @@ -534,8 +531,6 @@ kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) } SOCK_UNLOCK(so); bad: - if (!interrupted) - so->so_state &= ~SS_ISCONNECTING; if (error == ERESTART) error = EINTR; done1: diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index c736f35b5ee0..5add930bfa8e 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -890,13 +890,17 @@ uipc_listen(struct socket *so, int backlog, struct thread *td) if (so->so_type != SOCK_STREAM && so->so_type != SOCK_SEQPACKET) return (EOPNOTSUPP); + /* + * Synchronize with concurrent connection attempts. + */ + error = 0; unp = sotounpcb(so); - KASSERT(unp != NULL, ("uipc_listen: unp == NULL")); - UNP_PCB_LOCK(unp); - if (unp->unp_vnode == NULL) { - /* Already connected or not bound to an address. */ - error = unp->unp_conn != NULL ? EINVAL : EDESTADDRREQ; + if (unp->unp_conn != NULL || (unp->unp_flags & UNP_CONNECTING) != 0) + error = EINVAL; + else if (unp->unp_vnode == NULL) + error = EDESTADDRREQ; + if (error != 0) { UNP_PCB_UNLOCK(unp); return (error); } @@ -1523,6 +1527,7 @@ unp_connectat(int fd, struct socket *so, struct sockaddr *nam, bcopy(soun->sun_path, buf, len); buf[len] = 0; + error = 0; unp = sotounpcb(so); UNP_PCB_LOCK(unp); for (;;) { @@ -1538,13 +1543,16 @@ unp_connectat(int fd, struct socket *so, struct sockaddr *nam, * lock the peer socket, to ensure that unp_conn cannot * transition between two valid sockets while locks are dropped. */ - if (unp->unp_conn != NULL) { - UNP_PCB_UNLOCK(unp); - return (EISCONN); + if (SOLISTENING(so)) + error = EOPNOTSUPP; + else if (unp->unp_conn != NULL) + error = EISCONN; + else if ((unp->unp_flags & UNP_CONNECTING) != 0) { + error = EALREADY; } - if ((unp->unp_flags & UNP_CONNECTING) != 0) { + if (error != 0) { UNP_PCB_UNLOCK(unp); - return (EALREADY); + return (error); } if (unp->unp_pairbusy > 0) { unp->unp_flags |= UNP_WAITING; diff --git a/sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c b/sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c index cd620fe3aef9..18d7a89b7a2f 100644 --- a/sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c +++ b/sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c @@ -2506,14 +2506,17 @@ ng_btsocket_l2cap_listen(struct socket *so, int backlog, struct thread *td) if (error != 0) goto out; if (pcb == NULL) { + solisten_proto_abort(so); error = EINVAL; goto out; } if (ng_btsocket_l2cap_node == NULL) { + solisten_proto_abort(so); error = EINVAL; goto out; } if (pcb->psm == 0) { + solisten_proto_abort(so); error = EADDRNOTAVAIL; goto out; } diff --git a/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c b/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c index c0704bce55fa..5b7bbeb45407 100644 --- a/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c +++ b/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c @@ -894,6 +894,7 @@ ng_btsocket_rfcomm_listen(struct socket *so, int backlog, struct thread *td) * from socreate() */ if (l2so == NULL) { + solisten_proto_abort(so); error = socreate_error; goto out; } @@ -907,8 +908,10 @@ ng_btsocket_rfcomm_listen(struct socket *so, int backlog, struct thread *td) */ error = ng_btsocket_rfcomm_session_create(&s, l2so, NG_HCI_BDADDR_ANY, NULL, td); - if (error != 0) + if (error != 0) { + solisten_proto_abort(so); goto out; + } l2so = NULL; } SOCK_LOCK(so); diff --git a/sys/netinet/sctp_usrreq.c b/sys/netinet/sctp_usrreq.c index 822a8ffb534f..0b1e9b5e1836 100644 --- a/sys/netinet/sctp_usrreq.c +++ b/sys/netinet/sctp_usrreq.c @@ -7210,7 +7210,8 @@ sctp_listen(struct socket *so, int backlog, struct thread *p) } } } - SCTP_INP_RLOCK(inp); + SCTP_INP_INFO_WLOCK(); + SCTP_INP_WLOCK(inp); #ifdef SCTP_LOCK_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOCK_LOGGING_ENABLE) { sctp_log_lock(inp, (struct sctp_tcb *)NULL, SCTP_LOG_LOCK_SOCK); @@ -7218,10 +7219,9 @@ sctp_listen(struct socket *so, int backlog, struct thread *p) #endif SOCK_LOCK(so); error = solisten_proto_check(so); - SOCK_UNLOCK(so); if (error) { - SCTP_INP_RUNLOCK(inp); - return (error); + SOCK_UNLOCK(so); + goto out; } if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) && (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { @@ -7232,39 +7232,44 @@ sctp_listen(struct socket *so, int backlog, struct thread *p) * move the guy that was listener to the TCP Pool. */ if (sctp_swap_inpcb_for_listen(inp)) { - SCTP_INP_RUNLOCK(inp); - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE); - return (EADDRINUSE); + SOCK_UNLOCK(so); + solisten_proto_abort(so); + error = EADDRINUSE; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + goto out; } } if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED)) { - /* We are already connected AND the TCP model */ - SCTP_INP_RUNLOCK(inp); - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE); - return (EADDRINUSE); + SOCK_UNLOCK(so); + solisten_proto_abort(so); + error = EADDRINUSE; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + goto out; } - SCTP_INP_RUNLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) { - /* We must do a bind. */ - if ((error = sctp_inpcb_bind(so, NULL, NULL, p))) { + if ((error = sctp_inpcb_bind_locked(inp, NULL, NULL, p))) { + SOCK_UNLOCK(so); + solisten_proto_abort(so); /* bind error, probably perm */ - return (error); + goto out; } } - SCTP_INP_WLOCK(inp); if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) == 0) { - SOCK_LOCK(so); solisten_proto(so, backlog); - SOCK_UNLOCK(so); + } else { + solisten_proto_abort(so); } + SOCK_UNLOCK(so); if (backlog > 0) { inp->sctp_flags |= SCTP_PCB_FLAGS_ACCEPTING; } else { inp->sctp_flags &= ~SCTP_PCB_FLAGS_ACCEPTING; } +out: SCTP_INP_WUNLOCK(inp); + SCTP_INP_INFO_WUNLOCK(); return (error); } diff --git a/sys/netinet/sctputil.c b/sys/netinet/sctputil.c index f331fb70ded5..0993f8eb302f 100644 --- a/sys/netinet/sctputil.c +++ b/sys/netinet/sctputil.c @@ -4796,10 +4796,10 @@ sctp_pull_off_control_to_new_inp(struct sctp_inpcb *old_inp, old_so = old_inp->sctp_socket; new_so = new_inp->sctp_socket; TAILQ_INIT(&tmp_queue); - error = sblock(&old_so->so_rcv, waitflags); + error = SOCK_IO_RECV_LOCK(old_so, waitflags); if (error) { /* - * Gak, can't get sblock, we have a problem. data will be + * Gak, can't get I/O lock, we have a problem. data will be * left stranded.. and we don't dare look at it since the * other thread may be reading something. Oh well, its a * screwed up app that does a peeloff OR a accept while @@ -4831,9 +4831,8 @@ sctp_pull_off_control_to_new_inp(struct sctp_inpcb *old_inp, } } SCTP_INP_READ_UNLOCK(old_inp); - /* Remove the sb-lock on the old socket */ - - sbunlock(&old_so->so_rcv); + /* Remove the recv-lock on the old socket */ + SOCK_IO_RECV_UNLOCK(old_so); /* Now we move them over to the new socket buffer */ SCTP_INP_READ_LOCK(new_inp); TAILQ_FOREACH_SAFE(control, &tmp_queue, next, nctl) { @@ -5586,7 +5585,7 @@ sctp_sorecvmsg(struct socket *so, rwnd_req, block_allowed, so->so_rcv.sb_cc, (uint32_t)uio->uio_resid); } - error = sblock(&so->so_rcv, (block_allowed ? SBL_WAIT : 0)); + error = SOCK_IO_RECV_LOCK(so, (block_allowed ? SBL_WAIT : 0)); if (error) { goto release_unlocked; } @@ -6234,8 +6233,8 @@ sctp_sorecvmsg(struct socket *so, } /* * We need to wait for more data a few things: - We don't - * sbunlock() so we don't get someone else reading. - We - * must be sure to account for the case where what is added + * release the I/O lock so we don't get someone else reading. + * - We must be sure to account for the case where what is added * is NOT to our control when we wakeup. */ @@ -6383,7 +6382,7 @@ sctp_sorecvmsg(struct socket *so, hold_sblock = 0; } - sbunlock(&so->so_rcv); + SOCK_IO_RECV_UNLOCK(so); sockbuf_lock = 0; release_unlocked: @@ -6418,7 +6417,7 @@ sctp_sorecvmsg(struct socket *so, SOCKBUF_UNLOCK(&so->so_rcv); } if (sockbuf_lock) { - sbunlock(&so->so_rcv); + SOCK_IO_RECV_UNLOCK(so); } if (freecnt_applied) { diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index bcd7d18d9d62..3a1608cc106a 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -457,10 +457,15 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td) TCPDEBUG1(); SOCK_LOCK(so); error = solisten_proto_check(so); - INP_HASH_WLOCK(&V_tcbinfo); - if (error == 0 && inp->inp_lport == 0) - error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); - INP_HASH_WUNLOCK(&V_tcbinfo); + if (error != 0) { + SOCK_UNLOCK(so); + goto out; + } + if (inp->inp_lport == 0) { + INP_HASH_WLOCK(&V_tcbinfo); + error = in_pcbbind(inp, NULL, td->td_ucred); + INP_HASH_WUNLOCK(&V_tcbinfo); + } if (error == 0) { tcp_state_change(tp, TCPS_LISTEN); solisten_proto(so, backlog); @@ -468,6 +473,8 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td) if ((so->so_options & SO_NO_OFFLOAD) == 0) tcp_offload_listen_start(tp); #endif + } else { + solisten_proto_abort(so); } SOCK_UNLOCK(so); @@ -504,12 +511,16 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) TCPDEBUG1(); SOCK_LOCK(so); error = solisten_proto_check(so); + if (error != 0) { + SOCK_UNLOCK(so); + goto out; + } INP_HASH_WLOCK(&V_tcbinfo); - if (error == 0 && inp->inp_lport == 0) { + if (inp->inp_lport == 0) { inp->inp_vflag &= ~INP_IPV4; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; - error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); + error = in6_pcbbind(inp, NULL, td->td_ucred); } INP_HASH_WUNLOCK(&V_tcbinfo); if (error == 0) { @@ -519,6 +530,8 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) if ((so->so_options & SO_NO_OFFLOAD) == 0) tcp_offload_listen_start(tp); #endif + } else { + solisten_proto_abort(so); } SOCK_UNLOCK(so); @@ -581,6 +594,10 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) error = ECONNREFUSED; goto out; } + if (SOLISTENING(so)) { + error = EOPNOTSUPP; + goto out; + } tp = intotcpcb(inp); TCPDEBUG1(); NET_EPOCH_ENTER(et); @@ -643,6 +660,10 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) error = ECONNREFUSED; goto out; } + if (SOLISTENING(so)) { + error = EINVAL; + goto out; + } tp = intotcpcb(inp); TCPDEBUG1(); #ifdef INET @@ -1021,6 +1042,10 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, TCPDEBUG1(); if (nam != NULL && tp->t_state < TCPS_SYN_SENT) { + if (tp->t_state == TCPS_LISTEN) { + error = EINVAL; + goto out; + } switch (nam->sa_family) { #ifdef INET case AF_INET: @@ -1119,6 +1144,9 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, sbappendstream(&so->so_snd, m, flags); m = NULL; if (nam && tp->t_state < TCPS_SYN_SENT) { + KASSERT(tp->t_state == TCPS_CLOSED, + ("%s: tp %p is listening", __func__, tp)); + /* * Do implied connect if not yet connected, * initialize window to default value, and diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c index a38bdfcbed59..ed9dd1fcb224 100644 --- a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c +++ b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c @@ -1105,7 +1105,7 @@ sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, td->td_ru.ru_msgsnd++; ssk = sdp_sk(so); - error = sblock(&so->so_snd, SBLOCKWAIT(flags)); + error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); if (error) goto out; @@ -1196,7 +1196,7 @@ sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, } while (resid); release: - sbunlock(&so->so_snd); + SOCK_IO_SEND_UNLOCK(so); out: if (top != NULL) m_freem(top); @@ -1267,9 +1267,9 @@ sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio, ssk = sdp_sk(so); /* Prevent other readers from entering the socket. */ - error = sblock(sb, SBLOCKWAIT(flags)); + error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); if (error) - goto out; + return (error); SOCKBUF_LOCK(sb); /* Easy one, no space to copyout anything. */ @@ -1423,11 +1423,10 @@ sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio, if ((flags & MSG_WAITALL) && uio->uio_resid > 0) goto restart; out: - SOCKBUF_LOCK_ASSERT(sb); SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); SOCKBUF_UNLOCK(sb); - sbunlock(sb); + SOCK_IO_RECV_UNLOCK(so); return (error); } diff --git a/sys/sys/sockbuf.h b/sys/sys/sockbuf.h index 4c56f4eaf234..3b345870bd5f 100644 --- a/sys/sys/sockbuf.h +++ b/sys/sys/sockbuf.h @@ -78,14 +78,13 @@ struct selinfo; * * Locking key to struct sockbuf: * (a) locked by SOCKBUF_LOCK(). - * (b) locked by sblock() */ -struct sockbuf { - struct mtx sb_mtx; /* sockbuf lock */ - struct sx sb_sx; /* prevent I/O interlacing */ +struct sockbuf { + struct mtx *sb_mtx; /* sockbuf lock */ struct selinfo *sb_sel; /* process selecting read/write */ short sb_state; /* (a) socket state on sockbuf */ -#define sb_startzero sb_mb +#define sb_startzero sb_flags + short sb_flags; /* (a) flags, see above */ struct mbuf *sb_mb; /* (a) the mbuf chain */ struct mbuf *sb_mbtail; /* (a) the last mbuf in the chain */ struct mbuf *sb_lastrecord; /* (a) first mbuf of last @@ -109,7 +108,6 @@ struct sockbuf { struct ktls_session *sb_tls_info; /* (a + b) TLS state */ struct mbuf *sb_mtls; /* (a) TLS mbuf chain */ struct mbuf *sb_mtlstail; /* (a) last mbuf in TLS chain */ - short sb_flags; /* (a) flags, see above */ int (*sb_upcall)(struct socket *, void *, int); /* (a) */ void *sb_upcallarg; /* (a) */ TAILQ_HEAD(, kaiocb) sb_aiojobq; /* (a) pending AIO ops */ @@ -123,7 +121,7 @@ struct sockbuf { * Per-socket buffer mutex used to protect most fields in the socket * buffer. */ -#define SOCKBUF_MTX(_sb) (&(_sb)->sb_mtx) +#define SOCKBUF_MTX(_sb) ((_sb)->sb_mtx) #define SOCKBUF_LOCK_INIT(_sb, _name) \ mtx_init(SOCKBUF_MTX(_sb), _name, NULL, MTX_DEF) #define SOCKBUF_LOCK_DESTROY(_sb) mtx_destroy(SOCKBUF_MTX(_sb)) @@ -183,8 +181,6 @@ struct mbuf * struct mbuf * sbsndmbuf(struct sockbuf *sb, u_int off, u_int *moff); int sbwait(struct sockbuf *sb); -int sblock(struct sockbuf *sb, int flags); -void sbunlock(struct sockbuf *sb); void sballoc(struct sockbuf *, struct mbuf *); void sbfree(struct sockbuf *, struct mbuf *); void sballoc_ktls_rx(struct sockbuf *sb, struct mbuf *m); diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h index 47033fdabbfa..301f53e68435 100644 --- a/sys/sys/socketvar.h +++ b/sys/sys/socketvar.h @@ -91,13 +91,13 @@ struct socket { volatile u_int so_count; /* (b / refcount) */ struct selinfo so_rdsel; /* (b/cr) for so_rcv/so_comp */ struct selinfo so_wrsel; /* (b/cs) for so_snd */ - short so_type; /* (a) generic type, see socket.h */ int so_options; /* (b) from socket call, see socket.h */ - short so_linger; /* time to linger close(2) */ + short so_type; /* (a) generic type, see socket.h */ short so_state; /* (b) internal state flags SS_* */ void *so_pcb; /* protocol control block */ struct vnet *so_vnet; /* (a) network stack instance */ struct protosw *so_proto; /* (a) protocol handle */ + short so_linger; /* time to linger close(2) */ short so_timeo; /* (g) connection timeout */ u_short so_error; /* (f) error affecting connection */ u_short so_rerror; /* (f) error affecting connection */ @@ -121,6 +121,17 @@ struct socket { int so_ts_clock; /* type of the clock used for timestamps */ uint32_t so_max_pacing_rate; /* (f) TX rate limit in bytes/s */ + + /* + * Mutexes to prevent interleaving of socket I/O. These have to be + * outside of the socket buffers in order to interlock with listen(2). + */ + struct sx so_snd_sx __aligned(CACHE_LINE_SIZE); + struct mtx so_snd_mtx; + + struct sx so_rcv_sx __aligned(CACHE_LINE_SIZE); + struct mtx so_rcv_mtx; + union { /* Regular (data flow) socket. */ struct { @@ -255,6 +266,15 @@ struct socket { #define SBL_NOINTR 0x00000002 /* Force non-interruptible sleep. */ #define SBL_VALID (SBL_WAIT | SBL_NOINTR) +#define SOCK_IO_SEND_LOCK(so, flags) \ + soiolock((so), &(so)->so_snd_sx, (flags)) +#define SOCK_IO_SEND_UNLOCK(so) \ + soiounlock(&(so)->so_snd_sx) +#define SOCK_IO_RECV_LOCK(so, flags) \ + soiolock((so), &(so)->so_rcv_sx, (flags)) +#define SOCK_IO_RECV_UNLOCK(so) \ + soiounlock(&(so)->so_rcv_sx) + /* * Do we need to notify the other side when I/O is possible? */ @@ -433,6 +453,7 @@ void sofree(struct socket *so); void sohasoutofband(struct socket *so); int solisten(struct socket *so, int backlog, struct thread *td); void solisten_proto(struct socket *so, int backlog); +void solisten_proto_abort(struct socket *so); int solisten_proto_check(struct socket *so); int solisten_dequeue(struct socket *, struct socket **, int); struct socket * @@ -484,6 +505,8 @@ void socantsendmore(struct socket *so); void socantsendmore_locked(struct socket *so); void soroverflow(struct socket *so); void soroverflow_locked(struct socket *so); +int soiolock(struct socket *so, struct sx *sx, int flags); +void soiounlock(struct sx *sx); /* * Accept filter functions (duh).