diff --git a/share/man/man9/Makefile b/share/man/man9/Makefile index 35cffd1a0eb5..40de1f548774 100644 --- a/share/man/man9/Makefile +++ b/share/man/man9/Makefile @@ -62,7 +62,7 @@ MAN+= secmodel_securelevel.9 MLINKS+=secmodel_securelevel.9 securelevel.9 MAN+= secmodel_suser.9 \ SET.9 setbit.9 setjmp.9 shutdownhook_establish.9 \ - signal.9 skpc.9 sockopt.9 softintr.9 spl.9 specificdata.9 \ + signal.9 skpc.9 smr.9 sockopt.9 softintr.9 spl.9 specificdata.9 \ spi.9 splraiseipl.9 \ strlist.9 \ suspendsched.9 \ @@ -803,7 +803,8 @@ MLINKS+=pool_cache.9 pool_cache_init.9 \ pool_cache.9 pool_cache_invalidate.9 \ pool_cache.9 pool_cache_sethiwat.9 \ pool_cache.9 pool_cache_setlowat.9 \ - pool_cache.9 pool_cache_sethardlimit.9 + pool_cache.9 pool_cache_sethardlimit.9 \ + pool_cache.9 pool_cache_set_smr.9 MLINKS+=powerhook_establish.9 powerhook_disestablish.9 MLINKS+=preempt.9 yield.9 MLINKS+=pserialize.9 pserialize_create.9 \ @@ -904,6 +905,17 @@ MLINKS+=signal.9 siginit.9 \ signal.9 sendsig.9 \ signal.9 sigcode.9 \ signal.9 sigtramp.9 +MLINKS+=smr.9 smr_create.9 \ + smr.9 smr_destroy.9 \ + smr.9 smr_enter.9 \ + smr.9 smr_exit.9 \ + smr.9 smr_lazy_enter.9 \ + smr.9 smr_lazy_exit.9 \ + smr.9 smr_advance.9 \ + smr.9 smr_poll.9 \ + smr.9 smr_wait.9 \ + smr.9 smr_synchronize.9 \ + smr.9 pool_cache_set_smr.9 MLINKS+=sockopt.9 sockopt_init.9 \ sockopt.9 sockopt_destroy.9 \ sockopt.9 sockopt_get.9 \ diff --git a/share/man/man9/pool_cache.9 b/share/man/man9/pool_cache.9 index 82115c3a2ee7..512b1f89cf93 100644 --- a/share/man/man9/pool_cache.9 +++ b/share/man/man9/pool_cache.9 @@ -69,7 +69,8 @@ .Nm pool_cache_invalidate , .Nm pool_cache_sethiwat , .Nm pool_cache_setlowat , -.Nm pool_cache_sethardlimit +.Nm pool_cache_sethardlimit , +.Nm pool_cache_set_smr .Nd resource-pool cache manager .\" ------------------------------------------------------------ .Sh SYNOPSIS @@ -118,6 +119,10 @@ .Ft void .Fn pool_cache_sethardlimit \ "pool_cache_t pc" "int n" "const char *warnmess" "int ratecap" +.\" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +.Ft void +.Fn pool_cache_set_smr \ +"pool_cache_t pc" "void *smr" .\" ------------------------------------------------------------ .Sh DESCRIPTION These utility routines provide management of pools of fixed-sized @@ -170,28 +175,29 @@ The offset within an item to which the parameter applies. .It Fa flags .Pp -Should be set to zero, -.Dv PR_NOTOUCH , -or -.Dv PR_PSERIALIZE . -If -.Dv PR_NOTOUCH -is given, free items are never used to keep internal state so that -the pool can be used for non memory backed objects. -If -.Dv PR_PSERIALIZE -is given, then the allocator guarantees that a passive serialization barrier -equivalent to +Should be set to zero or a bitwise OR of the following: +.Bl -tag -width PR_PSERIALIZE +.It Dv PR_NOTOUCH +Free items are never used to keep internal state so that the pool can be +used for non memory backed objects. +.It Dv PR_PSERIALIZE +The allocator guarantees that a passive serialization barrier equivalent to .Dq xc_barrier(0) will be performed before either the object's destructor is called or -before object's backing store is returned to the system. -.Dv PR_PSERIALIZE -implies +before the object's backing store is returned to the system. +Implies .Dv PR_NOTOUCH . Because of the guarantees provided by .Dv PR_PSERIALIZE , objects must never be freed to a pool cache using this option from either hard or soft interrupt context, as doing so may block. +.El +.Pp +The +.Dv PR_SMR +flag is not set directly; it is applied by +.Fn pool_cache_set_smr +(see below). .It Fa name .Pp The name used to identify the object in diagnostic output. @@ -368,6 +374,60 @@ Set the minimum number of total items (both free and allocated) for the backing .Xr pool 9 to .Fa n . +.\" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +.It Fn pool_cache_set_smr "pc" "smr" +.Pp +Associate the pool cache +.Fa pc +with a Safe Memory Reclamation context +.Fa smr +(see +.Xr smr 9 ) . +This must be called after +.Fn pool_cache_init +but before any allocations from the cache. +It sets the +.Dv PR_SMR +and +.Dv PR_NOTOUCH +flags on the cache. +.Pp +When an SMR context is associated with a pool cache, freed objects are not +immediately eligible for reuse. +Instead, they are staged in per-CPU buckets, stamped with an SMR write +sequence number, and placed on a FIFO queue. +Objects are only recycled once +.Fn smr_poll +confirms that all readers that may have observed the object have exited +their read sections. +.Pp +On the allocation side, each CPU maintains a validated bucket of objects +whose grace period has already expired. +Allocations from this bucket require no lock acquisition. +When the validated bucket is empty, a batch is dequeued from the FIFO +and validated as a unit. +This batching amortizes the cost of +.Fn smr_advance +across approximately 15\(en30 freed objects and reduces lock acquisition +on both the allocation and free paths by a similar factor. +.Pp +.Dv PR_SMR +and +.Dv PR_PSERIALIZE +protect different lifecycles and are mutually exclusive: +.Dv PR_PSERIALIZE +gates the return of backing +.Em pages +to the VM system (a coarse-grained barrier via IPI), while +.Dv PR_SMR +gates the reuse of individual cache +.Em objects +(a fine-grained barrier via sequence number tracking). +An SMR pool does not require +.Dv PR_PSERIALIZE +because SMR grace periods already ensure that all readers have exited +before any object on a page can be recycled, and thus before the page +can become empty and eligible for return. .El .\" ------------------------------------------------------------ .Sh CODE REFERENCES @@ -380,4 +440,6 @@ subsystem is implemented within the file .Xr kmem 9 , .Xr memoryallocators 9 , .Xr percpu 9 , -.Xr pool 9 +.Xr pool 9 , +.Xr pserialize 9 , +.Xr smr 9 diff --git a/share/man/man9/smr.9 b/share/man/man9/smr.9 new file mode 100644 index 000000000000..fa8326415529 --- /dev/null +++ b/share/man/man9/smr.9 @@ -0,0 +1,705 @@ +.\" SPDX-License-Identifier: BSD-2-Clause +.\" +.\" Copyright (c) 2023 The FreeBSD Foundation +.\" +.\" This documentation was written by Mark Johnston +.\" under sponsorship from the FreeBSD Foundation. +.\" +.\" Adapted for NetBSD by Kevin Bowling . +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.Dd April 20, 2026 +.Dt SMR 9 +.Os +.Sh NAME +.Nm smr +.Nd safe memory reclamation for lock-free data structures +.Sh SYNOPSIS +.In sys/smr.h +.Ft smr_t +.Fo smr_create +.Fa "const char *name" +.Fa "int limit" +.Fa "int flags" +.Fc +.Ft void +.Fo smr_destroy +.Fa "smr_t smr" +.Fc +.Ft void +.Fo smr_enter +.Fa "smr_t smr" +.Fc +.Ft void +.Fo smr_exit +.Fa "smr_t smr" +.Fc +.Ft void +.Fo smr_lazy_enter +.Fa "smr_t smr" +.Fc +.Ft void +.Fo smr_lazy_exit +.Fa "smr_t smr" +.Fc +.Ft smr_seq_t +.Fo smr_advance +.Fa "smr_t smr" +.Fc +.Ft bool +.Fo smr_poll +.Fa "smr_t smr" +.Fa "smr_seq_t goal" +.Fa "bool wait" +.Fc +.Ft void +.Fo smr_wait +.Fa "smr_t smr" +.Fa "smr_seq_t goal" +.Fc +.Ft void +.Fo smr_synchronize +.Fa "smr_t smr" +.Fc +.In sys/pool.h +.Ft void +.Fo pool_cache_set_smr +.Fa "pool_cache_t pc" +.Fa "void *smr" +.Fc +.\" ----- +.Sh DESCRIPTION +Safe Memory Reclamation (SMR) is a facility which enables the implementation of +memory-safe lock-free data structures. +In typical usage, read accesses to an SMR-protected data structure, such as a +hash table or tree, are performed in a +.Dq read section +consisting of code bracketed by +.Fn smr_enter +and +.Fn smr_exit +calls (or their lazy variants), while mutations of the data structure are +serialized by a traditional mutex such as +.Xr mutex 9 . +In contrast with reader-writer locks such as +.Xr rwlock 9 , +SMR allows readers and writers to access the data structure concurrently. +Readers can always enter a read section immediately +.Po +.Fn smr_enter +never blocks +.Pc , +so mutations do not introduce read latency. +Furthermore, +.Fn smr_enter +and +.Fn smr_exit +operate only on per-CPU data and thus avoid some of the performance problems +inherent in the implementation of traditional reader-writer mutexes. +SMR can therefore be a useful building block for data structures which are +accessed frequently but are only rarely modified. +.Pp +Note that any SMR-protected data structure must be implemented carefully such +that operations behave correctly in the absence of mutual exclusion between +readers and writers. +The data structure must be designed to be lock-free; SMR merely facilitates +the implementation, for example by making it safe to follow dangling pointers +and by helping avoid the ABA problem. +.Pp +When shared accesses to and mutations of a data structure can proceed +concurrently, writers must take care to ensure that any items removed from the +structure are not freed and recycled while readers are accessing them in +parallel. +This requirement results in a two-phase approach to the removal of items: +first, the item is unlinked such that all pointers to the item are removed from +the structure, preventing any new readers from observing the item. +Then, the writer waits until some mechanism guarantees that no existing readers +are still accessing the item. +At that point the memory for that item can be freed and reused safely. +SMR provides this mechanism: readers may access a lock-free data structure in +between calls to the +.Fn smr_enter +and +.Fn smr_exit +functions, which together create a read section, and the +.Fn smr_advance , +.Fn smr_poll , +.Fn smr_wait , +and +.Fn smr_synchronize +functions can be used to wait for threads in read sections to finish. +All of these functions operate on a +.Ft smr_t +state block which holds both per-CPU and global state. +Readers load global state and modify per-CPU state, while writers must scan all +per-CPU states to detect active readers. +SMR is designed to amortize this cost by batching to give acceptable +performance in write-heavy workloads. +.\" ----- +.Ss Readers +Threads enter a read section by calling +.Fn smr_enter +or +.Fn smr_lazy_enter . +Read sections should be short, and many operations are not permitted while in +a read section. +Specifically, kernel preemption is disabled, and thus readers may not acquire +blocking mutexes such as +.Xr mutex 9 +with the +.Dv MUTEX_DEFAULT +type. +The thread is pinned to the current CPU for the duration of the read section. +Furthermore, read sections may not be nested: it is incorrect to call +.Fn smr_enter +or +.Fn smr_lazy_enter +with a given +.Ft smr_t +state block when already in a read section for that state block. +.Pp +.Fn smr_enter +is used for non-lazy SMR contexts and issues a full memory barrier +.Pq Fn membar_sync +on entry. +.Fn smr_lazy_enter +is used for lazy SMR contexts +.Pq created with Dv SMR_LAZY +and does not issue a memory barrier on entry, relying instead on +clock interrupts to flush store buffers. +On exit, +.Fn smr_exit +issues a release barrier while +.Fn smr_lazy_exit +issues an exit barrier. +.Ss Caller IPL Contract +.Fn smr_enter +and +.Fn smr_lazy_enter +use +.Fn kpreempt_disable +internally but do +.Em not +raise the interrupt priority level. +This means that a hardware interrupt can preempt the current thread and +trigger softint dispatch, which may re-enter the same SMR context on the +same CPU, causing a panic due to the non-recursive assertion on the per-CPU +sequence number. +.Pp +Callers must observe the following rules: +.Bl -bullet +.It +.Em Softint-context callers +(e.g., protocol input functions called from +.Xr softint 9 +at +.Dv IPL_SOFTNET ) +are inherently safe because they cannot be preempted by a same-level softint. +No additional protection is needed. +.It +.Em User-context callers +(e.g., system calls such as +.Xr bind 2 +or +.Xr connect 2 ) +.Em must +raise the IPL before entering the read section. +For network SMR contexts, wrap with +.Fn splsoftnet +and +.Fn splx : +.Bd -literal -offset indent +int s; + +s = splsoftnet(); +smr_lazy_enter(smr); +/* ... read section ... */ +smr_lazy_exit(smr); +splx(s); +.Ed +.Pp +Any raised IPL prevents softint dispatch on the current CPU, closing the +recursion race. +Future SMR contexts used from other softint levels +.Pq e.g., Dv IPL_SOFTCLOCK , Dv IPL_SOFTBIO +would require the corresponding +.Fn spl* +call. +.El +.Pp +Under +.Dv DIAGNOSTIC +kernels, both +.Fn smr_enter +and +.Fn smr_lazy_enter +check that the caller is either in interrupt context or at a raised IPL, +and panic immediately if called from user context at +.Dv IPL_NONE . +.\" ----- +.Ss Pool Cache Integration +To simplify the integration of SMR into consumers, the +.Xr pool_cache 9 +allocator provides SMR-aware facilities. +This eliminates a good deal of complexity from the implementation of consumers +and automatically batches write operations. +.Pp +A pool cache is associated with an SMR context by calling +.Fn pool_cache_set_smr +after +.Fn pool_cache_init +but before any allocations from the cache. +Objects freed via +.Fn pool_cache_put +are staged in per-CPU buckets and stamped with an SMR sequence number. +The freed objects are placed in a FIFO queue and are not recycled until +.Fn smr_poll +confirms that all readers which may have observed the object have exited +their read sections. +.Pp +Allocations via +.Fn pool_cache_get +first check a per-CPU validated bucket of objects that have already passed +their grace period, avoiding any lock acquisition on the fast path. +When the validated bucket is empty, +.Fn pool_cache_get +attempts to dequeue a batch from the FIFO, validates it with +.Fn smr_poll , +and installs it as the new per-CPU allocation bucket. +.Pp +This batching scheme amortizes the cost of +.Fn smr_advance +across approximately 15\(en30 freed objects +.Pq depending on architecture +and reduces lock acquisition on both the allocation and free paths by a +similar factor. +.\" ----- +.Ss Writers +Internally, SMR maintains a global +.Ql write sequence +number. +When entering a read section, +.Fn smr_enter +loads a copy of the write sequence and stores it in per-CPU memory, hence +.Ql observing +that value. +To exit a read section, this per-CPU memory is overwritten with an invalid +value, making the CPU inactive. +Writers perform two operations: advancing the write sequence number, and +polling all CPUs to see whether active readers have observed a given sequence +number. +These operations are performed by +.Fn smr_advance +and +.Fn smr_poll , +respectively, which do not require serialization between multiple writers. +.Pp +After a writer unlinks an item from a data structure, it increments the write +sequence number and tags the item with the new value returned by +.Fn smr_advance . +Once all CPUs have observed the new value, the writer can use +.Fn smr_poll +to deduce that no active readers have access to the unlinked item, and thus the +item is safe to recycle. +Because this pair of operations is relatively expensive, it is generally a good +idea to amortize this cost by accumulating a collection of multiple unlinked +items and tagging the entire batch with a target write sequence number. +.Pp +.Fn smr_poll +is a non-blocking operation and returns true only if all active readers are +guaranteed to have observed the target sequence number value. +.Fn smr_wait +is a variant of +.Fn smr_poll +which busy-waits until all CPUs have observed the target sequence number value. +.Fn smr_synchronize +combines +.Fn smr_advance +with +.Fn smr_wait +to wait for all CPUs to observe a new write sequence number. +This is an expensive operation and should only be used if polling cannot be +deferred in some way. +.\" ----- +.Ss Memory Ordering +The +.Fn smr_enter +function has acquire semantics via +.Fn membar_sync , +and the +.Fn smr_exit +function has release semantics via +.Fn atomic_store_release . +.Pp +The +.Fn smr_lazy_enter +function has relaxed store semantics only; it relies on periodic clock +interrupts to serialize with other CPUs. +The +.Fn smr_lazy_exit +function has release semantics via +.Fn membar_exit . +.Pp +The +.Fn smr_advance , +.Fn smr_poll , +.Fn smr_wait , +and +.Fn smr_synchronize +functions should not be assumed to have any guarantees with respect to memory +ordering beyond what is documented in the source. +.Fn smr_advance +issues a release barrier before advancing. +.Fn smr_poll +issues an acquire barrier before returning. +See +.Xr membar_ops 3 +for more details. +.\" ----- +.Sh FUNCTIONS +.Bl -tag -width compact +.It Fn smr_create "name" "limit" "flags" +Create and initialize a new SMR context. +.Fa name +is a human-readable identifier used in diagnostic messages. +.Fa limit +controls the deferred advance interval when +.Dv SMR_DEFERRED +is set (the sequence number is advanced every +.Fa limit +calls to +.Fn smr_advance +rather than every call); set to 0 for default or lazy behavior. +.Fa flags +is a bitwise OR of: +.Bl -tag -width SMR_DEFERRED +.It Dv SMR_LAZY +Enable lazy (tick-based) write sequence advancement. +The write sequence advances at the rate of the system clock +.Pq typically 100\(en1000 Hz +rather than on every call to +.Fn smr_advance . +This reduces write-side overhead at the cost of increased reclamation latency +(bounded by 2 clock ticks). +The read-side entry +.Pq Fn smr_lazy_enter +does not issue a full memory barrier, relying on clock interrupts to serialize +store buffers. +Suitable for read-mostly data structures where objects live for at least +milliseconds (e.g., protocol control blocks). +.It Dv SMR_DEFERRED +Enable deferred batching of write sequence advances. +The global write sequence is incremented only every +.Fa limit +calls to +.Fn smr_advance , +amortizing the atomic operation cost. +Intermediate calls return a predicted future sequence number. +Cannot be combined with +.Dv SMR_LAZY . +.El +.It Fn smr_destroy "smr" +Destroy the SMR context. +Calls +.Fn smr_synchronize +internally to ensure all readers have exited before freeing resources. +.It Fn smr_enter "smr" +Enter a read section for a non-lazy SMR context. +Disables kernel preemption and stores the current write sequence number +in per-CPU state with a full memory barrier. +Must not be called on a context created with +.Dv SMR_LAZY ; +use +.Fn smr_lazy_enter +instead. +Must not be nested. +See +.Sx Caller IPL Contract +above. +.It Fn smr_exit "smr" +Exit a non-lazy read section. +Clears the per-CPU sequence number with release semantics and re-enables +kernel preemption. +.It Fn smr_lazy_enter "smr" +Enter a read section for a lazy SMR context +.Pq created with Dv SMR_LAZY . +Disables kernel preemption and stores the current write sequence number +in per-CPU state with relaxed store semantics (no memory barrier). +Must not be called on a non-lazy context; use +.Fn smr_enter +instead. +Must not be nested. +See +.Sx Caller IPL Contract +above. +.It Fn smr_lazy_exit "smr" +Exit a lazy read section. +Issues an exit memory barrier, clears the per-CPU sequence number, and +re-enables kernel preemption. +.It Fn smr_advance "smr" +Advance the write sequence number and return the new goal sequence. +The returned value can be saved and later passed to +.Fn smr_poll +or +.Fn smr_wait . +Must not be called from within a read section. +Issues a release barrier before advancing to ensure prior stores are visible +to readers. +.Pp +For +.Dv SMR_LAZY +contexts, the sequence advances at the clock tick rate (bounded by 2 ticks +of grace). +For +.Dv SMR_DEFERRED +contexts, the global sequence may not be updated on every call. +.It Fn smr_poll "smr" "goal" "wait" +Check whether all active readers have observed the sequence number +.Fa goal . +Returns +.Dv true +if the goal has been met. +If +.Fa wait +is +.Dv true , +busy-loops until the goal is met. +If +.Fa wait +is +.Dv false , +returns immediately with the current status. +.Pp +For lazy and deferred contexts, if the goal is ahead of the current write +sequence, +.Fn smr_poll +will attempt to advance the write sequence when +.Fa wait +is +.Dv true . +.Pp +Must not be called with +.Fa wait +set to +.Dv true +from within a read section. +.It Fn smr_wait "smr" "goal" +Equivalent to calling +.Fn smr_poll +with +.Fa wait +set to +.Dv true . +.It Fn smr_synchronize "smr" +Advance the write sequence number and busy-wait until all active readers +have observed the new value. +Equivalent to calling +.Fn smr_wait +with the return value of +.Fn smr_advance . +This is expensive and should be avoided when polling can be deferred. +.It Fn pool_cache_set_smr "pc" "smr" +Associate the pool cache +.Fa pc +with the SMR context +.Fa smr . +Must be called after +.Fn pool_cache_init +but before any allocations from the cache. +Sets the +.Dv PR_SMR +and +.Dv PR_NOTOUCH +flags on the cache. +See +.Sx Pool Cache Integration +above for details on the batched free/reclaim mechanism. +.El +.\" ----- +.Sh EXAMPLES +Given a global hash table of connection records protected by SMR: +.Bd -literal +struct conn { + struct conn *c_next; + /* ... */ +}; + +static smr_t conn_smr; +static kmutex_t conn_lock; +static struct conn *conn_hash[HASH_SIZE]; +static pool_cache_t conn_cache; +.Ed +.Pp +Initialize the SMR context and pool cache: +.Bd -literal +conn_smr = smr_create("conn", 0, SMR_LAZY); +conn_cache = pool_cache_init(sizeof(struct conn), + coherency_unit, 0, 0, "connpl", NULL, IPL_NET, + NULL, NULL, NULL); +pool_cache_set_smr(conn_cache, conn_smr); +.Ed +.Pp +Look up a connection, as a reader (from softint context): +.Bd -literal +struct conn *c; +int error = ENOENT; + +smr_lazy_enter(conn_smr); +for (c = atomic_load_consume(&conn_hash[h]); + c != NULL; + c = c->c_next) { + if (c->c_key == key) { + /* Use the connection within the read section. */ + *resultp = c; + error = 0; + break; + } +} +smr_lazy_exit(conn_smr); +return error; +.Ed +.Pp +Look up a connection from user context (e.g., a system call): +.Bd -literal +int s; + +s = splsoftnet(); +smr_lazy_enter(conn_smr); +/* ... same lookup as above ... */ +smr_lazy_exit(conn_smr); +splx(s); +.Ed +.Pp +Remove a connection, as a writer: +.Bd -literal +struct conn **cp, *c; + +mutex_enter(&conn_lock); +for (cp = &conn_hash[h]; (c = *cp) != NULL; cp = &c->c_next) { + if (c->c_key == key) { + *cp = c->c_next; + break; + } +} +mutex_exit(&conn_lock); + +if (c != NULL) { + /* + * Return to the SMR-aware pool cache. The object will not + * be recycled until all readers that may have observed it + * have exited their read sections. + */ + pool_cache_put(conn_cache, c); +} +.Ed +.\" ----- +.Sh ALGORITHM +The SMR implementation is based on the Global Unbounded Sequences (GUS) +algorithm, inspired by epoch-based reclamation. +.Pp +A monotonically increasing write sequence number is maintained globally. +Readers record the most recent write sequence number they have observed in +per-CPU state. +A shared read sequence number records the lowest sequence number observed by +any active reader as of the last poll. +Any write older than this value has been observed by all readers and memory +tagged with that sequence number can be reclaimed. +.Pp +Idle readers store an invalid sentinel +.Pq Dv SMR_SEQ_INVALID +in their per-CPU state. +This allows +.Fn smr_poll +to distinguish active readers from idle CPUs. +.Pp +The write and read sequence numbers form a two-handed clock, with readers +always advancing towards writers. +When the system is idle the two hands meet and no deferred memory is +outstanding. +.Pp +A notable distinction between GUS and simpler epoch-based schemes is that +the delta between read and write sequence numbers is unbounded. +This allows finer-grained assignment of sequence numbers even when some +readers have long-lived read sections, and permits writers to advance the +sequence and defer polling to a later time when completion is more likely. +.\" ----- +.Sh NOTES +The algorithm is implemented in +.Pa sys/kern/subr_smr.c +with inline reader functions in +.Pa sys/sys/smr.h . +.Pp +The acronym SMR is used in the academic literature as a generic term for +a family of algorithms enabling memory-safe concurrent access (including +hazard pointers, epoch-based reclamation, and others). +In this context, SMR refers specifically to the GUS algorithm and its +implementation. +.\" ----- +.Sh CODE REFERENCES +The +.Nm +implementation is in +.Pa sys/kern/subr_smr.c . +The inline reader entry/exit functions and type definitions are in +.Pa sys/sys/smr.h +and +.Pa sys/sys/_smr.h . +The pool cache SMR integration is in +.Pa sys/kern/subr_pool.c . +.\" ----- +.Sh SEE ALSO +.Xr membar_ops 3 , +.Xr mutex 9 , +.Xr pool_cache 9 , +.Xr pserialize 9 , +.Xr rwlock 9 , +.Xr softint 9 , +.Xr spl 9 +.\" ----- +.Sh HISTORY +The SMR algorithm and implementation were originally written for +.Fx +by +.An Jeff Roberson Aq Mt jeff@FreeBSD.org . +The code was ported to +.Nx +with the addition of the +.Fn smr_lazy_enter +and +.Fn smr_lazy_exit +functions, the caller IPL contract, pool cache integration via +.Fn pool_cache_set_smr , +and the +.Dv SMR_DEFERRED +batching mode. +.Sh AUTHORS +The SMR algorithm and its implementation were provided by +.An Jeff Roberson Aq Mt jeff@FreeBSD.org . +The +.Fx +manual page was written by +.An Mark Johnston Aq Mt markj@FreeBSD.org . +The +.Nx +port and this manual page adaptation were done by +.An Kevin Bowling Aq Mt kevin.bowling@kev009.com . diff --git a/sys/kern/files.kern b/sys/kern/files.kern index 4c8967d61dcd..0b1259773c92 100644 --- a/sys/kern/files.kern +++ b/sys/kern/files.kern @@ -151,6 +151,7 @@ file kern/subr_prf.c kern file kern/subr_prof.c kern file kern/subr_pserialize.c kern file kern/subr_psref.c kern +file kern/subr_smr.c kern file kern/subr_specificdata.c kern file kern/subr_tftproot.c tftproot file kern/subr_time.c kern diff --git a/sys/kern/subr_pool.c b/sys/kern/subr_pool.c index 4cd08a249186..4d025632041a 100644 --- a/sys/kern/subr_pool.c +++ b/sys/kern/subr_pool.c @@ -58,6 +58,7 @@ __KERNEL_RCSID(0, "$NetBSD: subr_pool.c,v 1.297 2026/01/04 03:20:29 riastradh Ex #include #include #include +#include #include #include #include @@ -386,6 +387,10 @@ static void pool_cache_invalidate_cpu(pool_cache_t, u_int); static void pool_cache_transfer(pool_cache_t); static int pool_pcg_get(pcg_t *volatile *, pcg_t **); static int pool_pcg_put(pcg_t *volatile *, pcg_t *); +static void pool_cache_smr_put(pool_cache_t, pcg_t *); +static void pool_cache_smr_put_pair(pool_cache_t, pcg_t *, pcg_t *); +static void pool_cache_smr_put_head(pool_cache_t, pcg_t *); +static pcg_t * pool_cache_smr_get(pool_cache_t); static pcg_t * pool_pcg_trunc(pcg_t *volatile *); static int pool_catchup(struct pool *); @@ -2156,6 +2161,10 @@ pool_cache_bootstrap(pool_cache_t pc, size_t size, u_int align, pc->pc_refcnt = 0; pc->pc_roflags = flags; pc->pc_freecheck = NULL; + pc->pc_smr = NULL; + pc->pc_smr_head = NULL; + pc->pc_smr_tail = NULL; + pc->pc_smr_seq_oldest = SMR_SEQ_INVALID; if ((flags & PR_LARGECACHE) != 0) { pc->pc_pcgsize = PCG_NOBJECTS_LARGE; @@ -2239,6 +2248,10 @@ pool_cache_bootstrap_destroy(pool_cache_t pc) for (i = 0; i < __arraycount(pc->pc_cpus); i++) pool_cache_invalidate_cpu(pc, i); + /* Destroy SMR state if applicable. */ + if (pc->pc_smr != NULL) + mutex_destroy(&pc->pc_smr_lock); + /* Finally, destroy it. */ pool_destroy(pp); } @@ -2276,6 +2289,9 @@ pool_cache_cpu_init1(struct cpu_info *ci, pool_cache_t pc) cc->cc_current = __UNCONST(&pcg_dummy); cc->cc_previous = __UNCONST(&pcg_dummy); + cc->cc_smr_free1 = __UNCONST(&pcg_dummy); + cc->cc_smr_free2 = __UNCONST(&pcg_dummy); + cc->cc_smr_alloc = __UNCONST(&pcg_dummy); cc->cc_pcgcache = pc->pc_pcgcache; cc->cc_hits = 0; cc->cc_misses = 0; @@ -2451,6 +2467,26 @@ pool_cache_invalidate(pool_cache_t pc) ((pool_cache_cpu_t *)pc->pc_cpus[curcpu()->ci_index])->cc_nfull -= n; splx(s); + /* Drain SMR FIFO queue if this is an SMR-enabled pool cache. */ + if (pc->pc_smr != NULL) { + pcg_t *smr_pcg; + /* + * Wait for all SMR readers to complete before + * freeing objects from the FIFO. Without this, + * objects could be reused while readers on other + * CPUs still hold references. + */ + smr_synchronize(pc->pc_smr); + n = 0; + while ((smr_pcg = pool_cache_smr_get(pc)) != NULL) { + n += pool_cache_invalidate_groups(pc, smr_pcg); + } + s = splvm(); + ((pool_cache_cpu_t *)pc->pc_cpus[curcpu()->ci_index])->cc_nfull + -= n; + splx(s); + } + pcg = pool_pcg_trunc(&pc->pc_partgroups); n = pool_cache_invalidate_groups(pc, pcg); s = splvm(); @@ -2478,6 +2514,15 @@ pool_cache_invalidate_cpu(pool_cache_t pc, u_int index) if ((cc = pc->pc_cpus[index]) == NULL) return; + /* + * pool_cache_invalidate() has already xcalled every online CPU + * via pool_cache_transfer, which flushes cc_current/cc_previous + * (non-SMR) or cc_smr_free1/cc_smr_free2/cc_smr_alloc (SMR) and + * drains the SMR FIFO with smr_synchronize. Any non-dummy pcgs + * still here belong to a CPU that was offline at xcall time; + * objects were freed long ago, certainly past any SMR grace + * period, so a direct destruct is safe. + */ if ((pcg = cc->cc_current) != &pcg_dummy) { pcg->pcg_next = NULL; pool_cache_invalidate_groups(pc, pcg); @@ -2486,6 +2531,18 @@ pool_cache_invalidate_cpu(pool_cache_t pc, u_int index) pcg->pcg_next = NULL; pool_cache_invalidate_groups(pc, pcg); } + if ((pcg = cc->cc_smr_free1) != &pcg_dummy) { + pcg->pcg_next = NULL; + pool_cache_invalidate_groups(pc, pcg); + } + if ((pcg = cc->cc_smr_free2) != &pcg_dummy) { + pcg->pcg_next = NULL; + pool_cache_invalidate_groups(pc, pcg); + } + if ((pcg = cc->cc_smr_alloc) != &pcg_dummy) { + pcg->pcg_next = NULL; + pool_cache_invalidate_groups(pc, pcg); + } if (cc != &pc->pc_cpu0) pool_put(&cache_cpu_pool, cc); @@ -2512,6 +2569,119 @@ pool_cache_sethiwat(pool_cache_t pc, int n) pool_sethiwat(&pc->pc_pool, n); } +void +pool_cache_set_smr(pool_cache_t pc, void *smr) +{ + + KASSERT(pc->pc_smr == NULL); + /* Must be called before any allocations from this cache. */ + KASSERT(pc->pc_pool.pr_nout == 0); + pc->pc_smr = smr; + pc->pc_roflags |= PR_SMR | PR_NOTOUCH; + mutex_init(&pc->pc_smr_lock, MUTEX_DEFAULT, IPL_VM); + pc->pc_smr_head = NULL; + pc->pc_smr_tail = NULL; + pc->pc_smr_seq_oldest = SMR_SEQ_INVALID; +} + +/* + * SMR FIFO helpers: groups are appended at the tail (newest) and + * consumed from the head (oldest, most likely to have expired). + * Protected by pc_smr_lock. + * + * pc_smr_seq_oldest caches the head pcg's SMR sequence (or + * SMR_SEQ_INVALID when the FIFO is empty) so the allocation + * fast-reject path can skip the lock via atomic_load_relaxed + + * smr_poll when nothing is ripe. + */ +static void +pool_cache_smr_put(pool_cache_t pc, pcg_t *pcg) +{ + + pcg->pcg_next = NULL; + mutex_spin_enter(&pc->pc_smr_lock); + if (pc->pc_smr_tail != NULL) { + pc->pc_smr_tail->pcg_next = pcg; + } else { + pc->pc_smr_head = pcg; + atomic_store_relaxed(&pc->pc_smr_seq_oldest, + pcg->pcg_smr_seq); + } + pc->pc_smr_tail = pcg; + mutex_spin_exit(&pc->pc_smr_lock); +} + +/* + * Enqueue two full pcgs at the FIFO tail under a single lock + * acquisition. Used by the put_slow flush-both path, where both + * pcgs share the same smr_advance() seq. + */ +static void +pool_cache_smr_put_pair(pool_cache_t pc, pcg_t *first, pcg_t *second) +{ + + first->pcg_next = second; + second->pcg_next = NULL; + mutex_spin_enter(&pc->pc_smr_lock); + if (pc->pc_smr_tail != NULL) { + pc->pc_smr_tail->pcg_next = first; + } else { + pc->pc_smr_head = first; + atomic_store_relaxed(&pc->pc_smr_seq_oldest, + first->pcg_smr_seq); + } + pc->pc_smr_tail = second; + mutex_spin_exit(&pc->pc_smr_lock); +} + +/* + * Re-enqueue a partially drained (or unexpired) pcg at the FIFO head. + * Used by get_slow when the popped pcg is not yet expired, or when + * objects remain after popping one. The pcg's seq is necessarily + * <= any queued pcg, so head placement preserves FIFO-by-seq order. + */ +static void +pool_cache_smr_put_head(pool_cache_t pc, pcg_t *pcg) +{ + + mutex_spin_enter(&pc->pc_smr_lock); + pcg->pcg_next = pc->pc_smr_head; + pc->pc_smr_head = pcg; + if (pc->pc_smr_tail == NULL) + pc->pc_smr_tail = pcg; + atomic_store_relaxed(&pc->pc_smr_seq_oldest, pcg->pcg_smr_seq); + mutex_spin_exit(&pc->pc_smr_lock); +} + +static pcg_t * +pool_cache_smr_get(pool_cache_t pc) +{ + pcg_t *pcg; + + mutex_spin_enter(&pc->pc_smr_lock); + pcg = pc->pc_smr_head; + if (pcg != NULL) { + pc->pc_smr_head = pcg->pcg_next; + if (pc->pc_smr_head == NULL) { + pc->pc_smr_tail = NULL; + atomic_store_relaxed(&pc->pc_smr_seq_oldest, + SMR_SEQ_INVALID); + } else { + atomic_store_relaxed(&pc->pc_smr_seq_oldest, + pc->pc_smr_head->pcg_smr_seq); + } + } + mutex_spin_exit(&pc->pc_smr_lock); + return pcg; +} + +void * +pool_cache_get_smr(pool_cache_t pc) +{ + + return pc->pc_smr; +} + void pool_cache_sethardlimit(pool_cache_t pc, int n, const char *warnmess, int ratecap) { @@ -2570,6 +2740,16 @@ pool_pcg_get(pcg_t *volatile *head, pcg_t **pcgp) membar_datadep_consumer(); /* alpha */ n = atomic_load_relaxed(&o->pcg_next); atomic_store_release(head, n); +#ifdef __OCTEON__ + /* + * Octeon store buffers can linger for hundreds of + * thousands of cycles; a bare syncw drains the + * release store promptly. See mips/include/lock.h + * for details and the XXX about fixing + * atomic_store_release. + */ + __asm volatile("syncw" ::: "memory"); +#endif break; } } @@ -2641,26 +2821,99 @@ pool_cache_get_slow(pool_cache_t pc, pool_cache_cpu_t *cc, int s, pcg_t *pcg, *cur; void *object; - KASSERT(cc->cc_current->pcg_avail == 0); - KASSERT(cc->cc_previous->pcg_avail == 0); + if (__predict_false(pc->pc_smr != NULL)) { + KASSERT(cc->cc_smr_alloc->pcg_avail == 0); + } else { + KASSERT(cc->cc_current->pcg_avail == 0); + KASSERT(cc->cc_previous->pcg_avail == 0); + } cc->cc_misses++; /* * If there's a full group, release our empty group back to the - * cache. Install the full group as cc_current and return. + * cache and install the full group as the fast-path source. + * + * For SMR pools, pull from the FIFO (oldest first) and install + * the ripe pcg as cc_smr_alloc. The entire bucket's worth of + * already-validated objects is then drained by the get fast + * path without re-entering the FIFO lock. + * + * For non-SMR pools, use the regular LIFO pc_fullgroups stack + * and install as cc_current. */ - cc->cc_contended += pool_pcg_get(&pc->pc_fullgroups, &pcg); - if (__predict_true(pcg != NULL)) { - KASSERT(pcg->pcg_avail == pcg->pcg_size); - if (__predict_true((cur = cc->cc_current) != &pcg_dummy)) { - KASSERT(cur->pcg_avail == 0); - (void)pool_pcg_put(cc->cc_pcgcache, cur); + if (__predict_false(pc->pc_smr != NULL)) { + smr_seq_t seq; + + /* + * Lock-free fast-reject: consult the cached head seq + * before taking pc_smr_lock. A stale read is harmless; + * the per-pcg smr_poll below re-validates. + * + * When the FIFO is empty, call smr_advance() to keep + * the SMR_LAZY write clock (s_wr.seq) moving. Without + * this, batched put-side stamps set goals that never + * ripen on sparse workloads. No other code path ticks + * the lazy clock for this smr context and previously- + * stamped pcgs would accumulate forever in the FIFO. + * (When the FIFO is non-empty, smr_poll itself ticks + * the lazy clock via smr_lazy_advance.) + */ + seq = atomic_load_relaxed(&pc->pc_smr_seq_oldest); + if (seq == SMR_SEQ_INVALID) { + (void)smr_advance(pc->pc_smr); + goto fresh; + } + if (!smr_poll(pc->pc_smr, seq, false)) + goto fresh; + + pcg = pool_cache_smr_get(pc); + if (pcg != NULL) { + KASSERT(pcg->pcg_avail > 0); + if (!smr_poll(pc->pc_smr, pcg->pcg_smr_seq, + false)) { + /* + * Oldest group still has active readers + * (we lost a race against a recent put). + * Put it back at the head and allocate + * fresh. + */ + pool_cache_smr_put_head(pc, pcg); + goto fresh; + } + /* + * SMR grace period expired. Install the whole + * pcg as cc_smr_alloc; the retry in get_paddr + * will pop the first object via the fast path. + */ + cur = cc->cc_smr_alloc; + if (__predict_true(cur != &pcg_dummy)) { + KASSERT(cur->pcg_avail == 0); + (void)pool_pcg_put(cc->cc_pcgcache, cur); + } + cc->cc_smr_alloc = pcg; + cc->cc_nfull--; + return true; + } + } else { + cc->cc_contended += pool_pcg_get(&pc->pc_fullgroups, &pcg); + if (__predict_true(pcg != NULL)) { + KASSERT(pcg->pcg_avail == pcg->pcg_size); + goto install; } - cc->cc_nfull--; - cc->cc_current = pcg; - return true; } + goto fresh; + +install: + if (__predict_true((cur = cc->cc_current) != &pcg_dummy)) { + KASSERT(cur->pcg_avail == 0); + (void)pool_pcg_put(cc->cc_pcgcache, cur); + } + cc->cc_nfull--; + cc->cc_current = pcg; + return true; + +fresh: /* * Nothing available locally or in cache. Take the slow @@ -2736,8 +2989,55 @@ pool_cache_get_paddr(pool_cache_t pc, int flags, paddr_t *pap) /* Lock out interrupts and disable preemption. */ s = splvm(); while (/* CONSTCOND */ true) { - /* Try and allocate an object from the current group. */ cc = pc->pc_cpus[curcpu()->ci_index]; + if (__predict_false(cc == NULL)) { + /* + * CPU not yet registered with this pool cache. + */ + if (flags & PR_NOWAIT) { + splx(s); + return NULL; + } + splx(s); + pool_cache_cpu_init1(curcpu(), pc); + s = splvm(); + cc = pc->pc_cpus[curcpu()->ci_index]; + KASSERT(cc != NULL); + } + + /* + * For SMR pools the hot fast path pops from cc_smr_alloc, + * a per-CPU bucket whose objects have already passed a + * grace period (smr_poll verified on bucket install). + * cc_smr_free1 / cc_smr_free2 stage recently-freed + * objects that are NOT yet safe to hand out; the alloc + * side never touches them. This matches FreeBSD's + * UMA_ZONE_SMR uc_allocbucket / uc_freebucket split. + */ + if (__predict_false(pc->pc_smr != NULL)) { + pcg = cc->cc_smr_alloc; + if (__predict_true(pcg->pcg_avail > 0)) { + object = pcg->pcg_objects[--pcg->pcg_avail] + .pcgo_va; + if (__predict_false(pap != NULL)) + *pap = pcg->pcg_objects[pcg->pcg_avail] + .pcgo_pa; +#if defined(DIAGNOSTIC) + pcg->pcg_objects[pcg->pcg_avail].pcgo_va = NULL; + KASSERT(pcg->pcg_avail < pcg->pcg_size); + KASSERT(object != NULL); +#endif + cc->cc_hits++; + splx(s); + FREECHECK_OUT(&pc->pc_freecheck, object); + pool_redzone_fill(&pc->pc_pool, object); + pool_cache_get_kmsan(pc, object); + return object; + } + goto get_slow; + } + + /* Try and allocate an object from the current group. */ pcg = cc->cc_current; if (__predict_true(pcg->pcg_avail > 0)) { object = pcg->pcg_objects[--pcg->pcg_avail].pcgo_va; @@ -2766,6 +3066,7 @@ pool_cache_get_paddr(pool_cache_t pc, int flags, paddr_t *pap) cc->cc_current = pcg; continue; } +get_slow: /* * Can't allocate from either group: try the slow path. @@ -2795,8 +3096,16 @@ pool_cache_put_slow(pool_cache_t pc, pool_cache_cpu_t *cc, int s, void *object) { pcg_t *pcg, *cur; - KASSERT(cc->cc_current->pcg_avail == cc->cc_current->pcg_size); - KASSERT(cc->cc_previous->pcg_avail == cc->cc_previous->pcg_size); + if (__predict_false(pc->pc_smr != NULL)) { + KASSERT(cc->cc_smr_free1->pcg_avail == + cc->cc_smr_free1->pcg_size); + KASSERT(cc->cc_smr_free2->pcg_avail == + cc->cc_smr_free2->pcg_size); + } else { + KASSERT(cc->cc_current->pcg_avail == cc->cc_current->pcg_size); + KASSERT(cc->cc_previous->pcg_avail == + cc->cc_previous->pcg_size); + } cc->cc_misses++; @@ -2816,19 +3125,87 @@ pool_cache_put_slow(pool_cache_t pc, pool_cache_cpu_t *cc, int s, void *object) } /* - * If there's a empty group, release our full group back to the - * cache. Install the empty group to the local CPU and return. + * If there's an empty group, install it and retry. The filled + * cc_current is released as follows: + * + * non-SMR: pushed onto pc_fullgroups (LIFO) for any CPU to reuse. + * cc_previous is left in place; the get path drains it + * by swap-when-current-empty. + * SMR: both cc_smr_free1 and cc_smr_free2 are stamped with + * one smr_advance() and enqueued on the SMR FIFO. The + * alloc side reads from cc_smr_alloc only, so leaving + * cc_smr_free2 full would waste memory indefinitely; + * flushing both here also doubles amortization to + * ~2 * pc_pcgsize frees per smr_advance (~30 objects + * on _LP64). + * + * The SMR flush-both branch is reached only when both + * cc_smr_free1 and cc_smr_free2 are full pcgs (the fast path fell + * through) or cc_smr_free2 is pcg_dummy (handled below). + * cc_smr_free1 is never pcg_dummy with cc_smr_free2 non-dummy + * full: you cannot fill cc_smr_free2 without first having filled + * cc_smr_free1. */ if (pcg != NULL) { KASSERT(pcg->pcg_avail == 0); - if (__predict_false(cc->cc_previous == &pcg_dummy)) { + if (__predict_false(pc->pc_smr != NULL)) { + if (__predict_false(cc->cc_smr_free2 == &pcg_dummy)) { + /* + * Normally we install the new empty pcg as + * cc_smr_free2 and wait for it to fill before + * flushing. However, if cc_smr_free1 is full + * and the SMR FIFO is currently empty, publish + * cc_smr_free1 immediately so the alloc side + * has something to ripen. Deferring until + * flush-both would starve the alloc path under + * sparse workloads where per-CPU puts never + * reach the ~2 * pc_pcgsize threshold. + * + * The amortization factor drops from ~2x to 1x + * pc_pcgsize in this branch, but only when the + * FIFO has drained to empty. Once it's primed + * the normal flush-both path below resumes. + */ + cur = cc->cc_smr_free1; + if (cur != &pcg_dummy && + cur->pcg_avail == cur->pcg_size && + atomic_load_relaxed( + &pc->pc_smr_seq_oldest) == + SMR_SEQ_INVALID) { + cur->pcg_smr_seq = + smr_advance(pc->pc_smr); + pool_cache_smr_put(pc, cur); + cc->cc_nfull++; + cc->cc_smr_free1 = pcg; + } else { + cc->cc_smr_free2 = pcg; + } + } else { + pcg_t *prev = cc->cc_smr_free2; + smr_seq_t seq; + + cur = cc->cc_smr_free1; + KASSERT(cur != &pcg_dummy); + KASSERT(cur->pcg_avail == cur->pcg_size); + KASSERT(prev->pcg_avail == prev->pcg_size); + + seq = smr_advance(pc->pc_smr); + cur->pcg_smr_seq = seq; + prev->pcg_smr_seq = seq; + pool_cache_smr_put_pair(pc, cur, prev); + cc->cc_nfull += 2; + cc->cc_smr_free1 = pcg; + cc->cc_smr_free2 = __UNCONST(&pcg_dummy); + } + } else if (__predict_false(cc->cc_previous == &pcg_dummy)) { cc->cc_previous = pcg; } else { cur = cc->cc_current; if (__predict_true(cur != &pcg_dummy)) { KASSERT(cur->pcg_avail == cur->pcg_size); cc->cc_contended += - pool_pcg_put(&pc->pc_fullgroups, cur); + pool_pcg_put(&pc->pc_fullgroups, + cur); cc->cc_nfull++; } cc->cc_current = pcg; @@ -2876,6 +3253,51 @@ pool_cache_put_paddr(pool_cache_t pc, void *object, paddr_t pa) /* Lock out interrupts and disable preemption. */ s = splvm(); + + if (__predict_false(pc->pc_smr != NULL)) { + /* + * SMR put fast path: stage in cc_smr_free1 / cc_smr_free2. + * The allocation side reads from cc_smr_alloc and does not + * see objects staged here until put_slow stamps them with + * smr_advance and queues them on the SMR FIFO. + */ + while (/* CONSTCOND */ true) { + cc = pc->pc_cpus[curcpu()->ci_index]; + if (__predict_false(cc == NULL)) { + /* + * CPU not yet registered with this pool + * cache (transient during CPU hot-plug). + * Initialize and retry. pool_cache_get_paddr + * handles this the same way. + */ + splx(s); + pool_cache_cpu_init1(curcpu(), pc); + s = splvm(); + cc = pc->pc_cpus[curcpu()->ci_index]; + KASSERT(cc != NULL); + } + pcg = cc->cc_smr_free1; + if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) { + pcg->pcg_objects[pcg->pcg_avail].pcgo_va = + object; + pcg->pcg_objects[pcg->pcg_avail].pcgo_pa = pa; + pcg->pcg_avail++; + cc->cc_hits++; + splx(s); + return; + } + pcg = cc->cc_smr_free2; + if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) { + cc->cc_smr_free2 = cc->cc_smr_free1; + cc->cc_smr_free1 = pcg; + continue; + } + if (!pool_cache_put_slow(pc, cc, s, object)) + break; + } + return; + } + while (/* CONSTCOND */ true) { /* If the current group isn't full, release it there. */ cc = pc->pc_cpus[curcpu()->ci_index]; @@ -2920,11 +3342,73 @@ static void pool_cache_transfer(pool_cache_t pc) { pool_cache_cpu_t *cc; - pcg_t *prev, *cur; + pcg_t *prev, *cur, *alloc; int s; s = splvm(); cc = pc->pc_cpus[curcpu()->ci_index]; + + if (__predict_false(pc->pc_smr != NULL)) { + smr_seq_t seq = SMR_SEQ_INVALID; + + cur = cc->cc_smr_free1; + cc->cc_smr_free1 = __UNCONST(&pcg_dummy); + prev = cc->cc_smr_free2; + cc->cc_smr_free2 = __UNCONST(&pcg_dummy); + alloc = cc->cc_smr_alloc; + cc->cc_smr_alloc = __UNCONST(&pcg_dummy); + + /* + * Batch a single smr_advance for whichever of the + * two free buckets carry staged objects. + */ + if ((cur != &pcg_dummy && cur->pcg_avail > 0) || + (prev != &pcg_dummy && prev->pcg_avail > 0)) + seq = smr_advance(pc->pc_smr); + + if (cur != &pcg_dummy) { + if (cur->pcg_avail == 0) { + (void)pool_pcg_put(pc->pc_pcgcache, cur); + } else { + cur->pcg_smr_seq = seq; + pool_cache_smr_put(pc, cur); + cc->cc_nfull++; + } + } + if (prev != &pcg_dummy) { + if (prev->pcg_avail == 0) { + (void)pool_pcg_put(pc->pc_pcgcache, prev); + } else { + prev->pcg_smr_seq = seq; + pool_cache_smr_put(pc, prev); + cc->cc_nfull++; + } + } + if (alloc != &pcg_dummy) { + if (alloc->pcg_avail == 0) { + (void)pool_pcg_put(pc->pc_pcgcache, alloc); + } else { + /* + * Validated alloc bucket: the existing + * pcg_smr_seq already passed a grace + * period, so smr_poll will pass + * immediately on the next allocation. + * + * Pushing to FIFO tail may place this + * older seq behind the newer free1/free2 + * seqs we just stamped; pc_smr_seq_oldest + * tracks the head only and per-pcg + * smr_poll is authoritative, so FIFO-by- + * seq monotonicity is not load-bearing. + */ + pool_cache_smr_put(pc, alloc); + cc->cc_nfull++; + } + } + splx(s); + return; + } + cur = cc->cc_current; cc->cc_current = __UNCONST(&pcg_dummy); prev = cc->cc_previous; diff --git a/sys/kern/subr_smr.c b/sys/kern/subr_smr.c new file mode 100644 index 000000000000..e1da0b6796c1 --- /dev/null +++ b/sys/kern/subr_smr.c @@ -0,0 +1,625 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2019,2020 Jeffrey Roberson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Ported to NetBSD from FreeBSD's kern/subr_smr.c. + */ + +/* + * Global Unbounded Sequences (GUS) + * + * This is a novel safe memory reclamation technique inspired by + * epoch based reclamation from Samy Al Bahra's concurrency kit which + * in turn was based on work described in: + * Fraser, K. 2004. Practical Lock-Freedom. PhD Thesis, University + * of Cambridge Computing Laboratory. + * And shares some similarities with: + * Wang, Stamler, Parmer. 2016 Parallel Sections: Scaling System-Level + * Data-Structures + * + * This is not an implementation of hazard pointers or related + * techniques. The term safe memory reclamation is used as a + * generic descriptor for algorithms that defer frees to avoid + * use-after-free errors with lockless datastructures or as + * a mechanism to detect quiescence for writer synchronization. + * + * The basic approach is to maintain a monotonic write sequence + * number that is updated on some application defined granularity. + * Readers record the most recent write sequence number they have + * observed. A shared read sequence number records the lowest + * sequence number observed by any reader as of the last poll. Any + * write older than this value has been observed by all readers + * and memory can be reclaimed. Like Epoch we also detect idle + * readers by storing an invalid sequence number in the per-cpu + * state when the read section exits. Like Parsec we establish + * a global write clock that is used to mark memory on free. + * + * The write and read sequence numbers can be thought of as a two + * handed clock with readers always advancing towards writers. GUS + * maintains the invariant that all readers can safely access memory + * that was visible at the time they loaded their copy of the sequence + * number. Periodically the read sequence or hand is polled and + * advanced as far towards the write sequence as active readers allow. + * Memory which was freed between the old and new global read sequence + * number can now be reclaimed. When the system is idle the two hands + * meet and no deferred memory is outstanding. Readers never advance + * any sequence number, they only observe them. The shared read + * sequence number is consequently never higher than the write sequence. + * A stored sequence number that falls outside of this range has expired + * and needs no scan to reclaim. + * + * A notable distinction between GUS and Epoch, qsbr, rcu, etc. is + * that advancing the sequence number is decoupled from detecting its + * observation. That is to say, the delta between read and write + * sequence numbers is not bound. This can be thought of as a more + * generalized form of epoch which requires them at most one step + * apart. This results in a more granular assignment of sequence + * numbers even as read latencies prohibit all or some expiration. + * It also allows writers to advance the sequence number and save the + * poll for expiration until a later time when it is likely to + * complete without waiting. The batch granularity and free-to-use + * latency is dynamic and can be significantly smaller than in more + * strict systems. + * + * See FreeBSD's kern/subr_smr.c for the full UMA integration + * description, which is not applicable to the NetBSD port. + * + * If the read overhead of accessing the shared cacheline becomes + * especially burdensome an invariant TSC could be used in place of the + * sequence. The algorithm would then only need to maintain the minimum + * observed tsc. This would trade potential cache synchronization + * overhead for local serialization and cpu timestamp overhead. + */ + +/* + * A simplified diagram: + * + * 0 UINT_MAX + * | -------------------- sequence number space -------------------- | + * ^ rd seq ^ wr seq + * | ----- valid sequence numbers ---- | + * ^cpuA ^cpuC + * | -- free -- | --------- deferred frees -------- | ---- free ---- | + * + * + * In this example cpuA has the lowest sequence number and poll can + * advance rd seq. cpuB is not running and is considered to observe + * wr seq. + * + * Freed memory that is tagged with a sequence number between rd seq and + * wr seq can not be safely reclaimed because cpuA may hold a reference to + * it. Any other memory is guaranteed to be unreferenced. + * + * Any writer is free to advance wr seq at any time however it may busy + * poll in pathological cases. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD$"); + +#include +#include +#include +#include +#include +#include +#include /* SPINLOCK_BACKOFF_HOOK */ +#include +#include + +#ifndef DIAGNOSTIC +#define SMR_SEQ_INIT 1 /* All valid sequence numbers are odd. */ +#define SMR_SEQ_INCR 2 + +/* + * SMR_SEQ_MAX_DELTA is the maximum distance allowed between rd_seq and + * wr_seq. For the modular arithmetic to work a value of UNIT_MAX / 2 + * would be possible but it is checked after we increment the wr_seq so + * a safety margin is left to prevent overflow. + * + * We will block until SMR_SEQ_MAX_ADVANCE sequence numbers have progressed + * to prevent integer wrapping. See smr_advance() for more details. + */ +#define SMR_SEQ_MAX_DELTA (UINT_MAX / 4) +#define SMR_SEQ_MAX_ADVANCE (SMR_SEQ_MAX_DELTA - 1024) +#else +/* We want to test the wrapping feature in invariants kernels. */ +#define SMR_SEQ_INCR (UINT_MAX / 10000) +#define SMR_SEQ_INIT (UINT_MAX - 100000) +/* Force extra polls to test the integer overflow detection. */ +#define SMR_SEQ_MAX_DELTA (SMR_SEQ_INCR * 32) +#define SMR_SEQ_MAX_ADVANCE SMR_SEQ_MAX_DELTA / 2 +#endif + +/* + * The grace period for lazy (tick based) SMR. + * + * Hardclock is responsible for advancing ticks on a single CPU while every + * CPU receives a regular clock interrupt. The clock interrupts are flushing + * the store buffers and any speculative loads that may violate our invariants. + * Because these interrupts are not synchronized we must wait one additional + * tick in the future to be certain that all processors have had their state + * synchronized by an interrupt. + * + * This assumes that the clock interrupt will only be delayed by other causes + * that will flush the store buffer or prevent access to the section protected + * data. For example, an idle processor, or an system management interrupt, + * or a vm exit. + */ +#define SMR_LAZY_GRACE 2 +#define SMR_LAZY_INCR (SMR_LAZY_GRACE * SMR_SEQ_INCR) + +/* + * The maximum sequence number ahead of wr_seq that may still be valid. The + * sequence may not be advanced on write for lazy or deferred SMRs. In this + * case poll needs to attempt to forward the sequence number if the goal is + * within wr_seq + SMR_SEQ_ADVANCE. + */ +#define SMR_SEQ_ADVANCE SMR_LAZY_INCR + +/* Statistics (debug). */ +static struct evcnt smr_ev_advance = + EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "smr", "advance"); +static struct evcnt smr_ev_advance_wait = + EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "smr", "advance_wait"); +static struct evcnt smr_ev_poll = + EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "smr", "poll"); +static struct evcnt smr_ev_poll_scan = + EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "smr", "poll_scan"); +static struct evcnt smr_ev_poll_fail = + EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "smr", "poll_fail"); + +EVCNT_ATTACH_STATIC(smr_ev_advance); +EVCNT_ATTACH_STATIC(smr_ev_advance_wait); +EVCNT_ATTACH_STATIC(smr_ev_poll); +EVCNT_ATTACH_STATIC(smr_ev_poll_scan); +EVCNT_ATTACH_STATIC(smr_ev_poll_fail); + +/* + * Advance a lazy write sequence number. These move forward at the rate of + * ticks. Grace is SMR_LAZY_INCR (2 ticks) in the future. + * + * This returns the goal write sequence number. + */ +static smr_seq_t +smr_lazy_advance(smr_t smr, smr_shared_t s) +{ + union s_wr s_wr, old; + int t, d; + + KASSERT(kpreempt_disabled()); + + /* + * Load the stored ticks value before the current one. This way the + * current value can only be the same or larger. + */ + old._pair = s_wr._pair = atomic_load_relaxed(&s->s_wr._pair); + t = getticks(); + + /* + * The most probable condition that the update already took place. + */ + d = t - s_wr.ticks; + if (__predict_true(d == 0)) + goto out; + /* Cap the rate of advancement and handle long idle periods. */ + if (d > SMR_LAZY_GRACE || d < 0) + d = SMR_LAZY_GRACE; + s_wr.ticks = t; + s_wr.seq += d * SMR_SEQ_INCR; + + /* + * This can only fail if another thread races to call advance(). + * Strong cmpset semantics mean we are guaranteed that the update + * happened. + */ + (void)atomic_cas_64(&s->s_wr._pair, old._pair, s_wr._pair); +out: + return (s_wr.seq + SMR_LAZY_INCR); +} + +/* + * Increment the shared write sequence by 2. Since it is initialized + * to 1 this means the only valid values are odd and an observed value + * of 0 in a particular CPU means it is not currently in a read section. + */ +static smr_seq_t +smr_shared_advance(smr_shared_t s) +{ + + return (atomic_add_32_nv(&s->s_wr.seq, SMR_SEQ_INCR)); +} + +/* + * Advance the write sequence number for a normal smr section. If the + * write sequence is too far behind the read sequence we have to poll + * to advance rd_seq and prevent undetectable wraps. + */ +static smr_seq_t +smr_default_advance(smr_t smr, smr_shared_t s) +{ + smr_seq_t goal, s_rd_seq; + + KASSERT(kpreempt_disabled()); + + /* + * Load the current read seq before incrementing the goal so + * we are guaranteed it is always < goal. + */ + s_rd_seq = atomic_load_acquire(&s->s_rd_seq); + goal = smr_shared_advance(s); + + /* + * Force a synchronization here if the goal is getting too + * far ahead of the read sequence number. This keeps the + * wrap detecting arithmetic working in pathological cases. + */ + if (SMR_SEQ_DELTA(goal, s_rd_seq) >= SMR_SEQ_MAX_DELTA) { + smr_ev_advance_wait.ev_count++; + smr_wait(smr, goal - SMR_SEQ_MAX_ADVANCE); + } + smr_ev_advance.ev_count++; + + return (goal); +} + +/* + * Deferred SMRs conditionally update s_wr_seq based on an + * cpu local interval count. + */ +static smr_seq_t +smr_deferred_advance(smr_t smr, smr_shared_t s, smr_t self) +{ + + if (++self->c_deferred < self->c_limit) + return (smr_shared_current(s) + SMR_SEQ_INCR); + self->c_deferred = 0; + return (smr_default_advance(smr, s)); +} + +/* + * Advance the write sequence and return the value for use as the + * wait goal. This guarantees that any changes made by the calling + * thread prior to this call will be visible to all threads after + * rd_seq meets or exceeds the return value. + * + * This function may busy loop if the readers are roughly 1 billion + * sequence numbers behind the writers. + * + * Lazy SMRs will not busy loop and the wrap happens every 25 days + * at 1khz and 60 hours at 10khz. Readers can block for no longer + * than half of this for SMR_SEQ_ macros to continue working. + */ +smr_seq_t +smr_advance(smr_t smr) +{ + smr_t self; + smr_shared_t s; + smr_seq_t goal; + int flags; + + /* + * It is illegal to enter while in an smr section. + */ + SMR_ASSERT_NOT_ENTERED(smr); + + /* + * Modifications not done in a smr section need to be visible + * before advancing the seq. + */ + membar_exit(); + + kpreempt_disable(); + /* Try to touch the line once. */ + self = smr_cpu_self(smr); + s = self->c_shared; + flags = self->c_flags; + if ((flags & (SMR_LAZY | SMR_DEFERRED)) == 0) + goal = smr_default_advance(smr, s); + else if ((flags & SMR_LAZY) != 0) + goal = smr_lazy_advance(smr, s); + else + goal = smr_deferred_advance(smr, s, self); + kpreempt_enable(); + + return (goal); +} + +/* + * Poll to determine the currently observed sequence number on a cpu + * and spinwait if the 'wait' argument is true. + */ +static smr_seq_t +smr_poll_cpu(struct smr *c, smr_seq_t s_rd_seq, smr_seq_t goal, bool wait) +{ + smr_seq_t c_seq; + + c_seq = SMR_SEQ_INVALID; + for (;;) { + c_seq = atomic_load_relaxed(&c->c_seq); + if (c_seq == SMR_SEQ_INVALID) + break; + + /* + * There is a race described in smr.h:smr_enter that + * can lead to a stale seq value but not stale data + * access. If we find a value out of range here we + * pin it to the current min to prevent it from + * advancing until that stale section has expired. + * + * The race is created when a cpu loads the s_wr_seq + * value in a local register and then another thread + * advances s_wr_seq and calls smr_poll() which will + * oberve no value yet in c_seq and advance s_rd_seq + * up to s_wr_seq which is beyond the register + * cached value. This is only likely to happen on + * hypervisor or with a system management interrupt. + */ + if (SMR_SEQ_LT(c_seq, s_rd_seq)) + c_seq = s_rd_seq; + + /* + * If the sequence number meets the goal we are done + * with this cpu. + */ + if (SMR_SEQ_LEQ(goal, c_seq)) + break; + + if (!wait) + break; + SPINLOCK_BACKOFF_HOOK; + } + + return (c_seq); +} + +/* + * Loop until all cores have observed the goal sequence or have + * gone inactive. Returns the oldest sequence currently active; + * + * This function assumes a snapshot of sequence values has + * been obtained and validated by smr_poll(). + */ +static smr_seq_t +smr_poll_scan(smr_t smr, smr_shared_t s, smr_seq_t s_rd_seq, + smr_seq_t s_wr_seq, smr_seq_t goal, bool wait) +{ + smr_seq_t rd_seq, c_seq; + CPU_INFO_ITERATOR cii; + struct cpu_info *ci; + + KASSERT(kpreempt_disabled()); + smr_ev_poll_scan.ev_count++; + + /* + * The read sequence can be no larger than the write sequence at + * the start of the poll. + */ + rd_seq = s_wr_seq; + for (CPU_INFO_FOREACH(cii, ci)) { + /* + * Query the active sequence on this cpu. If we're not + * waiting and we don't meet the goal we will still scan + * the rest of the cpus to update s_rd_seq before returning + * failure. + */ + c_seq = smr_poll_cpu(smr_cpu_get(smr, cpu_index(ci)), + s_rd_seq, goal, wait); + + /* + * Limit the minimum observed rd_seq whether we met the goal + * or not. + */ + if (c_seq != SMR_SEQ_INVALID) + rd_seq = SMR_SEQ_MIN(rd_seq, c_seq); + } + + /* + * Advance the rd_seq as long as we observed a more recent value. + */ + s_rd_seq = atomic_load_relaxed(&s->s_rd_seq); + if (SMR_SEQ_GT(rd_seq, s_rd_seq)) { + (void)atomic_cas_32(&s->s_rd_seq, s_rd_seq, rd_seq); + s_rd_seq = rd_seq; + } + + return (s_rd_seq); +} + +/* + * Poll to determine whether all readers have observed the 'goal' write + * sequence number. + * + * If wait is true this will spin until the goal is met. + * + * This routine will updated the minimum observed read sequence number in + * s_rd_seq if it does a scan. It may not do a scan if another call has + * advanced s_rd_seq beyond the callers goal already. + * + * Returns true if the goal is met and false if not. + */ +bool +smr_poll(smr_t smr, smr_seq_t goal, bool wait) +{ + smr_shared_t s; + smr_t self; + smr_seq_t s_wr_seq, s_rd_seq; + smr_delta_t delta; + int flags; + bool success; + + /* + * It is illegal to enter while in an smr section. + */ + KASSERTMSG(!wait || !SMR_ENTERED(smr), + "smr_poll: Blocking not allowed in a SMR section."); + + /* + * Use a critical section so that we can avoid ABA races + * caused by long preemption sleeps. + */ + success = true; + kpreempt_disable(); + /* Attempt to load from self only once. */ + self = smr_cpu_self(smr); + s = self->c_shared; + flags = self->c_flags; + smr_ev_poll.ev_count++; + + /* + * Conditionally advance the lazy write clock on any writer + * activity. + */ + if ((flags & SMR_LAZY) != 0) + smr_lazy_advance(smr, s); + + /* + * Acquire barrier loads s_wr_seq after s_rd_seq so that we can not + * observe an updated read sequence that is larger than write. + */ + s_rd_seq = atomic_load_acquire(&s->s_rd_seq); + + /* + * If we have already observed the sequence number we can immediately + * return success. Most polls should meet this criterion. + */ + if (SMR_SEQ_LEQ(goal, s_rd_seq)) + goto out; + + /* + * wr_seq must be loaded prior to any c_seq value so that a + * stale c_seq can only reference time after this wr_seq. + */ + s_wr_seq = atomic_load_acquire(&s->s_wr.seq); + + /* + * This is the distance from s_wr_seq to goal. Positive values + * are in the future. + */ + delta = SMR_SEQ_DELTA(goal, s_wr_seq); + + /* + * Detect a stale wr_seq. + * + * This goal may have come from a deferred advance or a lazy + * smr. If we are not blocking we can not succeed but the + * sequence number is valid. + */ + if (delta > 0 && delta <= SMR_SEQ_ADVANCE && + (flags & (SMR_LAZY | SMR_DEFERRED)) != 0) { + if (!wait) { + success = false; + goto out; + } + /* Advance wr_seq until it reaches the goal. Lazy SMR + * goals can be SMR_LAZY_INCR ahead, requiring multiple + * advances of SMR_SEQ_INCR each. */ + while (SMR_SEQ_GT(goal, s_wr_seq)) + s_wr_seq = smr_shared_advance(s); + delta = 0; + } + + /* + * Detect an invalid goal. + * + * The goal must be in the range of s_wr_seq >= goal >= s_rd_seq for + * it to be valid. If it is not then the caller held on to it and + * the integer wrapped. If we wrapped back within range the caller + * will harmlessly scan. + */ + if (delta > 0) + goto out; + + /* Determine the lowest visible sequence number. */ + s_rd_seq = smr_poll_scan(smr, s, s_rd_seq, s_wr_seq, goal, wait); + success = SMR_SEQ_LEQ(goal, s_rd_seq); +out: + if (!success) + smr_ev_poll_fail.ev_count++; + kpreempt_enable(); + + /* + * Serialize with smr_advance()/smr_exit(). The caller is now free + * to modify memory as expected. + */ + membar_enter(); + + KASSERTMSG(success || !wait, "smr_poll: blocking poll failed"); + return (success); +} + +smr_t +smr_create(const char *name, int limit, int flags) +{ + smr_t smr; + smr_shared_t s; + struct smr *c; + u_int i; + size_t shared_size = roundup(sizeof(*s), COHERENCY_UNIT); + size_t smr_size = MAXCPUS * SMR_CPU_STRIDE; + + s = kmem_zalloc(shared_size, KM_SLEEP); + smr = kmem_zalloc(smr_size, KM_SLEEP); + + s->s_name = name; + s->s_rd_seq = s->s_wr.seq = SMR_SEQ_INIT; + s->s_wr.ticks = getticks(); + + /* Initialize all CPUS, not just those running. */ + for (i = 0; i < MAXCPUS; i++) { + c = smr_cpu_get(smr, i); + c->c_seq = SMR_SEQ_INVALID; + c->c_shared = s; + c->c_deferred = 0; + c->c_limit = limit; + c->c_flags = flags; + } + membar_sync(); + + return (smr); +} + +void +smr_destroy(smr_t smr) +{ + smr_shared_t s; + size_t shared_size = roundup(sizeof(struct smr_shared), COHERENCY_UNIT); + size_t smr_size = MAXCPUS * SMR_CPU_STRIDE; + + smr_synchronize(smr); + s = smr_cpu_get(smr, 0)->c_shared; + kmem_free(s, shared_size); + kmem_free(smr, smr_size); +} + +/* + * Initialize the SMR subsystem. + */ +void +smr_init(void) +{ + + /* Nothing to do -- kmem is used directly. */ +} diff --git a/sys/sys/_smr.h b/sys/sys/_smr.h new file mode 100644 index 000000000000..d2f8367c9e73 --- /dev/null +++ b/sys/sys/_smr.h @@ -0,0 +1,41 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2019, 2020 Jeffrey Roberson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Ported to NetBSD from FreeBSD's sys/sys/_smr.h. + */ + +#ifndef _SYS__SMR_H_ +#define _SYS__SMR_H_ + +#include + +typedef uint32_t smr_seq_t; +typedef int32_t smr_delta_t; +typedef struct smr *smr_t; + +#define SMR_SEQ_INVALID 0 + +#endif /* _SYS__SMR_H_ */ diff --git a/sys/sys/pool.h b/sys/sys/pool.h index 2342ca478214..3d20b9e2ba27 100644 --- a/sys/sys/pool.h +++ b/sys/sys/pool.h @@ -36,6 +36,7 @@ #include #include +#include struct pool_sysctl { char pr_wchan[16]; @@ -163,6 +164,7 @@ struct pool { #define PR_ZERO 0x8000 /* zero data before returning */ #define PR_USEBMAP 0x10000 /* use a bitmap to manage freed items */ #define PR_PSERIALIZE 0x20000 /* needs pserialize sync point before free */ +#define PR_SMR 0x40000 /* SMR safe memory reclamation */ /* * `pr_lock' protects the pool's data structures when removing @@ -237,13 +239,33 @@ typedef struct pool_cache_group { struct pool_cache_group *pcg_next; /* link to next group */ u_int pcg_avail; /* # available objects */ u_int pcg_size; /* max number objects */ + smr_seq_t pcg_smr_seq; /* SMR sequence at free time */ pcgpair_t pcg_objects[1]; /* the objects */ } pcg_t; -/* Pool cache CPU. Sized to 64 bytes on _LP64. */ +/* + * Pool cache CPU. + * + * For non-SMR pool_caches only cc_current / cc_previous are live; + * cc_smr_free1 / cc_smr_free2 / cc_smr_alloc are held at pcg_dummy + * and cost only the three pointers of space. + * + * For SMR pool_caches the picture is inverted: + * cc_smr_free1 - primary write-staging bucket (freed, unstamped objects) + * cc_smr_free2 - secondary write-staging bucket (flush-both partner) + * cc_smr_alloc - validated per-CPU allocation bucket (smr_poll expired) + * and cc_current / cc_previous stay at pcg_dummy. The split mirrors + * FreeBSD's UMA_ZONE_SMR (uc_freebucket / uc_allocbucket). The alloc + * bucket lets the hot get path satisfy allocations without taking + * pc_smr_lock while the free buckets amortize smr_advance across + * ~2 * pc_pcgsize frees. + */ typedef struct pool_cache_cpu { - struct pool_cache_group *cc_current; - struct pool_cache_group *cc_previous; + struct pool_cache_group *cc_current; /* non-SMR: LIFO top */ + struct pool_cache_group *cc_previous; /* non-SMR: LIFO bottom */ + struct pool_cache_group *cc_smr_free1; /* SMR: primary write stage */ + struct pool_cache_group *cc_smr_free2; /* SMR: flush-both partner */ + struct pool_cache_group *cc_smr_alloc; /* SMR: validated read bucket */ pcg_t *volatile *cc_pcgcache; uint64_t cc_misses; uint64_t cc_hits; @@ -269,6 +291,7 @@ struct pool_cache { void *pc_arg; /* for ctor/dtor */ unsigned int pc_refcnt; /* ref count for pagedaemon, etc */ unsigned int pc_roflags; /* r/o cache flags */ + void *pc_smr; /* SMR context (smr_t), or NULL */ void *pc_cpus[MAXCPUS]; /* Diagnostic aides. */ @@ -281,6 +304,20 @@ struct pool_cache { __aligned(CACHE_LINE_SIZE); pcg_t *volatile pc_partgroups; /* groups for reclamation */ + /* SMR FIFO queue for deferred-reuse groups (PR_SMR only). */ + kmutex_t pc_smr_lock; /* protects FIFO head/tail */ + pcg_t *pc_smr_head; /* oldest (most likely expired) */ + pcg_t *pc_smr_tail; /* newest (most recently freed) */ + + /* + * Cached seq of FIFO head (SMR_SEQ_INVALID when empty); + * atomic_load_relaxed readable for lock-free alloc fast-reject, + * written under pc_smr_lock. Isolated on its own cacheline to + * avoid false sharing with the FIFO writers on pc_smr_lock / + * head / tail above. + */ + smr_seq_t pc_smr_seq_oldest __aligned(CACHE_LINE_SIZE); + /* Boot cpu. */ pool_cache_cpu_t pc_cpu0 __aligned(CACHE_LINE_SIZE); }; @@ -357,6 +394,8 @@ void pool_cache_sethiwat(pool_cache_t, int); void pool_cache_sethardlimit(pool_cache_t, int, const char *, int); void pool_cache_prime(pool_cache_t, int); void pool_cache_cpu_init(struct cpu_info *); +void pool_cache_set_smr(pool_cache_t, void *); +void *pool_cache_get_smr(pool_cache_t); unsigned int pool_cache_nget(pool_cache_t); unsigned int pool_cache_nput(pool_cache_t); diff --git a/sys/sys/smr.h b/sys/sys/smr.h new file mode 100644 index 000000000000..f5857fdf1027 --- /dev/null +++ b/sys/sys/smr.h @@ -0,0 +1,359 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2019, 2020 Jeffrey Roberson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Ported to NetBSD from FreeBSD's sys/sys/smr.h. + */ + +#ifndef _SYS_SMR_H_ +#define _SYS_SMR_H_ + +#include +#include +#include +#include /* kpreempt_disable/enable */ + +/* + * Safe memory reclamation. See subr_smr.c for a description of the + * algorithm, and smr_types.h for macros to define and access SMR-protected + * data structures. + * + * Readers synchronize with smr_enter()/exit() and writers may either + * free directly to a SMR-aware allocator or use smr_synchronize or wait. + */ + +/* + * Modular arithmetic for comparing sequence numbers that have + * potentially wrapped. Copied from tcp_seq.h. + */ +#define SMR_SEQ_LT(a, b) ((smr_delta_t)((a)-(b)) < 0) +#define SMR_SEQ_LEQ(a, b) ((smr_delta_t)((a)-(b)) <= 0) +#define SMR_SEQ_GT(a, b) ((smr_delta_t)((a)-(b)) > 0) +#define SMR_SEQ_GEQ(a, b) ((smr_delta_t)((a)-(b)) >= 0) +#define SMR_SEQ_DELTA(a, b) ((smr_delta_t)((a)-(b))) +#define SMR_SEQ_MIN(a, b) (SMR_SEQ_LT((a), (b)) ? (a) : (b)) +#define SMR_SEQ_MAX(a, b) (SMR_SEQ_GT((a), (b)) ? (a) : (b)) + +/* Shared SMR state. */ +union s_wr { + struct { + smr_seq_t seq; /* Current write sequence #. */ + int ticks; /* tick of last update (LAZY) */ + }; + uint64_t _pair; +}; +struct smr_shared { + const char *s_name; /* Name for debugging/reporting. */ + union s_wr s_wr; /* Write sequence */ + smr_seq_t s_rd_seq; /* Minimum observed read sequence. */ +}; +typedef struct smr_shared *smr_shared_t; + +/* Per-cpu SMR state. */ +struct smr { + smr_seq_t c_seq; /* Current observed sequence. */ + smr_shared_t c_shared; /* Shared SMR state. */ + int c_deferred; /* Deferred advance counter. */ + int c_limit; /* Deferred advance limit. */ + int c_flags; /* SMR Configuration */ +}; + +#define SMR_LAZY 0x0001 /* Higher latency write, fast read. */ +#define SMR_DEFERRED 0x0002 /* Aggregate updates to wr_seq. */ + +/* + * Per-CPU access. SMR per-CPU data is allocated as a contiguous + * array indexed by cpu_index(), with each entry padded to a cache + * line to avoid false sharing. + */ +#define SMR_CPU_STRIDE roundup(sizeof(struct smr), COHERENCY_UNIT) + +static __inline struct smr * +smr_cpu_get(smr_t smr, u_int cpuid) +{ + return (struct smr *)((char *)smr + cpuid * SMR_CPU_STRIDE); +} + +static __inline struct smr * +smr_cpu_self(smr_t smr) +{ + return smr_cpu_get(smr, cpu_index(curcpu())); +} + +/* + * SMR_ENTERED: true if the current CPU is in an SMR read section. + */ +#define SMR_ENTERED(smr) \ + (kpreempt_disabled() && \ + smr_cpu_self(smr)->c_seq != SMR_SEQ_INVALID) + +#define SMR_ASSERT_ENTERED(smr) \ + KASSERT(SMR_ENTERED(smr)) + +#define SMR_ASSERT_NOT_ENTERED(smr) \ + KASSERT(!SMR_ENTERED(smr)) + +/* + * Return the current write sequence number. This is not the same as the + * current goal which may be in the future. + */ +static __inline smr_seq_t +smr_shared_current(smr_shared_t s) +{ + + return (atomic_load_relaxed(&s->s_wr.seq)); +} + +static __inline smr_seq_t +smr_current(smr_t smr) +{ + + return (smr_shared_current(smr_cpu_self(smr)->c_shared)); +} + +/* + * Enter a read section. + * + * IMPORTANT CALLER CONTRACT (same as smr_lazy_enter, see comment above + * smr_lazy_enter below): smr_enter uses only kpreempt_disable, it does + * NOT raise IPL. The recursion KASSERT below fires if the same smr + * context is re-entered on the same CPU, e.g. a softint dispatched + * while a user thread was in an smr_enter section. + * + * User-context callers MUST wrap smr_enter / smr_exit pairs with + * splsoftnet() / splx() (or a higher spl matching the softints that + * use this smr context) to prevent softint dispatch on the same CPU. + * Softint-context callers are safe by virtue of their own IPL. + */ +static __inline void +smr_enter(smr_t smr) +{ + struct smr *self; + + kpreempt_disable(); + self = smr_cpu_self(smr); + KASSERTMSG((self->c_flags & SMR_LAZY) == 0, + "smr_enter(%s) lazy smr.", self->c_shared->s_name); + KASSERTMSG(cpu_softintr_p() || cpu_intr_p() || + curcpu()->ci_cpl != IPL_NONE, + "smr_enter(%s) from user context at IPL_NONE", + self->c_shared->s_name); + KASSERTMSG(self->c_seq == SMR_SEQ_INVALID, + "smr_enter(%s) does not support recursion.", + self->c_shared->s_name); + + /* + * Store the current observed write sequence number in our + * per-cpu state so that it can be queried via smr_poll(). + * Frees that are newer than this stored value will be + * deferred until we call smr_exit(). + * + * Subsequent loads must not be re-ordered with the store. + * A full fence (seq_cst) is required to ensure the c_seq store + * is globally visible before subsequent loads (so smr_poll on + * another CPU sees us as an active reader). + * + * It is possible that a long delay between loading the wr_seq + * and storing the c_seq could create a situation where the + * rd_seq advances beyond our stored c_seq. In this situation + * only the observed wr_seq is stale, the fence still orders + * the load. See smr_poll() for details on how this condition + * is detected and handled there. + */ + atomic_store_relaxed(&self->c_seq, + smr_shared_current(self->c_shared)); + membar_sync(); +} + +/* + * Exit a read section. + */ +static __inline void +smr_exit(smr_t smr) +{ + struct smr *self; + + self = smr_cpu_self(smr); + KASSERTMSG(kpreempt_disabled(), "smr_exit(%s): preemption enabled", + self->c_shared->s_name); + KASSERTMSG((self->c_flags & SMR_LAZY) == 0, + "smr_exit(%s) lazy smr.", self->c_shared->s_name); + KASSERTMSG(self->c_seq != SMR_SEQ_INVALID, + "smr_exit(%s) not in a smr section.", self->c_shared->s_name); + + /* + * Clear the recorded sequence number. This allows poll() to + * detect CPUs not in read sections. + * + * Use release semantics to retire any stores before the sequence + * number is cleared. + */ + atomic_store_release(&self->c_seq, SMR_SEQ_INVALID); + kpreempt_enable(); +} + +/* + * Enter a lazy smr section. This is used for read-mostly state that + * can tolerate a high free latency. + * + * IMPORTANT CALLER CONTRACT: smr_lazy_enter uses only kpreempt_disable, + * it does NOT raise the IPL. It must therefore NOT be called from a + * context where an interrupt (hardware or softint) can preempt the + * current thread AND call smr_lazy_enter on the same smr context. The + * KASSERT below would fire ("does not support recursion") because + * c_seq is per-CPU state. + * + * In practice, this means: + * - Softint-context callers (e.g. tcp_input, ipintr, icmp processing) + * are already at their softint IPL and cannot be preempted by a + * same-IPL softint, so no extra protection is needed. + * - User-context callers (e.g. bind/connect, ifnet detach) MUST wrap + * the smr_lazy_enter/exit pair in splsoftnet()/splx() (or higher) + * so that hardware interrupts do not trigger softint_fast_dispatch + * of a softint that would re-enter the same smr context on this + * CPU. See inpcb_lookup_local() in sys/netinet/in_pcb.c for an + * example. + */ +static __inline void +smr_lazy_enter(smr_t smr) +{ + struct smr *self; + + kpreempt_disable(); + self = smr_cpu_self(smr); + KASSERTMSG((self->c_flags & SMR_LAZY) != 0, + "smr_lazy_enter(%s) non-lazy smr.", self->c_shared->s_name); + /* + * Recursion-prevention KASSERT. See the comment above this + * function for the caller IPL contract. + * + * Any elevated IPL (ci_cpl != IPL_NONE) prevents softint + * dispatch on the current CPU, which is what closes the + * recursion race. cpu_softintr_p() / cpu_intr_p() are a + * defensive belt-and-suspenders for arch ports whose softint + * machinery might not always raise ci_cpl to the softint's + * IPL before calling into C. + * + * This catches the typical violation (user context at + * IPL_NONE) immediately rather than waiting for a rare softint + * preemption to trip the c_seq check. The generic IPL check + * is chosen instead of a specific level (e.g. IPL_SOFTNET) so + * future non-network SMR contexts at other softint levels + * (IPL_SOFTCLOCK, IPL_SOFTBIO, IPL_SOFTSERIAL) are also + * covered without modification. + */ + KASSERTMSG(cpu_softintr_p() || cpu_intr_p() || + curcpu()->ci_cpl != IPL_NONE, + "smr_lazy_enter(%s) from user context at IPL_NONE", + self->c_shared->s_name); + KASSERTMSG(self->c_seq == SMR_SEQ_INVALID, + "smr_lazy_enter(%s) does not support recursion.", + self->c_shared->s_name); + + /* + * This needs no serialization. If an interrupt occurs before we + * assign sr_seq to c_seq any speculative loads will be discarded. + * If we assign a stale wr_seq value due to interrupt we use the + * same algorithm that renders smr_enter() safe. + */ + atomic_store_relaxed(&self->c_seq, + smr_shared_current(self->c_shared)); +} + +/* + * Exit a lazy smr section. This is used for read-mostly state that + * can tolerate a high free latency. + */ +static __inline void +smr_lazy_exit(smr_t smr) +{ + struct smr *self; + + self = smr_cpu_self(smr); + KASSERTMSG(kpreempt_disabled(), "smr_lazy_exit(%s): preemption enabled", + self->c_shared->s_name); + KASSERTMSG((self->c_flags & SMR_LAZY) != 0, + "smr_lazy_enter(%s) non-lazy smr.", self->c_shared->s_name); + KASSERTMSG(self->c_seq != SMR_SEQ_INVALID, + "smr_lazy_exit(%s) not in a smr section.", + self->c_shared->s_name); + + /* + * All loads/stores must be retired before the sequence becomes + * visible. Another alternative would be to omit the fence but + * store the exit time and wait 1 tick longer. + */ + membar_exit(); + atomic_store_relaxed(&self->c_seq, SMR_SEQ_INVALID); + kpreempt_enable(); +} + +/* + * Advances the write sequence number. Returns the sequence number + * required to ensure that all modifications are visible to readers. + */ +smr_seq_t smr_advance(smr_t smr); + +/* + * Returns true if a goal sequence has been reached. If + * wait is true this will busy loop until success. + */ +bool smr_poll(smr_t smr, smr_seq_t goal, bool wait); + +/* Create a new SMR context. */ +smr_t smr_create(const char *name, int limit, int flags); + +/* Destroy the context. */ +void smr_destroy(smr_t smr); + +/* + * Blocking wait for all readers to observe 'goal'. + */ +static __inline void +smr_wait(smr_t smr, smr_seq_t goal) +{ + + (void)smr_poll(smr, goal, true); +} + +/* + * Synchronize advances the write sequence and returns when all + * readers have observed it. + * + * If your application can cache a sequence number returned from + * smr_advance() and poll or wait at a later time there will + * be less chance of busy looping while waiting for readers. + */ +static __inline void +smr_synchronize(smr_t smr) +{ + + smr_wait(smr, smr_advance(smr)); +} + +/* Only at startup. */ +void smr_init(void); + +#endif /* _SYS_SMR_H_ */