diff --git a/share/man/man9/Makefile b/share/man/man9/Makefile
index 35cffd1a0eb5..40de1f548774 100644
--- a/share/man/man9/Makefile
+++ b/share/man/man9/Makefile
@@ -62,7 +62,7 @@ MAN+=	secmodel_securelevel.9
 MLINKS+=secmodel_securelevel.9 securelevel.9
 MAN+=	secmodel_suser.9 \
 	SET.9 setbit.9 setjmp.9 shutdownhook_establish.9 \
-	signal.9 skpc.9 sockopt.9 softintr.9 spl.9 specificdata.9 \
+	signal.9 skpc.9 smr.9 sockopt.9 softintr.9 spl.9 specificdata.9 \
 	spi.9 splraiseipl.9 \
 	strlist.9 \
 	suspendsched.9 \
@@ -803,7 +803,8 @@ MLINKS+=pool_cache.9 pool_cache_init.9 \
 	pool_cache.9 pool_cache_invalidate.9 \
 	pool_cache.9 pool_cache_sethiwat.9 \
 	pool_cache.9 pool_cache_setlowat.9 \
-	pool_cache.9 pool_cache_sethardlimit.9
+	pool_cache.9 pool_cache_sethardlimit.9 \
+	pool_cache.9 pool_cache_set_smr.9
 MLINKS+=powerhook_establish.9 powerhook_disestablish.9
 MLINKS+=preempt.9 yield.9
 MLINKS+=pserialize.9 pserialize_create.9 \
@@ -904,6 +905,17 @@ MLINKS+=signal.9 siginit.9 \
 	signal.9 sendsig.9 \
 	signal.9 sigcode.9 \
 	signal.9 sigtramp.9
+MLINKS+=smr.9 smr_create.9 \
+	smr.9 smr_destroy.9 \
+	smr.9 smr_enter.9 \
+	smr.9 smr_exit.9 \
+	smr.9 smr_lazy_enter.9 \
+	smr.9 smr_lazy_exit.9 \
+	smr.9 smr_advance.9 \
+	smr.9 smr_poll.9 \
+	smr.9 smr_wait.9 \
+	smr.9 smr_synchronize.9 \
+	smr.9 pool_cache_set_smr.9
 MLINKS+=sockopt.9 sockopt_init.9 \
 	sockopt.9 sockopt_destroy.9 \
 	sockopt.9 sockopt_get.9 \
diff --git a/share/man/man9/pool_cache.9 b/share/man/man9/pool_cache.9
index 82115c3a2ee7..512b1f89cf93 100644
--- a/share/man/man9/pool_cache.9
+++ b/share/man/man9/pool_cache.9
@@ -69,7 +69,8 @@
 .Nm pool_cache_invalidate ,
 .Nm pool_cache_sethiwat ,
 .Nm pool_cache_setlowat ,
-.Nm pool_cache_sethardlimit
+.Nm pool_cache_sethardlimit ,
+.Nm pool_cache_set_smr
 .Nd resource-pool cache manager
 .\" ------------------------------------------------------------
 .Sh SYNOPSIS
@@ -118,6 +119,10 @@
 .Ft void
 .Fn pool_cache_sethardlimit \
 "pool_cache_t pc" "int n" "const char *warnmess" "int ratecap"
+.\" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+.Ft void
+.Fn pool_cache_set_smr \
+"pool_cache_t pc" "void *smr"
 .\" ------------------------------------------------------------
 .Sh DESCRIPTION
 These utility routines provide management of pools of fixed-sized
@@ -170,28 +175,29 @@ The offset within an item to which the
 parameter applies.
 .It Fa flags
 .Pp
-Should be set to zero,
-.Dv PR_NOTOUCH ,
-or
-.Dv PR_PSERIALIZE .
-If
-.Dv PR_NOTOUCH
-is given, free items are never used to keep internal state so that
-the pool can be used for non memory backed objects.
-If
-.Dv PR_PSERIALIZE
-is given, then the allocator guarantees that a passive serialization barrier
-equivalent to
+Should be set to zero or a bitwise OR of the following:
+.Bl -tag -width PR_PSERIALIZE
+.It Dv PR_NOTOUCH
+Free items are never used to keep internal state so that the pool can be
+used for non memory backed objects.
+.It Dv PR_PSERIALIZE
+The allocator guarantees that a passive serialization barrier equivalent to
 .Dq xc_barrier(0)
 will be performed before either the object's destructor is called or
-before object's backing store is returned to the system.
-.Dv PR_PSERIALIZE
-implies
+before the object's backing store is returned to the system.
+Implies
 .Dv PR_NOTOUCH .
 Because of the guarantees provided by
 .Dv PR_PSERIALIZE ,
 objects must never be freed to a pool cache using this option
 from either hard or soft interrupt context, as doing so may block.
+.El
+.Pp
+The
+.Dv PR_SMR
+flag is not set directly; it is applied by
+.Fn pool_cache_set_smr
+(see below).
 .It Fa name
 .Pp
 The name used to identify the object in diagnostic output.
@@ -368,6 +374,60 @@ Set the minimum number of total items (both free and allocated) for the backing
 .Xr pool 9
 to
 .Fa n .
+.\" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+.It Fn pool_cache_set_smr "pc" "smr"
+.Pp
+Associate the pool cache
+.Fa pc
+with a Safe Memory Reclamation context
+.Fa smr
+(see
+.Xr smr 9 ) .
+This must be called after
+.Fn pool_cache_init
+but before any allocations from the cache.
+It sets the
+.Dv PR_SMR
+and
+.Dv PR_NOTOUCH
+flags on the cache.
+.Pp
+When an SMR context is associated with a pool cache, freed objects are not
+immediately eligible for reuse.
+Instead, they are staged in per-CPU buckets, stamped with an SMR write
+sequence number, and placed on a FIFO queue.
+Objects are only recycled once
+.Fn smr_poll
+confirms that all readers that may have observed the object have exited
+their read sections.
+.Pp
+On the allocation side, each CPU maintains a validated bucket of objects
+whose grace period has already expired.
+Allocations from this bucket require no lock acquisition.
+When the validated bucket is empty, a batch is dequeued from the FIFO
+and validated as a unit.
+This batching amortizes the cost of
+.Fn smr_advance
+across approximately 15\(en30 freed objects and reduces lock acquisition
+on both the allocation and free paths by a similar factor.
+.Pp
+.Dv PR_SMR
+and
+.Dv PR_PSERIALIZE
+protect different lifecycles and are mutually exclusive:
+.Dv PR_PSERIALIZE
+gates the return of backing
+.Em pages
+to the VM system (a coarse-grained barrier via IPI), while
+.Dv PR_SMR
+gates the reuse of individual cache
+.Em objects
+(a fine-grained barrier via sequence number tracking).
+An SMR pool does not require
+.Dv PR_PSERIALIZE
+because SMR grace periods already ensure that all readers have exited
+before any object on a page can be recycled, and thus before the page
+can become empty and eligible for return.
 .El
 .\" ------------------------------------------------------------
 .Sh CODE REFERENCES
@@ -380,4 +440,6 @@ subsystem is implemented within the file
 .Xr kmem 9 ,
 .Xr memoryallocators 9 ,
 .Xr percpu 9 ,
-.Xr pool 9
+.Xr pool 9 ,
+.Xr pserialize 9 ,
+.Xr smr 9
diff --git a/share/man/man9/smr.9 b/share/man/man9/smr.9
new file mode 100644
index 000000000000..fa8326415529
--- /dev/null
+++ b/share/man/man9/smr.9
@@ -0,0 +1,705 @@
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.\" Copyright (c) 2023 The FreeBSD Foundation
+.\"
+.\" This documentation was written by Mark Johnston <markj@FreeBSD.org>
+.\" under sponsorship from the FreeBSD Foundation.
+.\"
+.\" Adapted for NetBSD by Kevin Bowling <kevin.bowling@kev009.com>.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.Dd April 20, 2026
+.Dt SMR 9
+.Os
+.Sh NAME
+.Nm smr
+.Nd safe memory reclamation for lock-free data structures
+.Sh SYNOPSIS
+.In sys/smr.h
+.Ft smr_t
+.Fo smr_create
+.Fa "const char *name"
+.Fa "int limit"
+.Fa "int flags"
+.Fc
+.Ft void
+.Fo smr_destroy
+.Fa "smr_t smr"
+.Fc
+.Ft void
+.Fo smr_enter
+.Fa "smr_t smr"
+.Fc
+.Ft void
+.Fo smr_exit
+.Fa "smr_t smr"
+.Fc
+.Ft void
+.Fo smr_lazy_enter
+.Fa "smr_t smr"
+.Fc
+.Ft void
+.Fo smr_lazy_exit
+.Fa "smr_t smr"
+.Fc
+.Ft smr_seq_t
+.Fo smr_advance
+.Fa "smr_t smr"
+.Fc
+.Ft bool
+.Fo smr_poll
+.Fa "smr_t smr"
+.Fa "smr_seq_t goal"
+.Fa "bool wait"
+.Fc
+.Ft void
+.Fo smr_wait
+.Fa "smr_t smr"
+.Fa "smr_seq_t goal"
+.Fc
+.Ft void
+.Fo smr_synchronize
+.Fa "smr_t smr"
+.Fc
+.In sys/pool.h
+.Ft void
+.Fo pool_cache_set_smr
+.Fa "pool_cache_t pc"
+.Fa "void *smr"
+.Fc
+.\" -----
+.Sh DESCRIPTION
+Safe Memory Reclamation (SMR) is a facility which enables the implementation of
+memory-safe lock-free data structures.
+In typical usage, read accesses to an SMR-protected data structure, such as a
+hash table or tree, are performed in a
+.Dq read section
+consisting of code bracketed by
+.Fn smr_enter
+and
+.Fn smr_exit
+calls (or their lazy variants), while mutations of the data structure are
+serialized by a traditional mutex such as
+.Xr mutex 9 .
+In contrast with reader-writer locks such as
+.Xr rwlock 9 ,
+SMR allows readers and writers to access the data structure concurrently.
+Readers can always enter a read section immediately
+.Po
+.Fn smr_enter
+never blocks
+.Pc ,
+so mutations do not introduce read latency.
+Furthermore,
+.Fn smr_enter
+and
+.Fn smr_exit
+operate only on per-CPU data and thus avoid some of the performance problems
+inherent in the implementation of traditional reader-writer mutexes.
+SMR can therefore be a useful building block for data structures which are
+accessed frequently but are only rarely modified.
+.Pp
+Note that any SMR-protected data structure must be implemented carefully such
+that operations behave correctly in the absence of mutual exclusion between
+readers and writers.
+The data structure must be designed to be lock-free; SMR merely facilitates
+the implementation, for example by making it safe to follow dangling pointers
+and by helping avoid the ABA problem.
+.Pp
+When shared accesses to and mutations of a data structure can proceed
+concurrently, writers must take care to ensure that any items removed from the
+structure are not freed and recycled while readers are accessing them in
+parallel.
+This requirement results in a two-phase approach to the removal of items:
+first, the item is unlinked such that all pointers to the item are removed from
+the structure, preventing any new readers from observing the item.
+Then, the writer waits until some mechanism guarantees that no existing readers
+are still accessing the item.
+At that point the memory for that item can be freed and reused safely.
+SMR provides this mechanism: readers may access a lock-free data structure in
+between calls to the
+.Fn smr_enter
+and
+.Fn smr_exit
+functions, which together create a read section, and the
+.Fn smr_advance ,
+.Fn smr_poll ,
+.Fn smr_wait ,
+and
+.Fn smr_synchronize
+functions can be used to wait for threads in read sections to finish.
+All of these functions operate on a
+.Ft smr_t
+state block which holds both per-CPU and global state.
+Readers load global state and modify per-CPU state, while writers must scan all
+per-CPU states to detect active readers.
+SMR is designed to amortize this cost by batching to give acceptable
+performance in write-heavy workloads.
+.\" -----
+.Ss Readers
+Threads enter a read section by calling
+.Fn smr_enter
+or
+.Fn smr_lazy_enter .
+Read sections should be short, and many operations are not permitted while in
+a read section.
+Specifically, kernel preemption is disabled, and thus readers may not acquire
+blocking mutexes such as
+.Xr mutex 9
+with the
+.Dv MUTEX_DEFAULT
+type.
+The thread is pinned to the current CPU for the duration of the read section.
+Furthermore, read sections may not be nested: it is incorrect to call
+.Fn smr_enter
+or
+.Fn smr_lazy_enter
+with a given
+.Ft smr_t
+state block when already in a read section for that state block.
+.Pp
+.Fn smr_enter
+is used for non-lazy SMR contexts and issues a full memory barrier
+.Pq Fn membar_sync
+on entry.
+.Fn smr_lazy_enter
+is used for lazy SMR contexts
+.Pq created with Dv SMR_LAZY
+and does not issue a memory barrier on entry, relying instead on
+clock interrupts to flush store buffers.
+On exit,
+.Fn smr_exit
+issues a release barrier while
+.Fn smr_lazy_exit
+issues an exit barrier.
+.Ss Caller IPL Contract
+.Fn smr_enter
+and
+.Fn smr_lazy_enter
+use
+.Fn kpreempt_disable
+internally but do
+.Em not
+raise the interrupt priority level.
+This means that a hardware interrupt can preempt the current thread and
+trigger softint dispatch, which may re-enter the same SMR context on the
+same CPU, causing a panic due to the non-recursive assertion on the per-CPU
+sequence number.
+.Pp
+Callers must observe the following rules:
+.Bl -bullet
+.It
+.Em Softint-context callers
+(e.g., protocol input functions called from
+.Xr softint 9
+at
+.Dv IPL_SOFTNET )
+are inherently safe because they cannot be preempted by a same-level softint.
+No additional protection is needed.
+.It
+.Em User-context callers
+(e.g., system calls such as
+.Xr bind 2
+or
+.Xr connect 2 )
+.Em must
+raise the IPL before entering the read section.
+For network SMR contexts, wrap with
+.Fn splsoftnet
+and
+.Fn splx :
+.Bd -literal -offset indent
+int s;
+
+s = splsoftnet();
+smr_lazy_enter(smr);
+/* ... read section ... */
+smr_lazy_exit(smr);
+splx(s);
+.Ed
+.Pp
+Any raised IPL prevents softint dispatch on the current CPU, closing the
+recursion race.
+Future SMR contexts used from other softint levels
+.Pq e.g., Dv IPL_SOFTCLOCK , Dv IPL_SOFTBIO
+would require the corresponding
+.Fn spl*
+call.
+.El
+.Pp
+Under
+.Dv DIAGNOSTIC
+kernels, both
+.Fn smr_enter
+and
+.Fn smr_lazy_enter
+check that the caller is either in interrupt context or at a raised IPL,
+and panic immediately if called from user context at
+.Dv IPL_NONE .
+.\" -----
+.Ss Pool Cache Integration
+To simplify the integration of SMR into consumers, the
+.Xr pool_cache 9
+allocator provides SMR-aware facilities.
+This eliminates a good deal of complexity from the implementation of consumers
+and automatically batches write operations.
+.Pp
+A pool cache is associated with an SMR context by calling
+.Fn pool_cache_set_smr
+after
+.Fn pool_cache_init
+but before any allocations from the cache.
+Objects freed via
+.Fn pool_cache_put
+are staged in per-CPU buckets and stamped with an SMR sequence number.
+The freed objects are placed in a FIFO queue and are not recycled until
+.Fn smr_poll
+confirms that all readers which may have observed the object have exited
+their read sections.
+.Pp
+Allocations via
+.Fn pool_cache_get
+first check a per-CPU validated bucket of objects that have already passed
+their grace period, avoiding any lock acquisition on the fast path.
+When the validated bucket is empty,
+.Fn pool_cache_get
+attempts to dequeue a batch from the FIFO, validates it with
+.Fn smr_poll ,
+and installs it as the new per-CPU allocation bucket.
+.Pp
+This batching scheme amortizes the cost of
+.Fn smr_advance
+across approximately 15\(en30 freed objects
+.Pq depending on architecture
+and reduces lock acquisition on both the allocation and free paths by a
+similar factor.
+.\" -----
+.Ss Writers
+Internally, SMR maintains a global
+.Ql write sequence
+number.
+When entering a read section,
+.Fn smr_enter
+loads a copy of the write sequence and stores it in per-CPU memory, hence
+.Ql observing
+that value.
+To exit a read section, this per-CPU memory is overwritten with an invalid
+value, making the CPU inactive.
+Writers perform two operations: advancing the write sequence number, and
+polling all CPUs to see whether active readers have observed a given sequence
+number.
+These operations are performed by
+.Fn smr_advance
+and
+.Fn smr_poll ,
+respectively, which do not require serialization between multiple writers.
+.Pp
+After a writer unlinks an item from a data structure, it increments the write
+sequence number and tags the item with the new value returned by
+.Fn smr_advance .
+Once all CPUs have observed the new value, the writer can use
+.Fn smr_poll
+to deduce that no active readers have access to the unlinked item, and thus the
+item is safe to recycle.
+Because this pair of operations is relatively expensive, it is generally a good
+idea to amortize this cost by accumulating a collection of multiple unlinked
+items and tagging the entire batch with a target write sequence number.
+.Pp
+.Fn smr_poll
+is a non-blocking operation and returns true only if all active readers are
+guaranteed to have observed the target sequence number value.
+.Fn smr_wait
+is a variant of
+.Fn smr_poll
+which busy-waits until all CPUs have observed the target sequence number value.
+.Fn smr_synchronize
+combines
+.Fn smr_advance
+with
+.Fn smr_wait
+to wait for all CPUs to observe a new write sequence number.
+This is an expensive operation and should only be used if polling cannot be
+deferred in some way.
+.\" -----
+.Ss Memory Ordering
+The
+.Fn smr_enter
+function has acquire semantics via
+.Fn membar_sync ,
+and the
+.Fn smr_exit
+function has release semantics via
+.Fn atomic_store_release .
+.Pp
+The
+.Fn smr_lazy_enter
+function has relaxed store semantics only; it relies on periodic clock
+interrupts to serialize with other CPUs.
+The
+.Fn smr_lazy_exit
+function has release semantics via
+.Fn membar_exit .
+.Pp
+The
+.Fn smr_advance ,
+.Fn smr_poll ,
+.Fn smr_wait ,
+and
+.Fn smr_synchronize
+functions should not be assumed to have any guarantees with respect to memory
+ordering beyond what is documented in the source.
+.Fn smr_advance
+issues a release barrier before advancing.
+.Fn smr_poll
+issues an acquire barrier before returning.
+See
+.Xr membar_ops 3
+for more details.
+.\" -----
+.Sh FUNCTIONS
+.Bl -tag -width compact
+.It Fn smr_create "name" "limit" "flags"
+Create and initialize a new SMR context.
+.Fa name
+is a human-readable identifier used in diagnostic messages.
+.Fa limit
+controls the deferred advance interval when
+.Dv SMR_DEFERRED
+is set (the sequence number is advanced every
+.Fa limit
+calls to
+.Fn smr_advance
+rather than every call); set to 0 for default or lazy behavior.
+.Fa flags
+is a bitwise OR of:
+.Bl -tag -width SMR_DEFERRED
+.It Dv SMR_LAZY
+Enable lazy (tick-based) write sequence advancement.
+The write sequence advances at the rate of the system clock
+.Pq typically 100\(en1000 Hz
+rather than on every call to
+.Fn smr_advance .
+This reduces write-side overhead at the cost of increased reclamation latency
+(bounded by 2 clock ticks).
+The read-side entry
+.Pq Fn smr_lazy_enter
+does not issue a full memory barrier, relying on clock interrupts to serialize
+store buffers.
+Suitable for read-mostly data structures where objects live for at least
+milliseconds (e.g., protocol control blocks).
+.It Dv SMR_DEFERRED
+Enable deferred batching of write sequence advances.
+The global write sequence is incremented only every
+.Fa limit
+calls to
+.Fn smr_advance ,
+amortizing the atomic operation cost.
+Intermediate calls return a predicted future sequence number.
+Cannot be combined with
+.Dv SMR_LAZY .
+.El
+.It Fn smr_destroy "smr"
+Destroy the SMR context.
+Calls
+.Fn smr_synchronize
+internally to ensure all readers have exited before freeing resources.
+.It Fn smr_enter "smr"
+Enter a read section for a non-lazy SMR context.
+Disables kernel preemption and stores the current write sequence number
+in per-CPU state with a full memory barrier.
+Must not be called on a context created with
+.Dv SMR_LAZY ;
+use
+.Fn smr_lazy_enter
+instead.
+Must not be nested.
+See
+.Sx Caller IPL Contract
+above.
+.It Fn smr_exit "smr"
+Exit a non-lazy read section.
+Clears the per-CPU sequence number with release semantics and re-enables
+kernel preemption.
+.It Fn smr_lazy_enter "smr"
+Enter a read section for a lazy SMR context
+.Pq created with Dv SMR_LAZY .
+Disables kernel preemption and stores the current write sequence number
+in per-CPU state with relaxed store semantics (no memory barrier).
+Must not be called on a non-lazy context; use
+.Fn smr_enter
+instead.
+Must not be nested.
+See
+.Sx Caller IPL Contract
+above.
+.It Fn smr_lazy_exit "smr"
+Exit a lazy read section.
+Issues an exit memory barrier, clears the per-CPU sequence number, and
+re-enables kernel preemption.
+.It Fn smr_advance "smr"
+Advance the write sequence number and return the new goal sequence.
+The returned value can be saved and later passed to
+.Fn smr_poll
+or
+.Fn smr_wait .
+Must not be called from within a read section.
+Issues a release barrier before advancing to ensure prior stores are visible
+to readers.
+.Pp
+For
+.Dv SMR_LAZY
+contexts, the sequence advances at the clock tick rate (bounded by 2 ticks
+of grace).
+For
+.Dv SMR_DEFERRED
+contexts, the global sequence may not be updated on every call.
+.It Fn smr_poll "smr" "goal" "wait"
+Check whether all active readers have observed the sequence number
+.Fa goal .
+Returns
+.Dv true
+if the goal has been met.
+If
+.Fa wait
+is
+.Dv true ,
+busy-loops until the goal is met.
+If
+.Fa wait
+is
+.Dv false ,
+returns immediately with the current status.
+.Pp
+For lazy and deferred contexts, if the goal is ahead of the current write
+sequence,
+.Fn smr_poll
+will attempt to advance the write sequence when
+.Fa wait
+is
+.Dv true .
+.Pp
+Must not be called with
+.Fa wait
+set to
+.Dv true
+from within a read section.
+.It Fn smr_wait "smr" "goal"
+Equivalent to calling
+.Fn smr_poll
+with
+.Fa wait
+set to
+.Dv true .
+.It Fn smr_synchronize "smr"
+Advance the write sequence number and busy-wait until all active readers
+have observed the new value.
+Equivalent to calling
+.Fn smr_wait
+with the return value of
+.Fn smr_advance .
+This is expensive and should be avoided when polling can be deferred.
+.It Fn pool_cache_set_smr "pc" "smr"
+Associate the pool cache
+.Fa pc
+with the SMR context
+.Fa smr .
+Must be called after
+.Fn pool_cache_init
+but before any allocations from the cache.
+Sets the
+.Dv PR_SMR
+and
+.Dv PR_NOTOUCH
+flags on the cache.
+See
+.Sx Pool Cache Integration
+above for details on the batched free/reclaim mechanism.
+.El
+.\" -----
+.Sh EXAMPLES
+Given a global hash table of connection records protected by SMR:
+.Bd -literal
+struct conn {
+	struct conn	*c_next;
+	/* ... */
+};
+
+static smr_t		conn_smr;
+static kmutex_t		conn_lock;
+static struct conn	*conn_hash[HASH_SIZE];
+static pool_cache_t	conn_cache;
+.Ed
+.Pp
+Initialize the SMR context and pool cache:
+.Bd -literal
+conn_smr = smr_create("conn", 0, SMR_LAZY);
+conn_cache = pool_cache_init(sizeof(struct conn),
+    coherency_unit, 0, 0, "connpl", NULL, IPL_NET,
+    NULL, NULL, NULL);
+pool_cache_set_smr(conn_cache, conn_smr);
+.Ed
+.Pp
+Look up a connection, as a reader (from softint context):
+.Bd -literal
+struct conn *c;
+int error = ENOENT;
+
+smr_lazy_enter(conn_smr);
+for (c = atomic_load_consume(&conn_hash[h]);
+     c != NULL;
+     c = c->c_next) {
+	if (c->c_key == key) {
+		/* Use the connection within the read section. */
+		*resultp = c;
+		error = 0;
+		break;
+	}
+}
+smr_lazy_exit(conn_smr);
+return error;
+.Ed
+.Pp
+Look up a connection from user context (e.g., a system call):
+.Bd -literal
+int s;
+
+s = splsoftnet();
+smr_lazy_enter(conn_smr);
+/* ... same lookup as above ... */
+smr_lazy_exit(conn_smr);
+splx(s);
+.Ed
+.Pp
+Remove a connection, as a writer:
+.Bd -literal
+struct conn **cp, *c;
+
+mutex_enter(&conn_lock);
+for (cp = &conn_hash[h]; (c = *cp) != NULL; cp = &c->c_next) {
+	if (c->c_key == key) {
+		*cp = c->c_next;
+		break;
+	}
+}
+mutex_exit(&conn_lock);
+
+if (c != NULL) {
+	/*
+	 * Return to the SMR-aware pool cache.  The object will not
+	 * be recycled until all readers that may have observed it
+	 * have exited their read sections.
+	 */
+	pool_cache_put(conn_cache, c);
+}
+.Ed
+.\" -----
+.Sh ALGORITHM
+The SMR implementation is based on the Global Unbounded Sequences (GUS)
+algorithm, inspired by epoch-based reclamation.
+.Pp
+A monotonically increasing write sequence number is maintained globally.
+Readers record the most recent write sequence number they have observed in
+per-CPU state.
+A shared read sequence number records the lowest sequence number observed by
+any active reader as of the last poll.
+Any write older than this value has been observed by all readers and memory
+tagged with that sequence number can be reclaimed.
+.Pp
+Idle readers store an invalid sentinel
+.Pq Dv SMR_SEQ_INVALID
+in their per-CPU state.
+This allows
+.Fn smr_poll
+to distinguish active readers from idle CPUs.
+.Pp
+The write and read sequence numbers form a two-handed clock, with readers
+always advancing towards writers.
+When the system is idle the two hands meet and no deferred memory is
+outstanding.
+.Pp
+A notable distinction between GUS and simpler epoch-based schemes is that
+the delta between read and write sequence numbers is unbounded.
+This allows finer-grained assignment of sequence numbers even when some
+readers have long-lived read sections, and permits writers to advance the
+sequence and defer polling to a later time when completion is more likely.
+.\" -----
+.Sh NOTES
+The algorithm is implemented in
+.Pa sys/kern/subr_smr.c
+with inline reader functions in
+.Pa sys/sys/smr.h .
+.Pp
+The acronym SMR is used in the academic literature as a generic term for
+a family of algorithms enabling memory-safe concurrent access (including
+hazard pointers, epoch-based reclamation, and others).
+In this context, SMR refers specifically to the GUS algorithm and its
+implementation.
+.\" -----
+.Sh CODE REFERENCES
+The
+.Nm
+implementation is in
+.Pa sys/kern/subr_smr.c .
+The inline reader entry/exit functions and type definitions are in
+.Pa sys/sys/smr.h
+and
+.Pa sys/sys/_smr.h .
+The pool cache SMR integration is in
+.Pa sys/kern/subr_pool.c .
+.\" -----
+.Sh SEE ALSO
+.Xr membar_ops 3 ,
+.Xr mutex 9 ,
+.Xr pool_cache 9 ,
+.Xr pserialize 9 ,
+.Xr rwlock 9 ,
+.Xr softint 9 ,
+.Xr spl 9
+.\" -----
+.Sh HISTORY
+The SMR algorithm and implementation were originally written for
+.Fx
+by
+.An Jeff Roberson Aq Mt jeff@FreeBSD.org .
+The code was ported to
+.Nx
+with the addition of the
+.Fn smr_lazy_enter
+and
+.Fn smr_lazy_exit
+functions, the caller IPL contract, pool cache integration via
+.Fn pool_cache_set_smr ,
+and the
+.Dv SMR_DEFERRED
+batching mode.
+.Sh AUTHORS
+The SMR algorithm and its implementation were provided by
+.An Jeff Roberson Aq Mt jeff@FreeBSD.org .
+The
+.Fx
+manual page was written by
+.An Mark Johnston Aq Mt markj@FreeBSD.org .
+The
+.Nx
+port and this manual page adaptation were done by
+.An Kevin Bowling Aq Mt kevin.bowling@kev009.com .
diff --git a/sys/kern/files.kern b/sys/kern/files.kern
index 4c8967d61dcd..0b1259773c92 100644
--- a/sys/kern/files.kern
+++ b/sys/kern/files.kern
@@ -151,6 +151,7 @@ file	kern/subr_prf.c			kern
 file	kern/subr_prof.c		kern
 file	kern/subr_pserialize.c		kern
 file	kern/subr_psref.c		kern
+file	kern/subr_smr.c			kern
 file	kern/subr_specificdata.c	kern
 file	kern/subr_tftproot.c		tftproot
 file	kern/subr_time.c		kern
diff --git a/sys/kern/subr_pool.c b/sys/kern/subr_pool.c
index 4cd08a249186..4d025632041a 100644
--- a/sys/kern/subr_pool.c
+++ b/sys/kern/subr_pool.c
@@ -58,6 +58,7 @@ __KERNEL_RCSID(0, "$NetBSD: subr_pool.c,v 1.297 2026/01/04 03:20:29 riastradh Ex
 #include <sys/pool.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
+#include <sys/smr.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
@@ -386,6 +387,10 @@ static void	pool_cache_invalidate_cpu(pool_cache_t, u_int);
 static void	pool_cache_transfer(pool_cache_t);
 static int	pool_pcg_get(pcg_t *volatile *, pcg_t **);
 static int	pool_pcg_put(pcg_t *volatile *, pcg_t *);
+static void	pool_cache_smr_put(pool_cache_t, pcg_t *);
+static void	pool_cache_smr_put_pair(pool_cache_t, pcg_t *, pcg_t *);
+static void	pool_cache_smr_put_head(pool_cache_t, pcg_t *);
+static pcg_t *	pool_cache_smr_get(pool_cache_t);
 static pcg_t *	pool_pcg_trunc(pcg_t *volatile *);
 
 static int	pool_catchup(struct pool *);
@@ -2156,6 +2161,10 @@ pool_cache_bootstrap(pool_cache_t pc, size_t size, u_int align,
 	pc->pc_refcnt = 0;
 	pc->pc_roflags = flags;
 	pc->pc_freecheck = NULL;
+	pc->pc_smr = NULL;
+	pc->pc_smr_head = NULL;
+	pc->pc_smr_tail = NULL;
+	pc->pc_smr_seq_oldest = SMR_SEQ_INVALID;
 
 	if ((flags & PR_LARGECACHE) != 0) {
 		pc->pc_pcgsize = PCG_NOBJECTS_LARGE;
@@ -2239,6 +2248,10 @@ pool_cache_bootstrap_destroy(pool_cache_t pc)
 	for (i = 0; i < __arraycount(pc->pc_cpus); i++)
 		pool_cache_invalidate_cpu(pc, i);
 
+	/* Destroy SMR state if applicable. */
+	if (pc->pc_smr != NULL)
+		mutex_destroy(&pc->pc_smr_lock);
+
 	/* Finally, destroy it. */
 	pool_destroy(pp);
 }
@@ -2276,6 +2289,9 @@ pool_cache_cpu_init1(struct cpu_info *ci, pool_cache_t pc)
 
 	cc->cc_current = __UNCONST(&pcg_dummy);
 	cc->cc_previous = __UNCONST(&pcg_dummy);
+	cc->cc_smr_free1 = __UNCONST(&pcg_dummy);
+	cc->cc_smr_free2 = __UNCONST(&pcg_dummy);
+	cc->cc_smr_alloc = __UNCONST(&pcg_dummy);
 	cc->cc_pcgcache = pc->pc_pcgcache;
 	cc->cc_hits = 0;
 	cc->cc_misses = 0;
@@ -2451,6 +2467,26 @@ pool_cache_invalidate(pool_cache_t pc)
 	((pool_cache_cpu_t *)pc->pc_cpus[curcpu()->ci_index])->cc_nfull -= n;
 	splx(s);
 
+	/* Drain SMR FIFO queue if this is an SMR-enabled pool cache. */
+	if (pc->pc_smr != NULL) {
+		pcg_t *smr_pcg;
+		/*
+		 * Wait for all SMR readers to complete before
+		 * freeing objects from the FIFO.  Without this,
+		 * objects could be reused while readers on other
+		 * CPUs still hold references.
+		 */
+		smr_synchronize(pc->pc_smr);
+		n = 0;
+		while ((smr_pcg = pool_cache_smr_get(pc)) != NULL) {
+			n += pool_cache_invalidate_groups(pc, smr_pcg);
+		}
+		s = splvm();
+		((pool_cache_cpu_t *)pc->pc_cpus[curcpu()->ci_index])->cc_nfull
+		    -= n;
+		splx(s);
+	}
+
 	pcg = pool_pcg_trunc(&pc->pc_partgroups);
 	n = pool_cache_invalidate_groups(pc, pcg);
 	s = splvm();
@@ -2478,6 +2514,15 @@ pool_cache_invalidate_cpu(pool_cache_t pc, u_int index)
 	if ((cc = pc->pc_cpus[index]) == NULL)
 		return;
 
+	/*
+	 * pool_cache_invalidate() has already xcalled every online CPU
+	 * via pool_cache_transfer, which flushes cc_current/cc_previous
+	 * (non-SMR) or cc_smr_free1/cc_smr_free2/cc_smr_alloc (SMR) and
+	 * drains the SMR FIFO with smr_synchronize.  Any non-dummy pcgs
+	 * still here belong to a CPU that was offline at xcall time;
+	 * objects were freed long ago, certainly past any SMR grace
+	 * period, so a direct destruct is safe.
+	 */
 	if ((pcg = cc->cc_current) != &pcg_dummy) {
 		pcg->pcg_next = NULL;
 		pool_cache_invalidate_groups(pc, pcg);
@@ -2486,6 +2531,18 @@ pool_cache_invalidate_cpu(pool_cache_t pc, u_int index)
 		pcg->pcg_next = NULL;
 		pool_cache_invalidate_groups(pc, pcg);
 	}
+	if ((pcg = cc->cc_smr_free1) != &pcg_dummy) {
+		pcg->pcg_next = NULL;
+		pool_cache_invalidate_groups(pc, pcg);
+	}
+	if ((pcg = cc->cc_smr_free2) != &pcg_dummy) {
+		pcg->pcg_next = NULL;
+		pool_cache_invalidate_groups(pc, pcg);
+	}
+	if ((pcg = cc->cc_smr_alloc) != &pcg_dummy) {
+		pcg->pcg_next = NULL;
+		pool_cache_invalidate_groups(pc, pcg);
+	}
 	if (cc != &pc->pc_cpu0)
 		pool_put(&cache_cpu_pool, cc);
 
@@ -2512,6 +2569,119 @@ pool_cache_sethiwat(pool_cache_t pc, int n)
 	pool_sethiwat(&pc->pc_pool, n);
 }
 
+void
+pool_cache_set_smr(pool_cache_t pc, void *smr)
+{
+
+	KASSERT(pc->pc_smr == NULL);
+	/* Must be called before any allocations from this cache. */
+	KASSERT(pc->pc_pool.pr_nout == 0);
+	pc->pc_smr = smr;
+	pc->pc_roflags |= PR_SMR | PR_NOTOUCH;
+	mutex_init(&pc->pc_smr_lock, MUTEX_DEFAULT, IPL_VM);
+	pc->pc_smr_head = NULL;
+	pc->pc_smr_tail = NULL;
+	pc->pc_smr_seq_oldest = SMR_SEQ_INVALID;
+}
+
+/*
+ * SMR FIFO helpers: groups are appended at the tail (newest) and
+ * consumed from the head (oldest, most likely to have expired).
+ * Protected by pc_smr_lock.
+ *
+ * pc_smr_seq_oldest caches the head pcg's SMR sequence (or
+ * SMR_SEQ_INVALID when the FIFO is empty) so the allocation
+ * fast-reject path can skip the lock via atomic_load_relaxed +
+ * smr_poll when nothing is ripe.
+ */
+static void
+pool_cache_smr_put(pool_cache_t pc, pcg_t *pcg)
+{
+
+	pcg->pcg_next = NULL;
+	mutex_spin_enter(&pc->pc_smr_lock);
+	if (pc->pc_smr_tail != NULL) {
+		pc->pc_smr_tail->pcg_next = pcg;
+	} else {
+		pc->pc_smr_head = pcg;
+		atomic_store_relaxed(&pc->pc_smr_seq_oldest,
+		    pcg->pcg_smr_seq);
+	}
+	pc->pc_smr_tail = pcg;
+	mutex_spin_exit(&pc->pc_smr_lock);
+}
+
+/*
+ * Enqueue two full pcgs at the FIFO tail under a single lock
+ * acquisition.  Used by the put_slow flush-both path, where both
+ * pcgs share the same smr_advance() seq.
+ */
+static void
+pool_cache_smr_put_pair(pool_cache_t pc, pcg_t *first, pcg_t *second)
+{
+
+	first->pcg_next = second;
+	second->pcg_next = NULL;
+	mutex_spin_enter(&pc->pc_smr_lock);
+	if (pc->pc_smr_tail != NULL) {
+		pc->pc_smr_tail->pcg_next = first;
+	} else {
+		pc->pc_smr_head = first;
+		atomic_store_relaxed(&pc->pc_smr_seq_oldest,
+		    first->pcg_smr_seq);
+	}
+	pc->pc_smr_tail = second;
+	mutex_spin_exit(&pc->pc_smr_lock);
+}
+
+/*
+ * Re-enqueue a partially drained (or unexpired) pcg at the FIFO head.
+ * Used by get_slow when the popped pcg is not yet expired, or when
+ * objects remain after popping one.  The pcg's seq is necessarily
+ * <= any queued pcg, so head placement preserves FIFO-by-seq order.
+ */
+static void
+pool_cache_smr_put_head(pool_cache_t pc, pcg_t *pcg)
+{
+
+	mutex_spin_enter(&pc->pc_smr_lock);
+	pcg->pcg_next = pc->pc_smr_head;
+	pc->pc_smr_head = pcg;
+	if (pc->pc_smr_tail == NULL)
+		pc->pc_smr_tail = pcg;
+	atomic_store_relaxed(&pc->pc_smr_seq_oldest, pcg->pcg_smr_seq);
+	mutex_spin_exit(&pc->pc_smr_lock);
+}
+
+static pcg_t *
+pool_cache_smr_get(pool_cache_t pc)
+{
+	pcg_t *pcg;
+
+	mutex_spin_enter(&pc->pc_smr_lock);
+	pcg = pc->pc_smr_head;
+	if (pcg != NULL) {
+		pc->pc_smr_head = pcg->pcg_next;
+		if (pc->pc_smr_head == NULL) {
+			pc->pc_smr_tail = NULL;
+			atomic_store_relaxed(&pc->pc_smr_seq_oldest,
+			    SMR_SEQ_INVALID);
+		} else {
+			atomic_store_relaxed(&pc->pc_smr_seq_oldest,
+			    pc->pc_smr_head->pcg_smr_seq);
+		}
+	}
+	mutex_spin_exit(&pc->pc_smr_lock);
+	return pcg;
+}
+
+void *
+pool_cache_get_smr(pool_cache_t pc)
+{
+
+	return pc->pc_smr;
+}
+
 void
 pool_cache_sethardlimit(pool_cache_t pc, int n, const char *warnmess, int ratecap)
 {
@@ -2570,6 +2740,16 @@ pool_pcg_get(pcg_t *volatile *head, pcg_t **pcgp)
 			membar_datadep_consumer(); /* alpha */
 			n = atomic_load_relaxed(&o->pcg_next);
 			atomic_store_release(head, n);
+#ifdef __OCTEON__
+			/*
+			 * Octeon store buffers can linger for hundreds of
+			 * thousands of cycles; a bare syncw drains the
+			 * release store promptly.  See mips/include/lock.h
+			 * for details and the XXX about fixing
+			 * atomic_store_release.
+			 */
+			__asm volatile("syncw" ::: "memory");
+#endif
 			break;
 		}
 	}
@@ -2641,26 +2821,99 @@ pool_cache_get_slow(pool_cache_t pc, pool_cache_cpu_t *cc, int s,
 	pcg_t *pcg, *cur;
 	void *object;
 
-	KASSERT(cc->cc_current->pcg_avail == 0);
-	KASSERT(cc->cc_previous->pcg_avail == 0);
+	if (__predict_false(pc->pc_smr != NULL)) {
+		KASSERT(cc->cc_smr_alloc->pcg_avail == 0);
+	} else {
+		KASSERT(cc->cc_current->pcg_avail == 0);
+		KASSERT(cc->cc_previous->pcg_avail == 0);
+	}
 
 	cc->cc_misses++;
 
 	/*
 	 * If there's a full group, release our empty group back to the
-	 * cache.  Install the full group as cc_current and return.
+	 * cache and install the full group as the fast-path source.
+	 *
+	 * For SMR pools, pull from the FIFO (oldest first) and install
+	 * the ripe pcg as cc_smr_alloc.  The entire bucket's worth of
+	 * already-validated objects is then drained by the get fast
+	 * path without re-entering the FIFO lock.
+	 *
+	 * For non-SMR pools, use the regular LIFO pc_fullgroups stack
+	 * and install as cc_current.
 	 */
-	cc->cc_contended += pool_pcg_get(&pc->pc_fullgroups, &pcg);
-	if (__predict_true(pcg != NULL)) {
-		KASSERT(pcg->pcg_avail == pcg->pcg_size);
-		if (__predict_true((cur = cc->cc_current) != &pcg_dummy)) {
-			KASSERT(cur->pcg_avail == 0);
-			(void)pool_pcg_put(cc->cc_pcgcache, cur);
+	if (__predict_false(pc->pc_smr != NULL)) {
+		smr_seq_t seq;
+
+		/*
+		 * Lock-free fast-reject: consult the cached head seq
+		 * before taking pc_smr_lock.  A stale read is harmless;
+		 * the per-pcg smr_poll below re-validates.
+		 *
+		 * When the FIFO is empty, call smr_advance() to keep
+		 * the SMR_LAZY write clock (s_wr.seq) moving.  Without
+		 * this, batched put-side stamps set goals that never
+		 * ripen on sparse workloads.  No other code path ticks
+		 * the lazy clock for this smr context and previously-
+		 * stamped pcgs would accumulate forever in the FIFO.
+		 * (When the FIFO is non-empty, smr_poll itself ticks
+		 * the lazy clock via smr_lazy_advance.)
+		 */
+		seq = atomic_load_relaxed(&pc->pc_smr_seq_oldest);
+		if (seq == SMR_SEQ_INVALID) {
+			(void)smr_advance(pc->pc_smr);
+			goto fresh;
+		}
+		if (!smr_poll(pc->pc_smr, seq, false))
+			goto fresh;
+
+		pcg = pool_cache_smr_get(pc);
+		if (pcg != NULL) {
+			KASSERT(pcg->pcg_avail > 0);
+			if (!smr_poll(pc->pc_smr, pcg->pcg_smr_seq,
+			    false)) {
+				/*
+				 * Oldest group still has active readers
+				 * (we lost a race against a recent put).
+				 * Put it back at the head and allocate
+				 * fresh.
+				 */
+				pool_cache_smr_put_head(pc, pcg);
+				goto fresh;
+			}
+			/*
+			 * SMR grace period expired.  Install the whole
+			 * pcg as cc_smr_alloc; the retry in get_paddr
+			 * will pop the first object via the fast path.
+			 */
+			cur = cc->cc_smr_alloc;
+			if (__predict_true(cur != &pcg_dummy)) {
+				KASSERT(cur->pcg_avail == 0);
+				(void)pool_pcg_put(cc->cc_pcgcache, cur);
+			}
+			cc->cc_smr_alloc = pcg;
+			cc->cc_nfull--;
+			return true;
+		}
+	} else {
+		cc->cc_contended += pool_pcg_get(&pc->pc_fullgroups, &pcg);
+		if (__predict_true(pcg != NULL)) {
+			KASSERT(pcg->pcg_avail == pcg->pcg_size);
+			goto install;
 		}
-		cc->cc_nfull--;
-		cc->cc_current = pcg;
-		return true;
 	}
+	goto fresh;
+
+install:
+	if (__predict_true((cur = cc->cc_current) != &pcg_dummy)) {
+		KASSERT(cur->pcg_avail == 0);
+		(void)pool_pcg_put(cc->cc_pcgcache, cur);
+	}
+	cc->cc_nfull--;
+	cc->cc_current = pcg;
+	return true;
+
+fresh:
 
 	/*
 	 * Nothing available locally or in cache.  Take the slow
@@ -2736,8 +2989,55 @@ pool_cache_get_paddr(pool_cache_t pc, int flags, paddr_t *pap)
 	/* Lock out interrupts and disable preemption. */
 	s = splvm();
 	while (/* CONSTCOND */ true) {
-		/* Try and allocate an object from the current group. */
 		cc = pc->pc_cpus[curcpu()->ci_index];
+		if (__predict_false(cc == NULL)) {
+			/*
+			 * CPU not yet registered with this pool cache.
+			 */
+			if (flags & PR_NOWAIT) {
+				splx(s);
+				return NULL;
+			}
+			splx(s);
+			pool_cache_cpu_init1(curcpu(), pc);
+			s = splvm();
+			cc = pc->pc_cpus[curcpu()->ci_index];
+			KASSERT(cc != NULL);
+		}
+
+		/*
+		 * For SMR pools the hot fast path pops from cc_smr_alloc,
+		 * a per-CPU bucket whose objects have already passed a
+		 * grace period (smr_poll verified on bucket install).
+		 * cc_smr_free1 / cc_smr_free2 stage recently-freed
+		 * objects that are NOT yet safe to hand out; the alloc
+		 * side never touches them.  This matches FreeBSD's
+		 * UMA_ZONE_SMR uc_allocbucket / uc_freebucket split.
+		 */
+		if (__predict_false(pc->pc_smr != NULL)) {
+			pcg = cc->cc_smr_alloc;
+			if (__predict_true(pcg->pcg_avail > 0)) {
+				object = pcg->pcg_objects[--pcg->pcg_avail]
+				    .pcgo_va;
+				if (__predict_false(pap != NULL))
+					*pap = pcg->pcg_objects[pcg->pcg_avail]
+					    .pcgo_pa;
+#if defined(DIAGNOSTIC)
+				pcg->pcg_objects[pcg->pcg_avail].pcgo_va = NULL;
+				KASSERT(pcg->pcg_avail < pcg->pcg_size);
+				KASSERT(object != NULL);
+#endif
+				cc->cc_hits++;
+				splx(s);
+				FREECHECK_OUT(&pc->pc_freecheck, object);
+				pool_redzone_fill(&pc->pc_pool, object);
+				pool_cache_get_kmsan(pc, object);
+				return object;
+			}
+			goto get_slow;
+		}
+
+		/* Try and allocate an object from the current group. */
 	 	pcg = cc->cc_current;
 		if (__predict_true(pcg->pcg_avail > 0)) {
 			object = pcg->pcg_objects[--pcg->pcg_avail].pcgo_va;
@@ -2766,6 +3066,7 @@ pool_cache_get_paddr(pool_cache_t pc, int flags, paddr_t *pap)
 			cc->cc_current = pcg;
 			continue;
 		}
+get_slow:
 
 		/*
 		 * Can't allocate from either group: try the slow path.
@@ -2795,8 +3096,16 @@ pool_cache_put_slow(pool_cache_t pc, pool_cache_cpu_t *cc, int s, void *object)
 {
 	pcg_t *pcg, *cur;
 
-	KASSERT(cc->cc_current->pcg_avail == cc->cc_current->pcg_size);
-	KASSERT(cc->cc_previous->pcg_avail == cc->cc_previous->pcg_size);
+	if (__predict_false(pc->pc_smr != NULL)) {
+		KASSERT(cc->cc_smr_free1->pcg_avail ==
+		    cc->cc_smr_free1->pcg_size);
+		KASSERT(cc->cc_smr_free2->pcg_avail ==
+		    cc->cc_smr_free2->pcg_size);
+	} else {
+		KASSERT(cc->cc_current->pcg_avail == cc->cc_current->pcg_size);
+		KASSERT(cc->cc_previous->pcg_avail ==
+		    cc->cc_previous->pcg_size);
+	}
 
 	cc->cc_misses++;
 
@@ -2816,19 +3125,87 @@ pool_cache_put_slow(pool_cache_t pc, pool_cache_cpu_t *cc, int s, void *object)
 	}
 
 	/*
-	 * If there's a empty group, release our full group back to the
-	 * cache.  Install the empty group to the local CPU and return.
+	 * If there's an empty group, install it and retry.  The filled
+	 * cc_current is released as follows:
+	 *
+	 *   non-SMR: pushed onto pc_fullgroups (LIFO) for any CPU to reuse.
+	 *            cc_previous is left in place; the get path drains it
+	 *            by swap-when-current-empty.
+	 *   SMR:     both cc_smr_free1 and cc_smr_free2 are stamped with
+	 *            one smr_advance() and enqueued on the SMR FIFO.  The
+	 *            alloc side reads from cc_smr_alloc only, so leaving
+	 *            cc_smr_free2 full would waste memory indefinitely;
+	 *            flushing both here also doubles amortization to
+	 *            ~2 * pc_pcgsize frees per smr_advance (~30 objects
+	 *            on _LP64).
+	 *
+	 * The SMR flush-both branch is reached only when both
+	 * cc_smr_free1 and cc_smr_free2 are full pcgs (the fast path fell
+	 * through) or cc_smr_free2 is pcg_dummy (handled below).
+	 * cc_smr_free1 is never pcg_dummy with cc_smr_free2 non-dummy
+	 * full: you cannot fill cc_smr_free2 without first having filled
+	 * cc_smr_free1.
 	 */
 	if (pcg != NULL) {
 		KASSERT(pcg->pcg_avail == 0);
-		if (__predict_false(cc->cc_previous == &pcg_dummy)) {
+		if (__predict_false(pc->pc_smr != NULL)) {
+			if (__predict_false(cc->cc_smr_free2 == &pcg_dummy)) {
+				/*
+				 * Normally we install the new empty pcg as
+				 * cc_smr_free2 and wait for it to fill before
+				 * flushing.  However, if cc_smr_free1 is full
+				 * and the SMR FIFO is currently empty, publish
+				 * cc_smr_free1 immediately so the alloc side
+				 * has something to ripen.  Deferring until
+				 * flush-both would starve the alloc path under
+				 * sparse workloads where per-CPU puts never
+				 * reach the ~2 * pc_pcgsize threshold.
+				 *
+				 * The amortization factor drops from ~2x to 1x
+				 * pc_pcgsize in this branch, but only when the
+				 * FIFO has drained to empty.  Once it's primed
+				 * the normal flush-both path below resumes.
+				 */
+				cur = cc->cc_smr_free1;
+				if (cur != &pcg_dummy &&
+				    cur->pcg_avail == cur->pcg_size &&
+				    atomic_load_relaxed(
+				    &pc->pc_smr_seq_oldest) ==
+				    SMR_SEQ_INVALID) {
+					cur->pcg_smr_seq =
+					    smr_advance(pc->pc_smr);
+					pool_cache_smr_put(pc, cur);
+					cc->cc_nfull++;
+					cc->cc_smr_free1 = pcg;
+				} else {
+					cc->cc_smr_free2 = pcg;
+				}
+			} else {
+				pcg_t *prev = cc->cc_smr_free2;
+				smr_seq_t seq;
+
+				cur = cc->cc_smr_free1;
+				KASSERT(cur != &pcg_dummy);
+				KASSERT(cur->pcg_avail == cur->pcg_size);
+				KASSERT(prev->pcg_avail == prev->pcg_size);
+
+				seq = smr_advance(pc->pc_smr);
+				cur->pcg_smr_seq = seq;
+				prev->pcg_smr_seq = seq;
+				pool_cache_smr_put_pair(pc, cur, prev);
+				cc->cc_nfull += 2;
+				cc->cc_smr_free1 = pcg;
+				cc->cc_smr_free2 = __UNCONST(&pcg_dummy);
+			}
+		} else if (__predict_false(cc->cc_previous == &pcg_dummy)) {
 			cc->cc_previous = pcg;
 		} else {
 			cur = cc->cc_current;
 			if (__predict_true(cur != &pcg_dummy)) {
 				KASSERT(cur->pcg_avail == cur->pcg_size);
 				cc->cc_contended +=
-				    pool_pcg_put(&pc->pc_fullgroups, cur);
+				    pool_pcg_put(&pc->pc_fullgroups,
+				    cur);
 				cc->cc_nfull++;
 			}
 			cc->cc_current = pcg;
@@ -2876,6 +3253,51 @@ pool_cache_put_paddr(pool_cache_t pc, void *object, paddr_t pa)
 
 	/* Lock out interrupts and disable preemption. */
 	s = splvm();
+
+	if (__predict_false(pc->pc_smr != NULL)) {
+		/*
+		 * SMR put fast path: stage in cc_smr_free1 / cc_smr_free2.
+		 * The allocation side reads from cc_smr_alloc and does not
+		 * see objects staged here until put_slow stamps them with
+		 * smr_advance and queues them on the SMR FIFO.
+		 */
+		while (/* CONSTCOND */ true) {
+			cc = pc->pc_cpus[curcpu()->ci_index];
+			if (__predict_false(cc == NULL)) {
+				/*
+				 * CPU not yet registered with this pool
+				 * cache (transient during CPU hot-plug).
+				 * Initialize and retry.  pool_cache_get_paddr
+				 * handles this the same way.
+				 */
+				splx(s);
+				pool_cache_cpu_init1(curcpu(), pc);
+				s = splvm();
+				cc = pc->pc_cpus[curcpu()->ci_index];
+				KASSERT(cc != NULL);
+			}
+			pcg = cc->cc_smr_free1;
+			if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) {
+				pcg->pcg_objects[pcg->pcg_avail].pcgo_va =
+				    object;
+				pcg->pcg_objects[pcg->pcg_avail].pcgo_pa = pa;
+				pcg->pcg_avail++;
+				cc->cc_hits++;
+				splx(s);
+				return;
+			}
+			pcg = cc->cc_smr_free2;
+			if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) {
+				cc->cc_smr_free2 = cc->cc_smr_free1;
+				cc->cc_smr_free1 = pcg;
+				continue;
+			}
+			if (!pool_cache_put_slow(pc, cc, s, object))
+				break;
+		}
+		return;
+	}
+
 	while (/* CONSTCOND */ true) {
 		/* If the current group isn't full, release it there. */
 		cc = pc->pc_cpus[curcpu()->ci_index];
@@ -2920,11 +3342,73 @@ static void
 pool_cache_transfer(pool_cache_t pc)
 {
 	pool_cache_cpu_t *cc;
-	pcg_t *prev, *cur;
+	pcg_t *prev, *cur, *alloc;
 	int s;
 
 	s = splvm();
 	cc = pc->pc_cpus[curcpu()->ci_index];
+
+	if (__predict_false(pc->pc_smr != NULL)) {
+		smr_seq_t seq = SMR_SEQ_INVALID;
+
+		cur = cc->cc_smr_free1;
+		cc->cc_smr_free1 = __UNCONST(&pcg_dummy);
+		prev = cc->cc_smr_free2;
+		cc->cc_smr_free2 = __UNCONST(&pcg_dummy);
+		alloc = cc->cc_smr_alloc;
+		cc->cc_smr_alloc = __UNCONST(&pcg_dummy);
+
+		/*
+		 * Batch a single smr_advance for whichever of the
+		 * two free buckets carry staged objects.
+		 */
+		if ((cur != &pcg_dummy && cur->pcg_avail > 0) ||
+		    (prev != &pcg_dummy && prev->pcg_avail > 0))
+			seq = smr_advance(pc->pc_smr);
+
+		if (cur != &pcg_dummy) {
+			if (cur->pcg_avail == 0) {
+				(void)pool_pcg_put(pc->pc_pcgcache, cur);
+			} else {
+				cur->pcg_smr_seq = seq;
+				pool_cache_smr_put(pc, cur);
+				cc->cc_nfull++;
+			}
+		}
+		if (prev != &pcg_dummy) {
+			if (prev->pcg_avail == 0) {
+				(void)pool_pcg_put(pc->pc_pcgcache, prev);
+			} else {
+				prev->pcg_smr_seq = seq;
+				pool_cache_smr_put(pc, prev);
+				cc->cc_nfull++;
+			}
+		}
+		if (alloc != &pcg_dummy) {
+			if (alloc->pcg_avail == 0) {
+				(void)pool_pcg_put(pc->pc_pcgcache, alloc);
+			} else {
+				/*
+				 * Validated alloc bucket: the existing
+				 * pcg_smr_seq already passed a grace
+				 * period, so smr_poll will pass
+				 * immediately on the next allocation.
+				 *
+				 * Pushing to FIFO tail may place this
+				 * older seq behind the newer free1/free2
+				 * seqs we just stamped; pc_smr_seq_oldest
+				 * tracks the head only and per-pcg
+				 * smr_poll is authoritative, so FIFO-by-
+				 * seq monotonicity is not load-bearing.
+				 */
+				pool_cache_smr_put(pc, alloc);
+				cc->cc_nfull++;
+			}
+		}
+		splx(s);
+		return;
+	}
+
 	cur = cc->cc_current;
 	cc->cc_current = __UNCONST(&pcg_dummy);
 	prev = cc->cc_previous;
diff --git a/sys/kern/subr_smr.c b/sys/kern/subr_smr.c
new file mode 100644
index 000000000000..e1da0b6796c1
--- /dev/null
+++ b/sys/kern/subr_smr.c
@@ -0,0 +1,625 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2019,2020 Jeffrey Roberson <jeff@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Ported to NetBSD from FreeBSD's kern/subr_smr.c.
+ */
+
+/*
+ * Global Unbounded Sequences (GUS)
+ *
+ * This is a novel safe memory reclamation technique inspired by
+ * epoch based reclamation from Samy Al Bahra's concurrency kit which
+ * in turn was based on work described in:
+ *   Fraser, K. 2004. Practical Lock-Freedom. PhD Thesis, University
+ *   of Cambridge Computing Laboratory.
+ * And shares some similarities with:
+ *   Wang, Stamler, Parmer. 2016 Parallel Sections: Scaling System-Level
+ *   Data-Structures
+ *
+ * This is not an implementation of hazard pointers or related
+ * techniques.  The term safe memory reclamation is used as a
+ * generic descriptor for algorithms that defer frees to avoid
+ * use-after-free errors with lockless datastructures or as
+ * a mechanism to detect quiescence for writer synchronization.
+ *
+ * The basic approach is to maintain a monotonic write sequence
+ * number that is updated on some application defined granularity.
+ * Readers record the most recent write sequence number they have
+ * observed.  A shared read sequence number records the lowest
+ * sequence number observed by any reader as of the last poll.  Any
+ * write older than this value has been observed by all readers
+ * and memory can be reclaimed.  Like Epoch we also detect idle
+ * readers by storing an invalid sequence number in the per-cpu
+ * state when the read section exits.  Like Parsec we establish
+ * a global write clock that is used to mark memory on free.
+ *
+ * The write and read sequence numbers can be thought of as a two
+ * handed clock with readers always advancing towards writers.  GUS
+ * maintains the invariant that all readers can safely access memory
+ * that was visible at the time they loaded their copy of the sequence
+ * number.  Periodically the read sequence or hand is polled and
+ * advanced as far towards the write sequence as active readers allow.
+ * Memory which was freed between the old and new global read sequence
+ * number can now be reclaimed.  When the system is idle the two hands
+ * meet and no deferred memory is outstanding.  Readers never advance
+ * any sequence number, they only observe them.  The shared read
+ * sequence number is consequently never higher than the write sequence.
+ * A stored sequence number that falls outside of this range has expired
+ * and needs no scan to reclaim.
+ *
+ * A notable distinction between GUS and Epoch, qsbr, rcu, etc. is
+ * that advancing the sequence number is decoupled from detecting its
+ * observation.  That is to say, the delta between read and write
+ * sequence numbers is not bound.  This can be thought of as a more
+ * generalized form of epoch which requires them at most one step
+ * apart.  This results in a more granular assignment of sequence
+ * numbers even as read latencies prohibit all or some expiration.
+ * It also allows writers to advance the sequence number and save the
+ * poll for expiration until a later time when it is likely to
+ * complete without waiting.  The batch granularity and free-to-use
+ * latency is dynamic and can be significantly smaller than in more
+ * strict systems.
+ *
+ * See FreeBSD's kern/subr_smr.c for the full UMA integration
+ * description, which is not applicable to the NetBSD port.
+ *
+ * If the read overhead of accessing the shared cacheline becomes
+ * especially burdensome an invariant TSC could be used in place of the
+ * sequence.  The algorithm would then only need to maintain the minimum
+ * observed tsc.  This would trade potential cache synchronization
+ * overhead for local serialization and cpu timestamp overhead.
+ */
+
+/*
+ * A simplified diagram:
+ *
+ * 0                                                          UINT_MAX
+ * | -------------------- sequence number space -------------------- |
+ *              ^ rd seq                            ^ wr seq
+ *              | ----- valid sequence numbers ---- |
+ *                ^cpuA  ^cpuC
+ * | -- free -- | --------- deferred frees -------- | ---- free ---- |
+ *
+ *
+ * In this example cpuA has the lowest sequence number and poll can
+ * advance rd seq.  cpuB is not running and is considered to observe
+ * wr seq.
+ *
+ * Freed memory that is tagged with a sequence number between rd seq and
+ * wr seq can not be safely reclaimed because cpuA may hold a reference to
+ * it.  Any other memory is guaranteed to be unreferenced.
+ *
+ * Any writer is free to advance wr seq at any time however it may busy
+ * poll in pathological cases.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/atomic.h>
+#include <sys/cpu.h>
+#include <sys/kernel.h>
+#include <sys/kmem.h>
+#include <sys/lock.h>		/* SPINLOCK_BACKOFF_HOOK */
+#include <sys/smr.h>
+#include <sys/evcnt.h>
+
+#ifndef DIAGNOSTIC
+#define	SMR_SEQ_INIT	1		/* All valid sequence numbers are odd. */
+#define	SMR_SEQ_INCR	2
+
+/*
+ * SMR_SEQ_MAX_DELTA is the maximum distance allowed between rd_seq and
+ * wr_seq.  For the modular arithmetic to work a value of UNIT_MAX / 2
+ * would be possible but it is checked after we increment the wr_seq so
+ * a safety margin is left to prevent overflow.
+ *
+ * We will block until SMR_SEQ_MAX_ADVANCE sequence numbers have progressed
+ * to prevent integer wrapping.  See smr_advance() for more details.
+ */
+#define	SMR_SEQ_MAX_DELTA	(UINT_MAX / 4)
+#define	SMR_SEQ_MAX_ADVANCE	(SMR_SEQ_MAX_DELTA - 1024)
+#else
+/* We want to test the wrapping feature in invariants kernels. */
+#define	SMR_SEQ_INCR	(UINT_MAX / 10000)
+#define	SMR_SEQ_INIT	(UINT_MAX - 100000)
+/* Force extra polls to test the integer overflow detection. */
+#define	SMR_SEQ_MAX_DELTA	(SMR_SEQ_INCR * 32)
+#define	SMR_SEQ_MAX_ADVANCE	SMR_SEQ_MAX_DELTA / 2
+#endif
+
+/*
+ * The grace period for lazy (tick based) SMR.
+ *
+ * Hardclock is responsible for advancing ticks on a single CPU while every
+ * CPU receives a regular clock interrupt.  The clock interrupts are flushing
+ * the store buffers and any speculative loads that may violate our invariants.
+ * Because these interrupts are not synchronized we must wait one additional
+ * tick in the future to be certain that all processors have had their state
+ * synchronized by an interrupt.
+ *
+ * This assumes that the clock interrupt will only be delayed by other causes
+ * that will flush the store buffer or prevent access to the section protected
+ * data.  For example, an idle processor, or an system management interrupt,
+ * or a vm exit.
+ */
+#define	SMR_LAZY_GRACE		2
+#define	SMR_LAZY_INCR		(SMR_LAZY_GRACE * SMR_SEQ_INCR)
+
+/*
+ * The maximum sequence number ahead of wr_seq that may still be valid.  The
+ * sequence may not be advanced on write for lazy or deferred SMRs.  In this
+ * case poll needs to attempt to forward the sequence number if the goal is
+ * within wr_seq + SMR_SEQ_ADVANCE.
+ */
+#define	SMR_SEQ_ADVANCE		SMR_LAZY_INCR
+
+/* Statistics (debug). */
+static struct evcnt smr_ev_advance =
+    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "smr", "advance");
+static struct evcnt smr_ev_advance_wait =
+    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "smr", "advance_wait");
+static struct evcnt smr_ev_poll =
+    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "smr", "poll");
+static struct evcnt smr_ev_poll_scan =
+    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "smr", "poll_scan");
+static struct evcnt smr_ev_poll_fail =
+    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "smr", "poll_fail");
+
+EVCNT_ATTACH_STATIC(smr_ev_advance);
+EVCNT_ATTACH_STATIC(smr_ev_advance_wait);
+EVCNT_ATTACH_STATIC(smr_ev_poll);
+EVCNT_ATTACH_STATIC(smr_ev_poll_scan);
+EVCNT_ATTACH_STATIC(smr_ev_poll_fail);
+
+/*
+ * Advance a lazy write sequence number.  These move forward at the rate of
+ * ticks.  Grace is SMR_LAZY_INCR (2 ticks) in the future.
+ *
+ * This returns the goal write sequence number.
+ */
+static smr_seq_t
+smr_lazy_advance(smr_t smr, smr_shared_t s)
+{
+	union s_wr s_wr, old;
+	int t, d;
+
+	KASSERT(kpreempt_disabled());
+
+	/*
+	 * Load the stored ticks value before the current one.  This way the
+	 * current value can only be the same or larger.
+	 */
+	old._pair = s_wr._pair = atomic_load_relaxed(&s->s_wr._pair);
+	t = getticks();
+
+	/*
+	 * The most probable condition that the update already took place.
+	 */
+	d = t - s_wr.ticks;
+	if (__predict_true(d == 0))
+		goto out;
+	/* Cap the rate of advancement and handle long idle periods. */
+	if (d > SMR_LAZY_GRACE || d < 0)
+		d = SMR_LAZY_GRACE;
+	s_wr.ticks = t;
+	s_wr.seq += d * SMR_SEQ_INCR;
+
+	/*
+	 * This can only fail if another thread races to call advance().
+	 * Strong cmpset semantics mean we are guaranteed that the update
+	 * happened.
+	 */
+	(void)atomic_cas_64(&s->s_wr._pair, old._pair, s_wr._pair);
+out:
+	return (s_wr.seq + SMR_LAZY_INCR);
+}
+
+/*
+ * Increment the shared write sequence by 2.  Since it is initialized
+ * to 1 this means the only valid values are odd and an observed value
+ * of 0 in a particular CPU means it is not currently in a read section.
+ */
+static smr_seq_t
+smr_shared_advance(smr_shared_t s)
+{
+
+	return (atomic_add_32_nv(&s->s_wr.seq, SMR_SEQ_INCR));
+}
+
+/*
+ * Advance the write sequence number for a normal smr section.  If the
+ * write sequence is too far behind the read sequence we have to poll
+ * to advance rd_seq and prevent undetectable wraps.
+ */
+static smr_seq_t
+smr_default_advance(smr_t smr, smr_shared_t s)
+{
+	smr_seq_t goal, s_rd_seq;
+
+	KASSERT(kpreempt_disabled());
+
+	/*
+	 * Load the current read seq before incrementing the goal so
+	 * we are guaranteed it is always < goal.
+	 */
+	s_rd_seq = atomic_load_acquire(&s->s_rd_seq);
+	goal = smr_shared_advance(s);
+
+	/*
+	 * Force a synchronization here if the goal is getting too
+	 * far ahead of the read sequence number.  This keeps the
+	 * wrap detecting arithmetic working in pathological cases.
+	 */
+	if (SMR_SEQ_DELTA(goal, s_rd_seq) >= SMR_SEQ_MAX_DELTA) {
+		smr_ev_advance_wait.ev_count++;
+		smr_wait(smr, goal - SMR_SEQ_MAX_ADVANCE);
+	}
+	smr_ev_advance.ev_count++;
+
+	return (goal);
+}
+
+/*
+ * Deferred SMRs conditionally update s_wr_seq based on an
+ * cpu local interval count.
+ */
+static smr_seq_t
+smr_deferred_advance(smr_t smr, smr_shared_t s, smr_t self)
+{
+
+	if (++self->c_deferred < self->c_limit)
+		return (smr_shared_current(s) + SMR_SEQ_INCR);
+	self->c_deferred = 0;
+	return (smr_default_advance(smr, s));
+}
+
+/*
+ * Advance the write sequence and return the value for use as the
+ * wait goal.  This guarantees that any changes made by the calling
+ * thread prior to this call will be visible to all threads after
+ * rd_seq meets or exceeds the return value.
+ *
+ * This function may busy loop if the readers are roughly 1 billion
+ * sequence numbers behind the writers.
+ *
+ * Lazy SMRs will not busy loop and the wrap happens every 25 days
+ * at 1khz and 60 hours at 10khz.  Readers can block for no longer
+ * than half of this for SMR_SEQ_ macros to continue working.
+ */
+smr_seq_t
+smr_advance(smr_t smr)
+{
+	smr_t self;
+	smr_shared_t s;
+	smr_seq_t goal;
+	int flags;
+
+	/*
+	 * It is illegal to enter while in an smr section.
+	 */
+	SMR_ASSERT_NOT_ENTERED(smr);
+
+	/*
+	 * Modifications not done in a smr section need to be visible
+	 * before advancing the seq.
+	 */
+	membar_exit();
+
+	kpreempt_disable();
+	/* Try to touch the line once. */
+	self = smr_cpu_self(smr);
+	s = self->c_shared;
+	flags = self->c_flags;
+	if ((flags & (SMR_LAZY | SMR_DEFERRED)) == 0)
+		goal = smr_default_advance(smr, s);
+	else if ((flags & SMR_LAZY) != 0)
+		goal = smr_lazy_advance(smr, s);
+	else
+		goal = smr_deferred_advance(smr, s, self);
+	kpreempt_enable();
+
+	return (goal);
+}
+
+/*
+ * Poll to determine the currently observed sequence number on a cpu
+ * and spinwait if the 'wait' argument is true.
+ */
+static smr_seq_t
+smr_poll_cpu(struct smr *c, smr_seq_t s_rd_seq, smr_seq_t goal, bool wait)
+{
+	smr_seq_t c_seq;
+
+	c_seq = SMR_SEQ_INVALID;
+	for (;;) {
+		c_seq = atomic_load_relaxed(&c->c_seq);
+		if (c_seq == SMR_SEQ_INVALID)
+			break;
+
+		/*
+		 * There is a race described in smr.h:smr_enter that
+		 * can lead to a stale seq value but not stale data
+		 * access.  If we find a value out of range here we
+		 * pin it to the current min to prevent it from
+		 * advancing until that stale section has expired.
+		 *
+		 * The race is created when a cpu loads the s_wr_seq
+		 * value in a local register and then another thread
+		 * advances s_wr_seq and calls smr_poll() which will
+		 * oberve no value yet in c_seq and advance s_rd_seq
+		 * up to s_wr_seq which is beyond the register
+		 * cached value.  This is only likely to happen on
+		 * hypervisor or with a system management interrupt.
+		 */
+		if (SMR_SEQ_LT(c_seq, s_rd_seq))
+			c_seq = s_rd_seq;
+
+		/*
+		 * If the sequence number meets the goal we are done
+		 * with this cpu.
+		 */
+		if (SMR_SEQ_LEQ(goal, c_seq))
+			break;
+
+		if (!wait)
+			break;
+		SPINLOCK_BACKOFF_HOOK;
+	}
+
+	return (c_seq);
+}
+
+/*
+ * Loop until all cores have observed the goal sequence or have
+ * gone inactive.  Returns the oldest sequence currently active;
+ *
+ * This function assumes a snapshot of sequence values has
+ * been obtained and validated by smr_poll().
+ */
+static smr_seq_t
+smr_poll_scan(smr_t smr, smr_shared_t s, smr_seq_t s_rd_seq,
+    smr_seq_t s_wr_seq, smr_seq_t goal, bool wait)
+{
+	smr_seq_t rd_seq, c_seq;
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *ci;
+
+	KASSERT(kpreempt_disabled());
+	smr_ev_poll_scan.ev_count++;
+
+	/*
+	 * The read sequence can be no larger than the write sequence at
+	 * the start of the poll.
+	 */
+	rd_seq = s_wr_seq;
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		/*
+		 * Query the active sequence on this cpu.  If we're not
+		 * waiting and we don't meet the goal we will still scan
+		 * the rest of the cpus to update s_rd_seq before returning
+		 * failure.
+		 */
+		c_seq = smr_poll_cpu(smr_cpu_get(smr, cpu_index(ci)),
+		    s_rd_seq, goal, wait);
+
+		/*
+		 * Limit the minimum observed rd_seq whether we met the goal
+		 * or not.
+		 */
+		if (c_seq != SMR_SEQ_INVALID)
+			rd_seq = SMR_SEQ_MIN(rd_seq, c_seq);
+	}
+
+	/*
+	 * Advance the rd_seq as long as we observed a more recent value.
+	 */
+	s_rd_seq = atomic_load_relaxed(&s->s_rd_seq);
+	if (SMR_SEQ_GT(rd_seq, s_rd_seq)) {
+		(void)atomic_cas_32(&s->s_rd_seq, s_rd_seq, rd_seq);
+		s_rd_seq = rd_seq;
+	}
+
+	return (s_rd_seq);
+}
+
+/*
+ * Poll to determine whether all readers have observed the 'goal' write
+ * sequence number.
+ *
+ * If wait is true this will spin until the goal is met.
+ *
+ * This routine will updated the minimum observed read sequence number in
+ * s_rd_seq if it does a scan.  It may not do a scan if another call has
+ * advanced s_rd_seq beyond the callers goal already.
+ *
+ * Returns true if the goal is met and false if not.
+ */
+bool
+smr_poll(smr_t smr, smr_seq_t goal, bool wait)
+{
+	smr_shared_t s;
+	smr_t self;
+	smr_seq_t s_wr_seq, s_rd_seq;
+	smr_delta_t delta;
+	int flags;
+	bool success;
+
+	/*
+	 * It is illegal to enter while in an smr section.
+	 */
+	KASSERTMSG(!wait || !SMR_ENTERED(smr),
+	    "smr_poll: Blocking not allowed in a SMR section.");
+
+	/*
+	 * Use a critical section so that we can avoid ABA races
+	 * caused by long preemption sleeps.
+	 */
+	success = true;
+	kpreempt_disable();
+	/* Attempt to load from self only once. */
+	self = smr_cpu_self(smr);
+	s = self->c_shared;
+	flags = self->c_flags;
+	smr_ev_poll.ev_count++;
+
+	/*
+	 * Conditionally advance the lazy write clock on any writer
+	 * activity.
+	 */
+	if ((flags & SMR_LAZY) != 0)
+		smr_lazy_advance(smr, s);
+
+	/*
+	 * Acquire barrier loads s_wr_seq after s_rd_seq so that we can not
+	 * observe an updated read sequence that is larger than write.
+	 */
+	s_rd_seq = atomic_load_acquire(&s->s_rd_seq);
+
+	/*
+	 * If we have already observed the sequence number we can immediately
+	 * return success.  Most polls should meet this criterion.
+	 */
+	if (SMR_SEQ_LEQ(goal, s_rd_seq))
+		goto out;
+
+	/*
+	 * wr_seq must be loaded prior to any c_seq value so that a
+	 * stale c_seq can only reference time after this wr_seq.
+	 */
+	s_wr_seq = atomic_load_acquire(&s->s_wr.seq);
+
+	/*
+	 * This is the distance from s_wr_seq to goal.  Positive values
+	 * are in the future.
+	 */
+	delta = SMR_SEQ_DELTA(goal, s_wr_seq);
+
+	/*
+	 * Detect a stale wr_seq.
+	 *
+	 * This goal may have come from a deferred advance or a lazy
+	 * smr.  If we are not blocking we can not succeed but the
+	 * sequence number is valid.
+	 */
+	if (delta > 0 && delta <= SMR_SEQ_ADVANCE &&
+	    (flags & (SMR_LAZY | SMR_DEFERRED)) != 0) {
+		if (!wait) {
+			success = false;
+			goto out;
+		}
+		/* Advance wr_seq until it reaches the goal.  Lazy SMR
+		 * goals can be SMR_LAZY_INCR ahead, requiring multiple
+		 * advances of SMR_SEQ_INCR each. */
+		while (SMR_SEQ_GT(goal, s_wr_seq))
+			s_wr_seq = smr_shared_advance(s);
+		delta = 0;
+	}
+
+	/*
+	 * Detect an invalid goal.
+	 *
+	 * The goal must be in the range of s_wr_seq >= goal >= s_rd_seq for
+	 * it to be valid.  If it is not then the caller held on to it and
+	 * the integer wrapped.  If we wrapped back within range the caller
+	 * will harmlessly scan.
+	 */
+	if (delta > 0)
+		goto out;
+
+	/* Determine the lowest visible sequence number. */
+	s_rd_seq = smr_poll_scan(smr, s, s_rd_seq, s_wr_seq, goal, wait);
+	success = SMR_SEQ_LEQ(goal, s_rd_seq);
+out:
+	if (!success)
+		smr_ev_poll_fail.ev_count++;
+	kpreempt_enable();
+
+	/*
+	 * Serialize with smr_advance()/smr_exit().  The caller is now free
+	 * to modify memory as expected.
+	 */
+	membar_enter();
+
+	KASSERTMSG(success || !wait, "smr_poll: blocking poll failed");
+	return (success);
+}
+
+smr_t
+smr_create(const char *name, int limit, int flags)
+{
+	smr_t smr;
+	smr_shared_t s;
+	struct smr *c;
+	u_int i;
+	size_t shared_size = roundup(sizeof(*s), COHERENCY_UNIT);
+	size_t smr_size = MAXCPUS * SMR_CPU_STRIDE;
+
+	s = kmem_zalloc(shared_size, KM_SLEEP);
+	smr = kmem_zalloc(smr_size, KM_SLEEP);
+
+	s->s_name = name;
+	s->s_rd_seq = s->s_wr.seq = SMR_SEQ_INIT;
+	s->s_wr.ticks = getticks();
+
+	/* Initialize all CPUS, not just those running. */
+	for (i = 0; i < MAXCPUS; i++) {
+		c = smr_cpu_get(smr, i);
+		c->c_seq = SMR_SEQ_INVALID;
+		c->c_shared = s;
+		c->c_deferred = 0;
+		c->c_limit = limit;
+		c->c_flags = flags;
+	}
+	membar_sync();
+
+	return (smr);
+}
+
+void
+smr_destroy(smr_t smr)
+{
+	smr_shared_t s;
+	size_t shared_size = roundup(sizeof(struct smr_shared), COHERENCY_UNIT);
+	size_t smr_size = MAXCPUS * SMR_CPU_STRIDE;
+
+	smr_synchronize(smr);
+	s = smr_cpu_get(smr, 0)->c_shared;
+	kmem_free(s, shared_size);
+	kmem_free(smr, smr_size);
+}
+
+/*
+ * Initialize the SMR subsystem.
+ */
+void
+smr_init(void)
+{
+
+	/* Nothing to do -- kmem is used directly. */
+}
diff --git a/sys/sys/_smr.h b/sys/sys/_smr.h
new file mode 100644
index 000000000000..d2f8367c9e73
--- /dev/null
+++ b/sys/sys/_smr.h
@@ -0,0 +1,41 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2019, 2020 Jeffrey Roberson <jeff@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Ported to NetBSD from FreeBSD's sys/sys/_smr.h.
+ */
+
+#ifndef _SYS__SMR_H_
+#define _SYS__SMR_H_
+
+#include <sys/types.h>
+
+typedef uint32_t	smr_seq_t;
+typedef int32_t		smr_delta_t;
+typedef struct smr	*smr_t;
+
+#define SMR_SEQ_INVALID	0
+
+#endif /* _SYS__SMR_H_ */
diff --git a/sys/sys/pool.h b/sys/sys/pool.h
index 2342ca478214..3d20b9e2ba27 100644
--- a/sys/sys/pool.h
+++ b/sys/sys/pool.h
@@ -36,6 +36,7 @@
 
 #include <sys/stdbool.h>
 #include <sys/stdint.h>
+#include <sys/_smr.h>
 
 struct pool_sysctl {
 	char pr_wchan[16];
@@ -163,6 +164,7 @@ struct pool {
 #define PR_ZERO		0x8000	/* zero data before returning */
 #define PR_USEBMAP	0x10000	/* use a bitmap to manage freed items */
 #define PR_PSERIALIZE	0x20000	/* needs pserialize sync point before free */
+#define PR_SMR		0x40000	/* SMR safe memory reclamation */
 
 	/*
 	 * `pr_lock' protects the pool's data structures when removing
@@ -237,13 +239,33 @@ typedef struct pool_cache_group {
 	struct pool_cache_group	*pcg_next;	/* link to next group */
 	u_int			pcg_avail;	/* # available objects */
 	u_int			pcg_size;	/* max number objects */
+	smr_seq_t		pcg_smr_seq;	/* SMR sequence at free time */
 	pcgpair_t 		pcg_objects[1];	/* the objects */
 } pcg_t;
 
-/* Pool cache CPU.  Sized to 64 bytes on _LP64. */
+/*
+ * Pool cache CPU.
+ *
+ * For non-SMR pool_caches only cc_current / cc_previous are live;
+ * cc_smr_free1 / cc_smr_free2 / cc_smr_alloc are held at pcg_dummy
+ * and cost only the three pointers of space.
+ *
+ * For SMR pool_caches the picture is inverted:
+ *   cc_smr_free1  - primary write-staging bucket (freed, unstamped objects)
+ *   cc_smr_free2  - secondary write-staging bucket (flush-both partner)
+ *   cc_smr_alloc  - validated per-CPU allocation bucket (smr_poll expired)
+ * and cc_current / cc_previous stay at pcg_dummy.  The split mirrors
+ * FreeBSD's UMA_ZONE_SMR (uc_freebucket / uc_allocbucket).  The alloc
+ * bucket lets the hot get path satisfy allocations without taking
+ * pc_smr_lock while the free buckets amortize smr_advance across
+ * ~2 * pc_pcgsize frees.
+ */
 typedef struct pool_cache_cpu {
-	struct pool_cache_group	*cc_current;
-	struct pool_cache_group	*cc_previous;	
+	struct pool_cache_group	*cc_current;	/* non-SMR: LIFO top */
+	struct pool_cache_group	*cc_previous;	/* non-SMR: LIFO bottom */
+	struct pool_cache_group	*cc_smr_free1;	/* SMR: primary write stage */
+	struct pool_cache_group	*cc_smr_free2;	/* SMR: flush-both partner */
+	struct pool_cache_group	*cc_smr_alloc;	/* SMR: validated read bucket */
 	pcg_t *volatile 	*cc_pcgcache;
 	uint64_t		cc_misses;
 	uint64_t		cc_hits;
@@ -269,6 +291,7 @@ struct pool_cache {
 	void		*pc_arg;	/* for ctor/dtor */
 	unsigned int	pc_refcnt;	/* ref count for pagedaemon, etc */
 	unsigned int	pc_roflags;	/* r/o cache flags */
+	void		*pc_smr;	/* SMR context (smr_t), or NULL */
 	void		*pc_cpus[MAXCPUS];
 
 	/* Diagnostic aides. */
@@ -281,6 +304,20 @@ struct pool_cache {
 	    __aligned(CACHE_LINE_SIZE);
 	pcg_t *volatile pc_partgroups;	/* groups for reclamation */
 
+	/* SMR FIFO queue for deferred-reuse groups (PR_SMR only). */
+	kmutex_t	pc_smr_lock;	/* protects FIFO head/tail */
+	pcg_t		*pc_smr_head;	/* oldest (most likely expired) */
+	pcg_t		*pc_smr_tail;	/* newest (most recently freed) */
+
+	/*
+	 * Cached seq of FIFO head (SMR_SEQ_INVALID when empty);
+	 * atomic_load_relaxed readable for lock-free alloc fast-reject,
+	 * written under pc_smr_lock.  Isolated on its own cacheline to
+	 * avoid false sharing with the FIFO writers on pc_smr_lock /
+	 * head / tail above.
+	 */
+	smr_seq_t	pc_smr_seq_oldest __aligned(CACHE_LINE_SIZE);
+
 	/* Boot cpu. */
 	pool_cache_cpu_t pc_cpu0 __aligned(CACHE_LINE_SIZE);
 };
@@ -357,6 +394,8 @@ void		pool_cache_sethiwat(pool_cache_t, int);
 void		pool_cache_sethardlimit(pool_cache_t, int, const char *, int);
 void		pool_cache_prime(pool_cache_t, int);
 void		pool_cache_cpu_init(struct cpu_info *);
+void		pool_cache_set_smr(pool_cache_t, void *);
+void		*pool_cache_get_smr(pool_cache_t);
 
 unsigned int	pool_cache_nget(pool_cache_t);
 unsigned int	pool_cache_nput(pool_cache_t);
diff --git a/sys/sys/smr.h b/sys/sys/smr.h
new file mode 100644
index 000000000000..f5857fdf1027
--- /dev/null
+++ b/sys/sys/smr.h
@@ -0,0 +1,359 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2019, 2020 Jeffrey Roberson <jeff@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Ported to NetBSD from FreeBSD's sys/sys/smr.h.
+ */
+
+#ifndef _SYS_SMR_H_
+#define	_SYS_SMR_H_
+
+#include <sys/_smr.h>
+#include <sys/atomic.h>
+#include <sys/cpu.h>
+#include <sys/intr.h>		/* kpreempt_disable/enable */
+
+/*
+ * Safe memory reclamation.  See subr_smr.c for a description of the
+ * algorithm, and smr_types.h for macros to define and access SMR-protected
+ * data structures.
+ *
+ * Readers synchronize with smr_enter()/exit() and writers may either
+ * free directly to a SMR-aware allocator or use smr_synchronize or wait.
+ */
+
+/*
+ * Modular arithmetic for comparing sequence numbers that have
+ * potentially wrapped.  Copied from tcp_seq.h.
+ */
+#define	SMR_SEQ_LT(a, b)	((smr_delta_t)((a)-(b)) < 0)
+#define	SMR_SEQ_LEQ(a, b)	((smr_delta_t)((a)-(b)) <= 0)
+#define	SMR_SEQ_GT(a, b)	((smr_delta_t)((a)-(b)) > 0)
+#define	SMR_SEQ_GEQ(a, b)	((smr_delta_t)((a)-(b)) >= 0)
+#define	SMR_SEQ_DELTA(a, b)	((smr_delta_t)((a)-(b)))
+#define	SMR_SEQ_MIN(a, b)	(SMR_SEQ_LT((a), (b)) ? (a) : (b))
+#define	SMR_SEQ_MAX(a, b)	(SMR_SEQ_GT((a), (b)) ? (a) : (b))
+
+/* Shared SMR state. */
+union s_wr {
+	struct {
+		smr_seq_t	seq;	/* Current write sequence #. */
+		int		ticks;	/* tick of last update (LAZY) */
+	};
+	uint64_t	_pair;
+};
+struct smr_shared {
+	const char	*s_name;	/* Name for debugging/reporting. */
+	union s_wr	s_wr;		/* Write sequence */
+	smr_seq_t	s_rd_seq;	/* Minimum observed read sequence. */
+};
+typedef struct smr_shared *smr_shared_t;
+
+/* Per-cpu SMR state. */
+struct smr {
+	smr_seq_t	c_seq;		/* Current observed sequence. */
+	smr_shared_t	c_shared;	/* Shared SMR state. */
+	int		c_deferred;	/* Deferred advance counter. */
+	int		c_limit;	/* Deferred advance limit. */
+	int		c_flags;	/* SMR Configuration */
+};
+
+#define	SMR_LAZY	0x0001		/* Higher latency write, fast read. */
+#define	SMR_DEFERRED	0x0002		/* Aggregate updates to wr_seq. */
+
+/*
+ * Per-CPU access.  SMR per-CPU data is allocated as a contiguous
+ * array indexed by cpu_index(), with each entry padded to a cache
+ * line to avoid false sharing.
+ */
+#define	SMR_CPU_STRIDE	roundup(sizeof(struct smr), COHERENCY_UNIT)
+
+static __inline struct smr *
+smr_cpu_get(smr_t smr, u_int cpuid)
+{
+	return (struct smr *)((char *)smr + cpuid * SMR_CPU_STRIDE);
+}
+
+static __inline struct smr *
+smr_cpu_self(smr_t smr)
+{
+	return smr_cpu_get(smr, cpu_index(curcpu()));
+}
+
+/*
+ * SMR_ENTERED: true if the current CPU is in an SMR read section.
+ */
+#define	SMR_ENTERED(smr)						\
+    (kpreempt_disabled() &&						\
+     smr_cpu_self(smr)->c_seq != SMR_SEQ_INVALID)
+
+#define	SMR_ASSERT_ENTERED(smr)						\
+    KASSERT(SMR_ENTERED(smr))
+
+#define	SMR_ASSERT_NOT_ENTERED(smr)					\
+    KASSERT(!SMR_ENTERED(smr))
+
+/*
+ * Return the current write sequence number.  This is not the same as the
+ * current goal which may be in the future.
+ */
+static __inline smr_seq_t
+smr_shared_current(smr_shared_t s)
+{
+
+	return (atomic_load_relaxed(&s->s_wr.seq));
+}
+
+static __inline smr_seq_t
+smr_current(smr_t smr)
+{
+
+	return (smr_shared_current(smr_cpu_self(smr)->c_shared));
+}
+
+/*
+ * Enter a read section.
+ *
+ * IMPORTANT CALLER CONTRACT (same as smr_lazy_enter, see comment above
+ * smr_lazy_enter below): smr_enter uses only kpreempt_disable, it does
+ * NOT raise IPL.  The recursion KASSERT below fires if the same smr
+ * context is re-entered on the same CPU, e.g. a softint dispatched
+ * while a user thread was in an smr_enter section.
+ *
+ * User-context callers MUST wrap smr_enter / smr_exit pairs with
+ * splsoftnet() / splx() (or a higher spl matching the softints that
+ * use this smr context) to prevent softint dispatch on the same CPU.
+ * Softint-context callers are safe by virtue of their own IPL.
+ */
+static __inline void
+smr_enter(smr_t smr)
+{
+	struct smr *self;
+
+	kpreempt_disable();
+	self = smr_cpu_self(smr);
+	KASSERTMSG((self->c_flags & SMR_LAZY) == 0,
+	    "smr_enter(%s) lazy smr.", self->c_shared->s_name);
+	KASSERTMSG(cpu_softintr_p() || cpu_intr_p() ||
+	    curcpu()->ci_cpl != IPL_NONE,
+	    "smr_enter(%s) from user context at IPL_NONE",
+	    self->c_shared->s_name);
+	KASSERTMSG(self->c_seq == SMR_SEQ_INVALID,
+	    "smr_enter(%s) does not support recursion.",
+	    self->c_shared->s_name);
+
+	/*
+	 * Store the current observed write sequence number in our
+	 * per-cpu state so that it can be queried via smr_poll().
+	 * Frees that are newer than this stored value will be
+	 * deferred until we call smr_exit().
+	 *
+	 * Subsequent loads must not be re-ordered with the store.
+	 * A full fence (seq_cst) is required to ensure the c_seq store
+	 * is globally visible before subsequent loads (so smr_poll on
+	 * another CPU sees us as an active reader).
+	 *
+	 * It is possible that a long delay between loading the wr_seq
+	 * and storing the c_seq could create a situation where the
+	 * rd_seq advances beyond our stored c_seq.  In this situation
+	 * only the observed wr_seq is stale, the fence still orders
+	 * the load.  See smr_poll() for details on how this condition
+	 * is detected and handled there.
+	 */
+	atomic_store_relaxed(&self->c_seq,
+	    smr_shared_current(self->c_shared));
+	membar_sync();
+}
+
+/*
+ * Exit a read section.
+ */
+static __inline void
+smr_exit(smr_t smr)
+{
+	struct smr *self;
+
+	self = smr_cpu_self(smr);
+	KASSERTMSG(kpreempt_disabled(), "smr_exit(%s): preemption enabled",
+	    self->c_shared->s_name);
+	KASSERTMSG((self->c_flags & SMR_LAZY) == 0,
+	    "smr_exit(%s) lazy smr.", self->c_shared->s_name);
+	KASSERTMSG(self->c_seq != SMR_SEQ_INVALID,
+	    "smr_exit(%s) not in a smr section.", self->c_shared->s_name);
+
+	/*
+	 * Clear the recorded sequence number.  This allows poll() to
+	 * detect CPUs not in read sections.
+	 *
+	 * Use release semantics to retire any stores before the sequence
+	 * number is cleared.
+	 */
+	atomic_store_release(&self->c_seq, SMR_SEQ_INVALID);
+	kpreempt_enable();
+}
+
+/*
+ * Enter a lazy smr section.  This is used for read-mostly state that
+ * can tolerate a high free latency.
+ *
+ * IMPORTANT CALLER CONTRACT: smr_lazy_enter uses only kpreempt_disable,
+ * it does NOT raise the IPL.  It must therefore NOT be called from a
+ * context where an interrupt (hardware or softint) can preempt the
+ * current thread AND call smr_lazy_enter on the same smr context.  The
+ * KASSERT below would fire ("does not support recursion") because
+ * c_seq is per-CPU state.
+ *
+ * In practice, this means:
+ *   - Softint-context callers (e.g. tcp_input, ipintr, icmp processing)
+ *     are already at their softint IPL and cannot be preempted by a
+ *     same-IPL softint, so no extra protection is needed.
+ *   - User-context callers (e.g. bind/connect, ifnet detach) MUST wrap
+ *     the smr_lazy_enter/exit pair in splsoftnet()/splx() (or higher)
+ *     so that hardware interrupts do not trigger softint_fast_dispatch
+ *     of a softint that would re-enter the same smr context on this
+ *     CPU.  See inpcb_lookup_local() in sys/netinet/in_pcb.c for an
+ *     example.
+ */
+static __inline void
+smr_lazy_enter(smr_t smr)
+{
+	struct smr *self;
+
+	kpreempt_disable();
+	self = smr_cpu_self(smr);
+	KASSERTMSG((self->c_flags & SMR_LAZY) != 0,
+	    "smr_lazy_enter(%s) non-lazy smr.", self->c_shared->s_name);
+	/*
+	 * Recursion-prevention KASSERT.  See the comment above this
+	 * function for the caller IPL contract.
+	 *
+	 * Any elevated IPL (ci_cpl != IPL_NONE) prevents softint
+	 * dispatch on the current CPU, which is what closes the
+	 * recursion race.  cpu_softintr_p() / cpu_intr_p() are a
+	 * defensive belt-and-suspenders for arch ports whose softint
+	 * machinery might not always raise ci_cpl to the softint's
+	 * IPL before calling into C.
+	 *
+	 * This catches the typical violation (user context at
+	 * IPL_NONE) immediately rather than waiting for a rare softint
+	 * preemption to trip the c_seq check.  The generic IPL check
+	 * is chosen instead of a specific level (e.g. IPL_SOFTNET) so
+	 * future non-network SMR contexts at other softint levels
+	 * (IPL_SOFTCLOCK, IPL_SOFTBIO, IPL_SOFTSERIAL) are also
+	 * covered without modification.
+	 */
+	KASSERTMSG(cpu_softintr_p() || cpu_intr_p() ||
+	    curcpu()->ci_cpl != IPL_NONE,
+	    "smr_lazy_enter(%s) from user context at IPL_NONE",
+	    self->c_shared->s_name);
+	KASSERTMSG(self->c_seq == SMR_SEQ_INVALID,
+	    "smr_lazy_enter(%s) does not support recursion.",
+	    self->c_shared->s_name);
+
+	/*
+	 * This needs no serialization.  If an interrupt occurs before we
+	 * assign sr_seq to c_seq any speculative loads will be discarded.
+	 * If we assign a stale wr_seq value due to interrupt we use the
+	 * same algorithm that renders smr_enter() safe.
+	 */
+	atomic_store_relaxed(&self->c_seq,
+	    smr_shared_current(self->c_shared));
+}
+
+/*
+ * Exit a lazy smr section.  This is used for read-mostly state that
+ * can tolerate a high free latency.
+ */
+static __inline void
+smr_lazy_exit(smr_t smr)
+{
+	struct smr *self;
+
+	self = smr_cpu_self(smr);
+	KASSERTMSG(kpreempt_disabled(), "smr_lazy_exit(%s): preemption enabled",
+	    self->c_shared->s_name);
+	KASSERTMSG((self->c_flags & SMR_LAZY) != 0,
+	    "smr_lazy_enter(%s) non-lazy smr.", self->c_shared->s_name);
+	KASSERTMSG(self->c_seq != SMR_SEQ_INVALID,
+	    "smr_lazy_exit(%s) not in a smr section.",
+	    self->c_shared->s_name);
+
+	/*
+	 * All loads/stores must be retired before the sequence becomes
+	 * visible.  Another alternative would be to omit the fence but
+	 * store the exit time and wait 1 tick longer.
+	 */
+	membar_exit();
+	atomic_store_relaxed(&self->c_seq, SMR_SEQ_INVALID);
+	kpreempt_enable();
+}
+
+/*
+ * Advances the write sequence number.  Returns the sequence number
+ * required to ensure that all modifications are visible to readers.
+ */
+smr_seq_t smr_advance(smr_t smr);
+
+/*
+ * Returns true if a goal sequence has been reached.  If
+ * wait is true this will busy loop until success.
+ */
+bool smr_poll(smr_t smr, smr_seq_t goal, bool wait);
+
+/* Create a new SMR context. */
+smr_t smr_create(const char *name, int limit, int flags);
+
+/* Destroy the context. */
+void smr_destroy(smr_t smr);
+
+/*
+ * Blocking wait for all readers to observe 'goal'.
+ */
+static __inline void
+smr_wait(smr_t smr, smr_seq_t goal)
+{
+
+	(void)smr_poll(smr, goal, true);
+}
+
+/*
+ * Synchronize advances the write sequence and returns when all
+ * readers have observed it.
+ *
+ * If your application can cache a sequence number returned from
+ * smr_advance() and poll or wait at a later time there will
+ * be less chance of busy looping while waiting for readers.
+ */
+static __inline void
+smr_synchronize(smr_t smr)
+{
+
+	smr_wait(smr, smr_advance(smr));
+}
+
+/* Only at startup. */
+void smr_init(void);
+
+#endif	/* _SYS_SMR_H_ */