DVA-based Adaptive Replacement Cache. More...

#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/zfs_context.h>
#include <sys/arc.h>
#include <sys/refcount.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#include <sys/dnlc.h>
#include <sys/callb.h>
#include <sys/kstat.h>
#include <zfs_fletcher.h>
#include <sys/sdt.h>
#include <vm/vm_pageout.h>

Include dependency graph for arc.c:

Go to the source code of this file.

Data Structures
struct	arcs_lock
struct	arc_state
struct	arc_stats
struct	arc_callback
struct	arc_write_callback
struct	arc_buf_hdr
struct	ht_lock
struct	buf_hash_table
struct	l2arc_dev
struct	l2arc_read_callback
struct	l2arc_write_callback
struct	l2arc_buf_hdr
struct	l2arc_data_free
Defines
#define	ARC_REDUCE_DNLC_PERCENT 3
#define	ARCS_LOCK_PAD CACHE_LINE_SIZE
#define	ARC_BUFC_NUMDATALISTS 16
	must be power of two for mask use to work
#define	ARC_BUFC_NUMMETADATALISTS 16
#define	ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
#define	ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock))
#define	ARCSTAT(stat) (arc_stats.stat.value.ui64)
#define	ARCSTAT_INCR(stat, val) atomic_add_64(&arc_stats.stat.value.ui64, (val));
#define	ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
#define	ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
#define	ARCSTAT_MAX(stat, val)
#define	ARCSTAT_MAXSTAT(stat) ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat)
	We define a macro to allow ARC hits/misses to be easily broken down by two separate conditions, giving a total of four different subtypes for each of hits and misses (so eight statistics total).
#define	arc_size ARCSTAT(arcstat_size)
	actual total arc size
#define	arc_p ARCSTAT(arcstat_p)
	target size of MRU
#define	arc_c ARCSTAT(arcstat_c)
	target size of cache
#define	arc_c_min ARCSTAT(arcstat_c_min)
	min target cache size
#define	arc_c_max ARCSTAT(arcstat_c_max)
	max target cache size
#define	GHOST_STATE(state)
#define	ARC_IN_HASH_TABLE (1 << 9)
	this buffer is hashed
#define	ARC_IO_IN_PROGRESS (1 << 10)
	I/O in progress for buf.
#define	ARC_IO_ERROR (1 << 11)
	I/O failed for buf.
#define	ARC_FREED_IN_READ (1 << 12)
	buf freed while in read
#define	ARC_BUF_AVAILABLE (1 << 13)
	block not in active use
#define	ARC_INDIRECT (1 << 14)
	this is an indirect block
#define	ARC_FREE_IN_PROGRESS (1 << 15)
	hdr about to be freed
#define	ARC_L2_WRITING (1 << 16)
	L2ARC write in progress.
#define	ARC_L2_EVICTED (1 << 17)
	evicted during I/O
#define	ARC_L2_WRITE_HEAD (1 << 18)
	head of write list
#define	HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
#define	HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
#define	HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
#define	HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH)
#define	HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
#define	HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
#define	HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
#define	HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE)
#define	HDR_L2_READING(hdr)
#define	HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING)
#define	HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED)
#define	HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
#define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
#define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
#define	HT_LOCK_PAD CACHE_LINE_SIZE
#define	BUF_LOCKS 256
#define	BUF_HASH_INDEX(spa, dva, birth) (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
#define	BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
#define	HDR_LOCK(hdr) (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
#define	L2ARC_WRITE_SIZE (8 * 1024 * 1024)
	initial write max
#define	L2ARC_HEADROOM 2
	num of writes
#define	L2ARC_FEED_SECS 1
	caching interval secs
#define	L2ARC_FEED_MIN_MS 200
	min caching interval ms
#define	l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
#define	l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
#define	BUF_EMPTY(buf)
#define	BUF_EQUAL(spa, dva, birth, buf)
#define	ARC_MINTIME (hz>>4)
Typedefs
typedef enum arc_reclaim_strategy	arc_reclaim_strategy_t
typedef struct arc_state	arc_state_t
typedef struct arc_stats	arc_stats_t
typedef struct l2arc_buf_hdr	l2arc_buf_hdr_t
typedef struct arc_callback	arc_callback_t
typedef struct arc_write_callback	arc_write_callback_t
typedef struct buf_hash_table	buf_hash_table_t
typedef struct l2arc_dev	l2arc_dev_t
typedef struct l2arc_read_callback	l2arc_read_callback_t
typedef struct l2arc_write_callback	l2arc_write_callback_t
typedef struct l2arc_data_free	l2arc_data_free_t
Enumerations
enum	arc_reclaim_strategy { ARC_RECLAIM_AGGR, ARC_RECLAIM_CONS }
Functions
	TUNABLE_QUAD ("vfs.zfs.arc_max",&zfs_arc_max)
	TUNABLE_QUAD ("vfs.zfs.arc_min",&zfs_arc_min)
	TUNABLE_QUAD ("vfs.zfs.arc_meta_limit",&zfs_arc_meta_limit)
	SYSCTL_DECL (_vfs_zfs)
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN,&zfs_arc_max, 0,"Maximum ARC size")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN,&zfs_arc_min, 0,"Minimum ARC size")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RD,&arc_meta_used, 0,"ARC metadata used")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RW,&arc_meta_limit, 0,"ARC metadata limit")
static void	arc_get_data_buf (arc_buf_t *buf)
	The buffer, supplied as the first argument, needs a data block.
static void	arc_access (arc_buf_hdr_t buf, kmutex_t hash_lock)
	This routine is called whenever a buffer is accessed.
static int	arc_evict_needed (arc_buf_contents_t type)
	Check if the cache has reached its limits and eviction is required prior to insert.
static void	arc_evict_ghost (arc_state_t *state, uint64_t spa, int64_t bytes)
	Remove buffers from list until we've removed the specified number of bytes.
static boolean_t	l2arc_write_eligible (uint64_t spa_guid, arc_buf_hdr_t *ab)
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,&l2arc_write_max, 0,"max write size")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,&l2arc_write_boost, 0,"extra write during warmup")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,&l2arc_headroom, 0,"number of dev writes")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,&l2arc_feed_secs, 0,"interval seconds")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,&l2arc_feed_min_ms, 0,"min interval milliseconds")
	SYSCTL_INT (_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,&l2arc_noprefetch, 0,"don't cache prefetch bufs")
	SYSCTL_INT (_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,&l2arc_feed_again, 0,"turbo warmup")
	SYSCTL_INT (_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,&l2arc_norw, 0,"no reads during writes")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,&ARC_anon.arcs_size, 0,"size of anonymous state")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,&ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0,"size of anonymous state")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,&ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0,"size of anonymous state")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,&ARC_mru.arcs_size, 0,"size of mru state")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,&ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0,"size of metadata in mru state")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,&ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0,"size of data in mru state")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,&ARC_mru_ghost.arcs_size, 0,"size of mru ghost state")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,&ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,"size of metadata in mru ghost state")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,&ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,"size of data in mru ghost state")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,&ARC_mfu.arcs_size, 0,"size of mfu state")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,&ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0,"size of metadata in mfu state")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,&ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0,"size of data in mfu state")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,&ARC_mfu_ghost.arcs_size, 0,"size of mfu ghost state")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,&ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,"size of metadata in mfu ghost state")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,&ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,"size of data in mfu ghost state")
	SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,&ARC_l2c_only.arcs_size, 0,"size of mru state")
static void	l2arc_read_done (zio_t *zio)
	A read to a cache device completed.
static void	l2arc_hdr_stat_add (void)
static void	l2arc_hdr_stat_remove (void)
static uint64_t	buf_hash (uint64_t spa, const dva_t *dva, uint64_t birth)
static void	buf_discard_identity (arc_buf_hdr_t *hdr)
static arc_buf_hdr_t *	buf_hash_find (uint64_t spa, const dva_t dva, uint64_t birth, kmutex_t *lockp)
static arc_buf_hdr_t *	buf_hash_insert (arc_buf_hdr_t buf, kmutex_t *lockp)
	Insert an entry into the hash table.
static void	buf_hash_remove (arc_buf_hdr_t *buf)
static void	buf_fini (void)
static int	hdr_cons (void vbuf, void unused, int kmflag)
	Constructor callback - called when the cache is empty and a new buf is requested.
static int	buf_cons (void vbuf, void unused, int kmflag)
static void	hdr_dest (void vbuf, void unused)
	Destructor callback - called when a cached buf is no longer required.
static void	buf_dest (void vbuf, void unused)
static void	hdr_recl (void *unused)
	Reclaim callback -- invoked when memory is low.
static void	buf_init (void)
static void	arc_cksum_verify (arc_buf_t *buf)
static int	arc_cksum_equal (arc_buf_t *buf)
static void	arc_cksum_compute (arc_buf_t *buf, boolean_t force)
void	arc_buf_thaw (arc_buf_t *buf)
void	arc_buf_freeze (arc_buf_t *buf)
static void	get_buf_info (arc_buf_hdr_t ab, arc_state_t state, list_t list, kmutex_t lock)
static void	add_reference (arc_buf_hdr_t ab, kmutex_t hash_lock, void *tag)
static int	remove_reference (arc_buf_hdr_t ab, kmutex_t hash_lock, void *tag)
static void	arc_change_state (arc_state_t new_state, arc_buf_hdr_t ab, kmutex_t *hash_lock)
	Move the supplied buffer to the indicated state.
void	arc_space_consume (uint64_t space, arc_space_type_t type)
void	arc_space_return (uint64_t space, arc_space_type_t type)
void *	arc_data_buf_alloc (uint64_t size)
void	arc_data_buf_free (void *buf, uint64_t size)
arc_buf_t *	arc_buf_alloc (spa_t spa, int size, void tag, arc_buf_contents_t type)
arc_buf_t *	arc_loan_buf (spa_t *spa, int size)
	Loan out an anonymous arc buffer.
void	arc_return_buf (arc_buf_t buf, void tag)
	Return a loaned arc buffer to the arc.
void	arc_loan_inuse_buf (arc_buf_t buf, void tag)
	Detach an arc_buf from a dbuf (tag)
static arc_buf_t *	arc_buf_clone (arc_buf_t *from)
void	arc_buf_add_ref (arc_buf_t buf, void tag)
static void	arc_buf_data_free (arc_buf_t buf, void(free_func)(void *, size_t))
	Free the arc data buffer.
static void	arc_buf_destroy (arc_buf_t *buf, boolean_t recycle, boolean_t all)
static void	arc_hdr_destroy (arc_buf_hdr_t *hdr)
void	arc_buf_free (arc_buf_t buf, void tag)
int	arc_buf_remove_ref (arc_buf_t buf, void tag)
int	arc_buf_size (arc_buf_t *buf)
boolean_t	arc_buf_eviction_needed (arc_buf_t *buf)
	Called from the DMU to determine if the current buffer should be evicted.
static void *	arc_evict (arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, arc_buf_contents_t type)
	Evict buffers from list until we've removed the specified number of bytes.
static void	arc_adjust (void)
static void	arc_do_user_evicts (void)
void	arc_flush (spa_t *spa)
	Flush all evictable data from the cache for the given spa.
void	arc_shrink (void)
static int	arc_reclaim_needed (void)
static void	arc_kmem_reap_now (arc_reclaim_strategy_t strat)
static void	arc_reclaim_thread (void *dummy __unused)
static void	arc_adapt (int bytes, arc_state_t *state)
	Adapt arc info given the number of bytes we are trying to add and the state that we are comming from.
void	arc_bcopy_func (zio_t zio, arc_buf_t buf, void *arg)
	a generic arc_done_func_t which you can use
void	arc_getbuf_func (zio_t zio, arc_buf_t buf, void *arg)
	a generic arc_done_func_t
static void	arc_read_done (zio_t *zio)
int	arc_read (zio_t pio, spa_t spa, const blkptr_t bp, arc_buf_t pbuf, arc_done_func_t done, void private, int priority, int zio_flags, uint32_t arc_flags, const zbookmark_t zb)
	"Read" the block block at the specified DVA (in bp) via the cache.
int	arc_read_nolock (zio_t pio, spa_t spa, const blkptr_t bp, arc_done_func_t done, void private, int priority, int zio_flags, uint32_t arc_flags, const zbookmark_t *zb)
void	arc_set_callback (arc_buf_t buf, arc_evict_func_t func, void *private)
int	arc_buf_evict (arc_buf_t *buf)
	This is used by the DMU to let the ARC know that a buffer is being evicted, so the ARC should clean up.
void	arc_release (arc_buf_t buf, void tag)
	Release this buffer from the cache.
int	arc_release_bp (arc_buf_t buf, void tag, blkptr_t bp, spa_t spa, zbookmark_t *zb)
	Release this buffer.
int	arc_released (arc_buf_t *buf)
int	arc_has_callback (arc_buf_t *buf)
static void	arc_write_ready (zio_t *zio)
static void	arc_write_done (zio_t *zio)
zio_t *	arc_write (zio_t pio, spa_t spa, uint64_t txg, blkptr_t bp, arc_buf_t buf, boolean_t l2arc, const zio_prop_t zp, arc_done_func_t ready, arc_done_func_t done, void private, int priority, int zio_flags, const zbookmark_t *zb)
static int	arc_memory_throttle (uint64_t reserve, uint64_t inflight_data, uint64_t txg)
void	arc_tempreserve_clear (uint64_t reserve)
int	arc_tempreserve_space (uint64_t reserve, uint64_t txg)
static void	arc_lowmem (void *arg __unused, int howto __unused)
void	arc_init (void)
void	arc_fini (void)
static uint64_t	l2arc_write_size (l2arc_dev_t *dev)
static clock_t	l2arc_write_interval (clock_t began, uint64_t wanted, uint64_t wrote)
static l2arc_dev_t *	l2arc_dev_get_next (void)
	Cycle through L2ARC devices.
static void	l2arc_do_free_on_write ()
	Free buffers that were tagged for destruction.
static void	l2arc_write_done (zio_t *zio)
	A write to a cache device has completed.
static list_t *	l2arc_list_locked (int list_num, kmutex_t **lock)
	This is the list priority from which the L2ARC will search for pages to cache.
static void	l2arc_evict (l2arc_dev_t *dev, uint64_t distance, boolean_t all)
	Evict buffers from the device write hand to the distance specified in bytes.
static uint64_t	l2arc_write_buffers (spa_t spa, l2arc_dev_t dev, uint64_t target_sz)
	Find and write ARC buffers to the L2ARC device.
static void	l2arc_feed_thread (void *dummy __unused)
	Feed the L2ARC with buffers from the ARC at regular intervals.
boolean_t	l2arc_vdev_present (vdev_t *vd)
void	l2arc_add_vdev (spa_t spa, vdev_t vd)
	Add a vdev for use by the L2ARC.
void	l2arc_remove_vdev (vdev_t *vd)
	Remove a vdev from the L2ARC.
void	l2arc_init (void)
void	l2arc_fini (void)
void	l2arc_start (void)
void	l2arc_stop (void)
Variables
static kmutex_t	arc_reclaim_thr_lock
static kcondvar_t	arc_reclaim_thr_cv
	used to signal reclaim thr
static uint8_t	arc_thread_exit
int	zfs_write_limit_shift
	1/8th of physical memory
uint64_t	zfs_write_limit_max
	max data payload per txg
kmutex_t	zfs_write_limit_lock
uint_t	arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT
static int	arc_grow_retry = 60
	number of seconds before growing cache again
static int	arc_p_min_shift = 4
	shift of arc_c for calculating both min and max arc_p
static int	arc_shrink_shift = 5
	log2(fraction of arc to reclaim)
static int	arc_min_prefetch_lifespan
	minimum lifespan of a prefetch block in clock ticks (initialized in arc_init())
static int	arc_dead
int	zfs_prefetch_disable
static boolean_t	arc_warm
	The arc has filled available memory and has now warmed up.
uint64_t	zfs_arc_max
uint64_t	zfs_arc_min
uint64_t	zfs_arc_meta_limit = 0
int	zfs_arc_grow_retry = 0
int	zfs_arc_shrink_shift = 0
int	zfs_arc_p_min_shift = 0
int	zfs_disable_dup_eviction = 0
static arc_state_t	ARC_anon
static arc_state_t	ARC_mru
static arc_state_t	ARC_mru_ghost
static arc_state_t	ARC_mfu
static arc_state_t	ARC_mfu_ghost
static arc_state_t	ARC_l2c_only
static arc_stats_t	arc_stats
kstat_t *	arc_ksp
static arc_state_t *	arc_anon
static arc_state_t *	arc_mru
static arc_state_t *	arc_mru_ghost
static arc_state_t *	arc_mfu
static arc_state_t *	arc_mfu_ghost
static arc_state_t *	arc_l2c_only
static int	arc_no_grow
	Don't try to grow cache size.
static uint64_t	arc_tempreserve
static uint64_t	arc_loaned_bytes
static uint64_t	arc_meta_used
static uint64_t	arc_meta_limit
static uint64_t	arc_meta_max = 0
static arc_buf_t *	arc_eviction_list
static kmutex_t	arc_eviction_mtx
static arc_buf_hdr_t	arc_eviction_hdr
static buf_hash_table_t	buf_hash_table
uint64_t	zfs_crc64_table [256]
uint64_t	l2arc_write_max = L2ARC_WRITE_SIZE
	default max write size
uint64_t	l2arc_write_boost = L2ARC_WRITE_SIZE
	extra write during warmup
uint64_t	l2arc_headroom = L2ARC_HEADROOM
	number of dev writes
uint64_t	l2arc_feed_secs = L2ARC_FEED_SECS
	interval seconds
uint64_t	l2arc_feed_min_ms = L2ARC_FEED_MIN_MS
	min interval milliseconds
boolean_t	l2arc_noprefetch = B_TRUE
	don't cache prefetch bufs
boolean_t	l2arc_feed_again = B_TRUE
	turbo warmup
boolean_t	l2arc_norw = B_TRUE
	no reads during writes
static list_t	L2ARC_dev_list
	device list
static list_t *	l2arc_dev_list
	device list pointer
static kmutex_t	l2arc_dev_mtx
	device list mutex
static l2arc_dev_t *	l2arc_dev_last
	last device used
static kmutex_t	l2arc_buflist_mtx
	mutex for all buflists
static list_t	L2ARC_free_on_write
	free after write buf list
static list_t *	l2arc_free_on_write
	free after write list ptr
static kmutex_t	l2arc_free_on_write_mtx
	mutex for list
static uint64_t	l2arc_ndev
	number of devices
static kmutex_t	l2arc_feed_thr_lock
static kcondvar_t	l2arc_feed_thr_cv
static uint8_t	l2arc_thread_exit
static kmem_cache_t *	hdr_cache
static kmem_cache_t *	buf_cache
static char *	arc_onloan_tag = "onloan"
static int	needfree = 0
kmem_cache_t *	zio_buf_cache []
kmem_cache_t *	zio_data_buf_cache []
static kmutex_t	arc_lowmem_lock
static eventhandler_tag	arc_event_lowmem = NULL

Detailed Description

DVA-based Adaptive Replacement Cache.

Megiddo and Modha's Adaptive Replacement Cache

While much of the theory of operation used here is based on the self-tuning, low overhead replacement cache presented by Megiddo and Modha at FAST 2003, there are some significant differences:

The Megiddo and Modha model assumes any page is evictable. Pages in its cache cannot be "locked" into memory. This makes the eviction algorithm simple: evict the last page in the list. This also make the performance characteristics easy to reason about. Our cache is not so simple. At any given moment, some subset of the blocks in the cache are un-evictable because we have handed out a reference to them. Blocks are only evictable when there are no external references active. This makes eviction far more problematic: we choose to evict the evictable blocks that are the "lowest" in the list.

There are times when it is not possible to evict the requested space. In these circumstances we are unable to adjust the cache size. To prevent the cache growing unbounded at these times we implement a "cache throttle" that slows the flow of new data into the cache until we can make space available.
The Megiddo and Modha model assumes a fixed cache size. Pages are evicted when the cache is full and there is a cache miss. Our model has a variable sized cache. It grows with high use, but also tries to react to memory pressure from the operating system: decreasing its size when system memory is tight.
The Megiddo and Modha model assumes a fixed page size. All elements of the cache are therefore exactly the same size. So when adjusting the cache size following a cache miss, it's simply a matter of choosing a single page to evict. In our model, we have variable sized cache blocks (ranging from 512 bytes to 128K bytes). We therefore choose a set of blocks to evict to make space for a cache miss that approximates as closely as possible the space used by the new block.

See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" by N. Megiddo & D. Modha, FAST 2003

Locking Model

A new reference to a cache buffer can be obtained in two ways:

via a hash table lookup using the DVA as a key
via one of the ARC lists

The arc_read() interface uses method 1, while the internal arc algorithms for adjusting the cache use method 2. We therefore provide two types of locks:

the hash table lock array
the arc list locks

Buffers do not have their own mutexes, rather they rely on the hash table mutexes for the bulk of their protection (i.e. most fields in the arc_buf_hdr_t are protected by these mutexes).

buf_hash_find() returns the appropriate mutex (held) when it locates the requested buffer in the hash table. It returns NULL for the mutex if the buffer was not in the table.

buf_hash_remove() expects the appropriate hash mutex to be already held before it is invoked.

Each arc state also has a mutex which is used to protect the buffer list associated with the state. When attempting to obtain a hash table lock while holding an arc list lock you must use: mutex_tryenter() to avoid deadlock. Also note that the active state mutex must be held before the ghost state mutex.

Arc buffers may have an associated eviction callback function. This function will be invoked prior to removing the buffer (e.g. in arc_do_user_evicts()). Note however that the data associated with the buffer may be evicted prior to the callback. The callback must be made with *no locks held* (to prevent deadlock). Additionally, the users of callbacks must ensure that their private data is protected from simultaneous callbacks from arc_buf_evict() and arc_do_user_evicts().

Note that the majority of the performance stats are manipulated with atomic operations.

The L2ARC uses the l2arc_buflist_mtx global mutex for the following:

L2ARC buflist creation
L2ARC buflist eviction
L2ARC write completion, which walks L2ARC buflists
ARC header destruction, as it removes from L2ARC buflists
ARC header release, as it removes from L2ARC buflists

Arc Buffer States

Buffers can be in one of 6 states:

ARC_anon - anonymous (discussed below)
ARC_mru - recently used, currently cached
ARC_mru_ghost - recentely used, no longer in cache
ARC_mfu - frequently used, currently cached
ARC_mfu_ghost - frequently used, no longer in cache
ARC_l2c_only - exists in L2ARC but not other states When there are no active references to the buffer, they are are linked onto a list in one of these arc states. These are the only buffers that can be evicted or deleted. Within each state there are multiple lists, one for meta-data and one for non-meta-data. Meta-data (indirect blocks, blocks of dnodes, etc.) is tracked separately so that it can be managed more explicitly: favored over data, limited explicitly.

Anonymous buffers are buffers that are not associated with a DVA. These are buffers that hold dirty block copies before they are written to stable storage. By definition, they are "ref'd" and are considered part of arc_mru that cannot be freed. Generally, they will aquire a DVA as they are written and migrate onto the arc_mru list.

The ARC_l2c_only state is for buffers that are in the second level ARC but no longer in any of the ARC_m* lists. The second level ARC itself may also contain buffers that are in any of the ARC_m* states - meaning that a buffer can exist in two places. The reason for the ARC_l2c_only state is to keep the buffer header in the hash table, so that reads that hit the second level ARC benefit from these fast lookups.

Level 2 ARC

The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. It uses dedicated storage devices to hold cached data, which are populated using large infrequent writes. The main role of this cache is to boost the performance of random read workloads. The intended L2ARC devices include short-stroked disks, solid state disks, and other media with substantially faster read latency than disk.

                   +-----------------------+
                   |         ARC           |
                   +-----------------------+
                      |         ^     ^
                      |         |     |
        l2arc_feed_thread()    arc_read()
                      |         |     |
                      |  l2arc read   |
                      V         |     |
                 +---------------+    |
                 |     L2ARC     |    |
                 +---------------+    |
                     |    ^           |
            l2arc_write() |           |
                     |    |           |
                     V    |           |
                   +-------+      +-------+
                   | vdev  |      | vdev  |
                   | cache |      | cache |
                   +-------+      +-------+
                   +=========+     .-----.
                   :  L2ARC  :    |-_____-|
                   : devices :    | Disks |
                   +=========+    `-_____-'

Read requests are satisfied from the following sources, in order:

ARC
vdev cache of L2ARC devices
L2ARC devices
vdev cache of disks
disks

Some L2ARC device types exhibit extremely slow write performance. To accommodate for this there are some significant differences between the L2ARC and traditional cache design:

There is no eviction path from the ARC to the L2ARC. Evictions from the ARC behave as usual, freeing buffers and placing headers on ghost lists. The ARC does not send buffers to the L2ARC during eviction as this would add inflated write latencies for all ARC memory pressure.

The L2ARC attempts to cache data from the ARC before it is evicted. It does this by periodically scanning buffers from the eviction-end of the MFU and MRU ARC lists, copying them to the L2ARC devices if they are not already there. It scans until a headroom of buffers is satisfied, which itself is a buffer for ARC eviction. The thread that does this is l2arc_feed_thread(), illustrated below; example sizes are included to provide a better sense of ratio than this diagram:

  	       head -->                        tail
  	        +---------------------+----------+
  	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
  	        +---------------------+----------+   |   o L2ARC eligible
  	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
  	        +---------------------+----------+   |
  	             15.9 Gbytes      ^ 32 Mbytes    |
  	                           headroom          |
  	                                      l2arc_feed_thread()
  	                                             |
  	                 l2arc write hand <--[oooo]--'
  	                         |           8 Mbyte
  	                         |          write max
  	                         V
  		  +==============================+
  	L2ARC dev |####|#|###|###|    |####| ... |
  	          +==============================+
  	                     32 Gbytes

If an ARC buffer is copied to the L2ARC but then hit instead of evicted, then the L2ARC has cached a buffer much sooner than it probably needed to, potentially wasting L2ARC device bandwidth and storage. It is safe to say that this is an uncommon case, since buffers at the end of the ARC lists have moved there due to inactivity.
If the ARC evicts faster than the L2ARC can maintain a headroom, then the L2ARC simply misses copying some buffers. This serves as a pressure valve to prevent heavy read workloads from both stalling the ARC with waits and clogging the L2ARC with writes. This also helps prevent the potential for the L2ARC to churn if it attempts to cache content too quickly, such as during backups of the entire pool.
After system boot and before the ARC has filled main memory, there are no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru lists can remain mostly static. Instead of searching from tail of these lists as pictured, the l2arc_feed_thread() will search from the list heads for eligible buffers, greatly increasing its chance of finding them.

The L2ARC device write speed is also boosted during this time so that the L2ARC warms up faster. Since there have been no ARC evictions yet, there are no L2ARC reads, and no fear of degrading read performance through increased writes.
Writes to the L2ARC devices are grouped and sent in-sequence, so that the vdev queue can aggregate them into larger and fewer writes. Each device is written to in a rotor fashion, sweeping writes through available space then repeating.
The L2ARC does not store dirty content. It never needs to flush write buffers back to disk based storage.
If an ARC buffer is written (and dirtied) which also exists in the L2ARC, the now stale L2ARC buffer is immediately dropped.

The performance of the L2ARC can be tweaked by a number of tunables, which may be necessary for different workloads:

l2arc_write_max max write bytes per interval
l2arc_write_boost extra write bytes during device warmup
l2arc_noprefetch skip caching prefetched buffers
l2arc_headroom number of max device writes to precache
l2arc_feed_secs seconds between L2ARC writing

Tunables may be removed or added as future performance improvements are integrated, and also may become zpool properties.

There are three key functions that control how the L2ARC warms up:

l2arc_write_eligible() check if a buffer is eligible to cache
l2arc_write_size() calculate how much to write
l2arc_write_interval() calculate sleep delay between writes

These three functions determine what to write, how much, and how quickly to send writes.

Definition in file arc.c.

Define Documentation

#define ARC_BUF_AVAILABLE (1 << 13)

block not in active use

Definition at line 586 of file arc.c.

#define ARC_BUFC_NUMDATALISTS 16

must be power of two for mask use to work

Definition at line 261 of file arc.c.

#define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)

Definition at line 263 of file arc.c.

#define ARC_BUFC_NUMMETADATALISTS 16

Definition at line 262 of file arc.c.

#define arc_c ARCSTAT(arcstat_c)

target size of cache

Definition at line 484 of file arc.c.

#define arc_c_max ARCSTAT(arcstat_c_max)

max target cache size

Definition at line 486 of file arc.c.

#define arc_c_min ARCSTAT(arcstat_c_min)

min target cache size

Definition at line 485 of file arc.c.

#define ARC_FREE_IN_PROGRESS (1 << 15)

hdr about to be freed

Definition at line 588 of file arc.c.

#define ARC_FREED_IN_READ (1 << 12)

buf freed while in read

Definition at line 585 of file arc.c.

#define ARC_IN_HASH_TABLE (1 << 9)

this buffer is hashed

Definition at line 582 of file arc.c.

#define ARC_INDIRECT (1 << 14)

this is an indirect block

Definition at line 587 of file arc.c.

#define ARC_IO_ERROR (1 << 11)

I/O failed for buf.

Definition at line 584 of file arc.c.

#define ARC_IO_IN_PROGRESS (1 << 10)

I/O in progress for buf.

Definition at line 583 of file arc.c.

#define ARC_L2_EVICTED (1 << 17)

evicted during I/O

Definition at line 590 of file arc.c.

#define ARC_L2_WRITE_HEAD (1 << 18)

head of write list

Definition at line 591 of file arc.c.

#define ARC_L2_WRITING (1 << 16)

L2ARC write in progress.

Definition at line 589 of file arc.c.

#define ARC_MINTIME (hz>>4)

Definition at line 1054 of file arc.c.

#define arc_p ARCSTAT(arcstat_p)

target size of MRU

Definition at line 483 of file arc.c.

#define ARC_REDUCE_DNLC_PERCENT 3

Definition at line 158 of file arc.c.

#define arc_size ARCSTAT(arcstat_size)

actual total arc size

Definition at line 482 of file arc.c.

#define ARCS_LOCK	(	s,
		i
	)	(&((s)->arcs_locks[(i)].arcs_lock))

Definition at line 272 of file arc.c.

#define ARCS_LOCK_PAD CACHE_LINE_SIZE

Definition at line 250 of file arc.c.

#define ARCSTAT ( stat ) (arc_stats.stat.value.ui64)

Definition at line 428 of file arc.c.

#define ARCSTAT_BUMP ( stat ) ARCSTAT_INCR(stat, 1)

Definition at line 433 of file arc.c.

#define ARCSTAT_BUMPDOWN ( stat ) ARCSTAT_INCR(stat, -1)

Definition at line 434 of file arc.c.

#define ARCSTAT_CONDSTAT	(	cond1,
		stat1,
		notstat1,
		cond2,
		stat2,
		notstat2,
		stat
	)

Value:

if (cond1) {                                                    \
                if (cond2) {                                            \
                        ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
                } else {                                                \
                        ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
                }                                                       \
        } else {                                                        \
                if (cond2) {                                            \
                        ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
                } else {                                                \
                        ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
                }                                                       \
        }

We define a macro to allow ARC hits/misses to be easily broken down by two separate conditions, giving a total of four different subtypes for each of hits and misses (so eight statistics total).

Definition at line 451 of file arc.c.

#define ARCSTAT_INCR	(	stat,
		val
	)	atomic_add_64(&arc_stats.stat.value.ui64, (val));

Definition at line 430 of file arc.c.

#define ARCSTAT_MAX	(	stat,
		val
	)

Value:

{                                       \
        uint64_t m;                                                     \
        while ((val) > (m = arc_stats.stat.value.ui64) &&               \
            (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
                continue;                                               \
}

Definition at line 436 of file arc.c.

#define ARCSTAT_MAXSTAT ( stat ) ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)

Definition at line 443 of file arc.c.

#define BUF_EMPTY ( buf )

Value:

((buf)->b_dva.dva_word[0] == 0 &&                       \
        (buf)->b_dva.dva_word[1] == 0 &&                        \
        (buf)->b_birth == 0)

Definition at line 812 of file arc.c.

#define BUF_EQUAL	(	spa,
		dva,
		birth,
		buf
	)

Value:

((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&     \
        ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
        ((buf)->b_birth == birth) && ((buf)->b_spa == spa)

Definition at line 817 of file arc.c.

#define BUF_HASH_INDEX	(	spa,
		dva,
		birth
	)	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)

Definition at line 636 of file arc.c.

#define BUF_HASH_LOCK ( idx ) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))

Definition at line 639 of file arc.c.

#define BUF_HASH_LOCK_NTRY ( idx ) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])

Definition at line 638 of file arc.c.

#define BUF_LOCKS 256

Definition at line 627 of file arc.c.

#define GHOST_STATE ( state )

Value:

((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
        (state) == arc_l2c_only)

Definition at line 570 of file arc.c.

#define HDR_BUF_AVAILABLE ( hdr ) ((hdr)->b_flags & ARC_BUF_AVAILABLE)

Definition at line 598 of file arc.c.

#define HDR_FREE_IN_PROGRESS ( hdr ) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)

Definition at line 599 of file arc.c.

#define HDR_FREED_IN_READ ( hdr ) ((hdr)->b_flags & ARC_FREED_IN_READ)

Definition at line 597 of file arc.c.

#define HDR_IN_HASH_TABLE ( hdr ) ((hdr)->b_flags & ARC_IN_HASH_TABLE)

Definition at line 593 of file arc.c.

#define HDR_IO_ERROR ( hdr ) ((hdr)->b_flags & ARC_IO_ERROR)

Definition at line 595 of file arc.c.

#define HDR_IO_IN_PROGRESS ( hdr ) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)

Definition at line 594 of file arc.c.

#define HDR_L2_EVICTED ( hdr ) ((hdr)->b_flags & ARC_L2_EVICTED)

Definition at line 604 of file arc.c.

#define HDR_L2_READING ( hdr )

Value:

((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
                                    (hdr)->b_l2hdr != NULL)

Definition at line 601 of file arc.c.

#define HDR_L2_WRITE_HEAD ( hdr ) ((hdr)->b_flags & ARC_L2_WRITE_HEAD)

Definition at line 605 of file arc.c.

#define HDR_L2_WRITING ( hdr ) ((hdr)->b_flags & ARC_L2_WRITING)

Definition at line 603 of file arc.c.

#define HDR_L2CACHE ( hdr ) ((hdr)->b_flags & ARC_L2CACHE)

Definition at line 600 of file arc.c.

#define HDR_LOCK ( hdr ) (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))

Definition at line 640 of file arc.c.

#define HDR_PREFETCH ( hdr ) ((hdr)->b_flags & ARC_PREFETCH)

Definition at line 596 of file arc.c.

#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))

Definition at line 611 of file arc.c.

#define HT_LOCK_PAD CACHE_LINE_SIZE

Definition at line 618 of file arc.c.

#define L2ARC_FEED_MIN_MS 200

min caching interval ms

Definition at line 652 of file arc.c.

#define L2ARC_FEED_SECS 1

caching interval secs

Definition at line 651 of file arc.c.

#define L2ARC_HEADROOM 2

num of writes

Definition at line 650 of file arc.c.

#define L2ARC_WRITE_SIZE (8 * 1024 * 1024)

initial write max

Definition at line 649 of file arc.c.

#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)

Definition at line 655 of file arc.c.

#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)

Definition at line 654 of file arc.c.

#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))

Definition at line 612 of file arc.c.

Typedef Documentation

typedef struct arc_callback arc_callback_t

Definition at line 501 of file arc.c.

typedef enum arc_reclaim_strategy arc_reclaim_strategy_t

typedef struct arc_state arc_state_t

typedef struct arc_stats arc_stats_t

typedef struct arc_write_callback arc_write_callback_t

Definition at line 511 of file arc.c.

typedef struct buf_hash_table buf_hash_table_t

typedef struct l2arc_buf_hdr l2arc_buf_hdr_t

Definition at line 499 of file arc.c.

typedef struct l2arc_data_free l2arc_data_free_t

typedef struct l2arc_dev l2arc_dev_t

typedef struct l2arc_read_callback l2arc_read_callback_t

typedef struct l2arc_write_callback l2arc_write_callback_t

Enumeration Type Documentation

enum arc_reclaim_strategy

Enumerator:

ARC_RECLAIM_AGGR	Aggressive reclaim strategy.
ARC_RECLAIM_CONS	Conservative reclaim strategy.

Definition at line 161 of file arc.c.

Function Documentation

static void add_reference	(	arc_buf_hdr_t *	ab,
		kmutex_t *	hash_lock,
		void *	tag
	)		`[static]`

Definition at line 1221 of file arc.c.

static void arc_access	(	arc_buf_hdr_t *	buf,
		kmutex_t *	hash_lock
	)		`[static]`

This routine is called whenever a buffer is accessed.

Note:: The hash lock is dropped in this function.

Definition at line 2767 of file arc.c.

static void arc_adapt	(	int	bytes,
		arc_state_t *	state
	)		`[static]`

Adapt arc info given the number of bytes we are trying to add and the state that we are comming from.

This function is only called when we are adding new content to the cache.

Definition at line 2569 of file arc.c.

static void arc_adjust ( void ) [static]

Definition at line 2207 of file arc.c.

void arc_bcopy_func	(	zio_t *	zio,
		arc_buf_t *	buf,
		void *	arg
	)

a generic arc_done_func_t which you can use

Definition at line 2901 of file arc.c.

void arc_buf_add_ref	(	arc_buf_t *	buf,
		void *	tag
	)

Definition at line 1561 of file arc.c.

arc_buf_t* arc_buf_alloc	(	spa_t *	spa,
		int	size,
		void *	tag,
		arc_buf_contents_t	type
	)

Definition at line 1446 of file arc.c.

static arc_buf_t* arc_buf_clone ( arc_buf_t * from ) [static]

Definition at line 1528 of file arc.c.

static void arc_buf_data_free	(	arc_buf_t *	buf,
		void()(void , size_t)	free_func
	)		`[static]`

Free the arc data buffer.

If it is an l2arc write in progress, the buffer is placed on l2arc_free_on_write to be freed later.

Definition at line 1598 of file arc.c.

static void arc_buf_destroy	(	arc_buf_t *	buf,
		boolean_t	recycle,
		boolean_t	all
	)		`[static]`

Definition at line 1618 of file arc.c.

int arc_buf_evict ( arc_buf_t * buf )

This is used by the DMU to let the ARC know that a buffer is being evicted, so the ARC should clean up.

If this arc buf is not yet in the evicted state, it will be put there.

Definition at line 3385 of file arc.c.

boolean_t arc_buf_eviction_needed ( arc_buf_t * buf )

Called from the DMU to determine if the current buffer should be evicted.

In order to ensure proper locking, the eviction must be initiated from the DMU. Return true if the buffer is associated with user data and duplicate buffers still exist.

Definition at line 1856 of file arc.c.

void arc_buf_free	(	arc_buf_t *	buf,
		void *	tag
	)

Definition at line 1763 of file arc.c.

void arc_buf_freeze ( arc_buf_t * buf )

Definition at line 1186 of file arc.c.

int arc_buf_remove_ref	(	arc_buf_t *	buf,
		void *	tag
	)

Definition at line 1810 of file arc.c.

int arc_buf_size ( arc_buf_t * buf )

Definition at line 1844 of file arc.c.

void arc_buf_thaw ( arc_buf_t * buf )

Definition at line 1156 of file arc.c.

static void arc_change_state	(	arc_state_t *	new_state,
		arc_buf_hdr_t *	ab,
		kmutex_t *	hash_lock
	)		`[static]`

Move the supplied buffer to the indicated state.

The mutex for the buffer must be held by the caller.

Definition at line 1284 of file arc.c.

static void arc_cksum_compute	(	arc_buf_t *	buf,
		boolean_t	force
	)		`[static]`

Definition at line 1091 of file arc.c.

static int arc_cksum_equal ( arc_buf_t * buf ) [static]

Definition at line 1077 of file arc.c.

static void arc_cksum_verify ( arc_buf_t * buf ) [static]

Definition at line 1057 of file arc.c.

void* arc_data_buf_alloc ( uint64_t size )

Definition at line 1429 of file arc.c.

void arc_data_buf_free	(	void *	buf,
		uint64_t	size
	)

Definition at line 1438 of file arc.c.

static void arc_do_user_evicts ( void ) [static]

Definition at line 2271 of file arc.c.

static void* arc_evict	(	arc_state_t *	state,
		uint64_t	spa,
		int64_t	bytes,
		boolean_t	recycle,
		arc_buf_contents_t	type
	)		`[static]`

Evict buffers from list until we've removed the specified number of bytes.

Move the removed buffers to the appropriate evict state. If the recycle flag is set, then attempt to "recycle" a buffer:

look for a buffer to evict that is `bytes' long.
return the data block from this buffer rather than freeing it. This flag is used by callers that are trying to make space for a new buffer in a full arc cache.

This function makes a "best effort". It skips over any buffers it can't get a hash_lock on, and so may not catch all candidates. It may also return without evicting as much space as requested.

Definition at line 1905 of file arc.c.

static void arc_evict_ghost	(	arc_state_t *	state,
		uint64_t	spa,
		int64_t	bytes
	)		`[static]`

Remove buffers from list until we've removed the specified number of bytes.

Destroy the buffers that are removed.

Definition at line 2102 of file arc.c.

static int arc_evict_needed ( arc_buf_contents_t type ) [static]

Check if the cache has reached its limits and eviction is required prior to insert.

Definition at line 2636 of file arc.c.

void arc_fini ( void )

Definition at line 4105 of file arc.c.

void arc_flush ( spa_t * spa )

Flush all *evictable* data from the cache for the given spa.

Note:: This will not touch "active" (i.e. referenced) data.

Definition at line 2309 of file arc.c.

static void arc_get_data_buf ( arc_buf_t * buf ) [static]

The buffer, supplied as the first argument, needs a data block.

So, if we are at cache max, determine which cache should be victimized. We have the following cases:

Insert for MRU, p > sizeof(arc_anon + arc_mru) -> In this situation if we're out of space, but the resident size of the MFU is under the limit, victimize the MFU cache to satisfy this insertion request.
Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> Here, we've used up all of the available space for the MRU, so we need to evict from our own cache instead. Evict from the set of resident MRU entries.
Insert for MFU (c - p) > sizeof(arc_mfu) -> c minus p represents the MFU space in the cache, since p is the size of the cache that is dedicated to the MRU. In this situation there's still space on the MFU side, so the MRU side needs to be victimized.
Insert for MFU (c - p) < sizeof(arc_mfu) -> MFU's resident set is consuming more space than it has been allotted. In this situation, we must victimize our own cache, the MFU, for this insertion.

Definition at line 2680 of file arc.c.

void arc_getbuf_func	(	zio_t *	zio,
		arc_buf_t *	buf,
		void *	arg
	)

a generic arc_done_func_t

Definition at line 2912 of file arc.c.

int arc_has_callback ( arc_buf_t * buf )

Definition at line 3623 of file arc.c.

static void arc_hdr_destroy ( arc_buf_hdr_t * hdr ) [static]

Definition at line 1688 of file arc.c.

void arc_init ( void )

Definition at line 3924 of file arc.c.

static void arc_kmem_reap_now ( arc_reclaim_strategy_t strat ) [static]

Definition at line 2451 of file arc.c.

arc_buf_t* arc_loan_buf	(	spa_t *	spa,
		int	size
	)

Loan out an anonymous arc buffer.

Loaned buffers are not counted as in flight data by arc_tempreserve_space() until they are "returned". Loaned buffers must be returned to the arc before they can be used by the DMU or freed.

Definition at line 1484 of file arc.c.

void arc_loan_inuse_buf	(	arc_buf_t *	buf,
		void *	tag
	)

Detach an arc_buf from a dbuf (tag)

Definition at line 1513 of file arc.c.

static void arc_lowmem	(	void *arg	__unused,
		int howto	__unused
	)		`[static]`

Definition at line 3900 of file arc.c.

static int arc_memory_throttle	(	uint64_t	reserve,
		uint64_t	inflight_data,
		uint64_t	txg
	)		`[static]`

Definition at line 3775 of file arc.c.

int arc_read	(	zio_t *	pio,
		spa_t *	spa,
		const blkptr_t *	bp,
		arc_buf_t *	pbuf,
		arc_done_func_t *	done,
		void *	private,
		int	priority,
		int	zio_flags,
		uint32_t *	arc_flags,
		const zbookmark_t *	zb
	)

"Read" the block block at the specified DVA (in bp) via the cache.

If the block is found in the cache, invoke the provided callback immediately and return. Note that the `zio' parameter in the callback will be NULL in this case, since no IO was required. If the block is not in the cache pass the read request on to the spa with a substitute callback function, so that the requested block will be added to the cache.

If a read request arrives for a block that has a read in-progress, either wait for the in-progress read to complete (and return the results); or, if this is a read with a "done" func, add a record to the read to invoke the "done" func when the read completes, and return; or just return.

arc_read_done() will invoke all the requested "done" functions for readers of this block.

Normal callers should use arc_read and pass the arc buffer and offset for the bp. But if you know you don't need locking, you can use arc_read_nolock.

Definition at line 3076 of file arc.c.

static void arc_read_done ( zio_t * zio ) [static]

Definition at line 2925 of file arc.c.

int arc_read_nolock	(	zio_t *	pio,
		spa_t *	spa,
		const blkptr_t *	bp,
		arc_done_func_t *	done,
		void *	private,
		int	priority,
		int	zio_flags,
		uint32_t *	arc_flags,
		const zbookmark_t *	zb
	)

Definition at line 3103 of file arc.c.

static int arc_reclaim_needed ( void ) [static]

Definition at line 2378 of file arc.c.

static void arc_reclaim_thread ( void *dummy __unused ) [static]

Definition at line 2495 of file arc.c.

void arc_release	(	arc_buf_t *	buf,
		void *	tag
	)

Release this buffer from the cache.

This must be done after a read and prior to modifying the buffer contents. If the buffer has more than one reference, we must make a new hdr for the buffer.

Definition at line 3474 of file arc.c.

int arc_release_bp	(	arc_buf_t *	buf,
		void *	tag,
		blkptr_t *	bp,
		spa_t *	spa,
		zbookmark_t *	zb
	)

Release this buffer.

If it does not match the provided BP, fill it with that block's contents.

Definition at line 3604 of file arc.c.

int arc_released ( arc_buf_t * buf )

Definition at line 3612 of file arc.c.

void arc_return_buf	(	arc_buf_t *	buf,
		void *	tag
	)

Return a loaned arc buffer to the arc.

Definition at line 1498 of file arc.c.

void arc_set_callback	(	arc_buf_t *	buf,
		arc_evict_func_t *	func,
		void *	private
	)

Definition at line 3367 of file arc.c.

void arc_shrink ( void )

Definition at line 2347 of file arc.c.

void arc_space_consume	(	uint64_t	space,
		arc_space_type_t	type
	)

Definition at line 1377 of file arc.c.

void arc_space_return	(	uint64_t	space,
		arc_space_type_t	type
	)

Definition at line 1401 of file arc.c.

void arc_tempreserve_clear ( uint64_t reserve )

Definition at line 3832 of file arc.c.

int arc_tempreserve_space	(	uint64_t	reserve,
		uint64_t	txg
	)

Definition at line 3839 of file arc.c.

zio_t* arc_write	(	zio_t *	pio,
		spa_t *	spa,
		uint64_t	txg,
		blkptr_t *	bp,
		arc_buf_t *	buf,
		boolean_t	l2arc,
		const zio_prop_t *	zp,
		arc_done_func_t *	ready,
		arc_done_func_t *	done,
		void *	private,
		int	priority,
		int	zio_flags,
		const zbookmark_t *	zb
	)

Definition at line 3746 of file arc.c.

static void arc_write_done ( zio_t * zio ) [static]

Definition at line 3675 of file arc.c.

static void arc_write_ready ( zio_t * zio ) [static]

Definition at line 3647 of file arc.c.

static int buf_cons	(	void *	vbuf,
		void *	unused,
		int	kmflag
	)		`[static]`

Definition at line 959 of file arc.c.

static void buf_dest	(	void *	vbuf,
		void *	unused
	)		`[static]`

Definition at line 990 of file arc.c.

static void buf_discard_identity ( arc_buf_hdr_t * hdr ) [static]

Definition at line 823 of file arc.c.

static void buf_fini ( void ) [static]

Definition at line 926 of file arc.c.

static uint64_t buf_hash	(	uint64_t	spa,
		const dva_t *	dva,
		uint64_t	birth
	)		`[static]`

Definition at line 796 of file arc.c.

static arc_buf_hdr_t* buf_hash_find	(	uint64_t	spa,
		const dva_t *	dva,
		uint64_t	birth,
		kmutex_t **	lockp
	)		`[static]`

Definition at line 832 of file arc.c.

static arc_buf_hdr_t* buf_hash_insert	(	arc_buf_hdr_t *	buf,
		kmutex_t **	lockp
	)		`[static]`

Insert an entry into the hash table.

If there is already an element equal to elem in the hash table, then the already existing element will be returned and the new element will not be inserted. Otherwise returns NULL.

Definition at line 858 of file arc.c.

static void buf_hash_remove ( arc_buf_hdr_t * buf ) [static]

Definition at line 894 of file arc.c.

static void buf_init ( void ) [static]

Definition at line 1016 of file arc.c.

static void get_buf_info	(	arc_buf_hdr_t *	ab,
		arc_state_t *	state,
		list_t **	list,
		kmutex_t **	lock
	)		`[static]`

Definition at line 1204 of file arc.c.

static int hdr_cons	(	void *	vbuf,
		void *	unused,
		int	kmflag
	)		`[static]`

Constructor callback - called when the cache is empty and a new buf is requested.

Definition at line 944 of file arc.c.

static void hdr_dest	(	void *	vbuf,
		void *	unused
	)		`[static]`

Destructor callback - called when a cached buf is no longer required.

Definition at line 977 of file arc.c.

static void hdr_recl ( void * unused ) [static]

Reclaim callback -- invoked when memory is low.

Definition at line 1004 of file arc.c.

void l2arc_add_vdev	(	spa_t *	spa,
		vdev_t *	vd
	)

Add a vdev for use by the L2ARC.

By this point the spa has already validated the vdev and opened it.

Definition at line 5078 of file arc.c.

static l2arc_dev_t* l2arc_dev_get_next ( void ) [static]

Cycle through L2ARC devices.

This is how L2ARC load balances. If a device is returned, this also returns holding the spa config lock.

Definition at line 4373 of file arc.c.

static void l2arc_do_free_on_write ( ) [static]

Free buffers that were tagged for destruction.

Definition at line 4433 of file arc.c.

static void l2arc_evict	(	l2arc_dev_t *	dev,
		uint64_t	distance,
		boolean_t	all
	)		`[static]`

Evict buffers from the device write hand to the distance specified in bytes.

This distance may span populated buffers, it may span nothing. This is clearing a region on the L2ARC device ready for writing. If the 'all' boolean is set, every buffer is evicted.

Definition at line 4649 of file arc.c.

static void l2arc_feed_thread ( void *dummy __unused ) [static]

Feed the L2ARC with buffers from the ARC at regular intervals.

This thread is the beating heart of the L2ARC.

Definition at line 4965 of file arc.c.

void l2arc_fini ( void )

Definition at line 5180 of file arc.c.

static void l2arc_hdr_stat_add ( void ) [static]

Definition at line 4355 of file arc.c.

static void l2arc_hdr_stat_remove ( void ) [static]

Definition at line 4362 of file arc.c.

void l2arc_init ( void )

Definition at line 5158 of file arc.c.

static list_t* l2arc_list_locked	(	int	list_num,
		kmutex_t **	lock
	)		`[static]`

This is the list priority from which the L2ARC will search for pages to cache.

This is used within loops (0..3) to cycle through lists in the desired order. This order can have a significant effect on cache performance.

Currently the metadata lists are hit first, MFU then MRU, followed by the data lists. This function returns a locked list, and also returns the lock pointer.

Definition at line 4611 of file arc.c.

static void l2arc_read_done ( zio_t * zio ) [static]

A read to a cache device completed.

Validate buffer contents before handing over to the regular ARC routines.

Definition at line 4534 of file arc.c.

void l2arc_remove_vdev ( vdev_t * vd )

Remove a vdev from the L2ARC.

Definition at line 5123 of file arc.c.

void l2arc_start ( void )

Definition at line 5201 of file arc.c.

void l2arc_stop ( void )

Definition at line 5211 of file arc.c.

boolean_t l2arc_vdev_present ( vdev_t * vd )

Definition at line 5058 of file arc.c.

static uint64_t l2arc_write_buffers	(	spa_t *	spa,
		l2arc_dev_t *	dev,
		uint64_t	target_sz
	)		`[static]`

Find and write ARC buffers to the L2ARC device.

An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid for reading until they have completed writing.

Definition at line 4780 of file arc.c.

static void l2arc_write_done ( zio_t * zio ) [static]

A write to a cache device has completed.

Update all headers to allow reads from these buffers to begin.

Definition at line 4458 of file arc.c.

static boolean_t l2arc_write_eligible	(	uint64_t	spa_guid,
		arc_buf_hdr_t *	ab
	)		`[static]`

Definition at line 4289 of file arc.c.

static clock_t l2arc_write_interval	(	clock_t	began,
		uint64_t	wanted,
		uint64_t	wrote
	)		`[static]`

Definition at line 4333 of file arc.c.

static uint64_t l2arc_write_size ( l2arc_dev_t * dev ) [static]

Definition at line 4319 of file arc.c.

static int remove_reference	(	arc_buf_hdr_t *	ab,
		kmutex_t *	hash_lock,
		void *	tag
	)		`[static]`

Definition at line 1253 of file arc.c.

SYSCTL_DECL ( _vfs_zfs )

SYSCTL_INT	(	_vfs_zfs	,
		OID_AUTO	,
		l2arc_noprefetch	,
		CTLFLAG_RW	,
		&	l2arc_noprefetch,
		0	,
		"don't cache prefetch bufs"
	)

SYSCTL_INT	(	_vfs_zfs	,
		OID_AUTO	,
		l2arc_norw	,
		CTLFLAG_RW	,
		&	l2arc_norw,
		0	,
		"no reads during writes"
	)

SYSCTL_INT	(	_vfs_zfs	,
		OID_AUTO	,
		l2arc_feed_again	,
		CTLFLAG_RW	,
		&	l2arc_feed_again,
		0	,
		"turbo warmup"
	)

SYSCTL_UQUAD	(	_vfs_zfs	,
		OID_AUTO	,
		l2arc_feed_min_ms	,
		CTLFLAG_RW	,
		&	l2arc_feed_min_ms,
		0	,
		"min interval milliseconds"
	)

SYSCTL_UQUAD	(	_vfs_zfs	,
		OID_AUTO	,
		mru_metadata_lsize	,
		CTLFLAG_RD	,
		&ARC_mru.	arcs_lsize[ARC_BUFC_METADATA],
		0	,
		"size of metadata in mru state"
	)

SYSCTL_UQUAD	(	_vfs_zfs	,
		OID_AUTO	,
		mru_ghost_data_lsize	,
		CTLFLAG_RD	,
		&ARC_mru_ghost.	arcs_lsize[ARC_BUFC_DATA],
		0	,
		"size of data in mru ghost state"
	)

SYSCTL_UQUAD	(	_vfs_zfs	,
		OID_AUTO	,
		mfu_data_lsize	,
		CTLFLAG_RD	,
		&ARC_mfu.	arcs_lsize[ARC_BUFC_DATA],
		0	,
		"size of data in mfu state"
	)

SYSCTL_UQUAD	(	_vfs_zfs	,
		OID_AUTO	,
		mfu_ghost_data_lsize	,
		CTLFLAG_RD	,
		&ARC_mfu_ghost.	arcs_lsize[ARC_BUFC_DATA],
		0	,
		"size of data in mfu ghost state"
	)

SYSCTL_UQUAD	(	_vfs_zfs	,
		OID_AUTO	,
		mru_data_lsize	,
		CTLFLAG_RD	,
		&ARC_mru.	arcs_lsize[ARC_BUFC_DATA],
		0	,
		"size of data in mru state"
	)

SYSCTL_UQUAD	(	_vfs_zfs	,
		OID_AUTO	,
		mru_ghost_size	,
		CTLFLAG_RD	,
		&ARC_mru_ghost.	arcs_size,
		0	,
		"size of mru ghost state"
	)

SYSCTL_UQUAD	(	_vfs_zfs	,
		OID_AUTO	,
		mfu_ghost_metadata_lsize	,
		CTLFLAG_RD	,
		&ARC_mfu_ghost.	arcs_lsize[ARC_BUFC_METADATA],
		0	,
		"size of metadata in mfu ghost state"
	)

SYSCTL_UQUAD	(	_vfs_zfs	,
		OID_AUTO	,
		l2arc_write_max	,
		CTLFLAG_RW	,
		&	l2arc_write_max,
		0	,
		"max write size"
	)

SYSCTL_UQUAD	(	_vfs_zfs	,
		OID_AUTO	,
		mfu_size	,
		CTLFLAG_RD	,
		&ARC_mfu.	arcs_size,
		0	,
		"size of mfu state"
	)

SYSCTL_UQUAD	(	_vfs_zfs	,
		OID_AUTO	,
		arc_meta_limit	,
		CTLFLAG_RW	,
		&	arc_meta_limit,
		0	,
		"ARC metadata limit"
	)

SYSCTL_UQUAD	(	_vfs_zfs	,
		OID_AUTO	,
		mru_size	,
		CTLFLAG_RD	,
		&ARC_mru.	arcs_size,
		0	,
		"size of mru state"
	)

SYSCTL_UQUAD	(	_vfs_zfs	,
		OID_AUTO	,
		anon_data_lsize	,
		CTLFLAG_RD	,
		&ARC_anon.	arcs_lsize[ARC_BUFC_DATA],
		0	,
		"size of anonymous state"
	)

SYSCTL_UQUAD	(	_vfs_zfs	,
		OID_AUTO	,
		l2arc_feed_secs	,
		CTLFLAG_RW	,
		&	l2arc_feed_secs,
		0	,
		"interval seconds"
	)

SYSCTL_UQUAD	(	_vfs_zfs	,
		OID_AUTO	,
		arc_meta_used	,
		CTLFLAG_RD	,
		&	arc_meta_used,
		0	,
		"ARC metadata used"
	)

SYSCTL_UQUAD	(	_vfs_zfs	,
		OID_AUTO	,
		l2arc_write_boost	,
		CTLFLAG_RW	,
		&	l2arc_write_boost,
		0	,
		"extra write during warmup"
	)

SYSCTL_UQUAD	(	_vfs_zfs	,
		OID_AUTO	,
		mfu_metadata_lsize	,
		CTLFLAG_RD	,
		&ARC_mfu.	arcs_lsize[ARC_BUFC_METADATA],
		0	,
		"size of metadata in mfu state"
	)

SYSCTL_UQUAD	(	_vfs_zfs	,
		OID_AUTO	,
		mru_ghost_metadata_lsize	,
		CTLFLAG_RD	,
		&ARC_mru_ghost.	arcs_lsize[ARC_BUFC_METADATA],
		0	,
		"size of metadata in mru ghost state"
	)