Data Structures |
struct | arcs_lock |
struct | arc_state |
struct | arc_stats |
struct | arc_callback |
struct | arc_write_callback |
struct | arc_buf_hdr |
struct | ht_lock |
struct | buf_hash_table |
struct | l2arc_dev |
struct | l2arc_read_callback |
struct | l2arc_write_callback |
struct | l2arc_buf_hdr |
struct | l2arc_data_free |
Defines |
#define | ARC_REDUCE_DNLC_PERCENT 3 |
#define | ARCS_LOCK_PAD CACHE_LINE_SIZE |
#define | ARC_BUFC_NUMDATALISTS 16 |
| must be power of two for mask use to work
|
#define | ARC_BUFC_NUMMETADATALISTS 16 |
#define | ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS) |
#define | ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock)) |
#define | ARCSTAT(stat) (arc_stats.stat.value.ui64) |
#define | ARCSTAT_INCR(stat, val) atomic_add_64(&arc_stats.stat.value.ui64, (val)); |
#define | ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) |
#define | ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) |
#define | ARCSTAT_MAX(stat, val) |
#define | ARCSTAT_MAXSTAT(stat) ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) |
#define | ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) |
| We define a macro to allow ARC hits/misses to be easily broken down by two separate conditions, giving a total of four different subtypes for each of hits and misses (so eight statistics total).
|
#define | arc_size ARCSTAT(arcstat_size) |
| actual total arc size
|
#define | arc_p ARCSTAT(arcstat_p) |
| target size of MRU
|
#define | arc_c ARCSTAT(arcstat_c) |
| target size of cache
|
#define | arc_c_min ARCSTAT(arcstat_c_min) |
| min target cache size
|
#define | arc_c_max ARCSTAT(arcstat_c_max) |
| max target cache size
|
#define | GHOST_STATE(state) |
#define | ARC_IN_HASH_TABLE (1 << 9) |
| this buffer is hashed
|
#define | ARC_IO_IN_PROGRESS (1 << 10) |
| I/O in progress for buf.
|
#define | ARC_IO_ERROR (1 << 11) |
| I/O failed for buf.
|
#define | ARC_FREED_IN_READ (1 << 12) |
| buf freed while in read
|
#define | ARC_BUF_AVAILABLE (1 << 13) |
| block not in active use
|
#define | ARC_INDIRECT (1 << 14) |
| this is an indirect block
|
#define | ARC_FREE_IN_PROGRESS (1 << 15) |
| hdr about to be freed
|
#define | ARC_L2_WRITING (1 << 16) |
| L2ARC write in progress.
|
#define | ARC_L2_EVICTED (1 << 17) |
| evicted during I/O
|
#define | ARC_L2_WRITE_HEAD (1 << 18) |
| head of write list
|
#define | HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) |
#define | HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) |
#define | HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) |
#define | HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) |
#define | HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) |
#define | HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) |
#define | HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) |
#define | HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) |
#define | HDR_L2_READING(hdr) |
#define | HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) |
#define | HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) |
#define | HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) |
#define | HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) |
#define | L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) |
#define | HT_LOCK_PAD CACHE_LINE_SIZE |
#define | BUF_LOCKS 256 |
#define | BUF_HASH_INDEX(spa, dva, birth) (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) |
#define | BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) |
#define | BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) |
#define | HDR_LOCK(hdr) (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) |
#define | L2ARC_WRITE_SIZE (8 * 1024 * 1024) |
| initial write max
|
#define | L2ARC_HEADROOM 2 |
| num of writes
|
#define | L2ARC_FEED_SECS 1 |
| caching interval secs
|
#define | L2ARC_FEED_MIN_MS 200 |
| min caching interval ms
|
#define | l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) |
#define | l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) |
#define | BUF_EMPTY(buf) |
#define | BUF_EQUAL(spa, dva, birth, buf) |
#define | ARC_MINTIME (hz>>4) |
Typedefs |
typedef enum arc_reclaim_strategy | arc_reclaim_strategy_t |
typedef struct arc_state | arc_state_t |
typedef struct arc_stats | arc_stats_t |
typedef struct l2arc_buf_hdr | l2arc_buf_hdr_t |
typedef struct arc_callback | arc_callback_t |
typedef struct arc_write_callback | arc_write_callback_t |
typedef struct buf_hash_table | buf_hash_table_t |
typedef struct l2arc_dev | l2arc_dev_t |
typedef struct l2arc_read_callback | l2arc_read_callback_t |
typedef struct l2arc_write_callback | l2arc_write_callback_t |
typedef struct l2arc_data_free | l2arc_data_free_t |
Enumerations |
enum | arc_reclaim_strategy { ARC_RECLAIM_AGGR,
ARC_RECLAIM_CONS
} |
Functions |
| TUNABLE_QUAD ("vfs.zfs.arc_max",&zfs_arc_max) |
| TUNABLE_QUAD ("vfs.zfs.arc_min",&zfs_arc_min) |
| TUNABLE_QUAD ("vfs.zfs.arc_meta_limit",&zfs_arc_meta_limit) |
| SYSCTL_DECL (_vfs_zfs) |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN,&zfs_arc_max, 0,"Maximum ARC size") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN,&zfs_arc_min, 0,"Minimum ARC size") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RD,&arc_meta_used, 0,"ARC metadata used") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RW,&arc_meta_limit, 0,"ARC metadata limit") |
static void | arc_get_data_buf (arc_buf_t *buf) |
| The buffer, supplied as the first argument, needs a data block.
|
static void | arc_access (arc_buf_hdr_t *buf, kmutex_t *hash_lock) |
| This routine is called whenever a buffer is accessed.
|
static int | arc_evict_needed (arc_buf_contents_t type) |
| Check if the cache has reached its limits and eviction is required prior to insert.
|
static void | arc_evict_ghost (arc_state_t *state, uint64_t spa, int64_t bytes) |
| Remove buffers from list until we've removed the specified number of bytes.
|
static boolean_t | l2arc_write_eligible (uint64_t spa_guid, arc_buf_hdr_t *ab) |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,&l2arc_write_max, 0,"max write size") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,&l2arc_write_boost, 0,"extra write during warmup") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,&l2arc_headroom, 0,"number of dev writes") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,&l2arc_feed_secs, 0,"interval seconds") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,&l2arc_feed_min_ms, 0,"min interval milliseconds") |
| SYSCTL_INT (_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,&l2arc_noprefetch, 0,"don't cache prefetch bufs") |
| SYSCTL_INT (_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,&l2arc_feed_again, 0,"turbo warmup") |
| SYSCTL_INT (_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,&l2arc_norw, 0,"no reads during writes") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,&ARC_anon.arcs_size, 0,"size of anonymous state") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,&ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0,"size of anonymous state") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,&ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0,"size of anonymous state") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,&ARC_mru.arcs_size, 0,"size of mru state") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,&ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0,"size of metadata in mru state") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,&ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0,"size of data in mru state") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,&ARC_mru_ghost.arcs_size, 0,"size of mru ghost state") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,&ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,"size of metadata in mru ghost state") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,&ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,"size of data in mru ghost state") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,&ARC_mfu.arcs_size, 0,"size of mfu state") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,&ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0,"size of metadata in mfu state") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,&ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0,"size of data in mfu state") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,&ARC_mfu_ghost.arcs_size, 0,"size of mfu ghost state") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,&ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,"size of metadata in mfu ghost state") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,&ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,"size of data in mfu ghost state") |
| SYSCTL_UQUAD (_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,&ARC_l2c_only.arcs_size, 0,"size of mru state") |
static void | l2arc_read_done (zio_t *zio) |
| A read to a cache device completed.
|
static void | l2arc_hdr_stat_add (void) |
static void | l2arc_hdr_stat_remove (void) |
static uint64_t | buf_hash (uint64_t spa, const dva_t *dva, uint64_t birth) |
static void | buf_discard_identity (arc_buf_hdr_t *hdr) |
static arc_buf_hdr_t * | buf_hash_find (uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) |
static arc_buf_hdr_t * | buf_hash_insert (arc_buf_hdr_t *buf, kmutex_t **lockp) |
| Insert an entry into the hash table.
|
static void | buf_hash_remove (arc_buf_hdr_t *buf) |
static void | buf_fini (void) |
static int | hdr_cons (void *vbuf, void *unused, int kmflag) |
| Constructor callback - called when the cache is empty and a new buf is requested.
|
static int | buf_cons (void *vbuf, void *unused, int kmflag) |
static void | hdr_dest (void *vbuf, void *unused) |
| Destructor callback - called when a cached buf is no longer required.
|
static void | buf_dest (void *vbuf, void *unused) |
static void | hdr_recl (void *unused) |
| Reclaim callback -- invoked when memory is low.
|
static void | buf_init (void) |
static void | arc_cksum_verify (arc_buf_t *buf) |
static int | arc_cksum_equal (arc_buf_t *buf) |
static void | arc_cksum_compute (arc_buf_t *buf, boolean_t force) |
void | arc_buf_thaw (arc_buf_t *buf) |
void | arc_buf_freeze (arc_buf_t *buf) |
static void | get_buf_info (arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock) |
static void | add_reference (arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) |
static int | remove_reference (arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) |
static void | arc_change_state (arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) |
| Move the supplied buffer to the indicated state.
|
void | arc_space_consume (uint64_t space, arc_space_type_t type) |
void | arc_space_return (uint64_t space, arc_space_type_t type) |
void * | arc_data_buf_alloc (uint64_t size) |
void | arc_data_buf_free (void *buf, uint64_t size) |
arc_buf_t * | arc_buf_alloc (spa_t *spa, int size, void *tag, arc_buf_contents_t type) |
arc_buf_t * | arc_loan_buf (spa_t *spa, int size) |
| Loan out an anonymous arc buffer.
|
void | arc_return_buf (arc_buf_t *buf, void *tag) |
| Return a loaned arc buffer to the arc.
|
void | arc_loan_inuse_buf (arc_buf_t *buf, void *tag) |
| Detach an arc_buf from a dbuf (tag)
|
static arc_buf_t * | arc_buf_clone (arc_buf_t *from) |
void | arc_buf_add_ref (arc_buf_t *buf, void *tag) |
static void | arc_buf_data_free (arc_buf_t *buf, void(*free_func)(void *, size_t)) |
| Free the arc data buffer.
|
static void | arc_buf_destroy (arc_buf_t *buf, boolean_t recycle, boolean_t all) |
static void | arc_hdr_destroy (arc_buf_hdr_t *hdr) |
void | arc_buf_free (arc_buf_t *buf, void *tag) |
int | arc_buf_remove_ref (arc_buf_t *buf, void *tag) |
int | arc_buf_size (arc_buf_t *buf) |
boolean_t | arc_buf_eviction_needed (arc_buf_t *buf) |
| Called from the DMU to determine if the current buffer should be evicted.
|
static void * | arc_evict (arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, arc_buf_contents_t type) |
| Evict buffers from list until we've removed the specified number of bytes.
|
static void | arc_adjust (void) |
static void | arc_do_user_evicts (void) |
void | arc_flush (spa_t *spa) |
| Flush all *evictable* data from the cache for the given spa.
|
void | arc_shrink (void) |
static int | arc_reclaim_needed (void) |
static void | arc_kmem_reap_now (arc_reclaim_strategy_t strat) |
static void | arc_reclaim_thread (void *dummy __unused) |
static void | arc_adapt (int bytes, arc_state_t *state) |
| Adapt arc info given the number of bytes we are trying to add and the state that we are comming from.
|
void | arc_bcopy_func (zio_t *zio, arc_buf_t *buf, void *arg) |
| a generic arc_done_func_t which you can use
|
void | arc_getbuf_func (zio_t *zio, arc_buf_t *buf, void *arg) |
| a generic arc_done_func_t
|
static void | arc_read_done (zio_t *zio) |
int | arc_read (zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, arc_done_func_t *done, void *private, int priority, int zio_flags, uint32_t *arc_flags, const zbookmark_t *zb) |
| "Read" the block block at the specified DVA (in bp) via the cache.
|
int | arc_read_nolock (zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, int priority, int zio_flags, uint32_t *arc_flags, const zbookmark_t *zb) |
void | arc_set_callback (arc_buf_t *buf, arc_evict_func_t *func, void *private) |
int | arc_buf_evict (arc_buf_t *buf) |
| This is used by the DMU to let the ARC know that a buffer is being evicted, so the ARC should clean up.
|
void | arc_release (arc_buf_t *buf, void *tag) |
| Release this buffer from the cache.
|
int | arc_release_bp (arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa, zbookmark_t *zb) |
| Release this buffer.
|
int | arc_released (arc_buf_t *buf) |
int | arc_has_callback (arc_buf_t *buf) |
static void | arc_write_ready (zio_t *zio) |
static void | arc_write_done (zio_t *zio) |
zio_t * | arc_write (zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, int zio_flags, const zbookmark_t *zb) |
static int | arc_memory_throttle (uint64_t reserve, uint64_t inflight_data, uint64_t txg) |
void | arc_tempreserve_clear (uint64_t reserve) |
int | arc_tempreserve_space (uint64_t reserve, uint64_t txg) |
static void | arc_lowmem (void *arg __unused, int howto __unused) |
void | arc_init (void) |
void | arc_fini (void) |
static uint64_t | l2arc_write_size (l2arc_dev_t *dev) |
static clock_t | l2arc_write_interval (clock_t began, uint64_t wanted, uint64_t wrote) |
static l2arc_dev_t * | l2arc_dev_get_next (void) |
| Cycle through L2ARC devices.
|
static void | l2arc_do_free_on_write () |
| Free buffers that were tagged for destruction.
|
static void | l2arc_write_done (zio_t *zio) |
| A write to a cache device has completed.
|
static list_t * | l2arc_list_locked (int list_num, kmutex_t **lock) |
| This is the list priority from which the L2ARC will search for pages to cache.
|
static void | l2arc_evict (l2arc_dev_t *dev, uint64_t distance, boolean_t all) |
| Evict buffers from the device write hand to the distance specified in bytes.
|
static uint64_t | l2arc_write_buffers (spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) |
| Find and write ARC buffers to the L2ARC device.
|
static void | l2arc_feed_thread (void *dummy __unused) |
| Feed the L2ARC with buffers from the ARC at regular intervals.
|
boolean_t | l2arc_vdev_present (vdev_t *vd) |
void | l2arc_add_vdev (spa_t *spa, vdev_t *vd) |
| Add a vdev for use by the L2ARC.
|
void | l2arc_remove_vdev (vdev_t *vd) |
| Remove a vdev from the L2ARC.
|
void | l2arc_init (void) |
void | l2arc_fini (void) |
void | l2arc_start (void) |
void | l2arc_stop (void) |
Variables |
static kmutex_t | arc_reclaim_thr_lock |
static kcondvar_t | arc_reclaim_thr_cv |
| used to signal reclaim thr
|
static uint8_t | arc_thread_exit |
int | zfs_write_limit_shift |
| 1/8th of physical memory
|
uint64_t | zfs_write_limit_max |
| max data payload per txg
|
kmutex_t | zfs_write_limit_lock |
uint_t | arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT |
static int | arc_grow_retry = 60 |
| number of seconds before growing cache again
|
static int | arc_p_min_shift = 4 |
| shift of arc_c for calculating both min and max arc_p
|
static int | arc_shrink_shift = 5 |
| log2(fraction of arc to reclaim)
|
static int | arc_min_prefetch_lifespan |
| minimum lifespan of a prefetch block in clock ticks (initialized in arc_init())
|
static int | arc_dead |
int | zfs_prefetch_disable |
static boolean_t | arc_warm |
| The arc has filled available memory and has now warmed up.
|
uint64_t | zfs_arc_max |
uint64_t | zfs_arc_min |
uint64_t | zfs_arc_meta_limit = 0 |
int | zfs_arc_grow_retry = 0 |
int | zfs_arc_shrink_shift = 0 |
int | zfs_arc_p_min_shift = 0 |
int | zfs_disable_dup_eviction = 0 |
static arc_state_t | ARC_anon |
static arc_state_t | ARC_mru |
static arc_state_t | ARC_mru_ghost |
static arc_state_t | ARC_mfu |
static arc_state_t | ARC_mfu_ghost |
static arc_state_t | ARC_l2c_only |
static arc_stats_t | arc_stats |
kstat_t * | arc_ksp |
static arc_state_t * | arc_anon |
static arc_state_t * | arc_mru |
static arc_state_t * | arc_mru_ghost |
static arc_state_t * | arc_mfu |
static arc_state_t * | arc_mfu_ghost |
static arc_state_t * | arc_l2c_only |
static int | arc_no_grow |
| Don't try to grow cache size.
|
static uint64_t | arc_tempreserve |
static uint64_t | arc_loaned_bytes |
static uint64_t | arc_meta_used |
static uint64_t | arc_meta_limit |
static uint64_t | arc_meta_max = 0 |
static arc_buf_t * | arc_eviction_list |
static kmutex_t | arc_eviction_mtx |
static arc_buf_hdr_t | arc_eviction_hdr |
static buf_hash_table_t | buf_hash_table |
uint64_t | zfs_crc64_table [256] |
uint64_t | l2arc_write_max = L2ARC_WRITE_SIZE |
| default max write size
|
uint64_t | l2arc_write_boost = L2ARC_WRITE_SIZE |
| extra write during warmup
|
uint64_t | l2arc_headroom = L2ARC_HEADROOM |
| number of dev writes
|
uint64_t | l2arc_feed_secs = L2ARC_FEED_SECS |
| interval seconds
|
uint64_t | l2arc_feed_min_ms = L2ARC_FEED_MIN_MS |
| min interval milliseconds
|
boolean_t | l2arc_noprefetch = B_TRUE |
| don't cache prefetch bufs
|
boolean_t | l2arc_feed_again = B_TRUE |
| turbo warmup
|
boolean_t | l2arc_norw = B_TRUE |
| no reads during writes
|
static list_t | L2ARC_dev_list |
| device list
|
static list_t * | l2arc_dev_list |
| device list pointer
|
static kmutex_t | l2arc_dev_mtx |
| device list mutex
|
static l2arc_dev_t * | l2arc_dev_last |
| last device used
|
static kmutex_t | l2arc_buflist_mtx |
| mutex for all buflists
|
static list_t | L2ARC_free_on_write |
| free after write buf list
|
static list_t * | l2arc_free_on_write |
| free after write list ptr
|
static kmutex_t | l2arc_free_on_write_mtx |
| mutex for list
|
static uint64_t | l2arc_ndev |
| number of devices
|
static kmutex_t | l2arc_feed_thr_lock |
static kcondvar_t | l2arc_feed_thr_cv |
static uint8_t | l2arc_thread_exit |
static kmem_cache_t * | hdr_cache |
static kmem_cache_t * | buf_cache |
static char * | arc_onloan_tag = "onloan" |
static int | needfree = 0 |
kmem_cache_t * | zio_buf_cache [] |
kmem_cache_t * | zio_data_buf_cache [] |
static kmutex_t | arc_lowmem_lock |
static eventhandler_tag | arc_event_lowmem = NULL |
DVA-based Adaptive Replacement Cache.
Megiddo and Modha's Adaptive Replacement Cache
While much of the theory of operation used here is based on the self-tuning, low overhead replacement cache presented by Megiddo and Modha at FAST 2003, there are some significant differences:
- The Megiddo and Modha model assumes any page is evictable. Pages in its cache cannot be "locked" into memory. This makes the eviction algorithm simple: evict the last page in the list. This also make the performance characteristics easy to reason about. Our cache is not so simple. At any given moment, some subset of the blocks in the cache are un-evictable because we have handed out a reference to them. Blocks are only evictable when there are no external references active. This makes eviction far more problematic: we choose to evict the evictable blocks that are the "lowest" in the list.
There are times when it is not possible to evict the requested space. In these circumstances we are unable to adjust the cache size. To prevent the cache growing unbounded at these times we implement a "cache throttle" that slows the flow of new data into the cache until we can make space available.
- The Megiddo and Modha model assumes a fixed cache size. Pages are evicted when the cache is full and there is a cache miss. Our model has a variable sized cache. It grows with high use, but also tries to react to memory pressure from the operating system: decreasing its size when system memory is tight.
- The Megiddo and Modha model assumes a fixed page size. All elements of the cache are therefore exactly the same size. So when adjusting the cache size following a cache miss, it's simply a matter of choosing a single page to evict. In our model, we have variable sized cache blocks (ranging from 512 bytes to 128K bytes). We therefore choose a set of blocks to evict to make space for a cache miss that approximates as closely as possible the space used by the new block.
See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" by N. Megiddo & D. Modha, FAST 2003
Locking Model
A new reference to a cache buffer can be obtained in two ways:
- via a hash table lookup using the DVA as a key
- via one of the ARC lists
The arc_read() interface uses method 1, while the internal arc algorithms for adjusting the cache use method 2. We therefore provide two types of locks:
- the hash table lock array
- the arc list locks
Buffers do not have their own mutexes, rather they rely on the hash table mutexes for the bulk of their protection (i.e. most fields in the arc_buf_hdr_t are protected by these mutexes).
buf_hash_find() returns the appropriate mutex (held) when it locates the requested buffer in the hash table. It returns NULL for the mutex if the buffer was not in the table.
buf_hash_remove() expects the appropriate hash mutex to be already held before it is invoked.
Each arc state also has a mutex which is used to protect the buffer list associated with the state. When attempting to obtain a hash table lock while holding an arc list lock you must use: mutex_tryenter() to avoid deadlock. Also note that the active state mutex must be held before the ghost state mutex.
Arc buffers may have an associated eviction callback function. This function will be invoked prior to removing the buffer (e.g. in arc_do_user_evicts()). Note however that the data associated with the buffer may be evicted prior to the callback. The callback must be made with *no locks held* (to prevent deadlock). Additionally, the users of callbacks must ensure that their private data is protected from simultaneous callbacks from arc_buf_evict() and arc_do_user_evicts().
Note that the majority of the performance stats are manipulated with atomic operations.
The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
- L2ARC buflist creation
- L2ARC buflist eviction
- L2ARC write completion, which walks L2ARC buflists
- ARC header destruction, as it removes from L2ARC buflists
- ARC header release, as it removes from L2ARC buflists
Arc Buffer States
Buffers can be in one of 6 states:
- ARC_anon - anonymous (discussed below)
- ARC_mru - recently used, currently cached
- ARC_mru_ghost - recentely used, no longer in cache
- ARC_mfu - frequently used, currently cached
- ARC_mfu_ghost - frequently used, no longer in cache
- ARC_l2c_only - exists in L2ARC but not other states When there are no active references to the buffer, they are are linked onto a list in one of these arc states. These are the only buffers that can be evicted or deleted. Within each state there are multiple lists, one for meta-data and one for non-meta-data. Meta-data (indirect blocks, blocks of dnodes, etc.) is tracked separately so that it can be managed more explicitly: favored over data, limited explicitly.
Anonymous buffers are buffers that are not associated with a DVA. These are buffers that hold dirty block copies before they are written to stable storage. By definition, they are "ref'd" and are considered part of arc_mru that cannot be freed. Generally, they will aquire a DVA as they are written and migrate onto the arc_mru list.
The ARC_l2c_only state is for buffers that are in the second level ARC but no longer in any of the ARC_m* lists. The second level ARC itself may also contain buffers that are in any of the ARC_m* states - meaning that a buffer can exist in two places. The reason for the ARC_l2c_only state is to keep the buffer header in the hash table, so that reads that hit the second level ARC benefit from these fast lookups.
Level 2 ARC
The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. It uses dedicated storage devices to hold cached data, which are populated using large infrequent writes. The main role of this cache is to boost the performance of random read workloads. The intended L2ARC devices include short-stroked disks, solid state disks, and other media with substantially faster read latency than disk.
+-----------------------+
| ARC |
+-----------------------+
| ^ ^
| | |
l2arc_feed_thread() arc_read()
| | |
| l2arc read |
V | |
+---------------+ |
| L2ARC | |
+---------------+ |
| ^ |
l2arc_write() | |
| | |
V | |
+-------+ +-------+
| vdev | | vdev |
| cache | | cache |
+-------+ +-------+
+=========+ .-----.
: L2ARC : |-_____-|
: devices : | Disks |
+=========+ `-_____-'
Read requests are satisfied from the following sources, in order:
- ARC
- vdev cache of L2ARC devices
- L2ARC devices
- vdev cache of disks
- disks
Some L2ARC device types exhibit extremely slow write performance. To accommodate for this there are some significant differences between the L2ARC and traditional cache design:
- There is no eviction path from the ARC to the L2ARC. Evictions from the ARC behave as usual, freeing buffers and placing headers on ghost lists. The ARC does not send buffers to the L2ARC during eviction as this would add inflated write latencies for all ARC memory pressure.
- The L2ARC attempts to cache data from the ARC before it is evicted. It does this by periodically scanning buffers from the eviction-end of the MFU and MRU ARC lists, copying them to the L2ARC devices if they are not already there. It scans until a headroom of buffers is satisfied, which itself is a buffer for ARC eviction. The thread that does this is l2arc_feed_thread(), illustrated below; example sizes are included to provide a better sense of ratio than this diagram:
head --> tail
+---------------------+----------+
ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
+---------------------+----------+ | o L2ARC eligible
ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
+---------------------+----------+ |
15.9 Gbytes ^ 32 Mbytes |
headroom |
l2arc_feed_thread()
|
l2arc write hand <--[oooo]--'
| 8 Mbyte
| write max
V
+==============================+
L2ARC dev |####|#|###|###| |####| ... |
+==============================+
32 Gbytes
- If an ARC buffer is copied to the L2ARC but then hit instead of evicted, then the L2ARC has cached a buffer much sooner than it probably needed to, potentially wasting L2ARC device bandwidth and storage. It is safe to say that this is an uncommon case, since buffers at the end of the ARC lists have moved there due to inactivity.
- If the ARC evicts faster than the L2ARC can maintain a headroom, then the L2ARC simply misses copying some buffers. This serves as a pressure valve to prevent heavy read workloads from both stalling the ARC with waits and clogging the L2ARC with writes. This also helps prevent the potential for the L2ARC to churn if it attempts to cache content too quickly, such as during backups of the entire pool.
- After system boot and before the ARC has filled main memory, there are no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru lists can remain mostly static. Instead of searching from tail of these lists as pictured, the l2arc_feed_thread() will search from the list heads for eligible buffers, greatly increasing its chance of finding them.
The L2ARC device write speed is also boosted during this time so that the L2ARC warms up faster. Since there have been no ARC evictions yet, there are no L2ARC reads, and no fear of degrading read performance through increased writes.
- Writes to the L2ARC devices are grouped and sent in-sequence, so that the vdev queue can aggregate them into larger and fewer writes. Each device is written to in a rotor fashion, sweeping writes through available space then repeating.
- The L2ARC does not store dirty content. It never needs to flush write buffers back to disk based storage.
- If an ARC buffer is written (and dirtied) which also exists in the L2ARC, the now stale L2ARC buffer is immediately dropped.
The performance of the L2ARC can be tweaked by a number of tunables, which may be necessary for different workloads:
- l2arc_write_max max write bytes per interval
- l2arc_write_boost extra write bytes during device warmup
- l2arc_noprefetch skip caching prefetched buffers
- l2arc_headroom number of max device writes to precache
- l2arc_feed_secs seconds between L2ARC writing
Tunables may be removed or added as future performance improvements are integrated, and also may become zpool properties.
There are three key functions that control how the L2ARC warms up:
These three functions determine what to write, how much, and how quickly to send writes.
Definition in file arc.c.