FreeBSD ZFS
The Zettabyte File System
|
00001 /* 00002 * CDDL HEADER START 00003 * 00004 * The contents of this file are subject to the terms of the 00005 * Common Development and Distribution License (the "License"). 00006 * You may not use this file except in compliance with the License. 00007 * 00008 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 00009 * or http://www.opensolaris.org/os/licensing. 00010 * See the License for the specific language governing permissions 00011 * and limitations under the License. 00012 * 00013 * When distributing Covered Code, include this CDDL HEADER in each 00014 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 00015 * If applicable, add the following below this CDDL HEADER, with the 00016 * fields enclosed by brackets "[]" replaced with your own identifying 00017 * information: Portions Copyright [yyyy] [name of copyright owner] 00018 * 00019 * CDDL HEADER END 00020 */ 00021 /* 00022 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 00023 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 00024 * Copyright (c) 2011 by Delphix. All rights reserved. 00025 */ 00026 00125 #include <sys/spa.h> 00126 #include <sys/zio.h> 00127 #include <sys/zfs_context.h> 00128 #include <sys/arc.h> 00129 #include <sys/refcount.h> 00130 #include <sys/vdev.h> 00131 #include <sys/vdev_impl.h> 00132 #ifdef _KERNEL 00133 #include <sys/dnlc.h> 00134 #endif 00135 #include <sys/callb.h> 00136 #include <sys/kstat.h> 00137 #include <zfs_fletcher.h> 00138 #include <sys/sdt.h> 00139 00140 #include <vm/vm_pageout.h> 00141 00142 #ifdef illumos 00143 #ifndef _KERNEL 00144 00145 boolean_t arc_watch = B_FALSE; 00146 int arc_procfd; 00147 #endif 00148 #endif /* illumos */ 00149 00150 static kmutex_t arc_reclaim_thr_lock; 00151 static kcondvar_t arc_reclaim_thr_cv; 00152 static uint8_t arc_thread_exit; 00153 00154 extern int zfs_write_limit_shift; 00155 extern uint64_t zfs_write_limit_max; 00156 extern kmutex_t zfs_write_limit_lock; 00157 00158 #define ARC_REDUCE_DNLC_PERCENT 3 00159 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 00160 00161 typedef enum arc_reclaim_strategy { 00162 ARC_RECLAIM_AGGR, 00163 ARC_RECLAIM_CONS 00164 } arc_reclaim_strategy_t; 00165 00167 static int arc_grow_retry = 60; 00168 00170 static int arc_p_min_shift = 4; 00171 00173 static int arc_shrink_shift = 5; 00174 00179 static int arc_min_prefetch_lifespan; 00180 00181 static int arc_dead; 00182 extern int zfs_prefetch_disable; 00183 00187 static boolean_t arc_warm; 00188 00189 /* 00190 * These tunables are for performance analysis. 00191 */ 00196 uint64_t zfs_arc_max; 00197 uint64_t zfs_arc_min; 00198 uint64_t zfs_arc_meta_limit = 0; 00200 int zfs_arc_grow_retry = 0; 00201 int zfs_arc_shrink_shift = 0; 00202 int zfs_arc_p_min_shift = 0; 00203 int zfs_disable_dup_eviction = 0; 00204 00205 TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); 00206 TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); 00207 TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 00208 SYSCTL_DECL(_vfs_zfs); 00209 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, 00210 "Maximum ARC size"); 00211 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, 00212 "Minimum ARC size"); 00213 00250 #define ARCS_LOCK_PAD CACHE_LINE_SIZE 00251 struct arcs_lock { 00252 kmutex_t arcs_lock; 00253 #ifdef _KERNEL 00254 unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))]; 00255 #endif 00256 }; 00257 00261 #define ARC_BUFC_NUMDATALISTS 16 00262 #define ARC_BUFC_NUMMETADATALISTS 16 00263 #define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS) 00264 00265 typedef struct arc_state { 00266 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; 00267 uint64_t arcs_size; 00268 list_t arcs_lists[ARC_BUFC_NUMLISTS]; 00269 struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE); 00270 } arc_state_t; 00271 00272 #define ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock)) 00273 00274 /* The 6 states: */ 00275 static arc_state_t ARC_anon; 00276 static arc_state_t ARC_mru; 00277 static arc_state_t ARC_mru_ghost; 00278 static arc_state_t ARC_mfu; 00279 static arc_state_t ARC_mfu_ghost; 00280 static arc_state_t ARC_l2c_only; 00281 00282 typedef struct arc_stats { 00283 kstat_named_t arcstat_hits; 00284 kstat_named_t arcstat_misses; 00285 kstat_named_t arcstat_demand_data_hits; 00286 kstat_named_t arcstat_demand_data_misses; 00287 kstat_named_t arcstat_demand_metadata_hits; 00288 kstat_named_t arcstat_demand_metadata_misses; 00289 kstat_named_t arcstat_prefetch_data_hits; 00290 kstat_named_t arcstat_prefetch_data_misses; 00291 kstat_named_t arcstat_prefetch_metadata_hits; 00292 kstat_named_t arcstat_prefetch_metadata_misses; 00293 kstat_named_t arcstat_mru_hits; 00294 kstat_named_t arcstat_mru_ghost_hits; 00295 kstat_named_t arcstat_mfu_hits; 00296 kstat_named_t arcstat_mfu_ghost_hits; 00297 kstat_named_t arcstat_allocated; 00298 kstat_named_t arcstat_deleted; 00299 kstat_named_t arcstat_stolen; 00300 kstat_named_t arcstat_recycle_miss; 00301 kstat_named_t arcstat_mutex_miss; 00302 kstat_named_t arcstat_evict_skip; 00303 kstat_named_t arcstat_evict_l2_cached; 00304 kstat_named_t arcstat_evict_l2_eligible; 00305 kstat_named_t arcstat_evict_l2_ineligible; 00306 kstat_named_t arcstat_hash_elements; 00307 kstat_named_t arcstat_hash_elements_max; 00308 kstat_named_t arcstat_hash_collisions; 00309 kstat_named_t arcstat_hash_chains; 00310 kstat_named_t arcstat_hash_chain_max; 00311 kstat_named_t arcstat_p; 00312 kstat_named_t arcstat_c; 00313 kstat_named_t arcstat_c_min; 00314 kstat_named_t arcstat_c_max; 00315 kstat_named_t arcstat_size; 00316 kstat_named_t arcstat_hdr_size; 00317 kstat_named_t arcstat_data_size; 00318 kstat_named_t arcstat_other_size; 00319 kstat_named_t arcstat_l2_hits; 00320 kstat_named_t arcstat_l2_misses; 00321 kstat_named_t arcstat_l2_feeds; 00322 kstat_named_t arcstat_l2_rw_clash; 00323 kstat_named_t arcstat_l2_read_bytes; 00324 kstat_named_t arcstat_l2_write_bytes; 00325 kstat_named_t arcstat_l2_writes_sent; 00326 kstat_named_t arcstat_l2_writes_done; 00327 kstat_named_t arcstat_l2_writes_error; 00328 kstat_named_t arcstat_l2_writes_hdr_miss; 00329 kstat_named_t arcstat_l2_evict_lock_retry; 00330 kstat_named_t arcstat_l2_evict_reading; 00331 kstat_named_t arcstat_l2_free_on_write; 00332 kstat_named_t arcstat_l2_abort_lowmem; 00333 kstat_named_t arcstat_l2_cksum_bad; 00334 kstat_named_t arcstat_l2_io_error; 00335 kstat_named_t arcstat_l2_size; 00336 kstat_named_t arcstat_l2_hdr_size; 00337 kstat_named_t arcstat_l2_write_trylock_fail; 00338 kstat_named_t arcstat_l2_write_passed_headroom; 00339 kstat_named_t arcstat_l2_write_spa_mismatch; 00340 kstat_named_t arcstat_l2_write_in_l2; 00341 kstat_named_t arcstat_l2_write_hdr_io_in_progress; 00342 kstat_named_t arcstat_l2_write_not_cacheable; 00343 kstat_named_t arcstat_l2_write_full; 00344 kstat_named_t arcstat_l2_write_buffer_iter; 00345 kstat_named_t arcstat_l2_write_pios; 00346 kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 00347 kstat_named_t arcstat_l2_write_buffer_list_iter; 00348 kstat_named_t arcstat_l2_write_buffer_list_null_iter; 00349 kstat_named_t arcstat_memory_throttle_count; 00350 kstat_named_t arcstat_duplicate_buffers; 00351 kstat_named_t arcstat_duplicate_buffers_size; 00352 kstat_named_t arcstat_duplicate_reads; 00353 } arc_stats_t; 00354 00355 static arc_stats_t arc_stats = { 00356 { "hits", KSTAT_DATA_UINT64 }, 00357 { "misses", KSTAT_DATA_UINT64 }, 00358 { "demand_data_hits", KSTAT_DATA_UINT64 }, 00359 { "demand_data_misses", KSTAT_DATA_UINT64 }, 00360 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 00361 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 00362 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 00363 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 00364 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 00365 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 00366 { "mru_hits", KSTAT_DATA_UINT64 }, 00367 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 00368 { "mfu_hits", KSTAT_DATA_UINT64 }, 00369 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 00370 { "allocated", KSTAT_DATA_UINT64 }, 00371 { "deleted", KSTAT_DATA_UINT64 }, 00372 { "stolen", KSTAT_DATA_UINT64 }, 00373 { "recycle_miss", KSTAT_DATA_UINT64 }, 00374 { "mutex_miss", KSTAT_DATA_UINT64 }, 00375 { "evict_skip", KSTAT_DATA_UINT64 }, 00376 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 00377 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 00378 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 00379 { "hash_elements", KSTAT_DATA_UINT64 }, 00380 { "hash_elements_max", KSTAT_DATA_UINT64 }, 00381 { "hash_collisions", KSTAT_DATA_UINT64 }, 00382 { "hash_chains", KSTAT_DATA_UINT64 }, 00383 { "hash_chain_max", KSTAT_DATA_UINT64 }, 00384 { "p", KSTAT_DATA_UINT64 }, 00385 { "c", KSTAT_DATA_UINT64 }, 00386 { "c_min", KSTAT_DATA_UINT64 }, 00387 { "c_max", KSTAT_DATA_UINT64 }, 00388 { "size", KSTAT_DATA_UINT64 }, 00389 { "hdr_size", KSTAT_DATA_UINT64 }, 00390 { "data_size", KSTAT_DATA_UINT64 }, 00391 { "other_size", KSTAT_DATA_UINT64 }, 00392 { "l2_hits", KSTAT_DATA_UINT64 }, 00393 { "l2_misses", KSTAT_DATA_UINT64 }, 00394 { "l2_feeds", KSTAT_DATA_UINT64 }, 00395 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 00396 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 00397 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 00398 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 00399 { "l2_writes_done", KSTAT_DATA_UINT64 }, 00400 { "l2_writes_error", KSTAT_DATA_UINT64 }, 00401 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 00402 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 00403 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 00404 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 00405 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 00406 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 00407 { "l2_io_error", KSTAT_DATA_UINT64 }, 00408 { "l2_size", KSTAT_DATA_UINT64 }, 00409 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 00410 { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 00411 { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 00412 { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 00413 { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 00414 { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 00415 { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 00416 { "l2_write_full", KSTAT_DATA_UINT64 }, 00417 { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 00418 { "l2_write_pios", KSTAT_DATA_UINT64 }, 00419 { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 00420 { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 00421 { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 00422 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 00423 { "duplicate_buffers", KSTAT_DATA_UINT64 }, 00424 { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 00425 { "duplicate_reads", KSTAT_DATA_UINT64 } 00426 }; 00427 00428 #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 00429 00430 #define ARCSTAT_INCR(stat, val) \ 00431 atomic_add_64(&arc_stats.stat.value.ui64, (val)); 00432 00433 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 00434 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 00435 00436 #define ARCSTAT_MAX(stat, val) { \ 00437 uint64_t m; \ 00438 while ((val) > (m = arc_stats.stat.value.ui64) && \ 00439 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 00440 continue; \ 00441 } 00442 00443 #define ARCSTAT_MAXSTAT(stat) \ 00444 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 00445 00451 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 00452 if (cond1) { \ 00453 if (cond2) { \ 00454 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 00455 } else { \ 00456 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 00457 } \ 00458 } else { \ 00459 if (cond2) { \ 00460 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 00461 } else { \ 00462 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 00463 } \ 00464 } 00465 00466 kstat_t *arc_ksp; 00467 static arc_state_t *arc_anon; 00468 static arc_state_t *arc_mru; 00469 static arc_state_t *arc_mru_ghost; 00470 static arc_state_t *arc_mfu; 00471 static arc_state_t *arc_mfu_ghost; 00472 static arc_state_t *arc_l2c_only; 00473 00474 /* 00475 * There are several ARC variables that are critical to export as kstats -- 00476 * but we don't want to have to grovel around in the kstat whenever we wish to 00477 * manipulate them. For these variables, we therefore define them to be in 00478 * terms of the statistic variable. This assures that we are not introducing 00479 * the possibility of inconsistency by having shadow copies of the variables, 00480 * while still allowing the code to be readable. 00481 */ 00482 #define arc_size ARCSTAT(arcstat_size) 00483 #define arc_p ARCSTAT(arcstat_p) 00484 #define arc_c ARCSTAT(arcstat_c) 00485 #define arc_c_min ARCSTAT(arcstat_c_min) 00486 #define arc_c_max ARCSTAT(arcstat_c_max) 00488 static int arc_no_grow; 00489 static uint64_t arc_tempreserve; 00490 static uint64_t arc_loaned_bytes; 00491 static uint64_t arc_meta_used; 00492 static uint64_t arc_meta_limit; 00493 static uint64_t arc_meta_max = 0; 00494 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RD, &arc_meta_used, 0, 00495 "ARC metadata used"); 00496 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RW, &arc_meta_limit, 0, 00497 "ARC metadata limit"); 00498 00499 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; 00500 00501 typedef struct arc_callback arc_callback_t; 00502 00503 struct arc_callback { 00504 void *acb_private; 00505 arc_done_func_t *acb_done; 00506 arc_buf_t *acb_buf; 00507 zio_t *acb_zio_dummy; 00508 arc_callback_t *acb_next; 00509 }; 00510 00511 typedef struct arc_write_callback arc_write_callback_t; 00512 00513 struct arc_write_callback { 00514 void *awcb_private; 00515 arc_done_func_t *awcb_ready; 00516 arc_done_func_t *awcb_done; 00517 arc_buf_t *awcb_buf; 00518 }; 00519 00520 struct arc_buf_hdr { 00521 /* protected by hash lock */ 00522 dva_t b_dva; 00523 uint64_t b_birth; 00524 uint64_t b_cksum0; 00525 00526 kmutex_t b_freeze_lock; 00527 zio_cksum_t *b_freeze_cksum; 00528 void *b_thawed; 00529 00530 arc_buf_hdr_t *b_hash_next; 00531 arc_buf_t *b_buf; 00532 uint32_t b_flags; 00533 uint32_t b_datacnt; 00534 00535 arc_callback_t *b_acb; 00536 kcondvar_t b_cv; 00537 00538 /* immutable */ 00539 arc_buf_contents_t b_type; 00540 uint64_t b_size; 00541 uint64_t b_spa; 00542 00543 /* protected by arc state mutex */ 00544 arc_state_t *b_state; 00545 list_node_t b_arc_node; 00546 00547 /* updated atomically */ 00548 clock_t b_arc_access; 00549 00550 /* self protecting */ 00551 refcount_t b_refcnt; 00552 00553 l2arc_buf_hdr_t *b_l2hdr; 00554 list_node_t b_l2node; 00555 }; 00556 00557 static arc_buf_t *arc_eviction_list; 00558 static kmutex_t arc_eviction_mtx; 00559 static arc_buf_hdr_t arc_eviction_hdr; 00560 static void arc_get_data_buf(arc_buf_t *buf); 00561 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 00562 static int arc_evict_needed(arc_buf_contents_t type); 00563 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); 00564 #ifdef illumos 00565 static void arc_buf_watch(arc_buf_t *buf); 00566 #endif /* illumos */ 00567 00568 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); 00569 00570 #define GHOST_STATE(state) \ 00571 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 00572 (state) == arc_l2c_only) 00573 00574 /* 00575 * Private ARC flags. These flags are private ARC only flags that will show up 00576 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 00577 * be passed in as arc_flags in things like arc_read. However, these flags 00578 * should never be passed and should only be set by ARC code. When adding new 00579 * public flags, make sure not to smash the private ones. 00580 */ 00581 00582 #define ARC_IN_HASH_TABLE (1 << 9) 00583 #define ARC_IO_IN_PROGRESS (1 << 10) 00584 #define ARC_IO_ERROR (1 << 11) 00585 #define ARC_FREED_IN_READ (1 << 12) 00586 #define ARC_BUF_AVAILABLE (1 << 13) 00587 #define ARC_INDIRECT (1 << 14) 00588 #define ARC_FREE_IN_PROGRESS (1 << 15) 00589 #define ARC_L2_WRITING (1 << 16) 00590 #define ARC_L2_EVICTED (1 << 17) 00591 #define ARC_L2_WRITE_HEAD (1 << 18) 00593 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 00594 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 00595 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 00596 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) 00597 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 00598 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 00599 #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) 00600 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) 00601 #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ 00602 (hdr)->b_l2hdr != NULL) 00603 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) 00604 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) 00605 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) 00606 00607 /* 00608 * Other sizes 00609 */ 00610 00611 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 00612 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) 00613 00614 /* 00615 * Hash table routines 00616 */ 00617 00618 #define HT_LOCK_PAD CACHE_LINE_SIZE 00619 00620 struct ht_lock { 00621 kmutex_t ht_lock; 00622 #ifdef _KERNEL 00623 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 00624 #endif 00625 }; 00626 00627 #define BUF_LOCKS 256 00628 typedef struct buf_hash_table { 00629 uint64_t ht_mask; 00630 arc_buf_hdr_t **ht_table; 00631 struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 00632 } buf_hash_table_t; 00633 00634 static buf_hash_table_t buf_hash_table; 00635 00636 #define BUF_HASH_INDEX(spa, dva, birth) \ 00637 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 00638 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 00639 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 00640 #define HDR_LOCK(hdr) \ 00641 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 00642 00643 uint64_t zfs_crc64_table[256]; 00644 00645 /* 00646 * Level 2 ARC 00647 */ 00648 00649 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) 00650 #define L2ARC_HEADROOM 2 00651 #define L2ARC_FEED_SECS 1 00652 #define L2ARC_FEED_MIN_MS 200 00654 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 00655 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 00656 00657 /* L2ARC Performance Tunables */ 00662 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; 00663 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; 00664 uint64_t l2arc_headroom = L2ARC_HEADROOM; 00665 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; 00666 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; 00667 boolean_t l2arc_noprefetch = B_TRUE; 00668 boolean_t l2arc_feed_again = B_TRUE; 00669 boolean_t l2arc_norw = B_TRUE; 00672 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 00673 &l2arc_write_max, 0, "max write size"); 00674 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 00675 &l2arc_write_boost, 0, "extra write during warmup"); 00676 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 00677 &l2arc_headroom, 0, "number of dev writes"); 00678 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 00679 &l2arc_feed_secs, 0, "interval seconds"); 00680 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 00681 &l2arc_feed_min_ms, 0, "min interval milliseconds"); 00682 00683 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 00684 &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 00685 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 00686 &l2arc_feed_again, 0, "turbo warmup"); 00687 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 00688 &l2arc_norw, 0, "no reads during writes"); 00689 00690 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 00691 &ARC_anon.arcs_size, 0, "size of anonymous state"); 00692 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, 00693 &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); 00694 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, 00695 &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); 00696 00697 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 00698 &ARC_mru.arcs_size, 0, "size of mru state"); 00699 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, 00700 &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); 00701 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, 00702 &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); 00703 00704 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 00705 &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state"); 00706 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, 00707 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 00708 "size of metadata in mru ghost state"); 00709 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, 00710 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 00711 "size of data in mru ghost state"); 00712 00713 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 00714 &ARC_mfu.arcs_size, 0, "size of mfu state"); 00715 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, 00716 &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); 00717 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, 00718 &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); 00719 00720 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 00721 &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state"); 00722 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, 00723 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 00724 "size of metadata in mfu ghost state"); 00725 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, 00726 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 00727 "size of data in mfu ghost state"); 00728 00729 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 00730 &ARC_l2c_only.arcs_size, 0, "size of mru state"); 00731 00732 /* 00733 * L2ARC Internals 00734 */ 00735 typedef struct l2arc_dev { 00736 vdev_t *l2ad_vdev; 00737 spa_t *l2ad_spa; 00738 uint64_t l2ad_hand; 00739 uint64_t l2ad_write; 00740 uint64_t l2ad_boost; 00741 uint64_t l2ad_start; 00742 uint64_t l2ad_end; 00743 uint64_t l2ad_evict; 00744 boolean_t l2ad_first; 00745 boolean_t l2ad_writing; 00746 list_t *l2ad_buflist; 00747 list_node_t l2ad_node; 00748 } l2arc_dev_t; 00749 00750 static list_t L2ARC_dev_list; 00751 static list_t *l2arc_dev_list; 00752 static kmutex_t l2arc_dev_mtx; 00753 static l2arc_dev_t *l2arc_dev_last; 00754 static kmutex_t l2arc_buflist_mtx; 00755 static list_t L2ARC_free_on_write; 00756 static list_t *l2arc_free_on_write; 00757 static kmutex_t l2arc_free_on_write_mtx; 00758 static uint64_t l2arc_ndev; 00760 typedef struct l2arc_read_callback { 00761 arc_buf_t *l2rcb_buf; 00762 spa_t *l2rcb_spa; 00763 blkptr_t l2rcb_bp; 00764 zbookmark_t l2rcb_zb; 00765 int l2rcb_flags; 00766 } l2arc_read_callback_t; 00767 00768 typedef struct l2arc_write_callback { 00769 l2arc_dev_t *l2wcb_dev; 00770 arc_buf_hdr_t *l2wcb_head; 00771 } l2arc_write_callback_t; 00772 00773 struct l2arc_buf_hdr { 00774 /* protected by arc_buf_hdr mutex */ 00775 l2arc_dev_t *b_dev; 00776 uint64_t b_daddr; 00777 }; 00778 00779 typedef struct l2arc_data_free { 00780 /* protected by l2arc_free_on_write_mtx */ 00781 void *l2df_data; 00782 size_t l2df_size; 00783 void (*l2df_func)(void *, size_t); 00784 list_node_t l2df_list_node; 00785 } l2arc_data_free_t; 00786 00787 static kmutex_t l2arc_feed_thr_lock; 00788 static kcondvar_t l2arc_feed_thr_cv; 00789 static uint8_t l2arc_thread_exit; 00790 00791 static void l2arc_read_done(zio_t *zio); 00792 static void l2arc_hdr_stat_add(void); 00793 static void l2arc_hdr_stat_remove(void); 00794 00795 static uint64_t 00796 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 00797 { 00798 uint8_t *vdva = (uint8_t *)dva; 00799 uint64_t crc = -1ULL; 00800 int i; 00801 00802 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 00803 00804 for (i = 0; i < sizeof (dva_t); i++) 00805 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 00806 00807 crc ^= (spa>>8) ^ birth; 00808 00809 return (crc); 00810 } 00811 00812 #define BUF_EMPTY(buf) \ 00813 ((buf)->b_dva.dva_word[0] == 0 && \ 00814 (buf)->b_dva.dva_word[1] == 0 && \ 00815 (buf)->b_birth == 0) 00816 00817 #define BUF_EQUAL(spa, dva, birth, buf) \ 00818 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 00819 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 00820 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 00821 00822 static void 00823 buf_discard_identity(arc_buf_hdr_t *hdr) 00824 { 00825 hdr->b_dva.dva_word[0] = 0; 00826 hdr->b_dva.dva_word[1] = 0; 00827 hdr->b_birth = 0; 00828 hdr->b_cksum0 = 0; 00829 } 00830 00831 static arc_buf_hdr_t * 00832 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) 00833 { 00834 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 00835 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 00836 arc_buf_hdr_t *buf; 00837 00838 mutex_enter(hash_lock); 00839 for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 00840 buf = buf->b_hash_next) { 00841 if (BUF_EQUAL(spa, dva, birth, buf)) { 00842 *lockp = hash_lock; 00843 return (buf); 00844 } 00845 } 00846 mutex_exit(hash_lock); 00847 *lockp = NULL; 00848 return (NULL); 00849 } 00850 00857 static arc_buf_hdr_t * 00858 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 00859 { 00860 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 00861 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 00862 arc_buf_hdr_t *fbuf; 00863 uint32_t i; 00864 00865 ASSERT(!HDR_IN_HASH_TABLE(buf)); 00866 *lockp = hash_lock; 00867 mutex_enter(hash_lock); 00868 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 00869 fbuf = fbuf->b_hash_next, i++) { 00870 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 00871 return (fbuf); 00872 } 00873 00874 buf->b_hash_next = buf_hash_table.ht_table[idx]; 00875 buf_hash_table.ht_table[idx] = buf; 00876 buf->b_flags |= ARC_IN_HASH_TABLE; 00877 00878 /* collect some hash table performance data */ 00879 if (i > 0) { 00880 ARCSTAT_BUMP(arcstat_hash_collisions); 00881 if (i == 1) 00882 ARCSTAT_BUMP(arcstat_hash_chains); 00883 00884 ARCSTAT_MAX(arcstat_hash_chain_max, i); 00885 } 00886 00887 ARCSTAT_BUMP(arcstat_hash_elements); 00888 ARCSTAT_MAXSTAT(arcstat_hash_elements); 00889 00890 return (NULL); 00891 } 00892 00893 static void 00894 buf_hash_remove(arc_buf_hdr_t *buf) 00895 { 00896 arc_buf_hdr_t *fbuf, **bufp; 00897 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 00898 00899 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 00900 ASSERT(HDR_IN_HASH_TABLE(buf)); 00901 00902 bufp = &buf_hash_table.ht_table[idx]; 00903 while ((fbuf = *bufp) != buf) { 00904 ASSERT(fbuf != NULL); 00905 bufp = &fbuf->b_hash_next; 00906 } 00907 *bufp = buf->b_hash_next; 00908 buf->b_hash_next = NULL; 00909 buf->b_flags &= ~ARC_IN_HASH_TABLE; 00910 00911 /* collect some hash table performance data */ 00912 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 00913 00914 if (buf_hash_table.ht_table[idx] && 00915 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 00916 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 00917 } 00918 00919 /* 00920 * Global data structures and functions for the buf kmem cache. 00921 */ 00922 static kmem_cache_t *hdr_cache; 00923 static kmem_cache_t *buf_cache; 00924 00925 static void 00926 buf_fini(void) 00927 { 00928 int i; 00929 00930 kmem_free(buf_hash_table.ht_table, 00931 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 00932 for (i = 0; i < BUF_LOCKS; i++) 00933 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 00934 kmem_cache_destroy(hdr_cache); 00935 kmem_cache_destroy(buf_cache); 00936 } 00937 00942 /* ARGSUSED */ 00943 static int 00944 hdr_cons(void *vbuf, void *unused, int kmflag) 00945 { 00946 arc_buf_hdr_t *buf = vbuf; 00947 00948 bzero(buf, sizeof (arc_buf_hdr_t)); 00949 refcount_create(&buf->b_refcnt); 00950 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 00951 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 00952 arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 00953 00954 return (0); 00955 } 00956 00957 /* ARGSUSED */ 00958 static int 00959 buf_cons(void *vbuf, void *unused, int kmflag) 00960 { 00961 arc_buf_t *buf = vbuf; 00962 00963 bzero(buf, sizeof (arc_buf_t)); 00964 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 00965 rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL); 00966 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 00967 00968 return (0); 00969 } 00970 00975 /* ARGSUSED */ 00976 static void 00977 hdr_dest(void *vbuf, void *unused) 00978 { 00979 arc_buf_hdr_t *buf = vbuf; 00980 00981 ASSERT(BUF_EMPTY(buf)); 00982 refcount_destroy(&buf->b_refcnt); 00983 cv_destroy(&buf->b_cv); 00984 mutex_destroy(&buf->b_freeze_lock); 00985 arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 00986 } 00987 00988 /* ARGSUSED */ 00989 static void 00990 buf_dest(void *vbuf, void *unused) 00991 { 00992 arc_buf_t *buf = vbuf; 00993 00994 mutex_destroy(&buf->b_evict_lock); 00995 rw_destroy(&buf->b_data_lock); 00996 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 00997 } 00998 01002 /* ARGSUSED */ 01003 static void 01004 hdr_recl(void *unused) 01005 { 01006 dprintf("hdr_recl called\n"); 01007 /* 01008 * umem calls the reclaim func when we destroy the buf cache, 01009 * which is after we do arc_fini(). 01010 */ 01011 if (!arc_dead) 01012 cv_signal(&arc_reclaim_thr_cv); 01013 } 01014 01015 static void 01016 buf_init(void) 01017 { 01018 uint64_t *ct; 01019 uint64_t hsize = 1ULL << 12; 01020 int i, j; 01021 01022 /* 01023 * The hash table is big enough to fill all of physical memory 01024 * with an average 64K block size. The table will take up 01025 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 01026 */ 01027 while (hsize * 65536 < (uint64_t)physmem * PAGESIZE) 01028 hsize <<= 1; 01029 retry: 01030 buf_hash_table.ht_mask = hsize - 1; 01031 buf_hash_table.ht_table = 01032 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 01033 if (buf_hash_table.ht_table == NULL) { 01034 ASSERT(hsize > (1ULL << 8)); 01035 hsize >>= 1; 01036 goto retry; 01037 } 01038 01039 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 01040 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 01041 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 01042 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 01043 01044 for (i = 0; i < 256; i++) 01045 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 01046 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 01047 01048 for (i = 0; i < BUF_LOCKS; i++) { 01049 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 01050 NULL, MUTEX_DEFAULT, NULL); 01051 } 01052 } 01053 01054 #define ARC_MINTIME (hz>>4) /* 62 ms */ 01055 01056 static void 01057 arc_cksum_verify(arc_buf_t *buf) 01058 { 01059 zio_cksum_t zc; 01060 01061 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 01062 return; 01063 01064 mutex_enter(&buf->b_hdr->b_freeze_lock); 01065 if (buf->b_hdr->b_freeze_cksum == NULL || 01066 (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 01067 mutex_exit(&buf->b_hdr->b_freeze_lock); 01068 return; 01069 } 01070 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 01071 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 01072 panic("buffer modified while frozen!"); 01073 mutex_exit(&buf->b_hdr->b_freeze_lock); 01074 } 01075 01076 static int 01077 arc_cksum_equal(arc_buf_t *buf) 01078 { 01079 zio_cksum_t zc; 01080 int equal; 01081 01082 mutex_enter(&buf->b_hdr->b_freeze_lock); 01083 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 01084 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 01085 mutex_exit(&buf->b_hdr->b_freeze_lock); 01086 01087 return (equal); 01088 } 01089 01090 static void 01091 arc_cksum_compute(arc_buf_t *buf, boolean_t force) 01092 { 01093 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 01094 return; 01095 01096 mutex_enter(&buf->b_hdr->b_freeze_lock); 01097 if (buf->b_hdr->b_freeze_cksum != NULL) { 01098 mutex_exit(&buf->b_hdr->b_freeze_lock); 01099 return; 01100 } 01101 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 01102 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 01103 buf->b_hdr->b_freeze_cksum); 01104 mutex_exit(&buf->b_hdr->b_freeze_lock); 01105 #ifdef illumos 01106 arc_buf_watch(buf); 01107 #endif /* illumos */ 01108 } 01109 01110 #ifdef illumos 01111 #ifndef _KERNEL 01112 typedef struct procctl { 01113 long cmd; 01114 prwatch_t prwatch; 01115 } procctl_t; 01116 #endif 01117 01118 /* ARGSUSED */ 01119 static void 01120 arc_buf_unwatch(arc_buf_t *buf) 01121 { 01122 #ifndef _KERNEL 01123 if (arc_watch) { 01124 int result; 01125 procctl_t ctl; 01126 ctl.cmd = PCWATCH; 01127 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 01128 ctl.prwatch.pr_size = 0; 01129 ctl.prwatch.pr_wflags = 0; 01130 result = write(arc_procfd, &ctl, sizeof (ctl)); 01131 ASSERT3U(result, ==, sizeof (ctl)); 01132 } 01133 #endif 01134 } 01135 01136 /* ARGSUSED */ 01137 static void 01138 arc_buf_watch(arc_buf_t *buf) 01139 { 01140 #ifndef _KERNEL 01141 if (arc_watch) { 01142 int result; 01143 procctl_t ctl; 01144 ctl.cmd = PCWATCH; 01145 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 01146 ctl.prwatch.pr_size = buf->b_hdr->b_size; 01147 ctl.prwatch.pr_wflags = WA_WRITE; 01148 result = write(arc_procfd, &ctl, sizeof (ctl)); 01149 ASSERT3U(result, ==, sizeof (ctl)); 01150 } 01151 #endif 01152 } 01153 #endif /* illumos */ 01154 01155 void 01156 arc_buf_thaw(arc_buf_t *buf) 01157 { 01158 if (zfs_flags & ZFS_DEBUG_MODIFY) { 01159 if (buf->b_hdr->b_state != arc_anon) 01160 panic("modifying non-anon buffer!"); 01161 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 01162 panic("modifying buffer while i/o in progress!"); 01163 arc_cksum_verify(buf); 01164 } 01165 01166 mutex_enter(&buf->b_hdr->b_freeze_lock); 01167 if (buf->b_hdr->b_freeze_cksum != NULL) { 01168 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 01169 buf->b_hdr->b_freeze_cksum = NULL; 01170 } 01171 01172 if (zfs_flags & ZFS_DEBUG_MODIFY) { 01173 if (buf->b_hdr->b_thawed) 01174 kmem_free(buf->b_hdr->b_thawed, 1); 01175 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); 01176 } 01177 01178 mutex_exit(&buf->b_hdr->b_freeze_lock); 01179 01180 #ifdef illumos 01181 arc_buf_unwatch(buf); 01182 #endif /* illumos */ 01183 } 01184 01185 void 01186 arc_buf_freeze(arc_buf_t *buf) 01187 { 01188 kmutex_t *hash_lock; 01189 01190 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 01191 return; 01192 01193 hash_lock = HDR_LOCK(buf->b_hdr); 01194 mutex_enter(hash_lock); 01195 01196 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 01197 buf->b_hdr->b_state == arc_anon); 01198 arc_cksum_compute(buf, B_FALSE); 01199 mutex_exit(hash_lock); 01200 01201 } 01202 01203 static void 01204 get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock) 01205 { 01206 uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth); 01207 01208 if (ab->b_type == ARC_BUFC_METADATA) 01209 buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1); 01210 else { 01211 buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1); 01212 buf_hashid += ARC_BUFC_NUMMETADATALISTS; 01213 } 01214 01215 *list = &state->arcs_lists[buf_hashid]; 01216 *lock = ARCS_LOCK(state, buf_hashid); 01217 } 01218 01219 01220 static void 01221 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 01222 { 01223 ASSERT(MUTEX_HELD(hash_lock)); 01224 01225 if ((refcount_add(&ab->b_refcnt, tag) == 1) && 01226 (ab->b_state != arc_anon)) { 01227 uint64_t delta = ab->b_size * ab->b_datacnt; 01228 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; 01229 list_t *list; 01230 kmutex_t *lock; 01231 01232 get_buf_info(ab, ab->b_state, &list, &lock); 01233 ASSERT(!MUTEX_HELD(lock)); 01234 mutex_enter(lock); 01235 ASSERT(list_link_active(&ab->b_arc_node)); 01236 list_remove(list, ab); 01237 if (GHOST_STATE(ab->b_state)) { 01238 ASSERT0(ab->b_datacnt); 01239 ASSERT3P(ab->b_buf, ==, NULL); 01240 delta = ab->b_size; 01241 } 01242 ASSERT(delta > 0); 01243 ASSERT3U(*size, >=, delta); 01244 atomic_add_64(size, -delta); 01245 mutex_exit(lock); 01246 /* remove the prefetch flag if we get a reference */ 01247 if (ab->b_flags & ARC_PREFETCH) 01248 ab->b_flags &= ~ARC_PREFETCH; 01249 } 01250 } 01251 01252 static int 01253 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 01254 { 01255 int cnt; 01256 arc_state_t *state = ab->b_state; 01257 01258 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 01259 ASSERT(!GHOST_STATE(state)); 01260 01261 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 01262 (state != arc_anon)) { 01263 uint64_t *size = &state->arcs_lsize[ab->b_type]; 01264 list_t *list; 01265 kmutex_t *lock; 01266 01267 get_buf_info(ab, state, &list, &lock); 01268 ASSERT(!MUTEX_HELD(lock)); 01269 mutex_enter(lock); 01270 ASSERT(!list_link_active(&ab->b_arc_node)); 01271 list_insert_head(list, ab); 01272 ASSERT(ab->b_datacnt > 0); 01273 atomic_add_64(size, ab->b_size * ab->b_datacnt); 01274 mutex_exit(lock); 01275 } 01276 return (cnt); 01277 } 01278 01283 static void 01284 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 01285 { 01286 arc_state_t *old_state = ab->b_state; 01287 int64_t refcnt = refcount_count(&ab->b_refcnt); 01288 uint64_t from_delta, to_delta; 01289 list_t *list; 01290 kmutex_t *lock; 01291 01292 ASSERT(MUTEX_HELD(hash_lock)); 01293 ASSERT(new_state != old_state); 01294 ASSERT(refcnt == 0 || ab->b_datacnt > 0); 01295 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 01296 ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); 01297 01298 from_delta = to_delta = ab->b_datacnt * ab->b_size; 01299 01300 /* 01301 * If this buffer is evictable, transfer it from the 01302 * old state list to the new state list. 01303 */ 01304 if (refcnt == 0) { 01305 if (old_state != arc_anon) { 01306 int use_mutex; 01307 uint64_t *size = &old_state->arcs_lsize[ab->b_type]; 01308 01309 get_buf_info(ab, old_state, &list, &lock); 01310 use_mutex = !MUTEX_HELD(lock); 01311 if (use_mutex) 01312 mutex_enter(lock); 01313 01314 ASSERT(list_link_active(&ab->b_arc_node)); 01315 list_remove(list, ab); 01316 01317 /* 01318 * If prefetching out of the ghost cache, 01319 * we will have a non-zero datacnt. 01320 */ 01321 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 01322 /* ghost elements have a ghost size */ 01323 ASSERT(ab->b_buf == NULL); 01324 from_delta = ab->b_size; 01325 } 01326 ASSERT3U(*size, >=, from_delta); 01327 atomic_add_64(size, -from_delta); 01328 01329 if (use_mutex) 01330 mutex_exit(lock); 01331 } 01332 if (new_state != arc_anon) { 01333 int use_mutex; 01334 uint64_t *size = &new_state->arcs_lsize[ab->b_type]; 01335 01336 get_buf_info(ab, new_state, &list, &lock); 01337 use_mutex = !MUTEX_HELD(lock); 01338 if (use_mutex) 01339 mutex_enter(lock); 01340 01341 list_insert_head(list, ab); 01342 01343 /* ghost elements have a ghost size */ 01344 if (GHOST_STATE(new_state)) { 01345 ASSERT(ab->b_datacnt == 0); 01346 ASSERT(ab->b_buf == NULL); 01347 to_delta = ab->b_size; 01348 } 01349 atomic_add_64(size, to_delta); 01350 01351 if (use_mutex) 01352 mutex_exit(lock); 01353 } 01354 } 01355 01356 ASSERT(!BUF_EMPTY(ab)); 01357 if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) 01358 buf_hash_remove(ab); 01359 01360 /* adjust state sizes */ 01361 if (to_delta) 01362 atomic_add_64(&new_state->arcs_size, to_delta); 01363 if (from_delta) { 01364 ASSERT3U(old_state->arcs_size, >=, from_delta); 01365 atomic_add_64(&old_state->arcs_size, -from_delta); 01366 } 01367 ab->b_state = new_state; 01368 01369 /* adjust l2arc hdr stats */ 01370 if (new_state == arc_l2c_only) 01371 l2arc_hdr_stat_add(); 01372 else if (old_state == arc_l2c_only) 01373 l2arc_hdr_stat_remove(); 01374 } 01375 01376 void 01377 arc_space_consume(uint64_t space, arc_space_type_t type) 01378 { 01379 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 01380 01381 switch (type) { 01382 case ARC_SPACE_DATA: 01383 ARCSTAT_INCR(arcstat_data_size, space); 01384 break; 01385 case ARC_SPACE_OTHER: 01386 ARCSTAT_INCR(arcstat_other_size, space); 01387 break; 01388 case ARC_SPACE_HDRS: 01389 ARCSTAT_INCR(arcstat_hdr_size, space); 01390 break; 01391 case ARC_SPACE_L2HDRS: 01392 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 01393 break; 01394 } 01395 01396 atomic_add_64(&arc_meta_used, space); 01397 atomic_add_64(&arc_size, space); 01398 } 01399 01400 void 01401 arc_space_return(uint64_t space, arc_space_type_t type) 01402 { 01403 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 01404 01405 switch (type) { 01406 case ARC_SPACE_DATA: 01407 ARCSTAT_INCR(arcstat_data_size, -space); 01408 break; 01409 case ARC_SPACE_OTHER: 01410 ARCSTAT_INCR(arcstat_other_size, -space); 01411 break; 01412 case ARC_SPACE_HDRS: 01413 ARCSTAT_INCR(arcstat_hdr_size, -space); 01414 break; 01415 case ARC_SPACE_L2HDRS: 01416 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 01417 break; 01418 } 01419 01420 ASSERT(arc_meta_used >= space); 01421 if (arc_meta_max < arc_meta_used) 01422 arc_meta_max = arc_meta_used; 01423 atomic_add_64(&arc_meta_used, -space); 01424 ASSERT(arc_size >= space); 01425 atomic_add_64(&arc_size, -space); 01426 } 01427 01428 void * 01429 arc_data_buf_alloc(uint64_t size) 01430 { 01431 if (arc_evict_needed(ARC_BUFC_DATA)) 01432 cv_signal(&arc_reclaim_thr_cv); 01433 atomic_add_64(&arc_size, size); 01434 return (zio_data_buf_alloc(size)); 01435 } 01436 01437 void 01438 arc_data_buf_free(void *buf, uint64_t size) 01439 { 01440 zio_data_buf_free(buf, size); 01441 ASSERT(arc_size >= size); 01442 atomic_add_64(&arc_size, -size); 01443 } 01444 01445 arc_buf_t * 01446 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 01447 { 01448 arc_buf_hdr_t *hdr; 01449 arc_buf_t *buf; 01450 01451 ASSERT3U(size, >, 0); 01452 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 01453 ASSERT(BUF_EMPTY(hdr)); 01454 hdr->b_size = size; 01455 hdr->b_type = type; 01456 hdr->b_spa = spa_load_guid(spa); 01457 hdr->b_state = arc_anon; 01458 hdr->b_arc_access = 0; 01459 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 01460 buf->b_hdr = hdr; 01461 buf->b_data = NULL; 01462 buf->b_efunc = NULL; 01463 buf->b_private = NULL; 01464 buf->b_next = NULL; 01465 hdr->b_buf = buf; 01466 arc_get_data_buf(buf); 01467 hdr->b_datacnt = 1; 01468 hdr->b_flags = 0; 01469 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 01470 (void) refcount_add(&hdr->b_refcnt, tag); 01471 01472 return (buf); 01473 } 01474 01475 static char *arc_onloan_tag = "onloan"; 01476 01483 arc_buf_t * 01484 arc_loan_buf(spa_t *spa, int size) 01485 { 01486 arc_buf_t *buf; 01487 01488 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 01489 01490 atomic_add_64(&arc_loaned_bytes, size); 01491 return (buf); 01492 } 01493 01497 void 01498 arc_return_buf(arc_buf_t *buf, void *tag) 01499 { 01500 arc_buf_hdr_t *hdr = buf->b_hdr; 01501 01502 ASSERT(buf->b_data != NULL); 01503 (void) refcount_add(&hdr->b_refcnt, tag); 01504 (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); 01505 01506 atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 01507 } 01508 01512 void 01513 arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 01514 { 01515 arc_buf_hdr_t *hdr; 01516 01517 ASSERT(buf->b_data != NULL); 01518 hdr = buf->b_hdr; 01519 (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); 01520 (void) refcount_remove(&hdr->b_refcnt, tag); 01521 buf->b_efunc = NULL; 01522 buf->b_private = NULL; 01523 01524 atomic_add_64(&arc_loaned_bytes, hdr->b_size); 01525 } 01526 01527 static arc_buf_t * 01528 arc_buf_clone(arc_buf_t *from) 01529 { 01530 arc_buf_t *buf; 01531 arc_buf_hdr_t *hdr = from->b_hdr; 01532 uint64_t size = hdr->b_size; 01533 01534 ASSERT(hdr->b_state != arc_anon); 01535 01536 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 01537 buf->b_hdr = hdr; 01538 buf->b_data = NULL; 01539 buf->b_efunc = NULL; 01540 buf->b_private = NULL; 01541 buf->b_next = hdr->b_buf; 01542 hdr->b_buf = buf; 01543 arc_get_data_buf(buf); 01544 bcopy(from->b_data, buf->b_data, size); 01545 01546 /* 01547 * This buffer already exists in the arc so create a duplicate 01548 * copy for the caller. If the buffer is associated with user data 01549 * then track the size and number of duplicates. These stats will be 01550 * updated as duplicate buffers are created and destroyed. 01551 */ 01552 if (hdr->b_type == ARC_BUFC_DATA) { 01553 ARCSTAT_BUMP(arcstat_duplicate_buffers); 01554 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 01555 } 01556 hdr->b_datacnt += 1; 01557 return (buf); 01558 } 01559 01560 void 01561 arc_buf_add_ref(arc_buf_t *buf, void* tag) 01562 { 01563 arc_buf_hdr_t *hdr; 01564 kmutex_t *hash_lock; 01565 01566 /* 01567 * Check to see if this buffer is evicted. Callers 01568 * must verify b_data != NULL to know if the add_ref 01569 * was successful. 01570 */ 01571 mutex_enter(&buf->b_evict_lock); 01572 if (buf->b_data == NULL) { 01573 mutex_exit(&buf->b_evict_lock); 01574 return; 01575 } 01576 hash_lock = HDR_LOCK(buf->b_hdr); 01577 mutex_enter(hash_lock); 01578 hdr = buf->b_hdr; 01579 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 01580 mutex_exit(&buf->b_evict_lock); 01581 01582 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 01583 add_reference(hdr, hash_lock, tag); 01584 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 01585 arc_access(hdr, hash_lock); 01586 mutex_exit(hash_lock); 01587 ARCSTAT_BUMP(arcstat_hits); 01588 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 01589 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 01590 data, metadata, hits); 01591 } 01592 01597 static void 01598 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 01599 { 01600 arc_buf_hdr_t *hdr = buf->b_hdr; 01601 01602 if (HDR_L2_WRITING(hdr)) { 01603 l2arc_data_free_t *df; 01604 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 01605 df->l2df_data = buf->b_data; 01606 df->l2df_size = hdr->b_size; 01607 df->l2df_func = free_func; 01608 mutex_enter(&l2arc_free_on_write_mtx); 01609 list_insert_head(l2arc_free_on_write, df); 01610 mutex_exit(&l2arc_free_on_write_mtx); 01611 ARCSTAT_BUMP(arcstat_l2_free_on_write); 01612 } else { 01613 free_func(buf->b_data, hdr->b_size); 01614 } 01615 } 01616 01617 static void 01618 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 01619 { 01620 arc_buf_t **bufp; 01621 01622 /* free up data associated with the buf */ 01623 if (buf->b_data) { 01624 arc_state_t *state = buf->b_hdr->b_state; 01625 uint64_t size = buf->b_hdr->b_size; 01626 arc_buf_contents_t type = buf->b_hdr->b_type; 01627 01628 arc_cksum_verify(buf); 01629 #ifdef illumos 01630 arc_buf_unwatch(buf); 01631 #endif /* illumos */ 01632 01633 if (!recycle) { 01634 if (type == ARC_BUFC_METADATA) { 01635 arc_buf_data_free(buf, zio_buf_free); 01636 arc_space_return(size, ARC_SPACE_DATA); 01637 } else { 01638 ASSERT(type == ARC_BUFC_DATA); 01639 arc_buf_data_free(buf, zio_data_buf_free); 01640 ARCSTAT_INCR(arcstat_data_size, -size); 01641 atomic_add_64(&arc_size, -size); 01642 } 01643 } 01644 if (list_link_active(&buf->b_hdr->b_arc_node)) { 01645 uint64_t *cnt = &state->arcs_lsize[type]; 01646 01647 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 01648 ASSERT(state != arc_anon); 01649 01650 ASSERT3U(*cnt, >=, size); 01651 atomic_add_64(cnt, -size); 01652 } 01653 ASSERT3U(state->arcs_size, >=, size); 01654 atomic_add_64(&state->arcs_size, -size); 01655 buf->b_data = NULL; 01656 01657 /* 01658 * If we're destroying a duplicate buffer make sure 01659 * that the appropriate statistics are updated. 01660 */ 01661 if (buf->b_hdr->b_datacnt > 1 && 01662 buf->b_hdr->b_type == ARC_BUFC_DATA) { 01663 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 01664 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 01665 } 01666 ASSERT(buf->b_hdr->b_datacnt > 0); 01667 buf->b_hdr->b_datacnt -= 1; 01668 } 01669 01670 /* only remove the buf if requested */ 01671 if (!all) 01672 return; 01673 01674 /* remove the buf from the hdr list */ 01675 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 01676 continue; 01677 *bufp = buf->b_next; 01678 buf->b_next = NULL; 01679 01680 ASSERT(buf->b_efunc == NULL); 01681 01682 /* clean up the buf */ 01683 buf->b_hdr = NULL; 01684 kmem_cache_free(buf_cache, buf); 01685 } 01686 01687 static void 01688 arc_hdr_destroy(arc_buf_hdr_t *hdr) 01689 { 01690 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 01691 ASSERT3P(hdr->b_state, ==, arc_anon); 01692 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 01693 l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; 01694 01695 if (l2hdr != NULL) { 01696 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); 01697 /* 01698 * To prevent arc_free() and l2arc_evict() from 01699 * attempting to free the same buffer at the same time, 01700 * a FREE_IN_PROGRESS flag is given to arc_free() to 01701 * give it priority. l2arc_evict() can't destroy this 01702 * header while we are waiting on l2arc_buflist_mtx. 01703 * 01704 * The hdr may be removed from l2ad_buflist before we 01705 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. 01706 */ 01707 if (!buflist_held) { 01708 mutex_enter(&l2arc_buflist_mtx); 01709 l2hdr = hdr->b_l2hdr; 01710 } 01711 01712 if (l2hdr != NULL) { 01713 list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 01714 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 01715 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 01716 if (hdr->b_state == arc_l2c_only) 01717 l2arc_hdr_stat_remove(); 01718 hdr->b_l2hdr = NULL; 01719 } 01720 01721 if (!buflist_held) 01722 mutex_exit(&l2arc_buflist_mtx); 01723 } 01724 01725 if (!BUF_EMPTY(hdr)) { 01726 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 01727 buf_discard_identity(hdr); 01728 } 01729 while (hdr->b_buf) { 01730 arc_buf_t *buf = hdr->b_buf; 01731 01732 if (buf->b_efunc) { 01733 mutex_enter(&arc_eviction_mtx); 01734 mutex_enter(&buf->b_evict_lock); 01735 ASSERT(buf->b_hdr != NULL); 01736 arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 01737 hdr->b_buf = buf->b_next; 01738 buf->b_hdr = &arc_eviction_hdr; 01739 buf->b_next = arc_eviction_list; 01740 arc_eviction_list = buf; 01741 mutex_exit(&buf->b_evict_lock); 01742 mutex_exit(&arc_eviction_mtx); 01743 } else { 01744 arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 01745 } 01746 } 01747 if (hdr->b_freeze_cksum != NULL) { 01748 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 01749 hdr->b_freeze_cksum = NULL; 01750 } 01751 if (hdr->b_thawed) { 01752 kmem_free(hdr->b_thawed, 1); 01753 hdr->b_thawed = NULL; 01754 } 01755 01756 ASSERT(!list_link_active(&hdr->b_arc_node)); 01757 ASSERT3P(hdr->b_hash_next, ==, NULL); 01758 ASSERT3P(hdr->b_acb, ==, NULL); 01759 kmem_cache_free(hdr_cache, hdr); 01760 } 01761 01762 void 01763 arc_buf_free(arc_buf_t *buf, void *tag) 01764 { 01765 arc_buf_hdr_t *hdr = buf->b_hdr; 01766 int hashed = hdr->b_state != arc_anon; 01767 01768 ASSERT(buf->b_efunc == NULL); 01769 ASSERT(buf->b_data != NULL); 01770 01771 if (hashed) { 01772 kmutex_t *hash_lock = HDR_LOCK(hdr); 01773 01774 mutex_enter(hash_lock); 01775 hdr = buf->b_hdr; 01776 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 01777 01778 (void) remove_reference(hdr, hash_lock, tag); 01779 if (hdr->b_datacnt > 1) { 01780 arc_buf_destroy(buf, FALSE, TRUE); 01781 } else { 01782 ASSERT(buf == hdr->b_buf); 01783 ASSERT(buf->b_efunc == NULL); 01784 hdr->b_flags |= ARC_BUF_AVAILABLE; 01785 } 01786 mutex_exit(hash_lock); 01787 } else if (HDR_IO_IN_PROGRESS(hdr)) { 01788 int destroy_hdr; 01789 /* 01790 * We are in the middle of an async write. Don't destroy 01791 * this buffer unless the write completes before we finish 01792 * decrementing the reference count. 01793 */ 01794 mutex_enter(&arc_eviction_mtx); 01795 (void) remove_reference(hdr, NULL, tag); 01796 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 01797 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 01798 mutex_exit(&arc_eviction_mtx); 01799 if (destroy_hdr) 01800 arc_hdr_destroy(hdr); 01801 } else { 01802 if (remove_reference(hdr, NULL, tag) > 0) 01803 arc_buf_destroy(buf, FALSE, TRUE); 01804 else 01805 arc_hdr_destroy(hdr); 01806 } 01807 } 01808 01809 int 01810 arc_buf_remove_ref(arc_buf_t *buf, void* tag) 01811 { 01812 arc_buf_hdr_t *hdr = buf->b_hdr; 01813 kmutex_t *hash_lock = HDR_LOCK(hdr); 01814 int no_callback = (buf->b_efunc == NULL); 01815 01816 if (hdr->b_state == arc_anon) { 01817 ASSERT(hdr->b_datacnt == 1); 01818 arc_buf_free(buf, tag); 01819 return (no_callback); 01820 } 01821 01822 mutex_enter(hash_lock); 01823 hdr = buf->b_hdr; 01824 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 01825 ASSERT(hdr->b_state != arc_anon); 01826 ASSERT(buf->b_data != NULL); 01827 01828 (void) remove_reference(hdr, hash_lock, tag); 01829 if (hdr->b_datacnt > 1) { 01830 if (no_callback) 01831 arc_buf_destroy(buf, FALSE, TRUE); 01832 } else if (no_callback) { 01833 ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 01834 ASSERT(buf->b_efunc == NULL); 01835 hdr->b_flags |= ARC_BUF_AVAILABLE; 01836 } 01837 ASSERT(no_callback || hdr->b_datacnt > 1 || 01838 refcount_is_zero(&hdr->b_refcnt)); 01839 mutex_exit(hash_lock); 01840 return (no_callback); 01841 } 01842 01843 int 01844 arc_buf_size(arc_buf_t *buf) 01845 { 01846 return (buf->b_hdr->b_size); 01847 } 01848 01855 boolean_t 01856 arc_buf_eviction_needed(arc_buf_t *buf) 01857 { 01858 arc_buf_hdr_t *hdr; 01859 boolean_t evict_needed = B_FALSE; 01860 01861 if (zfs_disable_dup_eviction) 01862 return (B_FALSE); 01863 01864 mutex_enter(&buf->b_evict_lock); 01865 hdr = buf->b_hdr; 01866 if (hdr == NULL) { 01867 /* 01868 * We are in arc_do_user_evicts(); let that function 01869 * perform the eviction. 01870 */ 01871 ASSERT(buf->b_data == NULL); 01872 mutex_exit(&buf->b_evict_lock); 01873 return (B_FALSE); 01874 } else if (buf->b_data == NULL) { 01875 /* 01876 * We have already been added to the arc eviction list; 01877 * recommend eviction. 01878 */ 01879 ASSERT3P(hdr, ==, &arc_eviction_hdr); 01880 mutex_exit(&buf->b_evict_lock); 01881 return (B_TRUE); 01882 } 01883 01884 if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) 01885 evict_needed = B_TRUE; 01886 01887 mutex_exit(&buf->b_evict_lock); 01888 return (evict_needed); 01889 } 01890 01904 static void * 01905 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, 01906 arc_buf_contents_t type) 01907 { 01908 arc_state_t *evicted_state; 01909 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 01910 int64_t bytes_remaining; 01911 arc_buf_hdr_t *ab, *ab_prev = NULL; 01912 list_t *evicted_list, *list, *evicted_list_start, *list_start; 01913 kmutex_t *lock, *evicted_lock; 01914 kmutex_t *hash_lock; 01915 boolean_t have_lock; 01916 void *stolen = NULL; 01917 static int evict_metadata_offset, evict_data_offset; 01918 int i, idx, offset, list_count, count; 01919 01920 ASSERT(state == arc_mru || state == arc_mfu); 01921 01922 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 01923 01924 if (type == ARC_BUFC_METADATA) { 01925 offset = 0; 01926 list_count = ARC_BUFC_NUMMETADATALISTS; 01927 list_start = &state->arcs_lists[0]; 01928 evicted_list_start = &evicted_state->arcs_lists[0]; 01929 idx = evict_metadata_offset; 01930 } else { 01931 offset = ARC_BUFC_NUMMETADATALISTS; 01932 list_start = &state->arcs_lists[offset]; 01933 evicted_list_start = &evicted_state->arcs_lists[offset]; 01934 list_count = ARC_BUFC_NUMDATALISTS; 01935 idx = evict_data_offset; 01936 } 01937 bytes_remaining = evicted_state->arcs_lsize[type]; 01938 count = 0; 01939 01940 evict_start: 01941 list = &list_start[idx]; 01942 evicted_list = &evicted_list_start[idx]; 01943 lock = ARCS_LOCK(state, (offset + idx)); 01944 evicted_lock = ARCS_LOCK(evicted_state, (offset + idx)); 01945 01946 mutex_enter(lock); 01947 mutex_enter(evicted_lock); 01948 01949 for (ab = list_tail(list); ab; ab = ab_prev) { 01950 ab_prev = list_prev(list, ab); 01951 bytes_remaining -= (ab->b_size * ab->b_datacnt); 01952 /* prefetch buffers have a minimum lifespan */ 01953 if (HDR_IO_IN_PROGRESS(ab) || 01954 (spa && ab->b_spa != spa) || 01955 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 01956 ddi_get_lbolt() - ab->b_arc_access < 01957 arc_min_prefetch_lifespan)) { 01958 skipped++; 01959 continue; 01960 } 01961 /* "lookahead" for better eviction candidate */ 01962 if (recycle && ab->b_size != bytes && 01963 ab_prev && ab_prev->b_size == bytes) 01964 continue; 01965 hash_lock = HDR_LOCK(ab); 01966 have_lock = MUTEX_HELD(hash_lock); 01967 if (have_lock || mutex_tryenter(hash_lock)) { 01968 ASSERT0(refcount_count(&ab->b_refcnt)); 01969 ASSERT(ab->b_datacnt > 0); 01970 while (ab->b_buf) { 01971 arc_buf_t *buf = ab->b_buf; 01972 if (!mutex_tryenter(&buf->b_evict_lock)) { 01973 missed += 1; 01974 break; 01975 } 01976 if (buf->b_data) { 01977 bytes_evicted += ab->b_size; 01978 if (recycle && ab->b_type == type && 01979 ab->b_size == bytes && 01980 !HDR_L2_WRITING(ab)) { 01981 stolen = buf->b_data; 01982 recycle = FALSE; 01983 } 01984 } 01985 if (buf->b_efunc) { 01986 mutex_enter(&arc_eviction_mtx); 01987 arc_buf_destroy(buf, 01988 buf->b_data == stolen, FALSE); 01989 ab->b_buf = buf->b_next; 01990 buf->b_hdr = &arc_eviction_hdr; 01991 buf->b_next = arc_eviction_list; 01992 arc_eviction_list = buf; 01993 mutex_exit(&arc_eviction_mtx); 01994 mutex_exit(&buf->b_evict_lock); 01995 } else { 01996 mutex_exit(&buf->b_evict_lock); 01997 arc_buf_destroy(buf, 01998 buf->b_data == stolen, TRUE); 01999 } 02000 } 02001 02002 if (ab->b_l2hdr) { 02003 ARCSTAT_INCR(arcstat_evict_l2_cached, 02004 ab->b_size); 02005 } else { 02006 if (l2arc_write_eligible(ab->b_spa, ab)) { 02007 ARCSTAT_INCR(arcstat_evict_l2_eligible, 02008 ab->b_size); 02009 } else { 02010 ARCSTAT_INCR( 02011 arcstat_evict_l2_ineligible, 02012 ab->b_size); 02013 } 02014 } 02015 02016 if (ab->b_datacnt == 0) { 02017 arc_change_state(evicted_state, ab, hash_lock); 02018 ASSERT(HDR_IN_HASH_TABLE(ab)); 02019 ab->b_flags |= ARC_IN_HASH_TABLE; 02020 ab->b_flags &= ~ARC_BUF_AVAILABLE; 02021 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 02022 } 02023 if (!have_lock) 02024 mutex_exit(hash_lock); 02025 if (bytes >= 0 && bytes_evicted >= bytes) 02026 break; 02027 if (bytes_remaining > 0) { 02028 mutex_exit(evicted_lock); 02029 mutex_exit(lock); 02030 idx = ((idx + 1) & (list_count - 1)); 02031 count++; 02032 goto evict_start; 02033 } 02034 } else { 02035 missed += 1; 02036 } 02037 } 02038 02039 mutex_exit(evicted_lock); 02040 mutex_exit(lock); 02041 02042 idx = ((idx + 1) & (list_count - 1)); 02043 count++; 02044 02045 if (bytes_evicted < bytes) { 02046 if (count < list_count) 02047 goto evict_start; 02048 else 02049 dprintf("only evicted %lld bytes from %x", 02050 (longlong_t)bytes_evicted, state); 02051 } 02052 if (type == ARC_BUFC_METADATA) 02053 evict_metadata_offset = idx; 02054 else 02055 evict_data_offset = idx; 02056 02057 /* 02058 * Number of buffers skipped because they have I/O in progress or 02059 * are indrect prefetch buffers that have not lived long enough. 02060 */ 02061 if (skipped) 02062 ARCSTAT_INCR(arcstat_evict_skip, skipped); 02063 02064 /* 02065 * Number of buffers that could not be evicted because something 02066 * else is using them. 02067 */ 02068 if (missed) 02069 ARCSTAT_INCR(arcstat_mutex_miss, missed); 02070 02071 /* 02072 * We have just evicted some date into the ghost state, make 02073 * sure we also adjust the ghost state size if necessary. 02074 */ 02075 if (arc_no_grow && 02076 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { 02077 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + 02078 arc_mru_ghost->arcs_size - arc_c; 02079 02080 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { 02081 int64_t todelete = 02082 MIN(arc_mru_ghost->arcs_lsize[type], mru_over); 02083 arc_evict_ghost(arc_mru_ghost, 0, todelete); 02084 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { 02085 int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], 02086 arc_mru_ghost->arcs_size + 02087 arc_mfu_ghost->arcs_size - arc_c); 02088 arc_evict_ghost(arc_mfu_ghost, 0, todelete); 02089 } 02090 } 02091 if (stolen) 02092 ARCSTAT_BUMP(arcstat_stolen); 02093 02094 return (stolen); 02095 } 02096 02101 static void 02102 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) 02103 { 02104 arc_buf_hdr_t *ab, *ab_prev; 02105 arc_buf_hdr_t marker = { 0 }; 02106 list_t *list, *list_start; 02107 kmutex_t *hash_lock, *lock; 02108 uint64_t bytes_deleted = 0; 02109 uint64_t bufs_skipped = 0; 02110 static int evict_offset; 02111 int list_count, idx = evict_offset; 02112 int offset, count = 0; 02113 02114 ASSERT(GHOST_STATE(state)); 02115 02116 /* 02117 * data lists come after metadata lists 02118 */ 02119 list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS]; 02120 list_count = ARC_BUFC_NUMDATALISTS; 02121 offset = ARC_BUFC_NUMMETADATALISTS; 02122 02123 evict_start: 02124 list = &list_start[idx]; 02125 lock = ARCS_LOCK(state, idx + offset); 02126 02127 mutex_enter(lock); 02128 for (ab = list_tail(list); ab; ab = ab_prev) { 02129 ab_prev = list_prev(list, ab); 02130 if (spa && ab->b_spa != spa) 02131 continue; 02132 02133 /* ignore markers */ 02134 if (ab->b_spa == 0) 02135 continue; 02136 02137 hash_lock = HDR_LOCK(ab); 02138 /* caller may be trying to modify this buffer, skip it */ 02139 if (MUTEX_HELD(hash_lock)) 02140 continue; 02141 if (mutex_tryenter(hash_lock)) { 02142 ASSERT(!HDR_IO_IN_PROGRESS(ab)); 02143 ASSERT(ab->b_buf == NULL); 02144 ARCSTAT_BUMP(arcstat_deleted); 02145 bytes_deleted += ab->b_size; 02146 02147 if (ab->b_l2hdr != NULL) { 02148 /* 02149 * This buffer is cached on the 2nd Level ARC; 02150 * don't destroy the header. 02151 */ 02152 arc_change_state(arc_l2c_only, ab, hash_lock); 02153 mutex_exit(hash_lock); 02154 } else { 02155 arc_change_state(arc_anon, ab, hash_lock); 02156 mutex_exit(hash_lock); 02157 arc_hdr_destroy(ab); 02158 } 02159 02160 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 02161 if (bytes >= 0 && bytes_deleted >= bytes) 02162 break; 02163 } else if (bytes < 0) { 02164 /* 02165 * Insert a list marker and then wait for the 02166 * hash lock to become available. Once its 02167 * available, restart from where we left off. 02168 */ 02169 list_insert_after(list, ab, &marker); 02170 mutex_exit(lock); 02171 mutex_enter(hash_lock); 02172 mutex_exit(hash_lock); 02173 mutex_enter(lock); 02174 ab_prev = list_prev(list, &marker); 02175 list_remove(list, &marker); 02176 } else 02177 bufs_skipped += 1; 02178 } 02179 mutex_exit(lock); 02180 idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1)); 02181 count++; 02182 02183 if (count < list_count) 02184 goto evict_start; 02185 02186 evict_offset = idx; 02187 if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] && 02188 (bytes < 0 || bytes_deleted < bytes)) { 02189 list_start = &state->arcs_lists[0]; 02190 list_count = ARC_BUFC_NUMMETADATALISTS; 02191 offset = count = 0; 02192 goto evict_start; 02193 } 02194 02195 /* Number of buffers we could not obtain the hash lock for */ 02196 if (bufs_skipped) { 02197 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 02198 ASSERT(bytes >= 0); 02199 } 02200 02201 if (bytes_deleted < bytes) 02202 dprintf("only deleted %lld bytes from %p", 02203 (longlong_t)bytes_deleted, state); 02204 } 02205 02206 static void 02207 arc_adjust(void) 02208 { 02209 int64_t adjustment, delta; 02210 02211 /* 02212 * Adjust MRU size 02213 */ 02214 02215 adjustment = MIN((int64_t)(arc_size - arc_c), 02216 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - 02217 arc_p)); 02218 02219 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 02220 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 02221 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); 02222 adjustment -= delta; 02223 } 02224 02225 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 02226 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 02227 (void) arc_evict(arc_mru, 0, delta, FALSE, 02228 ARC_BUFC_METADATA); 02229 } 02230 02231 /* 02232 * Adjust MFU size 02233 */ 02234 02235 adjustment = arc_size - arc_c; 02236 02237 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 02238 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 02239 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); 02240 adjustment -= delta; 02241 } 02242 02243 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 02244 int64_t delta = MIN(adjustment, 02245 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 02246 (void) arc_evict(arc_mfu, 0, delta, FALSE, 02247 ARC_BUFC_METADATA); 02248 } 02249 02250 /* 02251 * Adjust ghost lists 02252 */ 02253 02254 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 02255 02256 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 02257 delta = MIN(arc_mru_ghost->arcs_size, adjustment); 02258 arc_evict_ghost(arc_mru_ghost, 0, delta); 02259 } 02260 02261 adjustment = 02262 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 02263 02264 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 02265 delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 02266 arc_evict_ghost(arc_mfu_ghost, 0, delta); 02267 } 02268 } 02269 02270 static void 02271 arc_do_user_evicts(void) 02272 { 02273 static arc_buf_t *tmp_arc_eviction_list; 02274 02275 /* 02276 * Move list over to avoid LOR 02277 */ 02278 restart: 02279 mutex_enter(&arc_eviction_mtx); 02280 tmp_arc_eviction_list = arc_eviction_list; 02281 arc_eviction_list = NULL; 02282 mutex_exit(&arc_eviction_mtx); 02283 02284 while (tmp_arc_eviction_list != NULL) { 02285 arc_buf_t *buf = tmp_arc_eviction_list; 02286 tmp_arc_eviction_list = buf->b_next; 02287 mutex_enter(&buf->b_evict_lock); 02288 buf->b_hdr = NULL; 02289 mutex_exit(&buf->b_evict_lock); 02290 02291 if (buf->b_efunc != NULL) 02292 VERIFY(buf->b_efunc(buf) == 0); 02293 02294 buf->b_efunc = NULL; 02295 buf->b_private = NULL; 02296 kmem_cache_free(buf_cache, buf); 02297 } 02298 02299 if (arc_eviction_list != NULL) 02300 goto restart; 02301 } 02302 02308 void 02309 arc_flush(spa_t *spa) 02310 { 02311 uint64_t guid = 0; 02312 02313 if (spa) 02314 guid = spa_load_guid(spa); 02315 02316 while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) { 02317 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); 02318 if (spa) 02319 break; 02320 } 02321 while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) { 02322 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); 02323 if (spa) 02324 break; 02325 } 02326 while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) { 02327 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); 02328 if (spa) 02329 break; 02330 } 02331 while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) { 02332 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); 02333 if (spa) 02334 break; 02335 } 02336 02337 arc_evict_ghost(arc_mru_ghost, guid, -1); 02338 arc_evict_ghost(arc_mfu_ghost, guid, -1); 02339 02340 mutex_enter(&arc_reclaim_thr_lock); 02341 arc_do_user_evicts(); 02342 mutex_exit(&arc_reclaim_thr_lock); 02343 ASSERT(spa || arc_eviction_list == NULL); 02344 } 02345 02346 void 02347 arc_shrink(void) 02348 { 02349 if (arc_c > arc_c_min) { 02350 uint64_t to_free; 02351 02352 #ifdef _KERNEL 02353 to_free = arc_c >> arc_shrink_shift; 02354 #else 02355 to_free = arc_c >> arc_shrink_shift; 02356 #endif 02357 if (arc_c > arc_c_min + to_free) 02358 atomic_add_64(&arc_c, -to_free); 02359 else 02360 arc_c = arc_c_min; 02361 02362 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 02363 if (arc_c > arc_size) 02364 arc_c = MAX(arc_size, arc_c_min); 02365 if (arc_p > arc_c) 02366 arc_p = (arc_c >> 1); 02367 ASSERT(arc_c >= arc_c_min); 02368 ASSERT((int64_t)arc_p >= 0); 02369 } 02370 02371 if (arc_size > arc_c) 02372 arc_adjust(); 02373 } 02374 02375 static int needfree = 0; 02376 02377 static int 02378 arc_reclaim_needed(void) 02379 { 02380 02381 #ifdef _KERNEL 02382 02383 if (needfree) 02384 return (1); 02385 02386 /* 02387 * Cooperate with pagedaemon when it's time for it to scan 02388 * and reclaim some pages. 02389 */ 02390 if (vm_paging_needed()) 02391 return (1); 02392 02393 #ifdef sun 02394 /* 02395 * take 'desfree' extra pages, so we reclaim sooner, rather than later 02396 */ 02397 extra = desfree; 02398 02399 /* 02400 * check that we're out of range of the pageout scanner. It starts to 02401 * schedule paging if freemem is less than lotsfree and needfree. 02402 * lotsfree is the high-water mark for pageout, and needfree is the 02403 * number of needed free pages. We add extra pages here to make sure 02404 * the scanner doesn't start up while we're freeing memory. 02405 */ 02406 if (freemem < lotsfree + needfree + extra) 02407 return (1); 02408 02409 /* 02410 * check to make sure that swapfs has enough space so that anon 02411 * reservations can still succeed. anon_resvmem() checks that the 02412 * availrmem is greater than swapfs_minfree, and the number of reserved 02413 * swap pages. We also add a bit of extra here just to prevent 02414 * circumstances from getting really dire. 02415 */ 02416 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 02417 return (1); 02418 02419 #if defined(__i386) 02420 /* 02421 * If we're on an i386 platform, it's possible that we'll exhaust the 02422 * kernel heap space before we ever run out of available physical 02423 * memory. Most checks of the size of the heap_area compare against 02424 * tune.t_minarmem, which is the minimum available real memory that we 02425 * can have in the system. However, this is generally fixed at 25 pages 02426 * which is so low that it's useless. In this comparison, we seek to 02427 * calculate the total heap-size, and reclaim if more than 3/4ths of the 02428 * heap is allocated. (Or, in the calculation, if less than 1/4th is 02429 * free) 02430 */ 02431 if (btop(vmem_size(heap_arena, VMEM_FREE)) < 02432 (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 02433 return (1); 02434 #endif 02435 #else /* !sun */ 02436 if (kmem_used() > (kmem_size() * 3) / 4) 02437 return (1); 02438 #endif /* sun */ 02439 02440 #else /* !_KERNEL */ 02441 if (spa_get_random(100) == 0) 02442 return (1); 02443 #endif 02444 return (0); 02445 } 02446 02447 extern kmem_cache_t *zio_buf_cache[]; 02448 extern kmem_cache_t *zio_data_buf_cache[]; 02449 02450 static void 02451 arc_kmem_reap_now(arc_reclaim_strategy_t strat) 02452 { 02453 size_t i; 02454 kmem_cache_t *prev_cache = NULL; 02455 kmem_cache_t *prev_data_cache = NULL; 02456 02457 #ifdef _KERNEL 02458 if (arc_meta_used >= arc_meta_limit) { 02459 /* 02460 * We are exceeding our meta-data cache limit. 02461 * Purge some DNLC entries to release holds on meta-data. 02462 */ 02463 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 02464 } 02465 #if defined(__i386) 02466 /* 02467 * Reclaim unused memory from all kmem caches. 02468 */ 02469 kmem_reap(); 02470 #endif 02471 #endif 02472 02473 /* 02474 * An aggressive reclamation will shrink the cache size as well as 02475 * reap free buffers from the arc kmem caches. 02476 */ 02477 if (strat == ARC_RECLAIM_AGGR) 02478 arc_shrink(); 02479 02480 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 02481 if (zio_buf_cache[i] != prev_cache) { 02482 prev_cache = zio_buf_cache[i]; 02483 kmem_cache_reap_now(zio_buf_cache[i]); 02484 } 02485 if (zio_data_buf_cache[i] != prev_data_cache) { 02486 prev_data_cache = zio_data_buf_cache[i]; 02487 kmem_cache_reap_now(zio_data_buf_cache[i]); 02488 } 02489 } 02490 kmem_cache_reap_now(buf_cache); 02491 kmem_cache_reap_now(hdr_cache); 02492 } 02493 02494 static void 02495 arc_reclaim_thread(void *dummy __unused) 02496 { 02497 clock_t growtime = 0; 02498 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 02499 callb_cpr_t cpr; 02500 02501 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 02502 02503 mutex_enter(&arc_reclaim_thr_lock); 02504 while (arc_thread_exit == 0) { 02505 if (arc_reclaim_needed()) { 02506 02507 if (arc_no_grow) { 02508 if (last_reclaim == ARC_RECLAIM_CONS) { 02509 last_reclaim = ARC_RECLAIM_AGGR; 02510 } else { 02511 last_reclaim = ARC_RECLAIM_CONS; 02512 } 02513 } else { 02514 arc_no_grow = TRUE; 02515 last_reclaim = ARC_RECLAIM_AGGR; 02516 membar_producer(); 02517 } 02518 02519 /* reset the growth delay for every reclaim */ 02520 growtime = ddi_get_lbolt() + (arc_grow_retry * hz); 02521 02522 if (needfree && last_reclaim == ARC_RECLAIM_CONS) { 02523 /* 02524 * If needfree is TRUE our vm_lowmem hook 02525 * was called and in that case we must free some 02526 * memory, so switch to aggressive mode. 02527 */ 02528 arc_no_grow = TRUE; 02529 last_reclaim = ARC_RECLAIM_AGGR; 02530 } 02531 arc_kmem_reap_now(last_reclaim); 02532 arc_warm = B_TRUE; 02533 02534 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { 02535 arc_no_grow = FALSE; 02536 } 02537 02538 arc_adjust(); 02539 02540 if (arc_eviction_list != NULL) 02541 arc_do_user_evicts(); 02542 02543 #ifdef _KERNEL 02544 if (needfree) { 02545 needfree = 0; 02546 wakeup(&needfree); 02547 } 02548 #endif 02549 02550 /* block until needed, or one second, whichever is shorter */ 02551 CALLB_CPR_SAFE_BEGIN(&cpr); 02552 (void) cv_timedwait(&arc_reclaim_thr_cv, 02553 &arc_reclaim_thr_lock, hz); 02554 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 02555 } 02556 02557 arc_thread_exit = 0; 02558 cv_broadcast(&arc_reclaim_thr_cv); 02559 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 02560 thread_exit(); 02561 } 02562 02568 static void 02569 arc_adapt(int bytes, arc_state_t *state) 02570 { 02571 int mult; 02572 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 02573 02574 if (state == arc_l2c_only) 02575 return; 02576 02577 ASSERT(bytes > 0); 02578 /* 02579 * Adapt the target size of the MRU list: 02580 * - if we just hit in the MRU ghost list, then increase 02581 * the target size of the MRU list. 02582 * - if we just hit in the MFU ghost list, then increase 02583 * the target size of the MFU list by decreasing the 02584 * target size of the MRU list. 02585 */ 02586 if (state == arc_mru_ghost) { 02587 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 02588 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 02589 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 02590 02591 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 02592 } else if (state == arc_mfu_ghost) { 02593 uint64_t delta; 02594 02595 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 02596 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 02597 mult = MIN(mult, 10); 02598 02599 delta = MIN(bytes * mult, arc_p); 02600 arc_p = MAX(arc_p_min, arc_p - delta); 02601 } 02602 ASSERT((int64_t)arc_p >= 0); 02603 02604 if (arc_reclaim_needed()) { 02605 cv_signal(&arc_reclaim_thr_cv); 02606 return; 02607 } 02608 02609 if (arc_no_grow) 02610 return; 02611 02612 if (arc_c >= arc_c_max) 02613 return; 02614 02615 /* 02616 * If we're within (2 * maxblocksize) bytes of the target 02617 * cache size, increment the target cache size 02618 */ 02619 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 02620 atomic_add_64(&arc_c, (int64_t)bytes); 02621 if (arc_c > arc_c_max) 02622 arc_c = arc_c_max; 02623 else if (state == arc_anon) 02624 atomic_add_64(&arc_p, (int64_t)bytes); 02625 if (arc_p > arc_c) 02626 arc_p = arc_c; 02627 } 02628 ASSERT((int64_t)arc_p >= 0); 02629 } 02630 02635 static int 02636 arc_evict_needed(arc_buf_contents_t type) 02637 { 02638 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 02639 return (1); 02640 02641 #ifdef sun 02642 #ifdef _KERNEL 02643 /* 02644 * If zio data pages are being allocated out of a separate heap segment, 02645 * then enforce that the size of available vmem for this area remains 02646 * above about 1/32nd free. 02647 */ 02648 if (type == ARC_BUFC_DATA && zio_arena != NULL && 02649 vmem_size(zio_arena, VMEM_FREE) < 02650 (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) 02651 return (1); 02652 #endif 02653 #endif /* sun */ 02654 02655 if (arc_reclaim_needed()) 02656 return (1); 02657 02658 return (arc_size > arc_c); 02659 } 02660 02679 static void 02680 arc_get_data_buf(arc_buf_t *buf) 02681 { 02682 arc_state_t *state = buf->b_hdr->b_state; 02683 uint64_t size = buf->b_hdr->b_size; 02684 arc_buf_contents_t type = buf->b_hdr->b_type; 02685 02686 arc_adapt(size, state); 02687 02688 /* 02689 * We have not yet reached cache maximum size, 02690 * just allocate a new buffer. 02691 */ 02692 if (!arc_evict_needed(type)) { 02693 if (type == ARC_BUFC_METADATA) { 02694 buf->b_data = zio_buf_alloc(size); 02695 arc_space_consume(size, ARC_SPACE_DATA); 02696 } else { 02697 ASSERT(type == ARC_BUFC_DATA); 02698 buf->b_data = zio_data_buf_alloc(size); 02699 ARCSTAT_INCR(arcstat_data_size, size); 02700 atomic_add_64(&arc_size, size); 02701 } 02702 goto out; 02703 } 02704 02705 /* 02706 * If we are prefetching from the mfu ghost list, this buffer 02707 * will end up on the mru list; so steal space from there. 02708 */ 02709 if (state == arc_mfu_ghost) 02710 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 02711 else if (state == arc_mru_ghost) 02712 state = arc_mru; 02713 02714 if (state == arc_mru || state == arc_anon) { 02715 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 02716 state = (arc_mfu->arcs_lsize[type] >= size && 02717 arc_p > mru_used) ? arc_mfu : arc_mru; 02718 } else { 02719 /* MFU cases */ 02720 uint64_t mfu_space = arc_c - arc_p; 02721 state = (arc_mru->arcs_lsize[type] >= size && 02722 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 02723 } 02724 if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { 02725 if (type == ARC_BUFC_METADATA) { 02726 buf->b_data = zio_buf_alloc(size); 02727 arc_space_consume(size, ARC_SPACE_DATA); 02728 } else { 02729 ASSERT(type == ARC_BUFC_DATA); 02730 buf->b_data = zio_data_buf_alloc(size); 02731 ARCSTAT_INCR(arcstat_data_size, size); 02732 atomic_add_64(&arc_size, size); 02733 } 02734 ARCSTAT_BUMP(arcstat_recycle_miss); 02735 } 02736 ASSERT(buf->b_data != NULL); 02737 out: 02738 /* 02739 * Update the state size. Note that ghost states have a 02740 * "ghost size" and so don't need to be updated. 02741 */ 02742 if (!GHOST_STATE(buf->b_hdr->b_state)) { 02743 arc_buf_hdr_t *hdr = buf->b_hdr; 02744 02745 atomic_add_64(&hdr->b_state->arcs_size, size); 02746 if (list_link_active(&hdr->b_arc_node)) { 02747 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 02748 atomic_add_64(&hdr->b_state->arcs_lsize[type], size); 02749 } 02750 /* 02751 * If we are growing the cache, and we are adding anonymous 02752 * data, and we have outgrown arc_p, update arc_p 02753 */ 02754 if (arc_size < arc_c && hdr->b_state == arc_anon && 02755 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 02756 arc_p = MIN(arc_c, arc_p + size); 02757 } 02758 ARCSTAT_BUMP(arcstat_allocated); 02759 } 02760 02766 static void 02767 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 02768 { 02769 clock_t now; 02770 02771 ASSERT(MUTEX_HELD(hash_lock)); 02772 02773 if (buf->b_state == arc_anon) { 02774 /* 02775 * This buffer is not in the cache, and does not 02776 * appear in our "ghost" list. Add the new buffer 02777 * to the MRU state. 02778 */ 02779 02780 ASSERT(buf->b_arc_access == 0); 02781 buf->b_arc_access = ddi_get_lbolt(); 02782 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 02783 arc_change_state(arc_mru, buf, hash_lock); 02784 02785 } else if (buf->b_state == arc_mru) { 02786 now = ddi_get_lbolt(); 02787 02788 /* 02789 * If this buffer is here because of a prefetch, then either: 02790 * - clear the flag if this is a "referencing" read 02791 * (any subsequent access will bump this into the MFU state). 02792 * or 02793 * - move the buffer to the head of the list if this is 02794 * another prefetch (to make it less likely to be evicted). 02795 */ 02796 if ((buf->b_flags & ARC_PREFETCH) != 0) { 02797 if (refcount_count(&buf->b_refcnt) == 0) { 02798 ASSERT(list_link_active(&buf->b_arc_node)); 02799 } else { 02800 buf->b_flags &= ~ARC_PREFETCH; 02801 ARCSTAT_BUMP(arcstat_mru_hits); 02802 } 02803 buf->b_arc_access = now; 02804 return; 02805 } 02806 02807 /* 02808 * This buffer has been "accessed" only once so far, 02809 * but it is still in the cache. Move it to the MFU 02810 * state. 02811 */ 02812 if (now > buf->b_arc_access + ARC_MINTIME) { 02813 /* 02814 * More than 125ms have passed since we 02815 * instantiated this buffer. Move it to the 02816 * most frequently used state. 02817 */ 02818 buf->b_arc_access = now; 02819 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 02820 arc_change_state(arc_mfu, buf, hash_lock); 02821 } 02822 ARCSTAT_BUMP(arcstat_mru_hits); 02823 } else if (buf->b_state == arc_mru_ghost) { 02824 arc_state_t *new_state; 02825 /* 02826 * This buffer has been "accessed" recently, but 02827 * was evicted from the cache. Move it to the 02828 * MFU state. 02829 */ 02830 02831 if (buf->b_flags & ARC_PREFETCH) { 02832 new_state = arc_mru; 02833 if (refcount_count(&buf->b_refcnt) > 0) 02834 buf->b_flags &= ~ARC_PREFETCH; 02835 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 02836 } else { 02837 new_state = arc_mfu; 02838 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 02839 } 02840 02841 buf->b_arc_access = ddi_get_lbolt(); 02842 arc_change_state(new_state, buf, hash_lock); 02843 02844 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 02845 } else if (buf->b_state == arc_mfu) { 02846 /* 02847 * This buffer has been accessed more than once and is 02848 * still in the cache. Keep it in the MFU state. 02849 * 02850 * NOTE: an add_reference() that occurred when we did 02851 * the arc_read() will have kicked this off the list. 02852 * If it was a prefetch, we will explicitly move it to 02853 * the head of the list now. 02854 */ 02855 if ((buf->b_flags & ARC_PREFETCH) != 0) { 02856 ASSERT(refcount_count(&buf->b_refcnt) == 0); 02857 ASSERT(list_link_active(&buf->b_arc_node)); 02858 } 02859 ARCSTAT_BUMP(arcstat_mfu_hits); 02860 buf->b_arc_access = ddi_get_lbolt(); 02861 } else if (buf->b_state == arc_mfu_ghost) { 02862 arc_state_t *new_state = arc_mfu; 02863 /* 02864 * This buffer has been accessed more than once but has 02865 * been evicted from the cache. Move it back to the 02866 * MFU state. 02867 */ 02868 02869 if (buf->b_flags & ARC_PREFETCH) { 02870 /* 02871 * This is a prefetch access... 02872 * move this block back to the MRU state. 02873 */ 02874 ASSERT0(refcount_count(&buf->b_refcnt)); 02875 new_state = arc_mru; 02876 } 02877 02878 buf->b_arc_access = ddi_get_lbolt(); 02879 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 02880 arc_change_state(new_state, buf, hash_lock); 02881 02882 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 02883 } else if (buf->b_state == arc_l2c_only) { 02884 /* 02885 * This buffer is on the 2nd Level ARC. 02886 */ 02887 02888 buf->b_arc_access = ddi_get_lbolt(); 02889 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 02890 arc_change_state(arc_mfu, buf, hash_lock); 02891 } else { 02892 ASSERT(!"invalid arc state"); 02893 } 02894 } 02895 02899 /* ARGSUSED */ 02900 void 02901 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 02902 { 02903 if (zio == NULL || zio->io_error == 0) 02904 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 02905 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 02906 } 02907 02911 void 02912 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 02913 { 02914 arc_buf_t **bufp = arg; 02915 if (zio && zio->io_error) { 02916 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 02917 *bufp = NULL; 02918 } else { 02919 *bufp = buf; 02920 ASSERT(buf->b_data); 02921 } 02922 } 02923 02924 static void 02925 arc_read_done(zio_t *zio) 02926 { 02927 arc_buf_hdr_t *hdr, *found; 02928 arc_buf_t *buf; 02929 arc_buf_t *abuf; /* buffer we're assigning to callback */ 02930 kmutex_t *hash_lock; 02931 arc_callback_t *callback_list, *acb; 02932 int freeable = FALSE; 02933 02934 buf = zio->io_private; 02935 hdr = buf->b_hdr; 02936 02937 /* 02938 * The hdr was inserted into hash-table and removed from lists 02939 * prior to starting I/O. We should find this header, since 02940 * it's in the hash table, and it should be legit since it's 02941 * not possible to evict it during the I/O. The only possible 02942 * reason for it not to be found is if we were freed during the 02943 * read. 02944 */ 02945 found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, 02946 &hash_lock); 02947 02948 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 02949 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 02950 (found == hdr && HDR_L2_READING(hdr))); 02951 02952 hdr->b_flags &= ~ARC_L2_EVICTED; 02953 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) 02954 hdr->b_flags &= ~ARC_L2CACHE; 02955 02956 /* byteswap if necessary */ 02957 callback_list = hdr->b_acb; 02958 ASSERT(callback_list != NULL); 02959 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 02960 dmu_object_byteswap_t bswap = 02961 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 02962 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 02963 byteswap_uint64_array : 02964 dmu_ot_byteswap[bswap].ob_func; 02965 func(buf->b_data, hdr->b_size); 02966 } 02967 02968 arc_cksum_compute(buf, B_FALSE); 02969 #ifdef illumos 02970 arc_buf_watch(buf); 02971 #endif /* illumos */ 02972 02973 if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { 02974 /* 02975 * Only call arc_access on anonymous buffers. This is because 02976 * if we've issued an I/O for an evicted buffer, we've already 02977 * called arc_access (to prevent any simultaneous readers from 02978 * getting confused). 02979 */ 02980 arc_access(hdr, hash_lock); 02981 } 02982 02983 /* create copies of the data buffer for the callers */ 02984 abuf = buf; 02985 for (acb = callback_list; acb; acb = acb->acb_next) { 02986 if (acb->acb_done) { 02987 if (abuf == NULL) { 02988 ARCSTAT_BUMP(arcstat_duplicate_reads); 02989 abuf = arc_buf_clone(buf); 02990 } 02991 acb->acb_buf = abuf; 02992 abuf = NULL; 02993 } 02994 } 02995 hdr->b_acb = NULL; 02996 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 02997 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 02998 if (abuf == buf) { 02999 ASSERT(buf->b_efunc == NULL); 03000 ASSERT(hdr->b_datacnt == 1); 03001 hdr->b_flags |= ARC_BUF_AVAILABLE; 03002 } 03003 03004 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 03005 03006 if (zio->io_error != 0) { 03007 hdr->b_flags |= ARC_IO_ERROR; 03008 if (hdr->b_state != arc_anon) 03009 arc_change_state(arc_anon, hdr, hash_lock); 03010 if (HDR_IN_HASH_TABLE(hdr)) 03011 buf_hash_remove(hdr); 03012 freeable = refcount_is_zero(&hdr->b_refcnt); 03013 } 03014 03015 /* 03016 * Broadcast before we drop the hash_lock to avoid the possibility 03017 * that the hdr (and hence the cv) might be freed before we get to 03018 * the cv_broadcast(). 03019 */ 03020 cv_broadcast(&hdr->b_cv); 03021 03022 if (hash_lock) { 03023 mutex_exit(hash_lock); 03024 } else { 03025 /* 03026 * This block was freed while we waited for the read to 03027 * complete. It has been removed from the hash table and 03028 * moved to the anonymous state (so that it won't show up 03029 * in the cache). 03030 */ 03031 ASSERT3P(hdr->b_state, ==, arc_anon); 03032 freeable = refcount_is_zero(&hdr->b_refcnt); 03033 } 03034 03035 /* execute each callback and free its structure */ 03036 while ((acb = callback_list) != NULL) { 03037 if (acb->acb_done) 03038 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 03039 03040 if (acb->acb_zio_dummy != NULL) { 03041 acb->acb_zio_dummy->io_error = zio->io_error; 03042 zio_nowait(acb->acb_zio_dummy); 03043 } 03044 03045 callback_list = acb->acb_next; 03046 kmem_free(acb, sizeof (arc_callback_t)); 03047 } 03048 03049 if (freeable) 03050 arc_hdr_destroy(hdr); 03051 } 03052 03075 int 03076 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, 03077 arc_done_func_t *done, void *private, int priority, int zio_flags, 03078 uint32_t *arc_flags, const zbookmark_t *zb) 03079 { 03080 int err; 03081 03082 if (pbuf == NULL) { 03083 /* 03084 * XXX This happens from traverse callback funcs, for 03085 * the objset_phys_t block. 03086 */ 03087 return (arc_read_nolock(pio, spa, bp, done, private, priority, 03088 zio_flags, arc_flags, zb)); 03089 } 03090 03091 ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); 03092 ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); 03093 rw_enter(&pbuf->b_data_lock, RW_READER); 03094 03095 err = arc_read_nolock(pio, spa, bp, done, private, priority, 03096 zio_flags, arc_flags, zb); 03097 rw_exit(&pbuf->b_data_lock); 03098 03099 return (err); 03100 } 03101 03102 int 03103 arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, 03104 arc_done_func_t *done, void *private, int priority, int zio_flags, 03105 uint32_t *arc_flags, const zbookmark_t *zb) 03106 { 03107 arc_buf_hdr_t *hdr; 03108 arc_buf_t *buf; 03109 kmutex_t *hash_lock; 03110 zio_t *rzio; 03111 uint64_t guid = spa_load_guid(spa); 03112 03113 top: 03114 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), 03115 &hash_lock); 03116 if (hdr && hdr->b_datacnt > 0) { 03117 03118 *arc_flags |= ARC_CACHED; 03119 03120 if (HDR_IO_IN_PROGRESS(hdr)) { 03121 03122 if (*arc_flags & ARC_WAIT) { 03123 cv_wait(&hdr->b_cv, hash_lock); 03124 mutex_exit(hash_lock); 03125 goto top; 03126 } 03127 ASSERT(*arc_flags & ARC_NOWAIT); 03128 03129 if (done) { 03130 arc_callback_t *acb = NULL; 03131 03132 acb = kmem_zalloc(sizeof (arc_callback_t), 03133 KM_SLEEP); 03134 acb->acb_done = done; 03135 acb->acb_private = private; 03136 if (pio != NULL) 03137 acb->acb_zio_dummy = zio_null(pio, 03138 spa, NULL, NULL, NULL, zio_flags); 03139 03140 ASSERT(acb->acb_done != NULL); 03141 acb->acb_next = hdr->b_acb; 03142 hdr->b_acb = acb; 03143 add_reference(hdr, hash_lock, private); 03144 mutex_exit(hash_lock); 03145 return (0); 03146 } 03147 mutex_exit(hash_lock); 03148 return (0); 03149 } 03150 03151 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 03152 03153 if (done) { 03154 add_reference(hdr, hash_lock, private); 03155 /* 03156 * If this block is already in use, create a new 03157 * copy of the data so that we will be guaranteed 03158 * that arc_release() will always succeed. 03159 */ 03160 buf = hdr->b_buf; 03161 ASSERT(buf); 03162 ASSERT(buf->b_data); 03163 if (HDR_BUF_AVAILABLE(hdr)) { 03164 ASSERT(buf->b_efunc == NULL); 03165 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 03166 } else { 03167 buf = arc_buf_clone(buf); 03168 } 03169 03170 } else if (*arc_flags & ARC_PREFETCH && 03171 refcount_count(&hdr->b_refcnt) == 0) { 03172 hdr->b_flags |= ARC_PREFETCH; 03173 } 03174 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 03175 arc_access(hdr, hash_lock); 03176 if (*arc_flags & ARC_L2CACHE) 03177 hdr->b_flags |= ARC_L2CACHE; 03178 mutex_exit(hash_lock); 03179 ARCSTAT_BUMP(arcstat_hits); 03180 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 03181 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 03182 data, metadata, hits); 03183 03184 if (done) 03185 done(NULL, buf, private); 03186 } else { 03187 uint64_t size = BP_GET_LSIZE(bp); 03188 arc_callback_t *acb; 03189 vdev_t *vd = NULL; 03190 uint64_t addr; 03191 boolean_t devw = B_FALSE; 03192 03193 if (hdr == NULL) { 03194 /* this block is not in the cache */ 03195 arc_buf_hdr_t *exists; 03196 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 03197 buf = arc_buf_alloc(spa, size, private, type); 03198 hdr = buf->b_hdr; 03199 hdr->b_dva = *BP_IDENTITY(bp); 03200 hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 03201 hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 03202 exists = buf_hash_insert(hdr, &hash_lock); 03203 if (exists) { 03204 /* somebody beat us to the hash insert */ 03205 mutex_exit(hash_lock); 03206 buf_discard_identity(hdr); 03207 (void) arc_buf_remove_ref(buf, private); 03208 goto top; /* restart the IO request */ 03209 } 03210 /* if this is a prefetch, we don't have a reference */ 03211 if (*arc_flags & ARC_PREFETCH) { 03212 (void) remove_reference(hdr, hash_lock, 03213 private); 03214 hdr->b_flags |= ARC_PREFETCH; 03215 } 03216 if (*arc_flags & ARC_L2CACHE) 03217 hdr->b_flags |= ARC_L2CACHE; 03218 if (BP_GET_LEVEL(bp) > 0) 03219 hdr->b_flags |= ARC_INDIRECT; 03220 } else { 03221 /* this block is in the ghost cache */ 03222 ASSERT(GHOST_STATE(hdr->b_state)); 03223 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 03224 ASSERT0(refcount_count(&hdr->b_refcnt)); 03225 ASSERT(hdr->b_buf == NULL); 03226 03227 /* if this is a prefetch, we don't have a reference */ 03228 if (*arc_flags & ARC_PREFETCH) 03229 hdr->b_flags |= ARC_PREFETCH; 03230 else 03231 add_reference(hdr, hash_lock, private); 03232 if (*arc_flags & ARC_L2CACHE) 03233 hdr->b_flags |= ARC_L2CACHE; 03234 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 03235 buf->b_hdr = hdr; 03236 buf->b_data = NULL; 03237 buf->b_efunc = NULL; 03238 buf->b_private = NULL; 03239 buf->b_next = NULL; 03240 hdr->b_buf = buf; 03241 ASSERT(hdr->b_datacnt == 0); 03242 hdr->b_datacnt = 1; 03243 arc_get_data_buf(buf); 03244 arc_access(hdr, hash_lock); 03245 } 03246 03247 ASSERT(!GHOST_STATE(hdr->b_state)); 03248 03249 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 03250 acb->acb_done = done; 03251 acb->acb_private = private; 03252 03253 ASSERT(hdr->b_acb == NULL); 03254 hdr->b_acb = acb; 03255 hdr->b_flags |= ARC_IO_IN_PROGRESS; 03256 03257 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && 03258 (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { 03259 devw = hdr->b_l2hdr->b_dev->l2ad_writing; 03260 addr = hdr->b_l2hdr->b_daddr; 03261 /* 03262 * Lock out device removal. 03263 */ 03264 if (vdev_is_dead(vd) || 03265 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 03266 vd = NULL; 03267 } 03268 03269 mutex_exit(hash_lock); 03270 03271 /* 03272 * At this point, we have a level 1 cache miss. Try again in 03273 * L2ARC if possible. 03274 */ 03275 ASSERT3U(hdr->b_size, ==, size); 03276 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 03277 uint64_t, size, zbookmark_t *, zb); 03278 ARCSTAT_BUMP(arcstat_misses); 03279 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 03280 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 03281 data, metadata, misses); 03282 #ifdef _KERNEL 03283 curthread->td_ru.ru_inblock++; 03284 #endif 03285 03286 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 03287 /* 03288 * Read from the L2ARC if the following are true: 03289 * 1. The L2ARC vdev was previously cached. 03290 * 2. This buffer still has L2ARC metadata. 03291 * 3. This buffer isn't currently writing to the L2ARC. 03292 * 4. The L2ARC entry wasn't evicted, which may 03293 * also have invalidated the vdev. 03294 * 5. This isn't prefetch and l2arc_noprefetch is set. 03295 */ 03296 if (hdr->b_l2hdr != NULL && 03297 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 03298 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 03299 l2arc_read_callback_t *cb; 03300 03301 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 03302 ARCSTAT_BUMP(arcstat_l2_hits); 03303 03304 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 03305 KM_SLEEP); 03306 cb->l2rcb_buf = buf; 03307 cb->l2rcb_spa = spa; 03308 cb->l2rcb_bp = *bp; 03309 cb->l2rcb_zb = *zb; 03310 cb->l2rcb_flags = zio_flags; 03311 03312 /* 03313 * l2arc read. The SCL_L2ARC lock will be 03314 * released by l2arc_read_done(). 03315 */ 03316 rzio = zio_read_phys(pio, vd, addr, size, 03317 buf->b_data, ZIO_CHECKSUM_OFF, 03318 l2arc_read_done, cb, priority, zio_flags | 03319 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | 03320 ZIO_FLAG_DONT_PROPAGATE | 03321 ZIO_FLAG_DONT_RETRY, B_FALSE); 03322 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 03323 zio_t *, rzio); 03324 ARCSTAT_INCR(arcstat_l2_read_bytes, size); 03325 03326 if (*arc_flags & ARC_NOWAIT) { 03327 zio_nowait(rzio); 03328 return (0); 03329 } 03330 03331 ASSERT(*arc_flags & ARC_WAIT); 03332 if (zio_wait(rzio) == 0) 03333 return (0); 03334 03335 /* l2arc read error; goto zio_read() */ 03336 } else { 03337 DTRACE_PROBE1(l2arc__miss, 03338 arc_buf_hdr_t *, hdr); 03339 ARCSTAT_BUMP(arcstat_l2_misses); 03340 if (HDR_L2_WRITING(hdr)) 03341 ARCSTAT_BUMP(arcstat_l2_rw_clash); 03342 spa_config_exit(spa, SCL_L2ARC, vd); 03343 } 03344 } else { 03345 if (vd != NULL) 03346 spa_config_exit(spa, SCL_L2ARC, vd); 03347 if (l2arc_ndev != 0) { 03348 DTRACE_PROBE1(l2arc__miss, 03349 arc_buf_hdr_t *, hdr); 03350 ARCSTAT_BUMP(arcstat_l2_misses); 03351 } 03352 } 03353 03354 rzio = zio_read(pio, spa, bp, buf->b_data, size, 03355 arc_read_done, buf, priority, zio_flags, zb); 03356 03357 if (*arc_flags & ARC_WAIT) 03358 return (zio_wait(rzio)); 03359 03360 ASSERT(*arc_flags & ARC_NOWAIT); 03361 zio_nowait(rzio); 03362 } 03363 return (0); 03364 } 03365 03366 void 03367 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 03368 { 03369 ASSERT(buf->b_hdr != NULL); 03370 ASSERT(buf->b_hdr->b_state != arc_anon); 03371 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 03372 ASSERT(buf->b_efunc == NULL); 03373 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 03374 03375 buf->b_efunc = func; 03376 buf->b_private = private; 03377 } 03378 03384 int 03385 arc_buf_evict(arc_buf_t *buf) 03386 { 03387 arc_buf_hdr_t *hdr; 03388 kmutex_t *hash_lock; 03389 arc_buf_t **bufp; 03390 list_t *list, *evicted_list; 03391 kmutex_t *lock, *evicted_lock; 03392 03393 mutex_enter(&buf->b_evict_lock); 03394 hdr = buf->b_hdr; 03395 if (hdr == NULL) { 03396 /* 03397 * We are in arc_do_user_evicts(). 03398 */ 03399 ASSERT(buf->b_data == NULL); 03400 mutex_exit(&buf->b_evict_lock); 03401 return (0); 03402 } else if (buf->b_data == NULL) { 03403 arc_buf_t copy = *buf; /* structure assignment */ 03404 /* 03405 * We are on the eviction list; process this buffer now 03406 * but let arc_do_user_evicts() do the reaping. 03407 */ 03408 buf->b_efunc = NULL; 03409 mutex_exit(&buf->b_evict_lock); 03410 VERIFY(copy.b_efunc(©) == 0); 03411 return (1); 03412 } 03413 hash_lock = HDR_LOCK(hdr); 03414 mutex_enter(hash_lock); 03415 hdr = buf->b_hdr; 03416 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 03417 03418 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 03419 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 03420 03421 /* 03422 * Pull this buffer off of the hdr 03423 */ 03424 bufp = &hdr->b_buf; 03425 while (*bufp != buf) 03426 bufp = &(*bufp)->b_next; 03427 *bufp = buf->b_next; 03428 03429 ASSERT(buf->b_data != NULL); 03430 arc_buf_destroy(buf, FALSE, FALSE); 03431 03432 if (hdr->b_datacnt == 0) { 03433 arc_state_t *old_state = hdr->b_state; 03434 arc_state_t *evicted_state; 03435 03436 ASSERT(hdr->b_buf == NULL); 03437 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 03438 03439 evicted_state = 03440 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 03441 03442 get_buf_info(hdr, old_state, &list, &lock); 03443 get_buf_info(hdr, evicted_state, &evicted_list, &evicted_lock); 03444 mutex_enter(lock); 03445 mutex_enter(evicted_lock); 03446 03447 arc_change_state(evicted_state, hdr, hash_lock); 03448 ASSERT(HDR_IN_HASH_TABLE(hdr)); 03449 hdr->b_flags |= ARC_IN_HASH_TABLE; 03450 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 03451 03452 mutex_exit(evicted_lock); 03453 mutex_exit(lock); 03454 } 03455 mutex_exit(hash_lock); 03456 mutex_exit(&buf->b_evict_lock); 03457 03458 VERIFY(buf->b_efunc(buf) == 0); 03459 buf->b_efunc = NULL; 03460 buf->b_private = NULL; 03461 buf->b_hdr = NULL; 03462 buf->b_next = NULL; 03463 kmem_cache_free(buf_cache, buf); 03464 return (1); 03465 } 03466 03473 void 03474 arc_release(arc_buf_t *buf, void *tag) 03475 { 03476 arc_buf_hdr_t *hdr; 03477 kmutex_t *hash_lock = NULL; 03478 l2arc_buf_hdr_t *l2hdr; 03479 uint64_t buf_size; 03480 03481 /* 03482 * It would be nice to assert that if it's DMU metadata (level > 03483 * 0 || it's the dnode file), then it must be syncing context. 03484 * But we don't know that information at this level. 03485 */ 03486 03487 mutex_enter(&buf->b_evict_lock); 03488 hdr = buf->b_hdr; 03489 03490 /* this buffer is not on any list */ 03491 ASSERT(refcount_count(&hdr->b_refcnt) > 0); 03492 03493 if (hdr->b_state == arc_anon) { 03494 /* this buffer is already released */ 03495 ASSERT(buf->b_efunc == NULL); 03496 } else { 03497 hash_lock = HDR_LOCK(hdr); 03498 mutex_enter(hash_lock); 03499 hdr = buf->b_hdr; 03500 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 03501 } 03502 03503 l2hdr = hdr->b_l2hdr; 03504 if (l2hdr) { 03505 mutex_enter(&l2arc_buflist_mtx); 03506 hdr->b_l2hdr = NULL; 03507 buf_size = hdr->b_size; 03508 } 03509 03510 /* 03511 * Do we have more than one buf? 03512 */ 03513 if (hdr->b_datacnt > 1) { 03514 arc_buf_hdr_t *nhdr; 03515 arc_buf_t **bufp; 03516 uint64_t blksz = hdr->b_size; 03517 uint64_t spa = hdr->b_spa; 03518 arc_buf_contents_t type = hdr->b_type; 03519 uint32_t flags = hdr->b_flags; 03520 03521 ASSERT(hdr->b_buf != buf || buf->b_next != NULL); 03522 /* 03523 * Pull the data off of this hdr and attach it to 03524 * a new anonymous hdr. 03525 */ 03526 (void) remove_reference(hdr, hash_lock, tag); 03527 bufp = &hdr->b_buf; 03528 while (*bufp != buf) 03529 bufp = &(*bufp)->b_next; 03530 *bufp = buf->b_next; 03531 buf->b_next = NULL; 03532 03533 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 03534 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 03535 if (refcount_is_zero(&hdr->b_refcnt)) { 03536 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; 03537 ASSERT3U(*size, >=, hdr->b_size); 03538 atomic_add_64(size, -hdr->b_size); 03539 } 03540 03541 /* 03542 * We're releasing a duplicate user data buffer, update 03543 * our statistics accordingly. 03544 */ 03545 if (hdr->b_type == ARC_BUFC_DATA) { 03546 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 03547 ARCSTAT_INCR(arcstat_duplicate_buffers_size, 03548 -hdr->b_size); 03549 } 03550 hdr->b_datacnt -= 1; 03551 arc_cksum_verify(buf); 03552 #ifdef illumos 03553 arc_buf_unwatch(buf); 03554 #endif /* illumos */ 03555 03556 mutex_exit(hash_lock); 03557 03558 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 03559 nhdr->b_size = blksz; 03560 nhdr->b_spa = spa; 03561 nhdr->b_type = type; 03562 nhdr->b_buf = buf; 03563 nhdr->b_state = arc_anon; 03564 nhdr->b_arc_access = 0; 03565 nhdr->b_flags = flags & ARC_L2_WRITING; 03566 nhdr->b_l2hdr = NULL; 03567 nhdr->b_datacnt = 1; 03568 nhdr->b_freeze_cksum = NULL; 03569 (void) refcount_add(&nhdr->b_refcnt, tag); 03570 buf->b_hdr = nhdr; 03571 mutex_exit(&buf->b_evict_lock); 03572 atomic_add_64(&arc_anon->arcs_size, blksz); 03573 } else { 03574 mutex_exit(&buf->b_evict_lock); 03575 ASSERT(refcount_count(&hdr->b_refcnt) == 1); 03576 ASSERT(!list_link_active(&hdr->b_arc_node)); 03577 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 03578 if (hdr->b_state != arc_anon) 03579 arc_change_state(arc_anon, hdr, hash_lock); 03580 hdr->b_arc_access = 0; 03581 if (hash_lock) 03582 mutex_exit(hash_lock); 03583 03584 buf_discard_identity(hdr); 03585 arc_buf_thaw(buf); 03586 } 03587 buf->b_efunc = NULL; 03588 buf->b_private = NULL; 03589 03590 if (l2hdr) { 03591 list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 03592 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 03593 ARCSTAT_INCR(arcstat_l2_size, -buf_size); 03594 mutex_exit(&l2arc_buflist_mtx); 03595 } 03596 } 03597 03602 /* ARGSUSED */ 03603 int 03604 arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa, 03605 zbookmark_t *zb) 03606 { 03607 arc_release(buf, tag); 03608 return (0); 03609 } 03610 03611 int 03612 arc_released(arc_buf_t *buf) 03613 { 03614 int released; 03615 03616 mutex_enter(&buf->b_evict_lock); 03617 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 03618 mutex_exit(&buf->b_evict_lock); 03619 return (released); 03620 } 03621 03622 int 03623 arc_has_callback(arc_buf_t *buf) 03624 { 03625 int callback; 03626 03627 mutex_enter(&buf->b_evict_lock); 03628 callback = (buf->b_efunc != NULL); 03629 mutex_exit(&buf->b_evict_lock); 03630 return (callback); 03631 } 03632 03633 #ifdef ZFS_DEBUG 03634 int 03635 arc_referenced(arc_buf_t *buf) 03636 { 03637 int referenced; 03638 03639 mutex_enter(&buf->b_evict_lock); 03640 referenced = (refcount_count(&buf->b_hdr->b_refcnt)); 03641 mutex_exit(&buf->b_evict_lock); 03642 return (referenced); 03643 } 03644 #endif 03645 03646 static void 03647 arc_write_ready(zio_t *zio) 03648 { 03649 arc_write_callback_t *callback = zio->io_private; 03650 arc_buf_t *buf = callback->awcb_buf; 03651 arc_buf_hdr_t *hdr = buf->b_hdr; 03652 03653 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 03654 callback->awcb_ready(zio, buf, callback->awcb_private); 03655 03656 /* 03657 * If the IO is already in progress, then this is a re-write 03658 * attempt, so we need to thaw and re-compute the cksum. 03659 * It is the responsibility of the callback to handle the 03660 * accounting for any re-write attempt. 03661 */ 03662 if (HDR_IO_IN_PROGRESS(hdr)) { 03663 mutex_enter(&hdr->b_freeze_lock); 03664 if (hdr->b_freeze_cksum != NULL) { 03665 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 03666 hdr->b_freeze_cksum = NULL; 03667 } 03668 mutex_exit(&hdr->b_freeze_lock); 03669 } 03670 arc_cksum_compute(buf, B_FALSE); 03671 hdr->b_flags |= ARC_IO_IN_PROGRESS; 03672 } 03673 03674 static void 03675 arc_write_done(zio_t *zio) 03676 { 03677 arc_write_callback_t *callback = zio->io_private; 03678 arc_buf_t *buf = callback->awcb_buf; 03679 arc_buf_hdr_t *hdr = buf->b_hdr; 03680 03681 ASSERT(hdr->b_acb == NULL); 03682 03683 if (zio->io_error == 0) { 03684 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 03685 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 03686 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 03687 } else { 03688 ASSERT(BUF_EMPTY(hdr)); 03689 } 03690 03691 /* 03692 * If the block to be written was all-zero, we may have 03693 * compressed it away. In this case no write was performed 03694 * so there will be no dva/birth/checksum. The buffer must 03695 * therefore remain anonymous (and uncached). 03696 */ 03697 if (!BUF_EMPTY(hdr)) { 03698 arc_buf_hdr_t *exists; 03699 kmutex_t *hash_lock; 03700 03701 ASSERT(zio->io_error == 0); 03702 03703 arc_cksum_verify(buf); 03704 03705 exists = buf_hash_insert(hdr, &hash_lock); 03706 if (exists) { 03707 /* 03708 * This can only happen if we overwrite for 03709 * sync-to-convergence, because we remove 03710 * buffers from the hash table when we arc_free(). 03711 */ 03712 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 03713 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 03714 panic("bad overwrite, hdr=%p exists=%p", 03715 (void *)hdr, (void *)exists); 03716 ASSERT(refcount_is_zero(&exists->b_refcnt)); 03717 arc_change_state(arc_anon, exists, hash_lock); 03718 mutex_exit(hash_lock); 03719 arc_hdr_destroy(exists); 03720 exists = buf_hash_insert(hdr, &hash_lock); 03721 ASSERT3P(exists, ==, NULL); 03722 } else { 03723 /* Dedup */ 03724 ASSERT(hdr->b_datacnt == 1); 03725 ASSERT(hdr->b_state == arc_anon); 03726 ASSERT(BP_GET_DEDUP(zio->io_bp)); 03727 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 03728 } 03729 } 03730 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 03731 /* if it's not anon, we are doing a scrub */ 03732 if (!exists && hdr->b_state == arc_anon) 03733 arc_access(hdr, hash_lock); 03734 mutex_exit(hash_lock); 03735 } else { 03736 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 03737 } 03738 03739 ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 03740 callback->awcb_done(zio, buf, callback->awcb_private); 03741 03742 kmem_free(callback, sizeof (arc_write_callback_t)); 03743 } 03744 03745 zio_t * 03746 arc_write(zio_t *pio, spa_t *spa, uint64_t txg, 03747 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, 03748 arc_done_func_t *ready, arc_done_func_t *done, void *private, 03749 int priority, int zio_flags, const zbookmark_t *zb) 03750 { 03751 arc_buf_hdr_t *hdr = buf->b_hdr; 03752 arc_write_callback_t *callback; 03753 zio_t *zio; 03754 03755 ASSERT(ready != NULL); 03756 ASSERT(done != NULL); 03757 ASSERT(!HDR_IO_ERROR(hdr)); 03758 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 03759 ASSERT(hdr->b_acb == NULL); 03760 if (l2arc) 03761 hdr->b_flags |= ARC_L2CACHE; 03762 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 03763 callback->awcb_ready = ready; 03764 callback->awcb_done = done; 03765 callback->awcb_private = private; 03766 callback->awcb_buf = buf; 03767 03768 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 03769 arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); 03770 03771 return (zio); 03772 } 03773 03774 static int 03775 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) 03776 { 03777 #ifdef _KERNEL 03778 uint64_t available_memory = 03779 ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count); 03780 static uint64_t page_load = 0; 03781 static uint64_t last_txg = 0; 03782 03783 #ifdef sun 03784 #if defined(__i386) 03785 available_memory = 03786 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); 03787 #endif 03788 #endif /* sun */ 03789 if (available_memory >= zfs_write_limit_max) 03790 return (0); 03791 03792 if (txg > last_txg) { 03793 last_txg = txg; 03794 page_load = 0; 03795 } 03796 /* 03797 * If we are in pageout, we know that memory is already tight, 03798 * the arc is already going to be evicting, so we just want to 03799 * continue to let page writes occur as quickly as possible. 03800 */ 03801 if (curproc == pageproc) { 03802 if (page_load > available_memory / 4) 03803 return (ERESTART); 03804 /* Note: reserve is inflated, so we deflate */ 03805 page_load += reserve / 8; 03806 return (0); 03807 } else if (page_load > 0 && arc_reclaim_needed()) { 03808 /* memory is low, delay before restarting */ 03809 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 03810 return (EAGAIN); 03811 } 03812 page_load = 0; 03813 03814 if (arc_size > arc_c_min) { 03815 uint64_t evictable_memory = 03816 arc_mru->arcs_lsize[ARC_BUFC_DATA] + 03817 arc_mru->arcs_lsize[ARC_BUFC_METADATA] + 03818 arc_mfu->arcs_lsize[ARC_BUFC_DATA] + 03819 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; 03820 available_memory += MIN(evictable_memory, arc_size - arc_c_min); 03821 } 03822 03823 if (inflight_data > available_memory / 4) { 03824 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 03825 return (ERESTART); 03826 } 03827 #endif 03828 return (0); 03829 } 03830 03831 void 03832 arc_tempreserve_clear(uint64_t reserve) 03833 { 03834 atomic_add_64(&arc_tempreserve, -reserve); 03835 ASSERT((int64_t)arc_tempreserve >= 0); 03836 } 03837 03838 int 03839 arc_tempreserve_space(uint64_t reserve, uint64_t txg) 03840 { 03841 int error; 03842 uint64_t anon_size; 03843 03844 #ifdef ZFS_DEBUG 03845 /* 03846 * Once in a while, fail for no reason. Everything should cope. 03847 */ 03848 if (spa_get_random(10000) == 0) { 03849 dprintf("forcing random failure\n"); 03850 return (ERESTART); 03851 } 03852 #endif 03853 if (reserve > arc_c/4 && !arc_no_grow) 03854 arc_c = MIN(arc_c_max, reserve * 4); 03855 if (reserve > arc_c) 03856 return (ENOMEM); 03857 03858 /* 03859 * Don't count loaned bufs as in flight dirty data to prevent long 03860 * network delays from blocking transactions that are ready to be 03861 * assigned to a txg. 03862 */ 03863 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); 03864 03865 /* 03866 * Writes will, almost always, require additional memory allocations 03867 * in order to compress/encrypt/etc the data. We therefore need to 03868 * make sure that there is sufficient available memory for this. 03869 */ 03870 if (error = arc_memory_throttle(reserve, anon_size, txg)) 03871 return (error); 03872 03873 /* 03874 * Throttle writes when the amount of dirty data in the cache 03875 * gets too large. We try to keep the cache less than half full 03876 * of dirty blocks so that our sync times don't grow too large. 03877 * Note: if two requests come in concurrently, we might let them 03878 * both succeed, when one of them should fail. Not a huge deal. 03879 */ 03880 03881 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 03882 anon_size > arc_c / 4) { 03883 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 03884 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 03885 arc_tempreserve>>10, 03886 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 03887 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 03888 reserve>>10, arc_c>>10); 03889 return (ERESTART); 03890 } 03891 atomic_add_64(&arc_tempreserve, reserve); 03892 return (0); 03893 } 03894 03895 static kmutex_t arc_lowmem_lock; 03896 #ifdef _KERNEL 03897 static eventhandler_tag arc_event_lowmem = NULL; 03898 03899 static void 03900 arc_lowmem(void *arg __unused, int howto __unused) 03901 { 03902 03903 /* Serialize access via arc_lowmem_lock. */ 03904 mutex_enter(&arc_lowmem_lock); 03905 mutex_enter(&arc_reclaim_thr_lock); 03906 needfree = 1; 03907 cv_signal(&arc_reclaim_thr_cv); 03908 03909 /* 03910 * It is unsafe to block here in arbitrary threads, because we can come 03911 * here from ARC itself and may hold ARC locks and thus risk a deadlock 03912 * with ARC reclaim thread. 03913 */ 03914 if (curproc == pageproc) { 03915 while (needfree) 03916 msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0); 03917 } 03918 mutex_exit(&arc_reclaim_thr_lock); 03919 mutex_exit(&arc_lowmem_lock); 03920 } 03921 #endif 03922 03923 void 03924 arc_init(void) 03925 { 03926 int i, prefetch_tunable_set = 0; 03927 03928 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 03929 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 03930 mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL); 03931 03932 /* Convert seconds to clock ticks */ 03933 arc_min_prefetch_lifespan = 1 * hz; 03934 03935 /* Start out with 1/8 of all memory */ 03936 arc_c = kmem_size() / 8; 03937 03938 #ifdef sun 03939 #ifdef _KERNEL 03940 /* 03941 * On architectures where the physical memory can be larger 03942 * than the addressable space (intel in 32-bit mode), we may 03943 * need to limit the cache to 1/8 of VM size. 03944 */ 03945 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 03946 #endif 03947 #endif /* sun */ 03948 /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ 03949 arc_c_min = MAX(arc_c / 4, 64<<18); 03950 /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ 03951 if (arc_c * 8 >= 1<<30) 03952 arc_c_max = (arc_c * 8) - (1<<30); 03953 else 03954 arc_c_max = arc_c_min; 03955 arc_c_max = MAX(arc_c * 5, arc_c_max); 03956 03957 #ifdef _KERNEL 03958 /* 03959 * Allow the tunables to override our calculations if they are 03960 * reasonable (ie. over 16MB) 03961 */ 03962 if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size()) 03963 arc_c_max = zfs_arc_max; 03964 if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max) 03965 arc_c_min = zfs_arc_min; 03966 #endif 03967 03968 arc_c = arc_c_max; 03969 arc_p = (arc_c >> 1); 03970 03971 /* limit meta-data to 1/4 of the arc capacity */ 03972 arc_meta_limit = arc_c_max / 4; 03973 03974 /* Allow the tunable to override if it is reasonable */ 03975 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 03976 arc_meta_limit = zfs_arc_meta_limit; 03977 03978 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 03979 arc_c_min = arc_meta_limit / 2; 03980 03981 if (zfs_arc_grow_retry > 0) 03982 arc_grow_retry = zfs_arc_grow_retry; 03983 03984 if (zfs_arc_shrink_shift > 0) 03985 arc_shrink_shift = zfs_arc_shrink_shift; 03986 03987 if (zfs_arc_p_min_shift > 0) 03988 arc_p_min_shift = zfs_arc_p_min_shift; 03989 03990 /* if kmem_flags are set, lets try to use less memory */ 03991 if (kmem_debugging()) 03992 arc_c = arc_c / 2; 03993 if (arc_c < arc_c_min) 03994 arc_c = arc_c_min; 03995 03996 zfs_arc_min = arc_c_min; 03997 zfs_arc_max = arc_c_max; 03998 03999 arc_anon = &ARC_anon; 04000 arc_mru = &ARC_mru; 04001 arc_mru_ghost = &ARC_mru_ghost; 04002 arc_mfu = &ARC_mfu; 04003 arc_mfu_ghost = &ARC_mfu_ghost; 04004 arc_l2c_only = &ARC_l2c_only; 04005 arc_size = 0; 04006 04007 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 04008 mutex_init(&arc_anon->arcs_locks[i].arcs_lock, 04009 NULL, MUTEX_DEFAULT, NULL); 04010 mutex_init(&arc_mru->arcs_locks[i].arcs_lock, 04011 NULL, MUTEX_DEFAULT, NULL); 04012 mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock, 04013 NULL, MUTEX_DEFAULT, NULL); 04014 mutex_init(&arc_mfu->arcs_locks[i].arcs_lock, 04015 NULL, MUTEX_DEFAULT, NULL); 04016 mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock, 04017 NULL, MUTEX_DEFAULT, NULL); 04018 mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock, 04019 NULL, MUTEX_DEFAULT, NULL); 04020 04021 list_create(&arc_mru->arcs_lists[i], 04022 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 04023 list_create(&arc_mru_ghost->arcs_lists[i], 04024 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 04025 list_create(&arc_mfu->arcs_lists[i], 04026 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 04027 list_create(&arc_mfu_ghost->arcs_lists[i], 04028 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 04029 list_create(&arc_mfu_ghost->arcs_lists[i], 04030 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 04031 list_create(&arc_l2c_only->arcs_lists[i], 04032 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 04033 } 04034 04035 buf_init(); 04036 04037 arc_thread_exit = 0; 04038 arc_eviction_list = NULL; 04039 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 04040 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 04041 04042 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 04043 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 04044 04045 if (arc_ksp != NULL) { 04046 arc_ksp->ks_data = &arc_stats; 04047 kstat_install(arc_ksp); 04048 } 04049 04050 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 04051 TS_RUN, minclsyspri); 04052 04053 #ifdef _KERNEL 04054 arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 04055 EVENTHANDLER_PRI_FIRST); 04056 #endif 04057 04058 arc_dead = FALSE; 04059 arc_warm = B_FALSE; 04060 04061 if (zfs_write_limit_max == 0) 04062 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; 04063 else 04064 zfs_write_limit_shift = 0; 04065 mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); 04066 04067 #ifdef _KERNEL 04068 if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 04069 prefetch_tunable_set = 1; 04070 04071 #ifdef __i386__ 04072 if (prefetch_tunable_set == 0) { 04073 printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 04074 "-- to enable,\n"); 04075 printf(" add \"vfs.zfs.prefetch_disable=0\" " 04076 "to /boot/loader.conf.\n"); 04077 zfs_prefetch_disable = 1; 04078 } 04079 #else 04080 if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 04081 prefetch_tunable_set == 0) { 04082 printf("ZFS NOTICE: Prefetch is disabled by default if less " 04083 "than 4GB of RAM is present;\n" 04084 " to enable, add \"vfs.zfs.prefetch_disable=0\" " 04085 "to /boot/loader.conf.\n"); 04086 zfs_prefetch_disable = 1; 04087 } 04088 #endif 04089 /* Warn about ZFS memory and address space requirements. */ 04090 if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 04091 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 04092 "expect unstable behavior.\n"); 04093 } 04094 if (kmem_size() < 512 * (1 << 20)) { 04095 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 04096 "expect unstable behavior.\n"); 04097 printf(" Consider tuning vm.kmem_size and " 04098 "vm.kmem_size_max\n"); 04099 printf(" in /boot/loader.conf.\n"); 04100 } 04101 #endif 04102 } 04103 04104 void 04105 arc_fini(void) 04106 { 04107 int i; 04108 04109 mutex_enter(&arc_reclaim_thr_lock); 04110 arc_thread_exit = 1; 04111 cv_signal(&arc_reclaim_thr_cv); 04112 while (arc_thread_exit != 0) 04113 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 04114 mutex_exit(&arc_reclaim_thr_lock); 04115 04116 arc_flush(NULL); 04117 04118 arc_dead = TRUE; 04119 04120 if (arc_ksp != NULL) { 04121 kstat_delete(arc_ksp); 04122 arc_ksp = NULL; 04123 } 04124 04125 mutex_destroy(&arc_eviction_mtx); 04126 mutex_destroy(&arc_reclaim_thr_lock); 04127 cv_destroy(&arc_reclaim_thr_cv); 04128 04129 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 04130 list_destroy(&arc_mru->arcs_lists[i]); 04131 list_destroy(&arc_mru_ghost->arcs_lists[i]); 04132 list_destroy(&arc_mfu->arcs_lists[i]); 04133 list_destroy(&arc_mfu_ghost->arcs_lists[i]); 04134 list_destroy(&arc_l2c_only->arcs_lists[i]); 04135 04136 mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock); 04137 mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock); 04138 mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock); 04139 mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock); 04140 mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock); 04141 mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock); 04142 } 04143 04144 mutex_destroy(&zfs_write_limit_lock); 04145 04146 buf_fini(); 04147 04148 ASSERT(arc_loaned_bytes == 0); 04149 04150 mutex_destroy(&arc_lowmem_lock); 04151 #ifdef _KERNEL 04152 if (arc_event_lowmem != NULL) 04153 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 04154 #endif 04155 } 04156 04288 static boolean_t 04289 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) 04290 { 04291 /* 04292 * A buffer is *not* eligible for the L2ARC if it: 04293 * 1. belongs to a different spa. 04294 * 2. is already cached on the L2ARC. 04295 * 3. has an I/O in progress (it may be an incomplete read). 04296 * 4. is flagged not eligible (zfs property). 04297 */ 04298 if (ab->b_spa != spa_guid) { 04299 ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 04300 return (B_FALSE); 04301 } 04302 if (ab->b_l2hdr != NULL) { 04303 ARCSTAT_BUMP(arcstat_l2_write_in_l2); 04304 return (B_FALSE); 04305 } 04306 if (HDR_IO_IN_PROGRESS(ab)) { 04307 ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 04308 return (B_FALSE); 04309 } 04310 if (!HDR_L2CACHE(ab)) { 04311 ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 04312 return (B_FALSE); 04313 } 04314 04315 return (B_TRUE); 04316 } 04317 04318 static uint64_t 04319 l2arc_write_size(l2arc_dev_t *dev) 04320 { 04321 uint64_t size; 04322 04323 size = dev->l2ad_write; 04324 04325 if (arc_warm == B_FALSE) 04326 size += dev->l2ad_boost; 04327 04328 return (size); 04329 04330 } 04331 04332 static clock_t 04333 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 04334 { 04335 clock_t interval, next, now; 04336 04337 /* 04338 * If the ARC lists are busy, increase our write rate; if the 04339 * lists are stale, idle back. This is achieved by checking 04340 * how much we previously wrote - if it was more than half of 04341 * what we wanted, schedule the next write much sooner. 04342 */ 04343 if (l2arc_feed_again && wrote > (wanted / 2)) 04344 interval = (hz * l2arc_feed_min_ms) / 1000; 04345 else 04346 interval = hz * l2arc_feed_secs; 04347 04348 now = ddi_get_lbolt(); 04349 next = MAX(now, MIN(now + interval, began + interval)); 04350 04351 return (next); 04352 } 04353 04354 static void 04355 l2arc_hdr_stat_add(void) 04356 { 04357 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); 04358 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); 04359 } 04360 04361 static void 04362 l2arc_hdr_stat_remove(void) 04363 { 04364 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); 04365 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); 04366 } 04367 04372 static l2arc_dev_t * 04373 l2arc_dev_get_next(void) 04374 { 04375 l2arc_dev_t *first, *next = NULL; 04376 04377 /* 04378 * Lock out the removal of spas (spa_namespace_lock), then removal 04379 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 04380 * both locks will be dropped and a spa config lock held instead. 04381 */ 04382 mutex_enter(&spa_namespace_lock); 04383 mutex_enter(&l2arc_dev_mtx); 04384 04385 /* if there are no vdevs, there is nothing to do */ 04386 if (l2arc_ndev == 0) 04387 goto out; 04388 04389 first = NULL; 04390 next = l2arc_dev_last; 04391 do { 04392 /* loop around the list looking for a non-faulted vdev */ 04393 if (next == NULL) { 04394 next = list_head(l2arc_dev_list); 04395 } else { 04396 next = list_next(l2arc_dev_list, next); 04397 if (next == NULL) 04398 next = list_head(l2arc_dev_list); 04399 } 04400 04401 /* if we have come back to the start, bail out */ 04402 if (first == NULL) 04403 first = next; 04404 else if (next == first) 04405 break; 04406 04407 } while (vdev_is_dead(next->l2ad_vdev)); 04408 04409 /* if we were unable to find any usable vdevs, return NULL */ 04410 if (vdev_is_dead(next->l2ad_vdev)) 04411 next = NULL; 04412 04413 l2arc_dev_last = next; 04414 04415 out: 04416 mutex_exit(&l2arc_dev_mtx); 04417 04418 /* 04419 * Grab the config lock to prevent the 'next' device from being 04420 * removed while we are writing to it. 04421 */ 04422 if (next != NULL) 04423 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 04424 mutex_exit(&spa_namespace_lock); 04425 04426 return (next); 04427 } 04428 04432 static void 04433 l2arc_do_free_on_write() 04434 { 04435 list_t *buflist; 04436 l2arc_data_free_t *df, *df_prev; 04437 04438 mutex_enter(&l2arc_free_on_write_mtx); 04439 buflist = l2arc_free_on_write; 04440 04441 for (df = list_tail(buflist); df; df = df_prev) { 04442 df_prev = list_prev(buflist, df); 04443 ASSERT(df->l2df_data != NULL); 04444 ASSERT(df->l2df_func != NULL); 04445 df->l2df_func(df->l2df_data, df->l2df_size); 04446 list_remove(buflist, df); 04447 kmem_free(df, sizeof (l2arc_data_free_t)); 04448 } 04449 04450 mutex_exit(&l2arc_free_on_write_mtx); 04451 } 04452 04457 static void 04458 l2arc_write_done(zio_t *zio) 04459 { 04460 l2arc_write_callback_t *cb; 04461 l2arc_dev_t *dev; 04462 list_t *buflist; 04463 arc_buf_hdr_t *head, *ab, *ab_prev; 04464 l2arc_buf_hdr_t *abl2; 04465 kmutex_t *hash_lock; 04466 04467 cb = zio->io_private; 04468 ASSERT(cb != NULL); 04469 dev = cb->l2wcb_dev; 04470 ASSERT(dev != NULL); 04471 head = cb->l2wcb_head; 04472 ASSERT(head != NULL); 04473 buflist = dev->l2ad_buflist; 04474 ASSERT(buflist != NULL); 04475 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 04476 l2arc_write_callback_t *, cb); 04477 04478 if (zio->io_error != 0) 04479 ARCSTAT_BUMP(arcstat_l2_writes_error); 04480 04481 mutex_enter(&l2arc_buflist_mtx); 04482 04483 /* 04484 * All writes completed, or an error was hit. 04485 */ 04486 for (ab = list_prev(buflist, head); ab; ab = ab_prev) { 04487 ab_prev = list_prev(buflist, ab); 04488 04489 hash_lock = HDR_LOCK(ab); 04490 if (!mutex_tryenter(hash_lock)) { 04491 /* 04492 * This buffer misses out. It may be in a stage 04493 * of eviction. Its ARC_L2_WRITING flag will be 04494 * left set, denying reads to this buffer. 04495 */ 04496 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 04497 continue; 04498 } 04499 04500 if (zio->io_error != 0) { 04501 /* 04502 * Error - drop L2ARC entry. 04503 */ 04504 list_remove(buflist, ab); 04505 abl2 = ab->b_l2hdr; 04506 ab->b_l2hdr = NULL; 04507 kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 04508 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 04509 } 04510 04511 /* 04512 * Allow ARC to begin reads to this L2ARC entry. 04513 */ 04514 ab->b_flags &= ~ARC_L2_WRITING; 04515 04516 mutex_exit(hash_lock); 04517 } 04518 04519 atomic_inc_64(&l2arc_writes_done); 04520 list_remove(buflist, head); 04521 kmem_cache_free(hdr_cache, head); 04522 mutex_exit(&l2arc_buflist_mtx); 04523 04524 l2arc_do_free_on_write(); 04525 04526 kmem_free(cb, sizeof (l2arc_write_callback_t)); 04527 } 04528 04533 static void 04534 l2arc_read_done(zio_t *zio) 04535 { 04536 l2arc_read_callback_t *cb; 04537 arc_buf_hdr_t *hdr; 04538 arc_buf_t *buf; 04539 kmutex_t *hash_lock; 04540 int equal; 04541 04542 ASSERT(zio->io_vd != NULL); 04543 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 04544 04545 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 04546 04547 cb = zio->io_private; 04548 ASSERT(cb != NULL); 04549 buf = cb->l2rcb_buf; 04550 ASSERT(buf != NULL); 04551 04552 hash_lock = HDR_LOCK(buf->b_hdr); 04553 mutex_enter(hash_lock); 04554 hdr = buf->b_hdr; 04555 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 04556 04557 /* 04558 * Check this survived the L2ARC journey. 04559 */ 04560 equal = arc_cksum_equal(buf); 04561 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 04562 mutex_exit(hash_lock); 04563 zio->io_private = buf; 04564 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 04565 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 04566 arc_read_done(zio); 04567 } else { 04568 mutex_exit(hash_lock); 04569 /* 04570 * Buffer didn't survive caching. Increment stats and 04571 * reissue to the original storage device. 04572 */ 04573 if (zio->io_error != 0) { 04574 ARCSTAT_BUMP(arcstat_l2_io_error); 04575 } else { 04576 zio->io_error = EIO; 04577 } 04578 if (!equal) 04579 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 04580 04581 /* 04582 * If there's no waiter, issue an async i/o to the primary 04583 * storage now. If there *is* a waiter, the caller must 04584 * issue the i/o in a context where it's OK to block. 04585 */ 04586 if (zio->io_waiter == NULL) { 04587 zio_t *pio = zio_unique_parent(zio); 04588 04589 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 04590 04591 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 04592 buf->b_data, zio->io_size, arc_read_done, buf, 04593 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 04594 } 04595 } 04596 04597 kmem_free(cb, sizeof (l2arc_read_callback_t)); 04598 } 04599 04610 static list_t * 04611 l2arc_list_locked(int list_num, kmutex_t **lock) 04612 { 04613 list_t *list; 04614 int idx; 04615 04616 ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS); 04617 04618 if (list_num < ARC_BUFC_NUMMETADATALISTS) { 04619 idx = list_num; 04620 list = &arc_mfu->arcs_lists[idx]; 04621 *lock = ARCS_LOCK(arc_mfu, idx); 04622 } else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) { 04623 idx = list_num - ARC_BUFC_NUMMETADATALISTS; 04624 list = &arc_mru->arcs_lists[idx]; 04625 *lock = ARCS_LOCK(arc_mru, idx); 04626 } else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 + 04627 ARC_BUFC_NUMDATALISTS)) { 04628 idx = list_num - ARC_BUFC_NUMMETADATALISTS; 04629 list = &arc_mfu->arcs_lists[idx]; 04630 *lock = ARCS_LOCK(arc_mfu, idx); 04631 } else { 04632 idx = list_num - ARC_BUFC_NUMLISTS; 04633 list = &arc_mru->arcs_lists[idx]; 04634 *lock = ARCS_LOCK(arc_mru, idx); 04635 } 04636 04637 ASSERT(!(MUTEX_HELD(*lock))); 04638 mutex_enter(*lock); 04639 return (list); 04640 } 04641 04648 static void 04649 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 04650 { 04651 list_t *buflist; 04652 l2arc_buf_hdr_t *abl2; 04653 arc_buf_hdr_t *ab, *ab_prev; 04654 kmutex_t *hash_lock; 04655 uint64_t taddr; 04656 04657 buflist = dev->l2ad_buflist; 04658 04659 if (buflist == NULL) 04660 return; 04661 04662 if (!all && dev->l2ad_first) { 04663 /* 04664 * This is the first sweep through the device. There is 04665 * nothing to evict. 04666 */ 04667 return; 04668 } 04669 04670 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 04671 /* 04672 * When nearing the end of the device, evict to the end 04673 * before the device write hand jumps to the start. 04674 */ 04675 taddr = dev->l2ad_end; 04676 } else { 04677 taddr = dev->l2ad_hand + distance; 04678 } 04679 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 04680 uint64_t, taddr, boolean_t, all); 04681 04682 top: 04683 mutex_enter(&l2arc_buflist_mtx); 04684 for (ab = list_tail(buflist); ab; ab = ab_prev) { 04685 ab_prev = list_prev(buflist, ab); 04686 04687 hash_lock = HDR_LOCK(ab); 04688 if (!mutex_tryenter(hash_lock)) { 04689 /* 04690 * Missed the hash lock. Retry. 04691 */ 04692 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 04693 mutex_exit(&l2arc_buflist_mtx); 04694 mutex_enter(hash_lock); 04695 mutex_exit(hash_lock); 04696 goto top; 04697 } 04698 04699 if (HDR_L2_WRITE_HEAD(ab)) { 04700 /* 04701 * We hit a write head node. Leave it for 04702 * l2arc_write_done(). 04703 */ 04704 list_remove(buflist, ab); 04705 mutex_exit(hash_lock); 04706 continue; 04707 } 04708 04709 if (!all && ab->b_l2hdr != NULL && 04710 (ab->b_l2hdr->b_daddr > taddr || 04711 ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { 04712 /* 04713 * We've evicted to the target address, 04714 * or the end of the device. 04715 */ 04716 mutex_exit(hash_lock); 04717 break; 04718 } 04719 04720 if (HDR_FREE_IN_PROGRESS(ab)) { 04721 /* 04722 * Already on the path to destruction. 04723 */ 04724 mutex_exit(hash_lock); 04725 continue; 04726 } 04727 04728 if (ab->b_state == arc_l2c_only) { 04729 ASSERT(!HDR_L2_READING(ab)); 04730 /* 04731 * This doesn't exist in the ARC. Destroy. 04732 * arc_hdr_destroy() will call list_remove() 04733 * and decrement arcstat_l2_size. 04734 */ 04735 arc_change_state(arc_anon, ab, hash_lock); 04736 arc_hdr_destroy(ab); 04737 } else { 04738 /* 04739 * Invalidate issued or about to be issued 04740 * reads, since we may be about to write 04741 * over this location. 04742 */ 04743 if (HDR_L2_READING(ab)) { 04744 ARCSTAT_BUMP(arcstat_l2_evict_reading); 04745 ab->b_flags |= ARC_L2_EVICTED; 04746 } 04747 04748 /* 04749 * Tell ARC this no longer exists in L2ARC. 04750 */ 04751 if (ab->b_l2hdr != NULL) { 04752 abl2 = ab->b_l2hdr; 04753 ab->b_l2hdr = NULL; 04754 kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 04755 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 04756 } 04757 list_remove(buflist, ab); 04758 04759 /* 04760 * This may have been leftover after a 04761 * failed write. 04762 */ 04763 ab->b_flags &= ~ARC_L2_WRITING; 04764 } 04765 mutex_exit(hash_lock); 04766 } 04767 mutex_exit(&l2arc_buflist_mtx); 04768 04769 vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0); 04770 dev->l2ad_evict = taddr; 04771 } 04772 04779 static uint64_t 04780 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) 04781 { 04782 arc_buf_hdr_t *ab, *ab_prev, *head; 04783 l2arc_buf_hdr_t *hdrl2; 04784 list_t *list; 04785 uint64_t passed_sz, write_sz, buf_sz, headroom; 04786 void *buf_data; 04787 kmutex_t *hash_lock, *list_lock; 04788 boolean_t have_lock, full; 04789 l2arc_write_callback_t *cb; 04790 zio_t *pio, *wzio; 04791 uint64_t guid = spa_load_guid(spa); 04792 int try; 04793 04794 ASSERT(dev->l2ad_vdev != NULL); 04795 04796 pio = NULL; 04797 write_sz = 0; 04798 full = B_FALSE; 04799 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 04800 head->b_flags |= ARC_L2_WRITE_HEAD; 04801 04802 ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 04803 /* 04804 * Copy buffers for L2ARC writing. 04805 */ 04806 mutex_enter(&l2arc_buflist_mtx); 04807 for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) { 04808 list = l2arc_list_locked(try, &list_lock); 04809 passed_sz = 0; 04810 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 04811 04812 /* 04813 * L2ARC fast warmup. 04814 * 04815 * Until the ARC is warm and starts to evict, read from the 04816 * head of the ARC lists rather than the tail. 04817 */ 04818 headroom = target_sz * l2arc_headroom; 04819 if (arc_warm == B_FALSE) 04820 ab = list_head(list); 04821 else 04822 ab = list_tail(list); 04823 if (ab == NULL) 04824 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 04825 04826 for (; ab; ab = ab_prev) { 04827 if (arc_warm == B_FALSE) 04828 ab_prev = list_next(list, ab); 04829 else 04830 ab_prev = list_prev(list, ab); 04831 ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size); 04832 04833 hash_lock = HDR_LOCK(ab); 04834 have_lock = MUTEX_HELD(hash_lock); 04835 if (!have_lock && !mutex_tryenter(hash_lock)) { 04836 ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 04837 /* 04838 * Skip this buffer rather than waiting. 04839 */ 04840 continue; 04841 } 04842 04843 passed_sz += ab->b_size; 04844 if (passed_sz > headroom) { 04845 /* 04846 * Searched too far. 04847 */ 04848 mutex_exit(hash_lock); 04849 ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 04850 break; 04851 } 04852 04853 if (!l2arc_write_eligible(guid, ab)) { 04854 mutex_exit(hash_lock); 04855 continue; 04856 } 04857 04858 if ((write_sz + ab->b_size) > target_sz) { 04859 full = B_TRUE; 04860 mutex_exit(hash_lock); 04861 ARCSTAT_BUMP(arcstat_l2_write_full); 04862 break; 04863 } 04864 04865 if (pio == NULL) { 04866 /* 04867 * Insert a dummy header on the buflist so 04868 * l2arc_write_done() can find where the 04869 * write buffers begin without searching. 04870 */ 04871 list_insert_head(dev->l2ad_buflist, head); 04872 04873 cb = kmem_alloc( 04874 sizeof (l2arc_write_callback_t), KM_SLEEP); 04875 cb->l2wcb_dev = dev; 04876 cb->l2wcb_head = head; 04877 pio = zio_root(spa, l2arc_write_done, cb, 04878 ZIO_FLAG_CANFAIL); 04879 ARCSTAT_BUMP(arcstat_l2_write_pios); 04880 } 04881 04882 /* 04883 * Create and add a new L2ARC header. 04884 */ 04885 hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); 04886 hdrl2->b_dev = dev; 04887 hdrl2->b_daddr = dev->l2ad_hand; 04888 04889 ab->b_flags |= ARC_L2_WRITING; 04890 ab->b_l2hdr = hdrl2; 04891 list_insert_head(dev->l2ad_buflist, ab); 04892 buf_data = ab->b_buf->b_data; 04893 buf_sz = ab->b_size; 04894 04895 /* 04896 * Compute and store the buffer cksum before 04897 * writing. On debug the cksum is verified first. 04898 */ 04899 arc_cksum_verify(ab->b_buf); 04900 arc_cksum_compute(ab->b_buf, B_TRUE); 04901 04902 mutex_exit(hash_lock); 04903 04904 wzio = zio_write_phys(pio, dev->l2ad_vdev, 04905 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 04906 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 04907 ZIO_FLAG_CANFAIL, B_FALSE); 04908 04909 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 04910 zio_t *, wzio); 04911 (void) zio_nowait(wzio); 04912 04913 /* 04914 * Keep the clock hand suitably device-aligned. 04915 */ 04916 buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 04917 04918 write_sz += buf_sz; 04919 dev->l2ad_hand += buf_sz; 04920 } 04921 04922 mutex_exit(list_lock); 04923 04924 if (full == B_TRUE) 04925 break; 04926 } 04927 mutex_exit(&l2arc_buflist_mtx); 04928 04929 if (pio == NULL) { 04930 ASSERT0(write_sz); 04931 kmem_cache_free(hdr_cache, head); 04932 return (0); 04933 } 04934 04935 ASSERT3U(write_sz, <=, target_sz); 04936 ARCSTAT_BUMP(arcstat_l2_writes_sent); 04937 ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz); 04938 ARCSTAT_INCR(arcstat_l2_size, write_sz); 04939 vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0); 04940 04941 /* 04942 * Bump device hand to the device start if it is approaching the end. 04943 * l2arc_evict() will already have evicted ahead for this case. 04944 */ 04945 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 04946 vdev_space_update(dev->l2ad_vdev, 04947 dev->l2ad_end - dev->l2ad_hand, 0, 0); 04948 dev->l2ad_hand = dev->l2ad_start; 04949 dev->l2ad_evict = dev->l2ad_start; 04950 dev->l2ad_first = B_FALSE; 04951 } 04952 04953 dev->l2ad_writing = B_TRUE; 04954 (void) zio_wait(pio); 04955 dev->l2ad_writing = B_FALSE; 04956 04957 return (write_sz); 04958 } 04959 04964 static void 04965 l2arc_feed_thread(void *dummy __unused) 04966 { 04967 callb_cpr_t cpr; 04968 l2arc_dev_t *dev; 04969 spa_t *spa; 04970 uint64_t size, wrote; 04971 clock_t begin, next = ddi_get_lbolt(); 04972 04973 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 04974 04975 mutex_enter(&l2arc_feed_thr_lock); 04976 04977 while (l2arc_thread_exit == 0) { 04978 CALLB_CPR_SAFE_BEGIN(&cpr); 04979 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 04980 next - ddi_get_lbolt()); 04981 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 04982 next = ddi_get_lbolt() + hz; 04983 04984 /* 04985 * Quick check for L2ARC devices. 04986 */ 04987 mutex_enter(&l2arc_dev_mtx); 04988 if (l2arc_ndev == 0) { 04989 mutex_exit(&l2arc_dev_mtx); 04990 continue; 04991 } 04992 mutex_exit(&l2arc_dev_mtx); 04993 begin = ddi_get_lbolt(); 04994 04995 /* 04996 * This selects the next l2arc device to write to, and in 04997 * doing so the next spa to feed from: dev->l2ad_spa. This 04998 * will return NULL if there are now no l2arc devices or if 04999 * they are all faulted. 05000 * 05001 * If a device is returned, its spa's config lock is also 05002 * held to prevent device removal. l2arc_dev_get_next() 05003 * will grab and release l2arc_dev_mtx. 05004 */ 05005 if ((dev = l2arc_dev_get_next()) == NULL) 05006 continue; 05007 05008 spa = dev->l2ad_spa; 05009 ASSERT(spa != NULL); 05010 05011 /* 05012 * If the pool is read-only then force the feed thread to 05013 * sleep a little longer. 05014 */ 05015 if (!spa_writeable(spa)) { 05016 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 05017 spa_config_exit(spa, SCL_L2ARC, dev); 05018 continue; 05019 } 05020 05021 /* 05022 * Avoid contributing to memory pressure. 05023 */ 05024 if (arc_reclaim_needed()) { 05025 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 05026 spa_config_exit(spa, SCL_L2ARC, dev); 05027 continue; 05028 } 05029 05030 ARCSTAT_BUMP(arcstat_l2_feeds); 05031 05032 size = l2arc_write_size(dev); 05033 05034 /* 05035 * Evict L2ARC buffers that will be overwritten. 05036 */ 05037 l2arc_evict(dev, size, B_FALSE); 05038 05039 /* 05040 * Write ARC buffers. 05041 */ 05042 wrote = l2arc_write_buffers(spa, dev, size); 05043 05044 /* 05045 * Calculate interval between writes. 05046 */ 05047 next = l2arc_write_interval(begin, size, wrote); 05048 spa_config_exit(spa, SCL_L2ARC, dev); 05049 } 05050 05051 l2arc_thread_exit = 0; 05052 cv_broadcast(&l2arc_feed_thr_cv); 05053 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 05054 thread_exit(); 05055 } 05056 05057 boolean_t 05058 l2arc_vdev_present(vdev_t *vd) 05059 { 05060 l2arc_dev_t *dev; 05061 05062 mutex_enter(&l2arc_dev_mtx); 05063 for (dev = list_head(l2arc_dev_list); dev != NULL; 05064 dev = list_next(l2arc_dev_list, dev)) { 05065 if (dev->l2ad_vdev == vd) 05066 break; 05067 } 05068 mutex_exit(&l2arc_dev_mtx); 05069 05070 return (dev != NULL); 05071 } 05072 05077 void 05078 l2arc_add_vdev(spa_t *spa, vdev_t *vd) 05079 { 05080 l2arc_dev_t *adddev; 05081 05082 ASSERT(!l2arc_vdev_present(vd)); 05083 05084 /* 05085 * Create a new l2arc device entry. 05086 */ 05087 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 05088 adddev->l2ad_spa = spa; 05089 adddev->l2ad_vdev = vd; 05090 adddev->l2ad_write = l2arc_write_max; 05091 adddev->l2ad_boost = l2arc_write_boost; 05092 adddev->l2ad_start = VDEV_LABEL_START_SIZE; 05093 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 05094 adddev->l2ad_hand = adddev->l2ad_start; 05095 adddev->l2ad_evict = adddev->l2ad_start; 05096 adddev->l2ad_first = B_TRUE; 05097 adddev->l2ad_writing = B_FALSE; 05098 ASSERT3U(adddev->l2ad_write, >, 0); 05099 05100 /* 05101 * This is a list of all ARC buffers that are still valid on the 05102 * device. 05103 */ 05104 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); 05105 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 05106 offsetof(arc_buf_hdr_t, b_l2node)); 05107 05108 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 05109 05110 /* 05111 * Add device to global list 05112 */ 05113 mutex_enter(&l2arc_dev_mtx); 05114 list_insert_head(l2arc_dev_list, adddev); 05115 atomic_inc_64(&l2arc_ndev); 05116 mutex_exit(&l2arc_dev_mtx); 05117 } 05118 05122 void 05123 l2arc_remove_vdev(vdev_t *vd) 05124 { 05125 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 05126 05127 /* 05128 * Find the device by vdev 05129 */ 05130 mutex_enter(&l2arc_dev_mtx); 05131 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 05132 nextdev = list_next(l2arc_dev_list, dev); 05133 if (vd == dev->l2ad_vdev) { 05134 remdev = dev; 05135 break; 05136 } 05137 } 05138 ASSERT(remdev != NULL); 05139 05140 /* 05141 * Remove device from global list 05142 */ 05143 list_remove(l2arc_dev_list, remdev); 05144 l2arc_dev_last = NULL; /* may have been invalidated */ 05145 atomic_dec_64(&l2arc_ndev); 05146 mutex_exit(&l2arc_dev_mtx); 05147 05148 /* 05149 * Clear all buflists and ARC references. L2ARC device flush. 05150 */ 05151 l2arc_evict(remdev, 0, B_TRUE); 05152 list_destroy(remdev->l2ad_buflist); 05153 kmem_free(remdev->l2ad_buflist, sizeof (list_t)); 05154 kmem_free(remdev, sizeof (l2arc_dev_t)); 05155 } 05156 05157 void 05158 l2arc_init(void) 05159 { 05160 l2arc_thread_exit = 0; 05161 l2arc_ndev = 0; 05162 l2arc_writes_sent = 0; 05163 l2arc_writes_done = 0; 05164 05165 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 05166 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 05167 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 05168 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); 05169 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 05170 05171 l2arc_dev_list = &L2ARC_dev_list; 05172 l2arc_free_on_write = &L2ARC_free_on_write; 05173 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 05174 offsetof(l2arc_dev_t, l2ad_node)); 05175 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 05176 offsetof(l2arc_data_free_t, l2df_list_node)); 05177 } 05178 05179 void 05180 l2arc_fini(void) 05181 { 05182 /* 05183 * This is called from dmu_fini(), which is called from spa_fini(); 05184 * Because of this, we can assume that all l2arc devices have 05185 * already been removed when the pools themselves were removed. 05186 */ 05187 05188 l2arc_do_free_on_write(); 05189 05190 mutex_destroy(&l2arc_feed_thr_lock); 05191 cv_destroy(&l2arc_feed_thr_cv); 05192 mutex_destroy(&l2arc_dev_mtx); 05193 mutex_destroy(&l2arc_buflist_mtx); 05194 mutex_destroy(&l2arc_free_on_write_mtx); 05195 05196 list_destroy(l2arc_dev_list); 05197 list_destroy(l2arc_free_on_write); 05198 } 05199 05200 void 05201 l2arc_start(void) 05202 { 05203 if (!(spa_mode_global & FWRITE)) 05204 return; 05205 05206 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 05207 TS_RUN, minclsyspri); 05208 } 05209 05210 void 05211 l2arc_stop(void) 05212 { 05213 if (!(spa_mode_global & FWRITE)) 05214 return; 05215 05216 mutex_enter(&l2arc_feed_thr_lock); 05217 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 05218 l2arc_thread_exit = 1; 05219 while (l2arc_thread_exit != 0) 05220 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 05221 mutex_exit(&l2arc_feed_thr_lock); 05222 }