FreeBSD ZFS
The Zettabyte File System

arc.c

Go to the documentation of this file.
00001 /*
00002  * CDDL HEADER START
00003  *
00004  * The contents of this file are subject to the terms of the
00005  * Common Development and Distribution License (the "License").
00006  * You may not use this file except in compliance with the License.
00007  *
00008  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
00009  * or http://www.opensolaris.org/os/licensing.
00010  * See the License for the specific language governing permissions
00011  * and limitations under the License.
00012  *
00013  * When distributing Covered Code, include this CDDL HEADER in each
00014  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
00015  * If applicable, add the following below this CDDL HEADER, with the
00016  * fields enclosed by brackets "[]" replaced with your own identifying
00017  * information: Portions Copyright [yyyy] [name of copyright owner]
00018  *
00019  * CDDL HEADER END
00020  */
00021 /*
00022  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
00023  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
00024  * Copyright (c) 2011 by Delphix. All rights reserved.
00025  */
00026 
00125 #include <sys/spa.h>
00126 #include <sys/zio.h>
00127 #include <sys/zfs_context.h>
00128 #include <sys/arc.h>
00129 #include <sys/refcount.h>
00130 #include <sys/vdev.h>
00131 #include <sys/vdev_impl.h>
00132 #ifdef _KERNEL
00133 #include <sys/dnlc.h>
00134 #endif
00135 #include <sys/callb.h>
00136 #include <sys/kstat.h>
00137 #include <zfs_fletcher.h>
00138 #include <sys/sdt.h>
00139 
00140 #include <vm/vm_pageout.h>
00141 
00142 #ifdef illumos
00143 #ifndef _KERNEL
00144 
00145 boolean_t arc_watch = B_FALSE;
00146 int arc_procfd;
00147 #endif
00148 #endif /* illumos */
00149 
00150 static kmutex_t         arc_reclaim_thr_lock;
00151 static kcondvar_t       arc_reclaim_thr_cv;   
00152 static uint8_t          arc_thread_exit;
00153 
00154 extern int zfs_write_limit_shift;
00155 extern uint64_t zfs_write_limit_max;
00156 extern kmutex_t zfs_write_limit_lock;
00157 
00158 #define ARC_REDUCE_DNLC_PERCENT 3
00159 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
00160 
00161 typedef enum arc_reclaim_strategy {
00162         ARC_RECLAIM_AGGR,               
00163         ARC_RECLAIM_CONS                
00164 } arc_reclaim_strategy_t;
00165 
00167 static int              arc_grow_retry = 60;
00168 
00170 static int              arc_p_min_shift = 4;
00171 
00173 static int              arc_shrink_shift = 5;
00174 
00179 static int              arc_min_prefetch_lifespan;
00180 
00181 static int arc_dead;
00182 extern int zfs_prefetch_disable;
00183 
00187 static boolean_t arc_warm;
00188 
00189 /*
00190  * These tunables are for performance analysis.
00191  */
00196 uint64_t zfs_arc_max;
00197 uint64_t zfs_arc_min;
00198 uint64_t zfs_arc_meta_limit = 0;
00200 int zfs_arc_grow_retry = 0;
00201 int zfs_arc_shrink_shift = 0;
00202 int zfs_arc_p_min_shift = 0;
00203 int zfs_disable_dup_eviction = 0;
00204 
00205 TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
00206 TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
00207 TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
00208 SYSCTL_DECL(_vfs_zfs);
00209 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
00210     "Maximum ARC size");
00211 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
00212     "Minimum ARC size");
00213 
00250 #define ARCS_LOCK_PAD           CACHE_LINE_SIZE
00251 struct arcs_lock {
00252         kmutex_t        arcs_lock;
00253 #ifdef _KERNEL
00254         unsigned char   pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
00255 #endif
00256 };
00257 
00261 #define ARC_BUFC_NUMDATALISTS           16
00262 #define ARC_BUFC_NUMMETADATALISTS       16
00263 #define ARC_BUFC_NUMLISTS       (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
00264 
00265 typedef struct arc_state {
00266         uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; 
00267         uint64_t arcs_size;     
00268         list_t  arcs_lists[ARC_BUFC_NUMLISTS]; 
00269         struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
00270 } arc_state_t;
00271 
00272 #define ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock))
00273 
00274 /* The 6 states: */
00275 static arc_state_t ARC_anon;
00276 static arc_state_t ARC_mru;
00277 static arc_state_t ARC_mru_ghost;
00278 static arc_state_t ARC_mfu;
00279 static arc_state_t ARC_mfu_ghost;
00280 static arc_state_t ARC_l2c_only;
00281 
00282 typedef struct arc_stats {
00283         kstat_named_t arcstat_hits;
00284         kstat_named_t arcstat_misses;
00285         kstat_named_t arcstat_demand_data_hits;
00286         kstat_named_t arcstat_demand_data_misses;
00287         kstat_named_t arcstat_demand_metadata_hits;
00288         kstat_named_t arcstat_demand_metadata_misses;
00289         kstat_named_t arcstat_prefetch_data_hits;
00290         kstat_named_t arcstat_prefetch_data_misses;
00291         kstat_named_t arcstat_prefetch_metadata_hits;
00292         kstat_named_t arcstat_prefetch_metadata_misses;
00293         kstat_named_t arcstat_mru_hits;
00294         kstat_named_t arcstat_mru_ghost_hits;
00295         kstat_named_t arcstat_mfu_hits;
00296         kstat_named_t arcstat_mfu_ghost_hits;
00297         kstat_named_t arcstat_allocated;
00298         kstat_named_t arcstat_deleted;
00299         kstat_named_t arcstat_stolen;
00300         kstat_named_t arcstat_recycle_miss;
00301         kstat_named_t arcstat_mutex_miss;
00302         kstat_named_t arcstat_evict_skip;
00303         kstat_named_t arcstat_evict_l2_cached;
00304         kstat_named_t arcstat_evict_l2_eligible;
00305         kstat_named_t arcstat_evict_l2_ineligible;
00306         kstat_named_t arcstat_hash_elements;
00307         kstat_named_t arcstat_hash_elements_max;
00308         kstat_named_t arcstat_hash_collisions;
00309         kstat_named_t arcstat_hash_chains;
00310         kstat_named_t arcstat_hash_chain_max;
00311         kstat_named_t arcstat_p;
00312         kstat_named_t arcstat_c;
00313         kstat_named_t arcstat_c_min;
00314         kstat_named_t arcstat_c_max;
00315         kstat_named_t arcstat_size;
00316         kstat_named_t arcstat_hdr_size;
00317         kstat_named_t arcstat_data_size;
00318         kstat_named_t arcstat_other_size;
00319         kstat_named_t arcstat_l2_hits;
00320         kstat_named_t arcstat_l2_misses;
00321         kstat_named_t arcstat_l2_feeds;
00322         kstat_named_t arcstat_l2_rw_clash;
00323         kstat_named_t arcstat_l2_read_bytes;
00324         kstat_named_t arcstat_l2_write_bytes;
00325         kstat_named_t arcstat_l2_writes_sent;
00326         kstat_named_t arcstat_l2_writes_done;
00327         kstat_named_t arcstat_l2_writes_error;
00328         kstat_named_t arcstat_l2_writes_hdr_miss;
00329         kstat_named_t arcstat_l2_evict_lock_retry;
00330         kstat_named_t arcstat_l2_evict_reading;
00331         kstat_named_t arcstat_l2_free_on_write;
00332         kstat_named_t arcstat_l2_abort_lowmem;
00333         kstat_named_t arcstat_l2_cksum_bad;
00334         kstat_named_t arcstat_l2_io_error;
00335         kstat_named_t arcstat_l2_size;
00336         kstat_named_t arcstat_l2_hdr_size;
00337         kstat_named_t arcstat_l2_write_trylock_fail;
00338         kstat_named_t arcstat_l2_write_passed_headroom;
00339         kstat_named_t arcstat_l2_write_spa_mismatch;
00340         kstat_named_t arcstat_l2_write_in_l2;
00341         kstat_named_t arcstat_l2_write_hdr_io_in_progress;
00342         kstat_named_t arcstat_l2_write_not_cacheable;
00343         kstat_named_t arcstat_l2_write_full;
00344         kstat_named_t arcstat_l2_write_buffer_iter;
00345         kstat_named_t arcstat_l2_write_pios;
00346         kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
00347         kstat_named_t arcstat_l2_write_buffer_list_iter;
00348         kstat_named_t arcstat_l2_write_buffer_list_null_iter;
00349         kstat_named_t arcstat_memory_throttle_count;
00350         kstat_named_t arcstat_duplicate_buffers;
00351         kstat_named_t arcstat_duplicate_buffers_size;
00352         kstat_named_t arcstat_duplicate_reads;
00353 } arc_stats_t;
00354 
00355 static arc_stats_t arc_stats = {
00356         { "hits",                       KSTAT_DATA_UINT64 },
00357         { "misses",                     KSTAT_DATA_UINT64 },
00358         { "demand_data_hits",           KSTAT_DATA_UINT64 },
00359         { "demand_data_misses",         KSTAT_DATA_UINT64 },
00360         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
00361         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
00362         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
00363         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
00364         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
00365         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
00366         { "mru_hits",                   KSTAT_DATA_UINT64 },
00367         { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
00368         { "mfu_hits",                   KSTAT_DATA_UINT64 },
00369         { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
00370         { "allocated",                  KSTAT_DATA_UINT64 },
00371         { "deleted",                    KSTAT_DATA_UINT64 },
00372         { "stolen",                     KSTAT_DATA_UINT64 },
00373         { "recycle_miss",               KSTAT_DATA_UINT64 },
00374         { "mutex_miss",                 KSTAT_DATA_UINT64 },
00375         { "evict_skip",                 KSTAT_DATA_UINT64 },
00376         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
00377         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
00378         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
00379         { "hash_elements",              KSTAT_DATA_UINT64 },
00380         { "hash_elements_max",          KSTAT_DATA_UINT64 },
00381         { "hash_collisions",            KSTAT_DATA_UINT64 },
00382         { "hash_chains",                KSTAT_DATA_UINT64 },
00383         { "hash_chain_max",             KSTAT_DATA_UINT64 },
00384         { "p",                          KSTAT_DATA_UINT64 },
00385         { "c",                          KSTAT_DATA_UINT64 },
00386         { "c_min",                      KSTAT_DATA_UINT64 },
00387         { "c_max",                      KSTAT_DATA_UINT64 },
00388         { "size",                       KSTAT_DATA_UINT64 },
00389         { "hdr_size",                   KSTAT_DATA_UINT64 },
00390         { "data_size",                  KSTAT_DATA_UINT64 },
00391         { "other_size",                 KSTAT_DATA_UINT64 },
00392         { "l2_hits",                    KSTAT_DATA_UINT64 },
00393         { "l2_misses",                  KSTAT_DATA_UINT64 },
00394         { "l2_feeds",                   KSTAT_DATA_UINT64 },
00395         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
00396         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
00397         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
00398         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
00399         { "l2_writes_done",             KSTAT_DATA_UINT64 },
00400         { "l2_writes_error",            KSTAT_DATA_UINT64 },
00401         { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
00402         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
00403         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
00404         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
00405         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
00406         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
00407         { "l2_io_error",                KSTAT_DATA_UINT64 },
00408         { "l2_size",                    KSTAT_DATA_UINT64 },
00409         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
00410         { "l2_write_trylock_fail",      KSTAT_DATA_UINT64 },
00411         { "l2_write_passed_headroom",   KSTAT_DATA_UINT64 },
00412         { "l2_write_spa_mismatch",      KSTAT_DATA_UINT64 },
00413         { "l2_write_in_l2",             KSTAT_DATA_UINT64 },
00414         { "l2_write_io_in_progress",    KSTAT_DATA_UINT64 },
00415         { "l2_write_not_cacheable",     KSTAT_DATA_UINT64 },
00416         { "l2_write_full",              KSTAT_DATA_UINT64 },
00417         { "l2_write_buffer_iter",       KSTAT_DATA_UINT64 },
00418         { "l2_write_pios",              KSTAT_DATA_UINT64 },
00419         { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
00420         { "l2_write_buffer_list_iter",  KSTAT_DATA_UINT64 },
00421         { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
00422         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
00423         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
00424         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
00425         { "duplicate_reads",            KSTAT_DATA_UINT64 }
00426 };
00427 
00428 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
00429 
00430 #define ARCSTAT_INCR(stat, val) \
00431         atomic_add_64(&arc_stats.stat.value.ui64, (val));
00432 
00433 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
00434 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
00435 
00436 #define ARCSTAT_MAX(stat, val) {                                        \
00437         uint64_t m;                                                     \
00438         while ((val) > (m = arc_stats.stat.value.ui64) &&               \
00439             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
00440                 continue;                                               \
00441 }
00442 
00443 #define ARCSTAT_MAXSTAT(stat) \
00444         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
00445 
00451 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
00452         if (cond1) {                                                    \
00453                 if (cond2) {                                            \
00454                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
00455                 } else {                                                \
00456                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
00457                 }                                                       \
00458         } else {                                                        \
00459                 if (cond2) {                                            \
00460                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
00461                 } else {                                                \
00462                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
00463                 }                                                       \
00464         }
00465 
00466 kstat_t                 *arc_ksp;
00467 static arc_state_t      *arc_anon;
00468 static arc_state_t      *arc_mru;
00469 static arc_state_t      *arc_mru_ghost;
00470 static arc_state_t      *arc_mfu;
00471 static arc_state_t      *arc_mfu_ghost;
00472 static arc_state_t      *arc_l2c_only;
00473 
00474 /*
00475  * There are several ARC variables that are critical to export as kstats --
00476  * but we don't want to have to grovel around in the kstat whenever we wish to
00477  * manipulate them.  For these variables, we therefore define them to be in
00478  * terms of the statistic variable.  This assures that we are not introducing
00479  * the possibility of inconsistency by having shadow copies of the variables,
00480  * while still allowing the code to be readable.
00481  */
00482 #define arc_size        ARCSTAT(arcstat_size)   
00483 #define arc_p           ARCSTAT(arcstat_p)      
00484 #define arc_c           ARCSTAT(arcstat_c)      
00485 #define arc_c_min       ARCSTAT(arcstat_c_min)  
00486 #define arc_c_max       ARCSTAT(arcstat_c_max)  
00488 static int              arc_no_grow;    
00489 static uint64_t         arc_tempreserve;
00490 static uint64_t         arc_loaned_bytes;
00491 static uint64_t         arc_meta_used;
00492 static uint64_t         arc_meta_limit;
00493 static uint64_t         arc_meta_max = 0;
00494 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RD, &arc_meta_used, 0,
00495     "ARC metadata used");
00496 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RW, &arc_meta_limit, 0,
00497     "ARC metadata limit");
00498 
00499 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
00500 
00501 typedef struct arc_callback arc_callback_t;
00502 
00503 struct arc_callback {
00504         void                    *acb_private;
00505         arc_done_func_t         *acb_done;
00506         arc_buf_t               *acb_buf;
00507         zio_t                   *acb_zio_dummy;
00508         arc_callback_t          *acb_next;
00509 };
00510 
00511 typedef struct arc_write_callback arc_write_callback_t;
00512 
00513 struct arc_write_callback {
00514         void            *awcb_private;
00515         arc_done_func_t *awcb_ready;
00516         arc_done_func_t *awcb_done;
00517         arc_buf_t       *awcb_buf;
00518 };
00519 
00520 struct arc_buf_hdr {
00521         /* protected by hash lock */
00522         dva_t                   b_dva;
00523         uint64_t                b_birth;
00524         uint64_t                b_cksum0;
00525 
00526         kmutex_t                b_freeze_lock;
00527         zio_cksum_t             *b_freeze_cksum;
00528         void                    *b_thawed;
00529 
00530         arc_buf_hdr_t           *b_hash_next;
00531         arc_buf_t               *b_buf;
00532         uint32_t                b_flags;
00533         uint32_t                b_datacnt;
00534 
00535         arc_callback_t          *b_acb;
00536         kcondvar_t              b_cv;
00537 
00538         /* immutable */
00539         arc_buf_contents_t      b_type;
00540         uint64_t                b_size;
00541         uint64_t                b_spa;
00542 
00543         /* protected by arc state mutex */
00544         arc_state_t             *b_state;
00545         list_node_t             b_arc_node;
00546 
00547         /* updated atomically */
00548         clock_t                 b_arc_access;
00549 
00550         /* self protecting */
00551         refcount_t              b_refcnt;
00552 
00553         l2arc_buf_hdr_t         *b_l2hdr;
00554         list_node_t             b_l2node;
00555 };
00556 
00557 static arc_buf_t *arc_eviction_list;
00558 static kmutex_t arc_eviction_mtx;
00559 static arc_buf_hdr_t arc_eviction_hdr;
00560 static void arc_get_data_buf(arc_buf_t *buf);
00561 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
00562 static int arc_evict_needed(arc_buf_contents_t type);
00563 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
00564 #ifdef illumos
00565 static void arc_buf_watch(arc_buf_t *buf);
00566 #endif /* illumos */
00567 
00568 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
00569 
00570 #define GHOST_STATE(state)      \
00571         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
00572         (state) == arc_l2c_only)
00573 
00574 /*
00575  * Private ARC flags.  These flags are private ARC only flags that will show up
00576  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
00577  * be passed in as arc_flags in things like arc_read.  However, these flags
00578  * should never be passed and should only be set by ARC code.  When adding new
00579  * public flags, make sure not to smash the private ones.
00580  */
00581 
00582 #define ARC_IN_HASH_TABLE       (1 << 9)        
00583 #define ARC_IO_IN_PROGRESS      (1 << 10)       
00584 #define ARC_IO_ERROR            (1 << 11)       
00585 #define ARC_FREED_IN_READ       (1 << 12)       
00586 #define ARC_BUF_AVAILABLE       (1 << 13)       
00587 #define ARC_INDIRECT            (1 << 14)       
00588 #define ARC_FREE_IN_PROGRESS    (1 << 15)       
00589 #define ARC_L2_WRITING          (1 << 16)       
00590 #define ARC_L2_EVICTED          (1 << 17)       
00591 #define ARC_L2_WRITE_HEAD       (1 << 18)       
00593 #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
00594 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
00595 #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_IO_ERROR)
00596 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_PREFETCH)
00597 #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FREED_IN_READ)
00598 #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_BUF_AVAILABLE)
00599 #define HDR_FREE_IN_PROGRESS(hdr)       ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
00600 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_L2CACHE)
00601 #define HDR_L2_READING(hdr)     ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
00602                                     (hdr)->b_l2hdr != NULL)
00603 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_L2_WRITING)
00604 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_L2_EVICTED)
00605 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
00606 
00607 /*
00608  * Other sizes
00609  */
00610 
00611 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
00612 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
00613 
00614 /*
00615  * Hash table routines
00616  */
00617 
00618 #define HT_LOCK_PAD     CACHE_LINE_SIZE
00619 
00620 struct ht_lock {
00621         kmutex_t        ht_lock;
00622 #ifdef _KERNEL
00623         unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
00624 #endif
00625 };
00626 
00627 #define BUF_LOCKS 256
00628 typedef struct buf_hash_table {
00629         uint64_t ht_mask;
00630         arc_buf_hdr_t **ht_table;
00631         struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
00632 } buf_hash_table_t;
00633 
00634 static buf_hash_table_t buf_hash_table;
00635 
00636 #define BUF_HASH_INDEX(spa, dva, birth) \
00637         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
00638 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
00639 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
00640 #define HDR_LOCK(hdr) \
00641         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
00642 
00643 uint64_t zfs_crc64_table[256];
00644 
00645 /*
00646  * Level 2 ARC
00647  */
00648 
00649 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       
00650 #define L2ARC_HEADROOM          2               
00651 #define L2ARC_FEED_SECS         1               
00652 #define L2ARC_FEED_MIN_MS       200             
00654 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
00655 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
00656 
00657 /* L2ARC Performance Tunables */
00662 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    
00663 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  
00664 uint64_t l2arc_headroom = L2ARC_HEADROOM;       
00665 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     
00666 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; 
00667 boolean_t l2arc_noprefetch = B_TRUE;            
00668 boolean_t l2arc_feed_again = B_TRUE;            
00669 boolean_t l2arc_norw = B_TRUE;                  
00672 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
00673     &l2arc_write_max, 0, "max write size");
00674 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
00675     &l2arc_write_boost, 0, "extra write during warmup");
00676 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
00677     &l2arc_headroom, 0, "number of dev writes");
00678 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
00679     &l2arc_feed_secs, 0, "interval seconds");
00680 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
00681     &l2arc_feed_min_ms, 0, "min interval milliseconds");
00682 
00683 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
00684     &l2arc_noprefetch, 0, "don't cache prefetch bufs");
00685 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
00686     &l2arc_feed_again, 0, "turbo warmup");
00687 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
00688     &l2arc_norw, 0, "no reads during writes");
00689 
00690 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
00691     &ARC_anon.arcs_size, 0, "size of anonymous state");
00692 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
00693     &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
00694 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
00695     &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
00696 
00697 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
00698     &ARC_mru.arcs_size, 0, "size of mru state");
00699 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
00700     &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
00701 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
00702     &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
00703 
00704 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
00705     &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
00706 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
00707     &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
00708     "size of metadata in mru ghost state");
00709 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
00710     &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
00711     "size of data in mru ghost state");
00712 
00713 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
00714     &ARC_mfu.arcs_size, 0, "size of mfu state");
00715 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
00716     &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
00717 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
00718     &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
00719 
00720 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
00721     &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
00722 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
00723     &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
00724     "size of metadata in mfu ghost state");
00725 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
00726     &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
00727     "size of data in mfu ghost state");
00728 
00729 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
00730     &ARC_l2c_only.arcs_size, 0, "size of mru state");
00731 
00732 /*
00733  * L2ARC Internals
00734  */
00735 typedef struct l2arc_dev {
00736         vdev_t                  *l2ad_vdev;     
00737         spa_t                   *l2ad_spa;      
00738         uint64_t                l2ad_hand;      
00739         uint64_t                l2ad_write;     
00740         uint64_t                l2ad_boost;     
00741         uint64_t                l2ad_start;     
00742         uint64_t                l2ad_end;       
00743         uint64_t                l2ad_evict;     
00744         boolean_t               l2ad_first;     
00745         boolean_t               l2ad_writing;   
00746         list_t                  *l2ad_buflist;  
00747         list_node_t             l2ad_node;      
00748 } l2arc_dev_t;
00749 
00750 static list_t L2ARC_dev_list;                   
00751 static list_t *l2arc_dev_list;                  
00752 static kmutex_t l2arc_dev_mtx;                  
00753 static l2arc_dev_t *l2arc_dev_last;             
00754 static kmutex_t l2arc_buflist_mtx;              
00755 static list_t L2ARC_free_on_write;              
00756 static list_t *l2arc_free_on_write;             
00757 static kmutex_t l2arc_free_on_write_mtx;        
00758 static uint64_t l2arc_ndev;                     
00760 typedef struct l2arc_read_callback {
00761         arc_buf_t       *l2rcb_buf;             
00762         spa_t           *l2rcb_spa;             
00763         blkptr_t        l2rcb_bp;               
00764         zbookmark_t     l2rcb_zb;               
00765         int             l2rcb_flags;            
00766 } l2arc_read_callback_t;
00767 
00768 typedef struct l2arc_write_callback {
00769         l2arc_dev_t     *l2wcb_dev;             
00770         arc_buf_hdr_t   *l2wcb_head;            
00771 } l2arc_write_callback_t;
00772 
00773 struct l2arc_buf_hdr {
00774         /* protected by arc_buf_hdr  mutex */
00775         l2arc_dev_t     *b_dev;                 
00776         uint64_t        b_daddr;                
00777 };
00778 
00779 typedef struct l2arc_data_free {
00780         /* protected by l2arc_free_on_write_mtx */
00781         void            *l2df_data;
00782         size_t          l2df_size;
00783         void            (*l2df_func)(void *, size_t);
00784         list_node_t     l2df_list_node;
00785 } l2arc_data_free_t;
00786 
00787 static kmutex_t l2arc_feed_thr_lock;
00788 static kcondvar_t l2arc_feed_thr_cv;
00789 static uint8_t l2arc_thread_exit;
00790 
00791 static void l2arc_read_done(zio_t *zio);
00792 static void l2arc_hdr_stat_add(void);
00793 static void l2arc_hdr_stat_remove(void);
00794 
00795 static uint64_t
00796 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
00797 {
00798         uint8_t *vdva = (uint8_t *)dva;
00799         uint64_t crc = -1ULL;
00800         int i;
00801 
00802         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
00803 
00804         for (i = 0; i < sizeof (dva_t); i++)
00805                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
00806 
00807         crc ^= (spa>>8) ^ birth;
00808 
00809         return (crc);
00810 }
00811 
00812 #define BUF_EMPTY(buf)                                          \
00813         ((buf)->b_dva.dva_word[0] == 0 &&                       \
00814         (buf)->b_dva.dva_word[1] == 0 &&                        \
00815         (buf)->b_birth == 0)
00816 
00817 #define BUF_EQUAL(spa, dva, birth, buf)                         \
00818         ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&     \
00819         ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
00820         ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
00821 
00822 static void
00823 buf_discard_identity(arc_buf_hdr_t *hdr)
00824 {
00825         hdr->b_dva.dva_word[0] = 0;
00826         hdr->b_dva.dva_word[1] = 0;
00827         hdr->b_birth = 0;
00828         hdr->b_cksum0 = 0;
00829 }
00830 
00831 static arc_buf_hdr_t *
00832 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
00833 {
00834         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
00835         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
00836         arc_buf_hdr_t *buf;
00837 
00838         mutex_enter(hash_lock);
00839         for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
00840             buf = buf->b_hash_next) {
00841                 if (BUF_EQUAL(spa, dva, birth, buf)) {
00842                         *lockp = hash_lock;
00843                         return (buf);
00844                 }
00845         }
00846         mutex_exit(hash_lock);
00847         *lockp = NULL;
00848         return (NULL);
00849 }
00850 
00857 static arc_buf_hdr_t *
00858 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
00859 {
00860         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
00861         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
00862         arc_buf_hdr_t *fbuf;
00863         uint32_t i;
00864 
00865         ASSERT(!HDR_IN_HASH_TABLE(buf));
00866         *lockp = hash_lock;
00867         mutex_enter(hash_lock);
00868         for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
00869             fbuf = fbuf->b_hash_next, i++) {
00870                 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
00871                         return (fbuf);
00872         }
00873 
00874         buf->b_hash_next = buf_hash_table.ht_table[idx];
00875         buf_hash_table.ht_table[idx] = buf;
00876         buf->b_flags |= ARC_IN_HASH_TABLE;
00877 
00878         /* collect some hash table performance data */
00879         if (i > 0) {
00880                 ARCSTAT_BUMP(arcstat_hash_collisions);
00881                 if (i == 1)
00882                         ARCSTAT_BUMP(arcstat_hash_chains);
00883 
00884                 ARCSTAT_MAX(arcstat_hash_chain_max, i);
00885         }
00886 
00887         ARCSTAT_BUMP(arcstat_hash_elements);
00888         ARCSTAT_MAXSTAT(arcstat_hash_elements);
00889 
00890         return (NULL);
00891 }
00892 
00893 static void
00894 buf_hash_remove(arc_buf_hdr_t *buf)
00895 {
00896         arc_buf_hdr_t *fbuf, **bufp;
00897         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
00898 
00899         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
00900         ASSERT(HDR_IN_HASH_TABLE(buf));
00901 
00902         bufp = &buf_hash_table.ht_table[idx];
00903         while ((fbuf = *bufp) != buf) {
00904                 ASSERT(fbuf != NULL);
00905                 bufp = &fbuf->b_hash_next;
00906         }
00907         *bufp = buf->b_hash_next;
00908         buf->b_hash_next = NULL;
00909         buf->b_flags &= ~ARC_IN_HASH_TABLE;
00910 
00911         /* collect some hash table performance data */
00912         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
00913 
00914         if (buf_hash_table.ht_table[idx] &&
00915             buf_hash_table.ht_table[idx]->b_hash_next == NULL)
00916                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
00917 }
00918 
00919 /*
00920  * Global data structures and functions for the buf kmem cache.
00921  */
00922 static kmem_cache_t *hdr_cache;
00923 static kmem_cache_t *buf_cache;
00924 
00925 static void
00926 buf_fini(void)
00927 {
00928         int i;
00929 
00930         kmem_free(buf_hash_table.ht_table,
00931             (buf_hash_table.ht_mask + 1) * sizeof (void *));
00932         for (i = 0; i < BUF_LOCKS; i++)
00933                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
00934         kmem_cache_destroy(hdr_cache);
00935         kmem_cache_destroy(buf_cache);
00936 }
00937 
00942 /* ARGSUSED */
00943 static int
00944 hdr_cons(void *vbuf, void *unused, int kmflag)
00945 {
00946         arc_buf_hdr_t *buf = vbuf;
00947 
00948         bzero(buf, sizeof (arc_buf_hdr_t));
00949         refcount_create(&buf->b_refcnt);
00950         cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
00951         mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
00952         arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
00953 
00954         return (0);
00955 }
00956 
00957 /* ARGSUSED */
00958 static int
00959 buf_cons(void *vbuf, void *unused, int kmflag)
00960 {
00961         arc_buf_t *buf = vbuf;
00962 
00963         bzero(buf, sizeof (arc_buf_t));
00964         mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
00965         rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL);
00966         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
00967 
00968         return (0);
00969 }
00970 
00975 /* ARGSUSED */
00976 static void
00977 hdr_dest(void *vbuf, void *unused)
00978 {
00979         arc_buf_hdr_t *buf = vbuf;
00980 
00981         ASSERT(BUF_EMPTY(buf));
00982         refcount_destroy(&buf->b_refcnt);
00983         cv_destroy(&buf->b_cv);
00984         mutex_destroy(&buf->b_freeze_lock);
00985         arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
00986 }
00987 
00988 /* ARGSUSED */
00989 static void
00990 buf_dest(void *vbuf, void *unused)
00991 {
00992         arc_buf_t *buf = vbuf;
00993 
00994         mutex_destroy(&buf->b_evict_lock);
00995         rw_destroy(&buf->b_data_lock);
00996         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
00997 }
00998 
01002 /* ARGSUSED */
01003 static void
01004 hdr_recl(void *unused)
01005 {
01006         dprintf("hdr_recl called\n");
01007         /*
01008          * umem calls the reclaim func when we destroy the buf cache,
01009          * which is after we do arc_fini().
01010          */
01011         if (!arc_dead)
01012                 cv_signal(&arc_reclaim_thr_cv);
01013 }
01014 
01015 static void
01016 buf_init(void)
01017 {
01018         uint64_t *ct;
01019         uint64_t hsize = 1ULL << 12;
01020         int i, j;
01021 
01022         /*
01023          * The hash table is big enough to fill all of physical memory
01024          * with an average 64K block size.  The table will take up
01025          * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
01026          */
01027         while (hsize * 65536 < (uint64_t)physmem * PAGESIZE)
01028                 hsize <<= 1;
01029 retry:
01030         buf_hash_table.ht_mask = hsize - 1;
01031         buf_hash_table.ht_table =
01032             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
01033         if (buf_hash_table.ht_table == NULL) {
01034                 ASSERT(hsize > (1ULL << 8));
01035                 hsize >>= 1;
01036                 goto retry;
01037         }
01038 
01039         hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
01040             0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
01041         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
01042             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
01043 
01044         for (i = 0; i < 256; i++)
01045                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
01046                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
01047 
01048         for (i = 0; i < BUF_LOCKS; i++) {
01049                 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
01050                     NULL, MUTEX_DEFAULT, NULL);
01051         }
01052 }
01053 
01054 #define ARC_MINTIME     (hz>>4) /* 62 ms */
01055 
01056 static void
01057 arc_cksum_verify(arc_buf_t *buf)
01058 {
01059         zio_cksum_t zc;
01060 
01061         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
01062                 return;
01063 
01064         mutex_enter(&buf->b_hdr->b_freeze_lock);
01065         if (buf->b_hdr->b_freeze_cksum == NULL ||
01066             (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
01067                 mutex_exit(&buf->b_hdr->b_freeze_lock);
01068                 return;
01069         }
01070         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
01071         if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
01072                 panic("buffer modified while frozen!");
01073         mutex_exit(&buf->b_hdr->b_freeze_lock);
01074 }
01075 
01076 static int
01077 arc_cksum_equal(arc_buf_t *buf)
01078 {
01079         zio_cksum_t zc;
01080         int equal;
01081 
01082         mutex_enter(&buf->b_hdr->b_freeze_lock);
01083         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
01084         equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
01085         mutex_exit(&buf->b_hdr->b_freeze_lock);
01086 
01087         return (equal);
01088 }
01089 
01090 static void
01091 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
01092 {
01093         if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
01094                 return;
01095 
01096         mutex_enter(&buf->b_hdr->b_freeze_lock);
01097         if (buf->b_hdr->b_freeze_cksum != NULL) {
01098                 mutex_exit(&buf->b_hdr->b_freeze_lock);
01099                 return;
01100         }
01101         buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
01102         fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
01103             buf->b_hdr->b_freeze_cksum);
01104         mutex_exit(&buf->b_hdr->b_freeze_lock);
01105 #ifdef illumos
01106         arc_buf_watch(buf);
01107 #endif /* illumos */
01108 }
01109 
01110 #ifdef illumos
01111 #ifndef _KERNEL
01112 typedef struct procctl {
01113         long cmd;
01114         prwatch_t prwatch;
01115 } procctl_t;
01116 #endif
01117 
01118 /* ARGSUSED */
01119 static void
01120 arc_buf_unwatch(arc_buf_t *buf)
01121 {
01122 #ifndef _KERNEL
01123         if (arc_watch) {
01124                 int result;
01125                 procctl_t ctl;
01126                 ctl.cmd = PCWATCH;
01127                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
01128                 ctl.prwatch.pr_size = 0;
01129                 ctl.prwatch.pr_wflags = 0;
01130                 result = write(arc_procfd, &ctl, sizeof (ctl));
01131                 ASSERT3U(result, ==, sizeof (ctl));
01132         }
01133 #endif
01134 }
01135 
01136 /* ARGSUSED */
01137 static void
01138 arc_buf_watch(arc_buf_t *buf)
01139 {
01140 #ifndef _KERNEL
01141         if (arc_watch) {
01142                 int result;
01143                 procctl_t ctl;
01144                 ctl.cmd = PCWATCH;
01145                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
01146                 ctl.prwatch.pr_size = buf->b_hdr->b_size;
01147                 ctl.prwatch.pr_wflags = WA_WRITE;
01148                 result = write(arc_procfd, &ctl, sizeof (ctl));
01149                 ASSERT3U(result, ==, sizeof (ctl));
01150         }
01151 #endif
01152 }
01153 #endif /* illumos */
01154 
01155 void
01156 arc_buf_thaw(arc_buf_t *buf)
01157 {
01158         if (zfs_flags & ZFS_DEBUG_MODIFY) {
01159                 if (buf->b_hdr->b_state != arc_anon)
01160                         panic("modifying non-anon buffer!");
01161                 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
01162                         panic("modifying buffer while i/o in progress!");
01163                 arc_cksum_verify(buf);
01164         }
01165 
01166         mutex_enter(&buf->b_hdr->b_freeze_lock);
01167         if (buf->b_hdr->b_freeze_cksum != NULL) {
01168                 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
01169                 buf->b_hdr->b_freeze_cksum = NULL;
01170         }
01171 
01172         if (zfs_flags & ZFS_DEBUG_MODIFY) {
01173                 if (buf->b_hdr->b_thawed)
01174                         kmem_free(buf->b_hdr->b_thawed, 1);
01175                 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
01176         }
01177 
01178         mutex_exit(&buf->b_hdr->b_freeze_lock);
01179 
01180 #ifdef illumos
01181         arc_buf_unwatch(buf);
01182 #endif /* illumos */
01183 }
01184 
01185 void
01186 arc_buf_freeze(arc_buf_t *buf)
01187 {
01188         kmutex_t *hash_lock;
01189 
01190         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
01191                 return;
01192 
01193         hash_lock = HDR_LOCK(buf->b_hdr);
01194         mutex_enter(hash_lock);
01195 
01196         ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
01197             buf->b_hdr->b_state == arc_anon);
01198         arc_cksum_compute(buf, B_FALSE);
01199         mutex_exit(hash_lock);
01200 
01201 }
01202 
01203 static void
01204 get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock)
01205 {
01206         uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth);
01207 
01208         if (ab->b_type == ARC_BUFC_METADATA)
01209                 buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
01210         else {
01211                 buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
01212                 buf_hashid += ARC_BUFC_NUMMETADATALISTS;
01213         }
01214 
01215         *list = &state->arcs_lists[buf_hashid];
01216         *lock = ARCS_LOCK(state, buf_hashid);
01217 }
01218 
01219 
01220 static void
01221 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
01222 {
01223         ASSERT(MUTEX_HELD(hash_lock));
01224 
01225         if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
01226             (ab->b_state != arc_anon)) {
01227                 uint64_t delta = ab->b_size * ab->b_datacnt;
01228                 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
01229                 list_t *list;
01230                 kmutex_t *lock;
01231 
01232                 get_buf_info(ab, ab->b_state, &list, &lock);
01233                 ASSERT(!MUTEX_HELD(lock));
01234                 mutex_enter(lock);
01235                 ASSERT(list_link_active(&ab->b_arc_node));
01236                 list_remove(list, ab);
01237                 if (GHOST_STATE(ab->b_state)) {
01238                         ASSERT0(ab->b_datacnt);
01239                         ASSERT3P(ab->b_buf, ==, NULL);
01240                         delta = ab->b_size;
01241                 }
01242                 ASSERT(delta > 0);
01243                 ASSERT3U(*size, >=, delta);
01244                 atomic_add_64(size, -delta);
01245                 mutex_exit(lock);
01246                 /* remove the prefetch flag if we get a reference */
01247                 if (ab->b_flags & ARC_PREFETCH)
01248                         ab->b_flags &= ~ARC_PREFETCH;
01249         }
01250 }
01251 
01252 static int
01253 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
01254 {
01255         int cnt;
01256         arc_state_t *state = ab->b_state;
01257 
01258         ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
01259         ASSERT(!GHOST_STATE(state));
01260 
01261         if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
01262             (state != arc_anon)) {
01263                 uint64_t *size = &state->arcs_lsize[ab->b_type];
01264                 list_t *list;
01265                 kmutex_t *lock;
01266 
01267                 get_buf_info(ab, state, &list, &lock);
01268                 ASSERT(!MUTEX_HELD(lock));
01269                 mutex_enter(lock);
01270                 ASSERT(!list_link_active(&ab->b_arc_node));
01271                 list_insert_head(list, ab);
01272                 ASSERT(ab->b_datacnt > 0);
01273                 atomic_add_64(size, ab->b_size * ab->b_datacnt);
01274                 mutex_exit(lock);
01275         }
01276         return (cnt);
01277 }
01278 
01283 static void
01284 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
01285 {
01286         arc_state_t *old_state = ab->b_state;
01287         int64_t refcnt = refcount_count(&ab->b_refcnt);
01288         uint64_t from_delta, to_delta;
01289         list_t *list;
01290         kmutex_t *lock;
01291 
01292         ASSERT(MUTEX_HELD(hash_lock));
01293         ASSERT(new_state != old_state);
01294         ASSERT(refcnt == 0 || ab->b_datacnt > 0);
01295         ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
01296         ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
01297 
01298         from_delta = to_delta = ab->b_datacnt * ab->b_size;
01299 
01300         /*
01301          * If this buffer is evictable, transfer it from the
01302          * old state list to the new state list.
01303          */
01304         if (refcnt == 0) {
01305                 if (old_state != arc_anon) {
01306                         int use_mutex;
01307                         uint64_t *size = &old_state->arcs_lsize[ab->b_type];
01308 
01309                         get_buf_info(ab, old_state, &list, &lock);
01310                         use_mutex = !MUTEX_HELD(lock);
01311                         if (use_mutex)
01312                                 mutex_enter(lock);
01313 
01314                         ASSERT(list_link_active(&ab->b_arc_node));
01315                         list_remove(list, ab);
01316 
01317                         /*
01318                          * If prefetching out of the ghost cache,
01319                          * we will have a non-zero datacnt.
01320                          */
01321                         if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
01322                                 /* ghost elements have a ghost size */
01323                                 ASSERT(ab->b_buf == NULL);
01324                                 from_delta = ab->b_size;
01325                         }
01326                         ASSERT3U(*size, >=, from_delta);
01327                         atomic_add_64(size, -from_delta);
01328 
01329                         if (use_mutex)
01330                                 mutex_exit(lock);
01331                 }
01332                 if (new_state != arc_anon) {
01333                         int use_mutex;
01334                         uint64_t *size = &new_state->arcs_lsize[ab->b_type];
01335 
01336                         get_buf_info(ab, new_state, &list, &lock);
01337                         use_mutex = !MUTEX_HELD(lock);
01338                         if (use_mutex)
01339                                 mutex_enter(lock);
01340 
01341                         list_insert_head(list, ab);
01342 
01343                         /* ghost elements have a ghost size */
01344                         if (GHOST_STATE(new_state)) {
01345                                 ASSERT(ab->b_datacnt == 0);
01346                                 ASSERT(ab->b_buf == NULL);
01347                                 to_delta = ab->b_size;
01348                         }
01349                         atomic_add_64(size, to_delta);
01350 
01351                         if (use_mutex)
01352                                 mutex_exit(lock);
01353                 }
01354         }
01355 
01356         ASSERT(!BUF_EMPTY(ab));
01357         if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
01358                 buf_hash_remove(ab);
01359 
01360         /* adjust state sizes */
01361         if (to_delta)
01362                 atomic_add_64(&new_state->arcs_size, to_delta);
01363         if (from_delta) {
01364                 ASSERT3U(old_state->arcs_size, >=, from_delta);
01365                 atomic_add_64(&old_state->arcs_size, -from_delta);
01366         }
01367         ab->b_state = new_state;
01368 
01369         /* adjust l2arc hdr stats */
01370         if (new_state == arc_l2c_only)
01371                 l2arc_hdr_stat_add();
01372         else if (old_state == arc_l2c_only)
01373                 l2arc_hdr_stat_remove();
01374 }
01375 
01376 void
01377 arc_space_consume(uint64_t space, arc_space_type_t type)
01378 {
01379         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
01380 
01381         switch (type) {
01382         case ARC_SPACE_DATA:
01383                 ARCSTAT_INCR(arcstat_data_size, space);
01384                 break;
01385         case ARC_SPACE_OTHER:
01386                 ARCSTAT_INCR(arcstat_other_size, space);
01387                 break;
01388         case ARC_SPACE_HDRS:
01389                 ARCSTAT_INCR(arcstat_hdr_size, space);
01390                 break;
01391         case ARC_SPACE_L2HDRS:
01392                 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
01393                 break;
01394         }
01395 
01396         atomic_add_64(&arc_meta_used, space);
01397         atomic_add_64(&arc_size, space);
01398 }
01399 
01400 void
01401 arc_space_return(uint64_t space, arc_space_type_t type)
01402 {
01403         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
01404 
01405         switch (type) {
01406         case ARC_SPACE_DATA:
01407                 ARCSTAT_INCR(arcstat_data_size, -space);
01408                 break;
01409         case ARC_SPACE_OTHER:
01410                 ARCSTAT_INCR(arcstat_other_size, -space);
01411                 break;
01412         case ARC_SPACE_HDRS:
01413                 ARCSTAT_INCR(arcstat_hdr_size, -space);
01414                 break;
01415         case ARC_SPACE_L2HDRS:
01416                 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
01417                 break;
01418         }
01419 
01420         ASSERT(arc_meta_used >= space);
01421         if (arc_meta_max < arc_meta_used)
01422                 arc_meta_max = arc_meta_used;
01423         atomic_add_64(&arc_meta_used, -space);
01424         ASSERT(arc_size >= space);
01425         atomic_add_64(&arc_size, -space);
01426 }
01427 
01428 void *
01429 arc_data_buf_alloc(uint64_t size)
01430 {
01431         if (arc_evict_needed(ARC_BUFC_DATA))
01432                 cv_signal(&arc_reclaim_thr_cv);
01433         atomic_add_64(&arc_size, size);
01434         return (zio_data_buf_alloc(size));
01435 }
01436 
01437 void
01438 arc_data_buf_free(void *buf, uint64_t size)
01439 {
01440         zio_data_buf_free(buf, size);
01441         ASSERT(arc_size >= size);
01442         atomic_add_64(&arc_size, -size);
01443 }
01444 
01445 arc_buf_t *
01446 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
01447 {
01448         arc_buf_hdr_t *hdr;
01449         arc_buf_t *buf;
01450 
01451         ASSERT3U(size, >, 0);
01452         hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
01453         ASSERT(BUF_EMPTY(hdr));
01454         hdr->b_size = size;
01455         hdr->b_type = type;
01456         hdr->b_spa = spa_load_guid(spa);
01457         hdr->b_state = arc_anon;
01458         hdr->b_arc_access = 0;
01459         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
01460         buf->b_hdr = hdr;
01461         buf->b_data = NULL;
01462         buf->b_efunc = NULL;
01463         buf->b_private = NULL;
01464         buf->b_next = NULL;
01465         hdr->b_buf = buf;
01466         arc_get_data_buf(buf);
01467         hdr->b_datacnt = 1;
01468         hdr->b_flags = 0;
01469         ASSERT(refcount_is_zero(&hdr->b_refcnt));
01470         (void) refcount_add(&hdr->b_refcnt, tag);
01471 
01472         return (buf);
01473 }
01474 
01475 static char *arc_onloan_tag = "onloan";
01476 
01483 arc_buf_t *
01484 arc_loan_buf(spa_t *spa, int size)
01485 {
01486         arc_buf_t *buf;
01487 
01488         buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
01489 
01490         atomic_add_64(&arc_loaned_bytes, size);
01491         return (buf);
01492 }
01493 
01497 void
01498 arc_return_buf(arc_buf_t *buf, void *tag)
01499 {
01500         arc_buf_hdr_t *hdr = buf->b_hdr;
01501 
01502         ASSERT(buf->b_data != NULL);
01503         (void) refcount_add(&hdr->b_refcnt, tag);
01504         (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
01505 
01506         atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
01507 }
01508 
01512 void
01513 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
01514 {
01515         arc_buf_hdr_t *hdr;
01516 
01517         ASSERT(buf->b_data != NULL);
01518         hdr = buf->b_hdr;
01519         (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
01520         (void) refcount_remove(&hdr->b_refcnt, tag);
01521         buf->b_efunc = NULL;
01522         buf->b_private = NULL;
01523 
01524         atomic_add_64(&arc_loaned_bytes, hdr->b_size);
01525 }
01526 
01527 static arc_buf_t *
01528 arc_buf_clone(arc_buf_t *from)
01529 {
01530         arc_buf_t *buf;
01531         arc_buf_hdr_t *hdr = from->b_hdr;
01532         uint64_t size = hdr->b_size;
01533 
01534         ASSERT(hdr->b_state != arc_anon);
01535 
01536         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
01537         buf->b_hdr = hdr;
01538         buf->b_data = NULL;
01539         buf->b_efunc = NULL;
01540         buf->b_private = NULL;
01541         buf->b_next = hdr->b_buf;
01542         hdr->b_buf = buf;
01543         arc_get_data_buf(buf);
01544         bcopy(from->b_data, buf->b_data, size);
01545 
01546         /*
01547          * This buffer already exists in the arc so create a duplicate
01548          * copy for the caller.  If the buffer is associated with user data
01549          * then track the size and number of duplicates.  These stats will be
01550          * updated as duplicate buffers are created and destroyed.
01551          */
01552         if (hdr->b_type == ARC_BUFC_DATA) {
01553                 ARCSTAT_BUMP(arcstat_duplicate_buffers);
01554                 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
01555         }
01556         hdr->b_datacnt += 1;
01557         return (buf);
01558 }
01559 
01560 void
01561 arc_buf_add_ref(arc_buf_t *buf, void* tag)
01562 {
01563         arc_buf_hdr_t *hdr;
01564         kmutex_t *hash_lock;
01565 
01566         /*
01567          * Check to see if this buffer is evicted.  Callers
01568          * must verify b_data != NULL to know if the add_ref
01569          * was successful.
01570          */
01571         mutex_enter(&buf->b_evict_lock);
01572         if (buf->b_data == NULL) {
01573                 mutex_exit(&buf->b_evict_lock);
01574                 return;
01575         }
01576         hash_lock = HDR_LOCK(buf->b_hdr);
01577         mutex_enter(hash_lock);
01578         hdr = buf->b_hdr;
01579         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
01580         mutex_exit(&buf->b_evict_lock);
01581 
01582         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
01583         add_reference(hdr, hash_lock, tag);
01584         DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
01585         arc_access(hdr, hash_lock);
01586         mutex_exit(hash_lock);
01587         ARCSTAT_BUMP(arcstat_hits);
01588         ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
01589             demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
01590             data, metadata, hits);
01591 }
01592 
01597 static void
01598 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
01599 {
01600         arc_buf_hdr_t *hdr = buf->b_hdr;
01601 
01602         if (HDR_L2_WRITING(hdr)) {
01603                 l2arc_data_free_t *df;
01604                 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
01605                 df->l2df_data = buf->b_data;
01606                 df->l2df_size = hdr->b_size;
01607                 df->l2df_func = free_func;
01608                 mutex_enter(&l2arc_free_on_write_mtx);
01609                 list_insert_head(l2arc_free_on_write, df);
01610                 mutex_exit(&l2arc_free_on_write_mtx);
01611                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
01612         } else {
01613                 free_func(buf->b_data, hdr->b_size);
01614         }
01615 }
01616 
01617 static void
01618 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
01619 {
01620         arc_buf_t **bufp;
01621 
01622         /* free up data associated with the buf */
01623         if (buf->b_data) {
01624                 arc_state_t *state = buf->b_hdr->b_state;
01625                 uint64_t size = buf->b_hdr->b_size;
01626                 arc_buf_contents_t type = buf->b_hdr->b_type;
01627 
01628                 arc_cksum_verify(buf);
01629 #ifdef illumos
01630                 arc_buf_unwatch(buf);
01631 #endif /* illumos */
01632 
01633                 if (!recycle) {
01634                         if (type == ARC_BUFC_METADATA) {
01635                                 arc_buf_data_free(buf, zio_buf_free);
01636                                 arc_space_return(size, ARC_SPACE_DATA);
01637                         } else {
01638                                 ASSERT(type == ARC_BUFC_DATA);
01639                                 arc_buf_data_free(buf, zio_data_buf_free);
01640                                 ARCSTAT_INCR(arcstat_data_size, -size);
01641                                 atomic_add_64(&arc_size, -size);
01642                         }
01643                 }
01644                 if (list_link_active(&buf->b_hdr->b_arc_node)) {
01645                         uint64_t *cnt = &state->arcs_lsize[type];
01646 
01647                         ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
01648                         ASSERT(state != arc_anon);
01649 
01650                         ASSERT3U(*cnt, >=, size);
01651                         atomic_add_64(cnt, -size);
01652                 }
01653                 ASSERT3U(state->arcs_size, >=, size);
01654                 atomic_add_64(&state->arcs_size, -size);
01655                 buf->b_data = NULL;
01656 
01657                 /*
01658                  * If we're destroying a duplicate buffer make sure
01659                  * that the appropriate statistics are updated.
01660                  */
01661                 if (buf->b_hdr->b_datacnt > 1 &&
01662                     buf->b_hdr->b_type == ARC_BUFC_DATA) {
01663                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
01664                         ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
01665                 }
01666                 ASSERT(buf->b_hdr->b_datacnt > 0);
01667                 buf->b_hdr->b_datacnt -= 1;
01668         }
01669 
01670         /* only remove the buf if requested */
01671         if (!all)
01672                 return;
01673 
01674         /* remove the buf from the hdr list */
01675         for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
01676                 continue;
01677         *bufp = buf->b_next;
01678         buf->b_next = NULL;
01679 
01680         ASSERT(buf->b_efunc == NULL);
01681 
01682         /* clean up the buf */
01683         buf->b_hdr = NULL;
01684         kmem_cache_free(buf_cache, buf);
01685 }
01686 
01687 static void
01688 arc_hdr_destroy(arc_buf_hdr_t *hdr)
01689 {
01690         ASSERT(refcount_is_zero(&hdr->b_refcnt));
01691         ASSERT3P(hdr->b_state, ==, arc_anon);
01692         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
01693         l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
01694 
01695         if (l2hdr != NULL) {
01696                 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
01697                 /*
01698                  * To prevent arc_free() and l2arc_evict() from
01699                  * attempting to free the same buffer at the same time,
01700                  * a FREE_IN_PROGRESS flag is given to arc_free() to
01701                  * give it priority.  l2arc_evict() can't destroy this
01702                  * header while we are waiting on l2arc_buflist_mtx.
01703                  *
01704                  * The hdr may be removed from l2ad_buflist before we
01705                  * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
01706                  */
01707                 if (!buflist_held) {
01708                         mutex_enter(&l2arc_buflist_mtx);
01709                         l2hdr = hdr->b_l2hdr;
01710                 }
01711 
01712                 if (l2hdr != NULL) {
01713                         list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
01714                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
01715                         kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
01716                         if (hdr->b_state == arc_l2c_only)
01717                                 l2arc_hdr_stat_remove();
01718                         hdr->b_l2hdr = NULL;
01719                 }
01720 
01721                 if (!buflist_held)
01722                         mutex_exit(&l2arc_buflist_mtx);
01723         }
01724 
01725         if (!BUF_EMPTY(hdr)) {
01726                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
01727                 buf_discard_identity(hdr);
01728         }
01729         while (hdr->b_buf) {
01730                 arc_buf_t *buf = hdr->b_buf;
01731 
01732                 if (buf->b_efunc) {
01733                         mutex_enter(&arc_eviction_mtx);
01734                         mutex_enter(&buf->b_evict_lock);
01735                         ASSERT(buf->b_hdr != NULL);
01736                         arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
01737                         hdr->b_buf = buf->b_next;
01738                         buf->b_hdr = &arc_eviction_hdr;
01739                         buf->b_next = arc_eviction_list;
01740                         arc_eviction_list = buf;
01741                         mutex_exit(&buf->b_evict_lock);
01742                         mutex_exit(&arc_eviction_mtx);
01743                 } else {
01744                         arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
01745                 }
01746         }
01747         if (hdr->b_freeze_cksum != NULL) {
01748                 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
01749                 hdr->b_freeze_cksum = NULL;
01750         }
01751         if (hdr->b_thawed) {
01752                 kmem_free(hdr->b_thawed, 1);
01753                 hdr->b_thawed = NULL;
01754         }
01755 
01756         ASSERT(!list_link_active(&hdr->b_arc_node));
01757         ASSERT3P(hdr->b_hash_next, ==, NULL);
01758         ASSERT3P(hdr->b_acb, ==, NULL);
01759         kmem_cache_free(hdr_cache, hdr);
01760 }
01761 
01762 void
01763 arc_buf_free(arc_buf_t *buf, void *tag)
01764 {
01765         arc_buf_hdr_t *hdr = buf->b_hdr;
01766         int hashed = hdr->b_state != arc_anon;
01767 
01768         ASSERT(buf->b_efunc == NULL);
01769         ASSERT(buf->b_data != NULL);
01770 
01771         if (hashed) {
01772                 kmutex_t *hash_lock = HDR_LOCK(hdr);
01773 
01774                 mutex_enter(hash_lock);
01775                 hdr = buf->b_hdr;
01776                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
01777 
01778                 (void) remove_reference(hdr, hash_lock, tag);
01779                 if (hdr->b_datacnt > 1) {
01780                         arc_buf_destroy(buf, FALSE, TRUE);
01781                 } else {
01782                         ASSERT(buf == hdr->b_buf);
01783                         ASSERT(buf->b_efunc == NULL);
01784                         hdr->b_flags |= ARC_BUF_AVAILABLE;
01785                 }
01786                 mutex_exit(hash_lock);
01787         } else if (HDR_IO_IN_PROGRESS(hdr)) {
01788                 int destroy_hdr;
01789                 /*
01790                  * We are in the middle of an async write.  Don't destroy
01791                  * this buffer unless the write completes before we finish
01792                  * decrementing the reference count.
01793                  */
01794                 mutex_enter(&arc_eviction_mtx);
01795                 (void) remove_reference(hdr, NULL, tag);
01796                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
01797                 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
01798                 mutex_exit(&arc_eviction_mtx);
01799                 if (destroy_hdr)
01800                         arc_hdr_destroy(hdr);
01801         } else {
01802                 if (remove_reference(hdr, NULL, tag) > 0)
01803                         arc_buf_destroy(buf, FALSE, TRUE);
01804                 else
01805                         arc_hdr_destroy(hdr);
01806         }
01807 }
01808 
01809 int
01810 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
01811 {
01812         arc_buf_hdr_t *hdr = buf->b_hdr;
01813         kmutex_t *hash_lock = HDR_LOCK(hdr);
01814         int no_callback = (buf->b_efunc == NULL);
01815 
01816         if (hdr->b_state == arc_anon) {
01817                 ASSERT(hdr->b_datacnt == 1);
01818                 arc_buf_free(buf, tag);
01819                 return (no_callback);
01820         }
01821 
01822         mutex_enter(hash_lock);
01823         hdr = buf->b_hdr;
01824         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
01825         ASSERT(hdr->b_state != arc_anon);
01826         ASSERT(buf->b_data != NULL);
01827 
01828         (void) remove_reference(hdr, hash_lock, tag);
01829         if (hdr->b_datacnt > 1) {
01830                 if (no_callback)
01831                         arc_buf_destroy(buf, FALSE, TRUE);
01832         } else if (no_callback) {
01833                 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
01834                 ASSERT(buf->b_efunc == NULL);
01835                 hdr->b_flags |= ARC_BUF_AVAILABLE;
01836         }
01837         ASSERT(no_callback || hdr->b_datacnt > 1 ||
01838             refcount_is_zero(&hdr->b_refcnt));
01839         mutex_exit(hash_lock);
01840         return (no_callback);
01841 }
01842 
01843 int
01844 arc_buf_size(arc_buf_t *buf)
01845 {
01846         return (buf->b_hdr->b_size);
01847 }
01848 
01855 boolean_t
01856 arc_buf_eviction_needed(arc_buf_t *buf)
01857 {
01858         arc_buf_hdr_t *hdr;
01859         boolean_t evict_needed = B_FALSE;
01860 
01861         if (zfs_disable_dup_eviction)
01862                 return (B_FALSE);
01863 
01864         mutex_enter(&buf->b_evict_lock);
01865         hdr = buf->b_hdr;
01866         if (hdr == NULL) {
01867                 /*
01868                  * We are in arc_do_user_evicts(); let that function
01869                  * perform the eviction.
01870                  */
01871                 ASSERT(buf->b_data == NULL);
01872                 mutex_exit(&buf->b_evict_lock);
01873                 return (B_FALSE);
01874         } else if (buf->b_data == NULL) {
01875                 /*
01876                  * We have already been added to the arc eviction list;
01877                  * recommend eviction.
01878                  */
01879                 ASSERT3P(hdr, ==, &arc_eviction_hdr);
01880                 mutex_exit(&buf->b_evict_lock);
01881                 return (B_TRUE);
01882         }
01883 
01884         if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
01885                 evict_needed = B_TRUE;
01886 
01887         mutex_exit(&buf->b_evict_lock);
01888         return (evict_needed);
01889 }
01890 
01904 static void *
01905 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
01906     arc_buf_contents_t type)
01907 {
01908         arc_state_t *evicted_state;
01909         uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
01910         int64_t bytes_remaining;
01911         arc_buf_hdr_t *ab, *ab_prev = NULL;
01912         list_t *evicted_list, *list, *evicted_list_start, *list_start;
01913         kmutex_t *lock, *evicted_lock;
01914         kmutex_t *hash_lock;
01915         boolean_t have_lock;
01916         void *stolen = NULL;
01917         static int evict_metadata_offset, evict_data_offset;
01918         int i, idx, offset, list_count, count;
01919 
01920         ASSERT(state == arc_mru || state == arc_mfu);
01921 
01922         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
01923 
01924         if (type == ARC_BUFC_METADATA) {
01925                 offset = 0;
01926                 list_count = ARC_BUFC_NUMMETADATALISTS;
01927                 list_start = &state->arcs_lists[0];
01928                 evicted_list_start = &evicted_state->arcs_lists[0];
01929                 idx = evict_metadata_offset;
01930         } else {
01931                 offset = ARC_BUFC_NUMMETADATALISTS;
01932                 list_start = &state->arcs_lists[offset];
01933                 evicted_list_start = &evicted_state->arcs_lists[offset];
01934                 list_count = ARC_BUFC_NUMDATALISTS;
01935                 idx = evict_data_offset;
01936         }
01937         bytes_remaining = evicted_state->arcs_lsize[type];
01938         count = 0;
01939 
01940 evict_start:
01941         list = &list_start[idx];
01942         evicted_list = &evicted_list_start[idx];
01943         lock = ARCS_LOCK(state, (offset + idx));
01944         evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
01945 
01946         mutex_enter(lock);
01947         mutex_enter(evicted_lock);
01948 
01949         for (ab = list_tail(list); ab; ab = ab_prev) {
01950                 ab_prev = list_prev(list, ab);
01951                 bytes_remaining -= (ab->b_size * ab->b_datacnt);
01952                 /* prefetch buffers have a minimum lifespan */
01953                 if (HDR_IO_IN_PROGRESS(ab) ||
01954                     (spa && ab->b_spa != spa) ||
01955                     (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
01956                     ddi_get_lbolt() - ab->b_arc_access <
01957                     arc_min_prefetch_lifespan)) {
01958                         skipped++;
01959                         continue;
01960                 }
01961                 /* "lookahead" for better eviction candidate */
01962                 if (recycle && ab->b_size != bytes &&
01963                     ab_prev && ab_prev->b_size == bytes)
01964                         continue;
01965                 hash_lock = HDR_LOCK(ab);
01966                 have_lock = MUTEX_HELD(hash_lock);
01967                 if (have_lock || mutex_tryenter(hash_lock)) {
01968                         ASSERT0(refcount_count(&ab->b_refcnt));
01969                         ASSERT(ab->b_datacnt > 0);
01970                         while (ab->b_buf) {
01971                                 arc_buf_t *buf = ab->b_buf;
01972                                 if (!mutex_tryenter(&buf->b_evict_lock)) {
01973                                         missed += 1;
01974                                         break;
01975                                 }
01976                                 if (buf->b_data) {
01977                                         bytes_evicted += ab->b_size;
01978                                         if (recycle && ab->b_type == type &&
01979                                             ab->b_size == bytes &&
01980                                             !HDR_L2_WRITING(ab)) {
01981                                                 stolen = buf->b_data;
01982                                                 recycle = FALSE;
01983                                         }
01984                                 }
01985                                 if (buf->b_efunc) {
01986                                         mutex_enter(&arc_eviction_mtx);
01987                                         arc_buf_destroy(buf,
01988                                             buf->b_data == stolen, FALSE);
01989                                         ab->b_buf = buf->b_next;
01990                                         buf->b_hdr = &arc_eviction_hdr;
01991                                         buf->b_next = arc_eviction_list;
01992                                         arc_eviction_list = buf;
01993                                         mutex_exit(&arc_eviction_mtx);
01994                                         mutex_exit(&buf->b_evict_lock);
01995                                 } else {
01996                                         mutex_exit(&buf->b_evict_lock);
01997                                         arc_buf_destroy(buf,
01998                                             buf->b_data == stolen, TRUE);
01999                                 }
02000                         }
02001 
02002                         if (ab->b_l2hdr) {
02003                                 ARCSTAT_INCR(arcstat_evict_l2_cached,
02004                                     ab->b_size);
02005                         } else {
02006                                 if (l2arc_write_eligible(ab->b_spa, ab)) {
02007                                         ARCSTAT_INCR(arcstat_evict_l2_eligible,
02008                                             ab->b_size);
02009                                 } else {
02010                                         ARCSTAT_INCR(
02011                                             arcstat_evict_l2_ineligible,
02012                                             ab->b_size);
02013                                 }
02014                         }
02015 
02016                         if (ab->b_datacnt == 0) {
02017                                 arc_change_state(evicted_state, ab, hash_lock);
02018                                 ASSERT(HDR_IN_HASH_TABLE(ab));
02019                                 ab->b_flags |= ARC_IN_HASH_TABLE;
02020                                 ab->b_flags &= ~ARC_BUF_AVAILABLE;
02021                                 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
02022                         }
02023                         if (!have_lock)
02024                                 mutex_exit(hash_lock);
02025                         if (bytes >= 0 && bytes_evicted >= bytes)
02026                                 break;
02027                         if (bytes_remaining > 0) {
02028                                 mutex_exit(evicted_lock);
02029                                 mutex_exit(lock);
02030                                 idx  = ((idx + 1) & (list_count - 1));
02031                                 count++;
02032                                 goto evict_start;
02033                         }
02034                 } else {
02035                         missed += 1;
02036                 }
02037         }
02038 
02039         mutex_exit(evicted_lock);
02040         mutex_exit(lock);
02041 
02042         idx  = ((idx + 1) & (list_count - 1));
02043         count++;
02044 
02045         if (bytes_evicted < bytes) {
02046                 if (count < list_count)
02047                         goto evict_start;
02048                 else
02049                         dprintf("only evicted %lld bytes from %x",
02050                             (longlong_t)bytes_evicted, state);
02051         }
02052         if (type == ARC_BUFC_METADATA)
02053                 evict_metadata_offset = idx;
02054         else
02055                 evict_data_offset = idx;
02056 
02057         /*
02058          * Number of buffers skipped because they have I/O in progress or
02059          * are indrect prefetch buffers that have not lived long enough.
02060          */
02061         if (skipped)
02062                 ARCSTAT_INCR(arcstat_evict_skip, skipped);
02063 
02064         /*
02065          * Number of buffers that could not be evicted because something
02066          * else is using them.
02067          */
02068         if (missed)
02069                 ARCSTAT_INCR(arcstat_mutex_miss, missed);
02070 
02071         /*
02072          * We have just evicted some date into the ghost state, make
02073          * sure we also adjust the ghost state size if necessary.
02074          */
02075         if (arc_no_grow &&
02076             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
02077                 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
02078                     arc_mru_ghost->arcs_size - arc_c;
02079 
02080                 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
02081                         int64_t todelete =
02082                             MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
02083                         arc_evict_ghost(arc_mru_ghost, 0, todelete);
02084                 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
02085                         int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
02086                             arc_mru_ghost->arcs_size +
02087                             arc_mfu_ghost->arcs_size - arc_c);
02088                         arc_evict_ghost(arc_mfu_ghost, 0, todelete);
02089                 }
02090         }
02091         if (stolen)
02092                 ARCSTAT_BUMP(arcstat_stolen);
02093 
02094         return (stolen);
02095 }
02096 
02101 static void
02102 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
02103 {
02104         arc_buf_hdr_t *ab, *ab_prev;
02105         arc_buf_hdr_t marker = { 0 };
02106         list_t *list, *list_start;
02107         kmutex_t *hash_lock, *lock;
02108         uint64_t bytes_deleted = 0;
02109         uint64_t bufs_skipped = 0;
02110         static int evict_offset;
02111         int list_count, idx = evict_offset;
02112         int offset, count = 0;
02113 
02114         ASSERT(GHOST_STATE(state));
02115 
02116         /*
02117          * data lists come after metadata lists
02118          */
02119         list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
02120         list_count = ARC_BUFC_NUMDATALISTS;
02121         offset = ARC_BUFC_NUMMETADATALISTS;
02122 
02123 evict_start:
02124         list = &list_start[idx];
02125         lock = ARCS_LOCK(state, idx + offset);
02126 
02127         mutex_enter(lock);
02128         for (ab = list_tail(list); ab; ab = ab_prev) {
02129                 ab_prev = list_prev(list, ab);
02130                 if (spa && ab->b_spa != spa)
02131                         continue;
02132 
02133                 /* ignore markers */
02134                 if (ab->b_spa == 0)
02135                         continue;
02136 
02137                 hash_lock = HDR_LOCK(ab);
02138                 /* caller may be trying to modify this buffer, skip it */
02139                 if (MUTEX_HELD(hash_lock))
02140                         continue;
02141                 if (mutex_tryenter(hash_lock)) {
02142                         ASSERT(!HDR_IO_IN_PROGRESS(ab));
02143                         ASSERT(ab->b_buf == NULL);
02144                         ARCSTAT_BUMP(arcstat_deleted);
02145                         bytes_deleted += ab->b_size;
02146 
02147                         if (ab->b_l2hdr != NULL) {
02148                                 /*
02149                                  * This buffer is cached on the 2nd Level ARC;
02150                                  * don't destroy the header.
02151                                  */
02152                                 arc_change_state(arc_l2c_only, ab, hash_lock);
02153                                 mutex_exit(hash_lock);
02154                         } else {
02155                                 arc_change_state(arc_anon, ab, hash_lock);
02156                                 mutex_exit(hash_lock);
02157                                 arc_hdr_destroy(ab);
02158                         }
02159 
02160                         DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
02161                         if (bytes >= 0 && bytes_deleted >= bytes)
02162                                 break;
02163                 } else if (bytes < 0) {
02164                         /*
02165                          * Insert a list marker and then wait for the
02166                          * hash lock to become available. Once its
02167                          * available, restart from where we left off.
02168                          */
02169                         list_insert_after(list, ab, &marker);
02170                         mutex_exit(lock);
02171                         mutex_enter(hash_lock);
02172                         mutex_exit(hash_lock);
02173                         mutex_enter(lock);
02174                         ab_prev = list_prev(list, &marker);
02175                         list_remove(list, &marker);
02176                 } else
02177                         bufs_skipped += 1;
02178         }
02179         mutex_exit(lock);
02180         idx  = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
02181         count++;
02182 
02183         if (count < list_count)
02184                 goto evict_start;
02185 
02186         evict_offset = idx;
02187         if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
02188             (bytes < 0 || bytes_deleted < bytes)) {
02189                 list_start = &state->arcs_lists[0];
02190                 list_count = ARC_BUFC_NUMMETADATALISTS;
02191                 offset = count = 0;
02192                 goto evict_start;
02193         }
02194 
02195         /* Number of buffers we could not obtain the hash lock for */
02196         if (bufs_skipped) {
02197                 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
02198                 ASSERT(bytes >= 0);
02199         }
02200 
02201         if (bytes_deleted < bytes)
02202                 dprintf("only deleted %lld bytes from %p",
02203                     (longlong_t)bytes_deleted, state);
02204 }
02205 
02206 static void
02207 arc_adjust(void)
02208 {
02209         int64_t adjustment, delta;
02210 
02211         /*
02212          * Adjust MRU size
02213          */
02214 
02215         adjustment = MIN((int64_t)(arc_size - arc_c),
02216             (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
02217             arc_p));
02218 
02219         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
02220                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
02221                 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
02222                 adjustment -= delta;
02223         }
02224 
02225         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
02226                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
02227                 (void) arc_evict(arc_mru, 0, delta, FALSE,
02228                     ARC_BUFC_METADATA);
02229         }
02230 
02231         /*
02232          * Adjust MFU size
02233          */
02234 
02235         adjustment = arc_size - arc_c;
02236 
02237         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
02238                 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
02239                 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
02240                 adjustment -= delta;
02241         }
02242 
02243         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
02244                 int64_t delta = MIN(adjustment,
02245                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
02246                 (void) arc_evict(arc_mfu, 0, delta, FALSE,
02247                     ARC_BUFC_METADATA);
02248         }
02249 
02250         /*
02251          * Adjust ghost lists
02252          */
02253 
02254         adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
02255 
02256         if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
02257                 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
02258                 arc_evict_ghost(arc_mru_ghost, 0, delta);
02259         }
02260 
02261         adjustment =
02262             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
02263 
02264         if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
02265                 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
02266                 arc_evict_ghost(arc_mfu_ghost, 0, delta);
02267         }
02268 }
02269 
02270 static void
02271 arc_do_user_evicts(void)
02272 {
02273         static arc_buf_t *tmp_arc_eviction_list;
02274 
02275         /*
02276          * Move list over to avoid LOR
02277          */
02278 restart:
02279         mutex_enter(&arc_eviction_mtx);
02280         tmp_arc_eviction_list = arc_eviction_list;
02281         arc_eviction_list = NULL;
02282         mutex_exit(&arc_eviction_mtx);
02283 
02284         while (tmp_arc_eviction_list != NULL) {
02285                 arc_buf_t *buf = tmp_arc_eviction_list;
02286                 tmp_arc_eviction_list = buf->b_next;
02287                 mutex_enter(&buf->b_evict_lock);
02288                 buf->b_hdr = NULL;
02289                 mutex_exit(&buf->b_evict_lock);
02290 
02291                 if (buf->b_efunc != NULL)
02292                         VERIFY(buf->b_efunc(buf) == 0);
02293 
02294                 buf->b_efunc = NULL;
02295                 buf->b_private = NULL;
02296                 kmem_cache_free(buf_cache, buf);
02297         }
02298 
02299         if (arc_eviction_list != NULL)
02300                 goto restart;
02301 }
02302 
02308 void
02309 arc_flush(spa_t *spa)
02310 {
02311         uint64_t guid = 0;
02312 
02313         if (spa)
02314                 guid = spa_load_guid(spa);
02315 
02316         while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
02317                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
02318                 if (spa)
02319                         break;
02320         }
02321         while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
02322                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
02323                 if (spa)
02324                         break;
02325         }
02326         while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
02327                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
02328                 if (spa)
02329                         break;
02330         }
02331         while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
02332                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
02333                 if (spa)
02334                         break;
02335         }
02336 
02337         arc_evict_ghost(arc_mru_ghost, guid, -1);
02338         arc_evict_ghost(arc_mfu_ghost, guid, -1);
02339 
02340         mutex_enter(&arc_reclaim_thr_lock);
02341         arc_do_user_evicts();
02342         mutex_exit(&arc_reclaim_thr_lock);
02343         ASSERT(spa || arc_eviction_list == NULL);
02344 }
02345 
02346 void
02347 arc_shrink(void)
02348 {
02349         if (arc_c > arc_c_min) {
02350                 uint64_t to_free;
02351 
02352 #ifdef _KERNEL
02353                 to_free = arc_c >> arc_shrink_shift;
02354 #else
02355                 to_free = arc_c >> arc_shrink_shift;
02356 #endif
02357                 if (arc_c > arc_c_min + to_free)
02358                         atomic_add_64(&arc_c, -to_free);
02359                 else
02360                         arc_c = arc_c_min;
02361 
02362                 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
02363                 if (arc_c > arc_size)
02364                         arc_c = MAX(arc_size, arc_c_min);
02365                 if (arc_p > arc_c)
02366                         arc_p = (arc_c >> 1);
02367                 ASSERT(arc_c >= arc_c_min);
02368                 ASSERT((int64_t)arc_p >= 0);
02369         }
02370 
02371         if (arc_size > arc_c)
02372                 arc_adjust();
02373 }
02374 
02375 static int needfree = 0;
02376 
02377 static int
02378 arc_reclaim_needed(void)
02379 {
02380 
02381 #ifdef _KERNEL
02382 
02383         if (needfree)
02384                 return (1);
02385 
02386         /*
02387          * Cooperate with pagedaemon when it's time for it to scan
02388          * and reclaim some pages.
02389          */
02390         if (vm_paging_needed())
02391                 return (1);
02392 
02393 #ifdef sun
02394         /*
02395          * take 'desfree' extra pages, so we reclaim sooner, rather than later
02396          */
02397         extra = desfree;
02398 
02399         /*
02400          * check that we're out of range of the pageout scanner.  It starts to
02401          * schedule paging if freemem is less than lotsfree and needfree.
02402          * lotsfree is the high-water mark for pageout, and needfree is the
02403          * number of needed free pages.  We add extra pages here to make sure
02404          * the scanner doesn't start up while we're freeing memory.
02405          */
02406         if (freemem < lotsfree + needfree + extra)
02407                 return (1);
02408 
02409         /*
02410          * check to make sure that swapfs has enough space so that anon
02411          * reservations can still succeed. anon_resvmem() checks that the
02412          * availrmem is greater than swapfs_minfree, and the number of reserved
02413          * swap pages.  We also add a bit of extra here just to prevent
02414          * circumstances from getting really dire.
02415          */
02416         if (availrmem < swapfs_minfree + swapfs_reserve + extra)
02417                 return (1);
02418 
02419 #if defined(__i386)
02420         /*
02421          * If we're on an i386 platform, it's possible that we'll exhaust the
02422          * kernel heap space before we ever run out of available physical
02423          * memory.  Most checks of the size of the heap_area compare against
02424          * tune.t_minarmem, which is the minimum available real memory that we
02425          * can have in the system.  However, this is generally fixed at 25 pages
02426          * which is so low that it's useless.  In this comparison, we seek to
02427          * calculate the total heap-size, and reclaim if more than 3/4ths of the
02428          * heap is allocated.  (Or, in the calculation, if less than 1/4th is
02429          * free)
02430          */
02431         if (btop(vmem_size(heap_arena, VMEM_FREE)) <
02432             (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
02433                 return (1);
02434 #endif
02435 #else   /* !sun */
02436         if (kmem_used() > (kmem_size() * 3) / 4)
02437                 return (1);
02438 #endif  /* sun */
02439 
02440 #else   /* !_KERNEL */
02441         if (spa_get_random(100) == 0)
02442                 return (1);
02443 #endif
02444         return (0);
02445 }
02446 
02447 extern kmem_cache_t     *zio_buf_cache[];
02448 extern kmem_cache_t     *zio_data_buf_cache[];
02449 
02450 static void
02451 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
02452 {
02453         size_t                  i;
02454         kmem_cache_t            *prev_cache = NULL;
02455         kmem_cache_t            *prev_data_cache = NULL;
02456 
02457 #ifdef _KERNEL
02458         if (arc_meta_used >= arc_meta_limit) {
02459                 /*
02460                  * We are exceeding our meta-data cache limit.
02461                  * Purge some DNLC entries to release holds on meta-data.
02462                  */
02463                 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
02464         }
02465 #if defined(__i386)
02466         /*
02467          * Reclaim unused memory from all kmem caches.
02468          */
02469         kmem_reap();
02470 #endif
02471 #endif
02472 
02473         /*
02474          * An aggressive reclamation will shrink the cache size as well as
02475          * reap free buffers from the arc kmem caches.
02476          */
02477         if (strat == ARC_RECLAIM_AGGR)
02478                 arc_shrink();
02479 
02480         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
02481                 if (zio_buf_cache[i] != prev_cache) {
02482                         prev_cache = zio_buf_cache[i];
02483                         kmem_cache_reap_now(zio_buf_cache[i]);
02484                 }
02485                 if (zio_data_buf_cache[i] != prev_data_cache) {
02486                         prev_data_cache = zio_data_buf_cache[i];
02487                         kmem_cache_reap_now(zio_data_buf_cache[i]);
02488                 }
02489         }
02490         kmem_cache_reap_now(buf_cache);
02491         kmem_cache_reap_now(hdr_cache);
02492 }
02493 
02494 static void
02495 arc_reclaim_thread(void *dummy __unused)
02496 {
02497         clock_t                 growtime = 0;
02498         arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
02499         callb_cpr_t             cpr;
02500 
02501         CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
02502 
02503         mutex_enter(&arc_reclaim_thr_lock);
02504         while (arc_thread_exit == 0) {
02505                 if (arc_reclaim_needed()) {
02506 
02507                         if (arc_no_grow) {
02508                                 if (last_reclaim == ARC_RECLAIM_CONS) {
02509                                         last_reclaim = ARC_RECLAIM_AGGR;
02510                                 } else {
02511                                         last_reclaim = ARC_RECLAIM_CONS;
02512                                 }
02513                         } else {
02514                                 arc_no_grow = TRUE;
02515                                 last_reclaim = ARC_RECLAIM_AGGR;
02516                                 membar_producer();
02517                         }
02518 
02519                         /* reset the growth delay for every reclaim */
02520                         growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
02521 
02522                         if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
02523                                 /*
02524                                  * If needfree is TRUE our vm_lowmem hook
02525                                  * was called and in that case we must free some
02526                                  * memory, so switch to aggressive mode.
02527                                  */
02528                                 arc_no_grow = TRUE;
02529                                 last_reclaim = ARC_RECLAIM_AGGR;
02530                         }
02531                         arc_kmem_reap_now(last_reclaim);
02532                         arc_warm = B_TRUE;
02533 
02534                 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
02535                         arc_no_grow = FALSE;
02536                 }
02537 
02538                 arc_adjust();
02539 
02540                 if (arc_eviction_list != NULL)
02541                         arc_do_user_evicts();
02542 
02543 #ifdef _KERNEL
02544                 if (needfree) {
02545                         needfree = 0;
02546                         wakeup(&needfree);
02547                 }
02548 #endif
02549 
02550                 /* block until needed, or one second, whichever is shorter */
02551                 CALLB_CPR_SAFE_BEGIN(&cpr);
02552                 (void) cv_timedwait(&arc_reclaim_thr_cv,
02553                     &arc_reclaim_thr_lock, hz);
02554                 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
02555         }
02556 
02557         arc_thread_exit = 0;
02558         cv_broadcast(&arc_reclaim_thr_cv);
02559         CALLB_CPR_EXIT(&cpr);           /* drops arc_reclaim_thr_lock */
02560         thread_exit();
02561 }
02562 
02568 static void
02569 arc_adapt(int bytes, arc_state_t *state)
02570 {
02571         int mult;
02572         uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
02573 
02574         if (state == arc_l2c_only)
02575                 return;
02576 
02577         ASSERT(bytes > 0);
02578         /*
02579          * Adapt the target size of the MRU list:
02580          *      - if we just hit in the MRU ghost list, then increase
02581          *        the target size of the MRU list.
02582          *      - if we just hit in the MFU ghost list, then increase
02583          *        the target size of the MFU list by decreasing the
02584          *        target size of the MRU list.
02585          */
02586         if (state == arc_mru_ghost) {
02587                 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
02588                     1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
02589                 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
02590 
02591                 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
02592         } else if (state == arc_mfu_ghost) {
02593                 uint64_t delta;
02594 
02595                 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
02596                     1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
02597                 mult = MIN(mult, 10);
02598 
02599                 delta = MIN(bytes * mult, arc_p);
02600                 arc_p = MAX(arc_p_min, arc_p - delta);
02601         }
02602         ASSERT((int64_t)arc_p >= 0);
02603 
02604         if (arc_reclaim_needed()) {
02605                 cv_signal(&arc_reclaim_thr_cv);
02606                 return;
02607         }
02608 
02609         if (arc_no_grow)
02610                 return;
02611 
02612         if (arc_c >= arc_c_max)
02613                 return;
02614 
02615         /*
02616          * If we're within (2 * maxblocksize) bytes of the target
02617          * cache size, increment the target cache size
02618          */
02619         if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
02620                 atomic_add_64(&arc_c, (int64_t)bytes);
02621                 if (arc_c > arc_c_max)
02622                         arc_c = arc_c_max;
02623                 else if (state == arc_anon)
02624                         atomic_add_64(&arc_p, (int64_t)bytes);
02625                 if (arc_p > arc_c)
02626                         arc_p = arc_c;
02627         }
02628         ASSERT((int64_t)arc_p >= 0);
02629 }
02630 
02635 static int
02636 arc_evict_needed(arc_buf_contents_t type)
02637 {
02638         if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
02639                 return (1);
02640 
02641 #ifdef sun
02642 #ifdef _KERNEL
02643         /*
02644          * If zio data pages are being allocated out of a separate heap segment,
02645          * then enforce that the size of available vmem for this area remains
02646          * above about 1/32nd free.
02647          */
02648         if (type == ARC_BUFC_DATA && zio_arena != NULL &&
02649             vmem_size(zio_arena, VMEM_FREE) <
02650             (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
02651                 return (1);
02652 #endif
02653 #endif  /* sun */
02654 
02655         if (arc_reclaim_needed())
02656                 return (1);
02657 
02658         return (arc_size > arc_c);
02659 }
02660 
02679 static void
02680 arc_get_data_buf(arc_buf_t *buf)
02681 {
02682         arc_state_t             *state = buf->b_hdr->b_state;
02683         uint64_t                size = buf->b_hdr->b_size;
02684         arc_buf_contents_t      type = buf->b_hdr->b_type;
02685 
02686         arc_adapt(size, state);
02687 
02688         /*
02689          * We have not yet reached cache maximum size,
02690          * just allocate a new buffer.
02691          */
02692         if (!arc_evict_needed(type)) {
02693                 if (type == ARC_BUFC_METADATA) {
02694                         buf->b_data = zio_buf_alloc(size);
02695                         arc_space_consume(size, ARC_SPACE_DATA);
02696                 } else {
02697                         ASSERT(type == ARC_BUFC_DATA);
02698                         buf->b_data = zio_data_buf_alloc(size);
02699                         ARCSTAT_INCR(arcstat_data_size, size);
02700                         atomic_add_64(&arc_size, size);
02701                 }
02702                 goto out;
02703         }
02704 
02705         /*
02706          * If we are prefetching from the mfu ghost list, this buffer
02707          * will end up on the mru list; so steal space from there.
02708          */
02709         if (state == arc_mfu_ghost)
02710                 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
02711         else if (state == arc_mru_ghost)
02712                 state = arc_mru;
02713 
02714         if (state == arc_mru || state == arc_anon) {
02715                 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
02716                 state = (arc_mfu->arcs_lsize[type] >= size &&
02717                     arc_p > mru_used) ? arc_mfu : arc_mru;
02718         } else {
02719                 /* MFU cases */
02720                 uint64_t mfu_space = arc_c - arc_p;
02721                 state =  (arc_mru->arcs_lsize[type] >= size &&
02722                     mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
02723         }
02724         if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
02725                 if (type == ARC_BUFC_METADATA) {
02726                         buf->b_data = zio_buf_alloc(size);
02727                         arc_space_consume(size, ARC_SPACE_DATA);
02728                 } else {
02729                         ASSERT(type == ARC_BUFC_DATA);
02730                         buf->b_data = zio_data_buf_alloc(size);
02731                         ARCSTAT_INCR(arcstat_data_size, size);
02732                         atomic_add_64(&arc_size, size);
02733                 }
02734                 ARCSTAT_BUMP(arcstat_recycle_miss);
02735         }
02736         ASSERT(buf->b_data != NULL);
02737 out:
02738         /*
02739          * Update the state size.  Note that ghost states have a
02740          * "ghost size" and so don't need to be updated.
02741          */
02742         if (!GHOST_STATE(buf->b_hdr->b_state)) {
02743                 arc_buf_hdr_t *hdr = buf->b_hdr;
02744 
02745                 atomic_add_64(&hdr->b_state->arcs_size, size);
02746                 if (list_link_active(&hdr->b_arc_node)) {
02747                         ASSERT(refcount_is_zero(&hdr->b_refcnt));
02748                         atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
02749                 }
02750                 /*
02751                  * If we are growing the cache, and we are adding anonymous
02752                  * data, and we have outgrown arc_p, update arc_p
02753                  */
02754                 if (arc_size < arc_c && hdr->b_state == arc_anon &&
02755                     arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
02756                         arc_p = MIN(arc_c, arc_p + size);
02757         }
02758         ARCSTAT_BUMP(arcstat_allocated);
02759 }
02760 
02766 static void
02767 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
02768 {
02769         clock_t now;
02770 
02771         ASSERT(MUTEX_HELD(hash_lock));
02772 
02773         if (buf->b_state == arc_anon) {
02774                 /*
02775                  * This buffer is not in the cache, and does not
02776                  * appear in our "ghost" list.  Add the new buffer
02777                  * to the MRU state.
02778                  */
02779 
02780                 ASSERT(buf->b_arc_access == 0);
02781                 buf->b_arc_access = ddi_get_lbolt();
02782                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
02783                 arc_change_state(arc_mru, buf, hash_lock);
02784 
02785         } else if (buf->b_state == arc_mru) {
02786                 now = ddi_get_lbolt();
02787 
02788                 /*
02789                  * If this buffer is here because of a prefetch, then either:
02790                  * - clear the flag if this is a "referencing" read
02791                  *   (any subsequent access will bump this into the MFU state).
02792                  * or
02793                  * - move the buffer to the head of the list if this is
02794                  *   another prefetch (to make it less likely to be evicted).
02795                  */
02796                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
02797                         if (refcount_count(&buf->b_refcnt) == 0) {
02798                                 ASSERT(list_link_active(&buf->b_arc_node));
02799                         } else {
02800                                 buf->b_flags &= ~ARC_PREFETCH;
02801                                 ARCSTAT_BUMP(arcstat_mru_hits);
02802                         }
02803                         buf->b_arc_access = now;
02804                         return;
02805                 }
02806 
02807                 /*
02808                  * This buffer has been "accessed" only once so far,
02809                  * but it is still in the cache. Move it to the MFU
02810                  * state.
02811                  */
02812                 if (now > buf->b_arc_access + ARC_MINTIME) {
02813                         /*
02814                          * More than 125ms have passed since we
02815                          * instantiated this buffer.  Move it to the
02816                          * most frequently used state.
02817                          */
02818                         buf->b_arc_access = now;
02819                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
02820                         arc_change_state(arc_mfu, buf, hash_lock);
02821                 }
02822                 ARCSTAT_BUMP(arcstat_mru_hits);
02823         } else if (buf->b_state == arc_mru_ghost) {
02824                 arc_state_t     *new_state;
02825                 /*
02826                  * This buffer has been "accessed" recently, but
02827                  * was evicted from the cache.  Move it to the
02828                  * MFU state.
02829                  */
02830 
02831                 if (buf->b_flags & ARC_PREFETCH) {
02832                         new_state = arc_mru;
02833                         if (refcount_count(&buf->b_refcnt) > 0)
02834                                 buf->b_flags &= ~ARC_PREFETCH;
02835                         DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
02836                 } else {
02837                         new_state = arc_mfu;
02838                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
02839                 }
02840 
02841                 buf->b_arc_access = ddi_get_lbolt();
02842                 arc_change_state(new_state, buf, hash_lock);
02843 
02844                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
02845         } else if (buf->b_state == arc_mfu) {
02846                 /*
02847                  * This buffer has been accessed more than once and is
02848                  * still in the cache.  Keep it in the MFU state.
02849                  *
02850                  * NOTE: an add_reference() that occurred when we did
02851                  * the arc_read() will have kicked this off the list.
02852                  * If it was a prefetch, we will explicitly move it to
02853                  * the head of the list now.
02854                  */
02855                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
02856                         ASSERT(refcount_count(&buf->b_refcnt) == 0);
02857                         ASSERT(list_link_active(&buf->b_arc_node));
02858                 }
02859                 ARCSTAT_BUMP(arcstat_mfu_hits);
02860                 buf->b_arc_access = ddi_get_lbolt();
02861         } else if (buf->b_state == arc_mfu_ghost) {
02862                 arc_state_t     *new_state = arc_mfu;
02863                 /*
02864                  * This buffer has been accessed more than once but has
02865                  * been evicted from the cache.  Move it back to the
02866                  * MFU state.
02867                  */
02868 
02869                 if (buf->b_flags & ARC_PREFETCH) {
02870                         /*
02871                          * This is a prefetch access...
02872                          * move this block back to the MRU state.
02873                          */
02874                         ASSERT0(refcount_count(&buf->b_refcnt));
02875                         new_state = arc_mru;
02876                 }
02877 
02878                 buf->b_arc_access = ddi_get_lbolt();
02879                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
02880                 arc_change_state(new_state, buf, hash_lock);
02881 
02882                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
02883         } else if (buf->b_state == arc_l2c_only) {
02884                 /*
02885                  * This buffer is on the 2nd Level ARC.
02886                  */
02887 
02888                 buf->b_arc_access = ddi_get_lbolt();
02889                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
02890                 arc_change_state(arc_mfu, buf, hash_lock);
02891         } else {
02892                 ASSERT(!"invalid arc state");
02893         }
02894 }
02895 
02899 /* ARGSUSED */
02900 void
02901 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
02902 {
02903         if (zio == NULL || zio->io_error == 0)
02904                 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
02905         VERIFY(arc_buf_remove_ref(buf, arg) == 1);
02906 }
02907 
02911 void
02912 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
02913 {
02914         arc_buf_t **bufp = arg;
02915         if (zio && zio->io_error) {
02916                 VERIFY(arc_buf_remove_ref(buf, arg) == 1);
02917                 *bufp = NULL;
02918         } else {
02919                 *bufp = buf;
02920                 ASSERT(buf->b_data);
02921         }
02922 }
02923 
02924 static void
02925 arc_read_done(zio_t *zio)
02926 {
02927         arc_buf_hdr_t   *hdr, *found;
02928         arc_buf_t       *buf;
02929         arc_buf_t       *abuf;  /* buffer we're assigning to callback */
02930         kmutex_t        *hash_lock;
02931         arc_callback_t  *callback_list, *acb;
02932         int             freeable = FALSE;
02933 
02934         buf = zio->io_private;
02935         hdr = buf->b_hdr;
02936 
02937         /*
02938          * The hdr was inserted into hash-table and removed from lists
02939          * prior to starting I/O.  We should find this header, since
02940          * it's in the hash table, and it should be legit since it's
02941          * not possible to evict it during the I/O.  The only possible
02942          * reason for it not to be found is if we were freed during the
02943          * read.
02944          */
02945         found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
02946             &hash_lock);
02947 
02948         ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
02949             (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
02950             (found == hdr && HDR_L2_READING(hdr)));
02951 
02952         hdr->b_flags &= ~ARC_L2_EVICTED;
02953         if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
02954                 hdr->b_flags &= ~ARC_L2CACHE;
02955 
02956         /* byteswap if necessary */
02957         callback_list = hdr->b_acb;
02958         ASSERT(callback_list != NULL);
02959         if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
02960                 dmu_object_byteswap_t bswap =
02961                     DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
02962                 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
02963                     byteswap_uint64_array :
02964                     dmu_ot_byteswap[bswap].ob_func;
02965                 func(buf->b_data, hdr->b_size);
02966         }
02967 
02968         arc_cksum_compute(buf, B_FALSE);
02969 #ifdef illumos
02970         arc_buf_watch(buf);
02971 #endif /* illumos */
02972 
02973         if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
02974                 /*
02975                  * Only call arc_access on anonymous buffers.  This is because
02976                  * if we've issued an I/O for an evicted buffer, we've already
02977                  * called arc_access (to prevent any simultaneous readers from
02978                  * getting confused).
02979                  */
02980                 arc_access(hdr, hash_lock);
02981         }
02982 
02983         /* create copies of the data buffer for the callers */
02984         abuf = buf;
02985         for (acb = callback_list; acb; acb = acb->acb_next) {
02986                 if (acb->acb_done) {
02987                         if (abuf == NULL) {
02988                                 ARCSTAT_BUMP(arcstat_duplicate_reads);
02989                                 abuf = arc_buf_clone(buf);
02990                         }
02991                         acb->acb_buf = abuf;
02992                         abuf = NULL;
02993                 }
02994         }
02995         hdr->b_acb = NULL;
02996         hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
02997         ASSERT(!HDR_BUF_AVAILABLE(hdr));
02998         if (abuf == buf) {
02999                 ASSERT(buf->b_efunc == NULL);
03000                 ASSERT(hdr->b_datacnt == 1);
03001                 hdr->b_flags |= ARC_BUF_AVAILABLE;
03002         }
03003 
03004         ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
03005 
03006         if (zio->io_error != 0) {
03007                 hdr->b_flags |= ARC_IO_ERROR;
03008                 if (hdr->b_state != arc_anon)
03009                         arc_change_state(arc_anon, hdr, hash_lock);
03010                 if (HDR_IN_HASH_TABLE(hdr))
03011                         buf_hash_remove(hdr);
03012                 freeable = refcount_is_zero(&hdr->b_refcnt);
03013         }
03014 
03015         /*
03016          * Broadcast before we drop the hash_lock to avoid the possibility
03017          * that the hdr (and hence the cv) might be freed before we get to
03018          * the cv_broadcast().
03019          */
03020         cv_broadcast(&hdr->b_cv);
03021 
03022         if (hash_lock) {
03023                 mutex_exit(hash_lock);
03024         } else {
03025                 /*
03026                  * This block was freed while we waited for the read to
03027                  * complete.  It has been removed from the hash table and
03028                  * moved to the anonymous state (so that it won't show up
03029                  * in the cache).
03030                  */
03031                 ASSERT3P(hdr->b_state, ==, arc_anon);
03032                 freeable = refcount_is_zero(&hdr->b_refcnt);
03033         }
03034 
03035         /* execute each callback and free its structure */
03036         while ((acb = callback_list) != NULL) {
03037                 if (acb->acb_done)
03038                         acb->acb_done(zio, acb->acb_buf, acb->acb_private);
03039 
03040                 if (acb->acb_zio_dummy != NULL) {
03041                         acb->acb_zio_dummy->io_error = zio->io_error;
03042                         zio_nowait(acb->acb_zio_dummy);
03043                 }
03044 
03045                 callback_list = acb->acb_next;
03046                 kmem_free(acb, sizeof (arc_callback_t));
03047         }
03048 
03049         if (freeable)
03050                 arc_hdr_destroy(hdr);
03051 }
03052 
03075 int
03076 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
03077     arc_done_func_t *done, void *private, int priority, int zio_flags,
03078     uint32_t *arc_flags, const zbookmark_t *zb)
03079 {
03080         int err;
03081 
03082         if (pbuf == NULL) {
03083                 /*
03084                  * XXX This happens from traverse callback funcs, for
03085                  * the objset_phys_t block.
03086                  */
03087                 return (arc_read_nolock(pio, spa, bp, done, private, priority,
03088                     zio_flags, arc_flags, zb));
03089         }
03090 
03091         ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
03092         ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
03093         rw_enter(&pbuf->b_data_lock, RW_READER);
03094 
03095         err = arc_read_nolock(pio, spa, bp, done, private, priority,
03096             zio_flags, arc_flags, zb);
03097         rw_exit(&pbuf->b_data_lock);
03098 
03099         return (err);
03100 }
03101 
03102 int
03103 arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
03104     arc_done_func_t *done, void *private, int priority, int zio_flags,
03105     uint32_t *arc_flags, const zbookmark_t *zb)
03106 {
03107         arc_buf_hdr_t *hdr;
03108         arc_buf_t *buf;
03109         kmutex_t *hash_lock;
03110         zio_t *rzio;
03111         uint64_t guid = spa_load_guid(spa);
03112 
03113 top:
03114         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
03115             &hash_lock);
03116         if (hdr && hdr->b_datacnt > 0) {
03117 
03118                 *arc_flags |= ARC_CACHED;
03119 
03120                 if (HDR_IO_IN_PROGRESS(hdr)) {
03121 
03122                         if (*arc_flags & ARC_WAIT) {
03123                                 cv_wait(&hdr->b_cv, hash_lock);
03124                                 mutex_exit(hash_lock);
03125                                 goto top;
03126                         }
03127                         ASSERT(*arc_flags & ARC_NOWAIT);
03128 
03129                         if (done) {
03130                                 arc_callback_t  *acb = NULL;
03131 
03132                                 acb = kmem_zalloc(sizeof (arc_callback_t),
03133                                     KM_SLEEP);
03134                                 acb->acb_done = done;
03135                                 acb->acb_private = private;
03136                                 if (pio != NULL)
03137                                         acb->acb_zio_dummy = zio_null(pio,
03138                                             spa, NULL, NULL, NULL, zio_flags);
03139 
03140                                 ASSERT(acb->acb_done != NULL);
03141                                 acb->acb_next = hdr->b_acb;
03142                                 hdr->b_acb = acb;
03143                                 add_reference(hdr, hash_lock, private);
03144                                 mutex_exit(hash_lock);
03145                                 return (0);
03146                         }
03147                         mutex_exit(hash_lock);
03148                         return (0);
03149                 }
03150 
03151                 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
03152 
03153                 if (done) {
03154                         add_reference(hdr, hash_lock, private);
03155                         /*
03156                          * If this block is already in use, create a new
03157                          * copy of the data so that we will be guaranteed
03158                          * that arc_release() will always succeed.
03159                          */
03160                         buf = hdr->b_buf;
03161                         ASSERT(buf);
03162                         ASSERT(buf->b_data);
03163                         if (HDR_BUF_AVAILABLE(hdr)) {
03164                                 ASSERT(buf->b_efunc == NULL);
03165                                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
03166                         } else {
03167                                 buf = arc_buf_clone(buf);
03168                         }
03169 
03170                 } else if (*arc_flags & ARC_PREFETCH &&
03171                     refcount_count(&hdr->b_refcnt) == 0) {
03172                         hdr->b_flags |= ARC_PREFETCH;
03173                 }
03174                 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
03175                 arc_access(hdr, hash_lock);
03176                 if (*arc_flags & ARC_L2CACHE)
03177                         hdr->b_flags |= ARC_L2CACHE;
03178                 mutex_exit(hash_lock);
03179                 ARCSTAT_BUMP(arcstat_hits);
03180                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
03181                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
03182                     data, metadata, hits);
03183 
03184                 if (done)
03185                         done(NULL, buf, private);
03186         } else {
03187                 uint64_t size = BP_GET_LSIZE(bp);
03188                 arc_callback_t  *acb;
03189                 vdev_t *vd = NULL;
03190                 uint64_t addr;
03191                 boolean_t devw = B_FALSE;
03192 
03193                 if (hdr == NULL) {
03194                         /* this block is not in the cache */
03195                         arc_buf_hdr_t   *exists;
03196                         arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
03197                         buf = arc_buf_alloc(spa, size, private, type);
03198                         hdr = buf->b_hdr;
03199                         hdr->b_dva = *BP_IDENTITY(bp);
03200                         hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
03201                         hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
03202                         exists = buf_hash_insert(hdr, &hash_lock);
03203                         if (exists) {
03204                                 /* somebody beat us to the hash insert */
03205                                 mutex_exit(hash_lock);
03206                                 buf_discard_identity(hdr);
03207                                 (void) arc_buf_remove_ref(buf, private);
03208                                 goto top; /* restart the IO request */
03209                         }
03210                         /* if this is a prefetch, we don't have a reference */
03211                         if (*arc_flags & ARC_PREFETCH) {
03212                                 (void) remove_reference(hdr, hash_lock,
03213                                     private);
03214                                 hdr->b_flags |= ARC_PREFETCH;
03215                         }
03216                         if (*arc_flags & ARC_L2CACHE)
03217                                 hdr->b_flags |= ARC_L2CACHE;
03218                         if (BP_GET_LEVEL(bp) > 0)
03219                                 hdr->b_flags |= ARC_INDIRECT;
03220                 } else {
03221                         /* this block is in the ghost cache */
03222                         ASSERT(GHOST_STATE(hdr->b_state));
03223                         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
03224                         ASSERT0(refcount_count(&hdr->b_refcnt));
03225                         ASSERT(hdr->b_buf == NULL);
03226 
03227                         /* if this is a prefetch, we don't have a reference */
03228                         if (*arc_flags & ARC_PREFETCH)
03229                                 hdr->b_flags |= ARC_PREFETCH;
03230                         else
03231                                 add_reference(hdr, hash_lock, private);
03232                         if (*arc_flags & ARC_L2CACHE)
03233                                 hdr->b_flags |= ARC_L2CACHE;
03234                         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
03235                         buf->b_hdr = hdr;
03236                         buf->b_data = NULL;
03237                         buf->b_efunc = NULL;
03238                         buf->b_private = NULL;
03239                         buf->b_next = NULL;
03240                         hdr->b_buf = buf;
03241                         ASSERT(hdr->b_datacnt == 0);
03242                         hdr->b_datacnt = 1;
03243                         arc_get_data_buf(buf);
03244                         arc_access(hdr, hash_lock);
03245                 }
03246 
03247                 ASSERT(!GHOST_STATE(hdr->b_state));
03248 
03249                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
03250                 acb->acb_done = done;
03251                 acb->acb_private = private;
03252 
03253                 ASSERT(hdr->b_acb == NULL);
03254                 hdr->b_acb = acb;
03255                 hdr->b_flags |= ARC_IO_IN_PROGRESS;
03256 
03257                 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
03258                     (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
03259                         devw = hdr->b_l2hdr->b_dev->l2ad_writing;
03260                         addr = hdr->b_l2hdr->b_daddr;
03261                         /*
03262                          * Lock out device removal.
03263                          */
03264                         if (vdev_is_dead(vd) ||
03265                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
03266                                 vd = NULL;
03267                 }
03268 
03269                 mutex_exit(hash_lock);
03270 
03271                 /*
03272                  * At this point, we have a level 1 cache miss.  Try again in
03273                  * L2ARC if possible.
03274                  */
03275                 ASSERT3U(hdr->b_size, ==, size);
03276                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
03277                     uint64_t, size, zbookmark_t *, zb);
03278                 ARCSTAT_BUMP(arcstat_misses);
03279                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
03280                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
03281                     data, metadata, misses);
03282 #ifdef _KERNEL
03283                 curthread->td_ru.ru_inblock++;
03284 #endif
03285 
03286                 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
03287                         /*
03288                          * Read from the L2ARC if the following are true:
03289                          * 1. The L2ARC vdev was previously cached.
03290                          * 2. This buffer still has L2ARC metadata.
03291                          * 3. This buffer isn't currently writing to the L2ARC.
03292                          * 4. The L2ARC entry wasn't evicted, which may
03293                          *    also have invalidated the vdev.
03294                          * 5. This isn't prefetch and l2arc_noprefetch is set.
03295                          */
03296                         if (hdr->b_l2hdr != NULL &&
03297                             !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
03298                             !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
03299                                 l2arc_read_callback_t *cb;
03300 
03301                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
03302                                 ARCSTAT_BUMP(arcstat_l2_hits);
03303 
03304                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
03305                                     KM_SLEEP);
03306                                 cb->l2rcb_buf = buf;
03307                                 cb->l2rcb_spa = spa;
03308                                 cb->l2rcb_bp = *bp;
03309                                 cb->l2rcb_zb = *zb;
03310                                 cb->l2rcb_flags = zio_flags;
03311 
03312                                 /*
03313                                  * l2arc read.  The SCL_L2ARC lock will be
03314                                  * released by l2arc_read_done().
03315                                  */
03316                                 rzio = zio_read_phys(pio, vd, addr, size,
03317                                     buf->b_data, ZIO_CHECKSUM_OFF,
03318                                     l2arc_read_done, cb, priority, zio_flags |
03319                                     ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
03320                                     ZIO_FLAG_DONT_PROPAGATE |
03321                                     ZIO_FLAG_DONT_RETRY, B_FALSE);
03322                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
03323                                     zio_t *, rzio);
03324                                 ARCSTAT_INCR(arcstat_l2_read_bytes, size);
03325 
03326                                 if (*arc_flags & ARC_NOWAIT) {
03327                                         zio_nowait(rzio);
03328                                         return (0);
03329                                 }
03330 
03331                                 ASSERT(*arc_flags & ARC_WAIT);
03332                                 if (zio_wait(rzio) == 0)
03333                                         return (0);
03334 
03335                                 /* l2arc read error; goto zio_read() */
03336                         } else {
03337                                 DTRACE_PROBE1(l2arc__miss,
03338                                     arc_buf_hdr_t *, hdr);
03339                                 ARCSTAT_BUMP(arcstat_l2_misses);
03340                                 if (HDR_L2_WRITING(hdr))
03341                                         ARCSTAT_BUMP(arcstat_l2_rw_clash);
03342                                 spa_config_exit(spa, SCL_L2ARC, vd);
03343                         }
03344                 } else {
03345                         if (vd != NULL)
03346                                 spa_config_exit(spa, SCL_L2ARC, vd);
03347                         if (l2arc_ndev != 0) {
03348                                 DTRACE_PROBE1(l2arc__miss,
03349                                     arc_buf_hdr_t *, hdr);
03350                                 ARCSTAT_BUMP(arcstat_l2_misses);
03351                         }
03352                 }
03353 
03354                 rzio = zio_read(pio, spa, bp, buf->b_data, size,
03355                     arc_read_done, buf, priority, zio_flags, zb);
03356 
03357                 if (*arc_flags & ARC_WAIT)
03358                         return (zio_wait(rzio));
03359 
03360                 ASSERT(*arc_flags & ARC_NOWAIT);
03361                 zio_nowait(rzio);
03362         }
03363         return (0);
03364 }
03365 
03366 void
03367 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
03368 {
03369         ASSERT(buf->b_hdr != NULL);
03370         ASSERT(buf->b_hdr->b_state != arc_anon);
03371         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
03372         ASSERT(buf->b_efunc == NULL);
03373         ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
03374 
03375         buf->b_efunc = func;
03376         buf->b_private = private;
03377 }
03378 
03384 int
03385 arc_buf_evict(arc_buf_t *buf)
03386 {
03387         arc_buf_hdr_t *hdr;
03388         kmutex_t *hash_lock;
03389         arc_buf_t **bufp;
03390         list_t *list, *evicted_list;
03391         kmutex_t *lock, *evicted_lock;
03392 
03393         mutex_enter(&buf->b_evict_lock);
03394         hdr = buf->b_hdr;
03395         if (hdr == NULL) {
03396                 /*
03397                  * We are in arc_do_user_evicts().
03398                  */
03399                 ASSERT(buf->b_data == NULL);
03400                 mutex_exit(&buf->b_evict_lock);
03401                 return (0);
03402         } else if (buf->b_data == NULL) {
03403                 arc_buf_t copy = *buf; /* structure assignment */
03404                 /*
03405                  * We are on the eviction list; process this buffer now
03406                  * but let arc_do_user_evicts() do the reaping.
03407                  */
03408                 buf->b_efunc = NULL;
03409                 mutex_exit(&buf->b_evict_lock);
03410                 VERIFY(copy.b_efunc(&copy) == 0);
03411                 return (1);
03412         }
03413         hash_lock = HDR_LOCK(hdr);
03414         mutex_enter(hash_lock);
03415         hdr = buf->b_hdr;
03416         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
03417 
03418         ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
03419         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
03420 
03421         /*
03422          * Pull this buffer off of the hdr
03423          */
03424         bufp = &hdr->b_buf;
03425         while (*bufp != buf)
03426                 bufp = &(*bufp)->b_next;
03427         *bufp = buf->b_next;
03428 
03429         ASSERT(buf->b_data != NULL);
03430         arc_buf_destroy(buf, FALSE, FALSE);
03431 
03432         if (hdr->b_datacnt == 0) {
03433                 arc_state_t *old_state = hdr->b_state;
03434                 arc_state_t *evicted_state;
03435 
03436                 ASSERT(hdr->b_buf == NULL);
03437                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
03438 
03439                 evicted_state =
03440                     (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
03441 
03442                 get_buf_info(hdr, old_state, &list, &lock);
03443                 get_buf_info(hdr, evicted_state, &evicted_list, &evicted_lock);
03444                 mutex_enter(lock);
03445                 mutex_enter(evicted_lock);
03446 
03447                 arc_change_state(evicted_state, hdr, hash_lock);
03448                 ASSERT(HDR_IN_HASH_TABLE(hdr));
03449                 hdr->b_flags |= ARC_IN_HASH_TABLE;
03450                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
03451 
03452                 mutex_exit(evicted_lock);
03453                 mutex_exit(lock);
03454         }
03455         mutex_exit(hash_lock);
03456         mutex_exit(&buf->b_evict_lock);
03457 
03458         VERIFY(buf->b_efunc(buf) == 0);
03459         buf->b_efunc = NULL;
03460         buf->b_private = NULL;
03461         buf->b_hdr = NULL;
03462         buf->b_next = NULL;
03463         kmem_cache_free(buf_cache, buf);
03464         return (1);
03465 }
03466 
03473 void
03474 arc_release(arc_buf_t *buf, void *tag)
03475 {
03476         arc_buf_hdr_t *hdr;
03477         kmutex_t *hash_lock = NULL;
03478         l2arc_buf_hdr_t *l2hdr;
03479         uint64_t buf_size;
03480 
03481         /*
03482          * It would be nice to assert that if it's DMU metadata (level >
03483          * 0 || it's the dnode file), then it must be syncing context.
03484          * But we don't know that information at this level.
03485          */
03486 
03487         mutex_enter(&buf->b_evict_lock);
03488         hdr = buf->b_hdr;
03489 
03490         /* this buffer is not on any list */
03491         ASSERT(refcount_count(&hdr->b_refcnt) > 0);
03492 
03493         if (hdr->b_state == arc_anon) {
03494                 /* this buffer is already released */
03495                 ASSERT(buf->b_efunc == NULL);
03496         } else {
03497                 hash_lock = HDR_LOCK(hdr);
03498                 mutex_enter(hash_lock);
03499                 hdr = buf->b_hdr;
03500                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
03501         }
03502 
03503         l2hdr = hdr->b_l2hdr;
03504         if (l2hdr) {
03505                 mutex_enter(&l2arc_buflist_mtx);
03506                 hdr->b_l2hdr = NULL;
03507                 buf_size = hdr->b_size;
03508         }
03509 
03510         /*
03511          * Do we have more than one buf?
03512          */
03513         if (hdr->b_datacnt > 1) {
03514                 arc_buf_hdr_t *nhdr;
03515                 arc_buf_t **bufp;
03516                 uint64_t blksz = hdr->b_size;
03517                 uint64_t spa = hdr->b_spa;
03518                 arc_buf_contents_t type = hdr->b_type;
03519                 uint32_t flags = hdr->b_flags;
03520 
03521                 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
03522                 /*
03523                  * Pull the data off of this hdr and attach it to
03524                  * a new anonymous hdr.
03525                  */
03526                 (void) remove_reference(hdr, hash_lock, tag);
03527                 bufp = &hdr->b_buf;
03528                 while (*bufp != buf)
03529                         bufp = &(*bufp)->b_next;
03530                 *bufp = buf->b_next;
03531                 buf->b_next = NULL;
03532 
03533                 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
03534                 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
03535                 if (refcount_is_zero(&hdr->b_refcnt)) {
03536                         uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
03537                         ASSERT3U(*size, >=, hdr->b_size);
03538                         atomic_add_64(size, -hdr->b_size);
03539                 }
03540 
03541                 /*
03542                  * We're releasing a duplicate user data buffer, update
03543                  * our statistics accordingly.
03544                  */
03545                 if (hdr->b_type == ARC_BUFC_DATA) {
03546                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
03547                         ARCSTAT_INCR(arcstat_duplicate_buffers_size,
03548                             -hdr->b_size);
03549                 }
03550                 hdr->b_datacnt -= 1;
03551                 arc_cksum_verify(buf);
03552 #ifdef illumos
03553                 arc_buf_unwatch(buf);
03554 #endif /* illumos */
03555 
03556                 mutex_exit(hash_lock);
03557 
03558                 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
03559                 nhdr->b_size = blksz;
03560                 nhdr->b_spa = spa;
03561                 nhdr->b_type = type;
03562                 nhdr->b_buf = buf;
03563                 nhdr->b_state = arc_anon;
03564                 nhdr->b_arc_access = 0;
03565                 nhdr->b_flags = flags & ARC_L2_WRITING;
03566                 nhdr->b_l2hdr = NULL;
03567                 nhdr->b_datacnt = 1;
03568                 nhdr->b_freeze_cksum = NULL;
03569                 (void) refcount_add(&nhdr->b_refcnt, tag);
03570                 buf->b_hdr = nhdr;
03571                 mutex_exit(&buf->b_evict_lock);
03572                 atomic_add_64(&arc_anon->arcs_size, blksz);
03573         } else {
03574                 mutex_exit(&buf->b_evict_lock);
03575                 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
03576                 ASSERT(!list_link_active(&hdr->b_arc_node));
03577                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
03578                 if (hdr->b_state != arc_anon)
03579                         arc_change_state(arc_anon, hdr, hash_lock);
03580                 hdr->b_arc_access = 0;
03581                 if (hash_lock)
03582                         mutex_exit(hash_lock);
03583 
03584                 buf_discard_identity(hdr);
03585                 arc_buf_thaw(buf);
03586         }
03587         buf->b_efunc = NULL;
03588         buf->b_private = NULL;
03589 
03590         if (l2hdr) {
03591                 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
03592                 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
03593                 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
03594                 mutex_exit(&l2arc_buflist_mtx);
03595         }
03596 }
03597 
03602 /* ARGSUSED */
03603 int
03604 arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
03605     zbookmark_t *zb)
03606 {
03607         arc_release(buf, tag);
03608         return (0);
03609 }
03610 
03611 int
03612 arc_released(arc_buf_t *buf)
03613 {
03614         int released;
03615 
03616         mutex_enter(&buf->b_evict_lock);
03617         released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
03618         mutex_exit(&buf->b_evict_lock);
03619         return (released);
03620 }
03621 
03622 int
03623 arc_has_callback(arc_buf_t *buf)
03624 {
03625         int callback;
03626 
03627         mutex_enter(&buf->b_evict_lock);
03628         callback = (buf->b_efunc != NULL);
03629         mutex_exit(&buf->b_evict_lock);
03630         return (callback);
03631 }
03632 
03633 #ifdef ZFS_DEBUG
03634 int
03635 arc_referenced(arc_buf_t *buf)
03636 {
03637         int referenced;
03638 
03639         mutex_enter(&buf->b_evict_lock);
03640         referenced = (refcount_count(&buf->b_hdr->b_refcnt));
03641         mutex_exit(&buf->b_evict_lock);
03642         return (referenced);
03643 }
03644 #endif
03645 
03646 static void
03647 arc_write_ready(zio_t *zio)
03648 {
03649         arc_write_callback_t *callback = zio->io_private;
03650         arc_buf_t *buf = callback->awcb_buf;
03651         arc_buf_hdr_t *hdr = buf->b_hdr;
03652 
03653         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
03654         callback->awcb_ready(zio, buf, callback->awcb_private);
03655 
03656         /*
03657          * If the IO is already in progress, then this is a re-write
03658          * attempt, so we need to thaw and re-compute the cksum.
03659          * It is the responsibility of the callback to handle the
03660          * accounting for any re-write attempt.
03661          */
03662         if (HDR_IO_IN_PROGRESS(hdr)) {
03663                 mutex_enter(&hdr->b_freeze_lock);
03664                 if (hdr->b_freeze_cksum != NULL) {
03665                         kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
03666                         hdr->b_freeze_cksum = NULL;
03667                 }
03668                 mutex_exit(&hdr->b_freeze_lock);
03669         }
03670         arc_cksum_compute(buf, B_FALSE);
03671         hdr->b_flags |= ARC_IO_IN_PROGRESS;
03672 }
03673 
03674 static void
03675 arc_write_done(zio_t *zio)
03676 {
03677         arc_write_callback_t *callback = zio->io_private;
03678         arc_buf_t *buf = callback->awcb_buf;
03679         arc_buf_hdr_t *hdr = buf->b_hdr;
03680 
03681         ASSERT(hdr->b_acb == NULL);
03682 
03683         if (zio->io_error == 0) {
03684                 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
03685                 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
03686                 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
03687         } else {
03688                 ASSERT(BUF_EMPTY(hdr));
03689         }
03690 
03691         /*
03692          * If the block to be written was all-zero, we may have
03693          * compressed it away.  In this case no write was performed
03694          * so there will be no dva/birth/checksum.  The buffer must
03695          * therefore remain anonymous (and uncached).
03696          */
03697         if (!BUF_EMPTY(hdr)) {
03698                 arc_buf_hdr_t *exists;
03699                 kmutex_t *hash_lock;
03700 
03701                 ASSERT(zio->io_error == 0);
03702 
03703                 arc_cksum_verify(buf);
03704 
03705                 exists = buf_hash_insert(hdr, &hash_lock);
03706                 if (exists) {
03707                         /*
03708                          * This can only happen if we overwrite for
03709                          * sync-to-convergence, because we remove
03710                          * buffers from the hash table when we arc_free().
03711                          */
03712                         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
03713                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
03714                                         panic("bad overwrite, hdr=%p exists=%p",
03715                                             (void *)hdr, (void *)exists);
03716                                 ASSERT(refcount_is_zero(&exists->b_refcnt));
03717                                 arc_change_state(arc_anon, exists, hash_lock);
03718                                 mutex_exit(hash_lock);
03719                                 arc_hdr_destroy(exists);
03720                                 exists = buf_hash_insert(hdr, &hash_lock);
03721                                 ASSERT3P(exists, ==, NULL);
03722                         } else {
03723                                 /* Dedup */
03724                                 ASSERT(hdr->b_datacnt == 1);
03725                                 ASSERT(hdr->b_state == arc_anon);
03726                                 ASSERT(BP_GET_DEDUP(zio->io_bp));
03727                                 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
03728                         }
03729                 }
03730                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
03731                 /* if it's not anon, we are doing a scrub */
03732                 if (!exists && hdr->b_state == arc_anon)
03733                         arc_access(hdr, hash_lock);
03734                 mutex_exit(hash_lock);
03735         } else {
03736                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
03737         }
03738 
03739         ASSERT(!refcount_is_zero(&hdr->b_refcnt));
03740         callback->awcb_done(zio, buf, callback->awcb_private);
03741 
03742         kmem_free(callback, sizeof (arc_write_callback_t));
03743 }
03744 
03745 zio_t *
03746 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
03747     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
03748     arc_done_func_t *ready, arc_done_func_t *done, void *private,
03749     int priority, int zio_flags, const zbookmark_t *zb)
03750 {
03751         arc_buf_hdr_t *hdr = buf->b_hdr;
03752         arc_write_callback_t *callback;
03753         zio_t *zio;
03754 
03755         ASSERT(ready != NULL);
03756         ASSERT(done != NULL);
03757         ASSERT(!HDR_IO_ERROR(hdr));
03758         ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
03759         ASSERT(hdr->b_acb == NULL);
03760         if (l2arc)
03761                 hdr->b_flags |= ARC_L2CACHE;
03762         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
03763         callback->awcb_ready = ready;
03764         callback->awcb_done = done;
03765         callback->awcb_private = private;
03766         callback->awcb_buf = buf;
03767 
03768         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
03769             arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
03770 
03771         return (zio);
03772 }
03773 
03774 static int
03775 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
03776 {
03777 #ifdef _KERNEL
03778         uint64_t available_memory =
03779             ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count);
03780         static uint64_t page_load = 0;
03781         static uint64_t last_txg = 0;
03782 
03783 #ifdef sun
03784 #if defined(__i386)
03785         available_memory =
03786             MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
03787 #endif
03788 #endif  /* sun */
03789         if (available_memory >= zfs_write_limit_max)
03790                 return (0);
03791 
03792         if (txg > last_txg) {
03793                 last_txg = txg;
03794                 page_load = 0;
03795         }
03796         /*
03797          * If we are in pageout, we know that memory is already tight,
03798          * the arc is already going to be evicting, so we just want to
03799          * continue to let page writes occur as quickly as possible.
03800          */
03801         if (curproc == pageproc) {
03802                 if (page_load > available_memory / 4)
03803                         return (ERESTART);
03804                 /* Note: reserve is inflated, so we deflate */
03805                 page_load += reserve / 8;
03806                 return (0);
03807         } else if (page_load > 0 && arc_reclaim_needed()) {
03808                 /* memory is low, delay before restarting */
03809                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
03810                 return (EAGAIN);
03811         }
03812         page_load = 0;
03813 
03814         if (arc_size > arc_c_min) {
03815                 uint64_t evictable_memory =
03816                     arc_mru->arcs_lsize[ARC_BUFC_DATA] +
03817                     arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
03818                     arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
03819                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
03820                 available_memory += MIN(evictable_memory, arc_size - arc_c_min);
03821         }
03822 
03823         if (inflight_data > available_memory / 4) {
03824                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
03825                 return (ERESTART);
03826         }
03827 #endif
03828         return (0);
03829 }
03830 
03831 void
03832 arc_tempreserve_clear(uint64_t reserve)
03833 {
03834         atomic_add_64(&arc_tempreserve, -reserve);
03835         ASSERT((int64_t)arc_tempreserve >= 0);
03836 }
03837 
03838 int
03839 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
03840 {
03841         int error;
03842         uint64_t anon_size;
03843 
03844 #ifdef ZFS_DEBUG
03845         /*
03846          * Once in a while, fail for no reason.  Everything should cope.
03847          */
03848         if (spa_get_random(10000) == 0) {
03849                 dprintf("forcing random failure\n");
03850                 return (ERESTART);
03851         }
03852 #endif
03853         if (reserve > arc_c/4 && !arc_no_grow)
03854                 arc_c = MIN(arc_c_max, reserve * 4);
03855         if (reserve > arc_c)
03856                 return (ENOMEM);
03857 
03858         /*
03859          * Don't count loaned bufs as in flight dirty data to prevent long
03860          * network delays from blocking transactions that are ready to be
03861          * assigned to a txg.
03862          */
03863         anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
03864 
03865         /*
03866          * Writes will, almost always, require additional memory allocations
03867          * in order to compress/encrypt/etc the data.  We therefore need to
03868          * make sure that there is sufficient available memory for this.
03869          */
03870         if (error = arc_memory_throttle(reserve, anon_size, txg))
03871                 return (error);
03872 
03873         /*
03874          * Throttle writes when the amount of dirty data in the cache
03875          * gets too large.  We try to keep the cache less than half full
03876          * of dirty blocks so that our sync times don't grow too large.
03877          * Note: if two requests come in concurrently, we might let them
03878          * both succeed, when one of them should fail.  Not a huge deal.
03879          */
03880 
03881         if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
03882             anon_size > arc_c / 4) {
03883                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
03884                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
03885                     arc_tempreserve>>10,
03886                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
03887                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
03888                     reserve>>10, arc_c>>10);
03889                 return (ERESTART);
03890         }
03891         atomic_add_64(&arc_tempreserve, reserve);
03892         return (0);
03893 }
03894 
03895 static kmutex_t arc_lowmem_lock;
03896 #ifdef _KERNEL
03897 static eventhandler_tag arc_event_lowmem = NULL;
03898 
03899 static void
03900 arc_lowmem(void *arg __unused, int howto __unused)
03901 {
03902 
03903         /* Serialize access via arc_lowmem_lock. */
03904         mutex_enter(&arc_lowmem_lock);
03905         mutex_enter(&arc_reclaim_thr_lock);
03906         needfree = 1;
03907         cv_signal(&arc_reclaim_thr_cv);
03908 
03909         /*
03910          * It is unsafe to block here in arbitrary threads, because we can come
03911          * here from ARC itself and may hold ARC locks and thus risk a deadlock
03912          * with ARC reclaim thread.
03913          */
03914         if (curproc == pageproc) {
03915                 while (needfree)
03916                         msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
03917         }
03918         mutex_exit(&arc_reclaim_thr_lock);
03919         mutex_exit(&arc_lowmem_lock);
03920 }
03921 #endif
03922 
03923 void
03924 arc_init(void)
03925 {
03926         int i, prefetch_tunable_set = 0;
03927 
03928         mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
03929         cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
03930         mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
03931 
03932         /* Convert seconds to clock ticks */
03933         arc_min_prefetch_lifespan = 1 * hz;
03934 
03935         /* Start out with 1/8 of all memory */
03936         arc_c = kmem_size() / 8;
03937 
03938 #ifdef sun
03939 #ifdef _KERNEL
03940         /*
03941          * On architectures where the physical memory can be larger
03942          * than the addressable space (intel in 32-bit mode), we may
03943          * need to limit the cache to 1/8 of VM size.
03944          */
03945         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
03946 #endif
03947 #endif  /* sun */
03948         /* set min cache to 1/32 of all memory, or 16MB, whichever is more */
03949         arc_c_min = MAX(arc_c / 4, 64<<18);
03950         /* set max to 1/2 of all memory, or all but 1GB, whichever is more */
03951         if (arc_c * 8 >= 1<<30)
03952                 arc_c_max = (arc_c * 8) - (1<<30);
03953         else
03954                 arc_c_max = arc_c_min;
03955         arc_c_max = MAX(arc_c * 5, arc_c_max);
03956 
03957 #ifdef _KERNEL
03958         /*
03959          * Allow the tunables to override our calculations if they are
03960          * reasonable (ie. over 16MB)
03961          */
03962         if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size())
03963                 arc_c_max = zfs_arc_max;
03964         if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max)
03965                 arc_c_min = zfs_arc_min;
03966 #endif
03967 
03968         arc_c = arc_c_max;
03969         arc_p = (arc_c >> 1);
03970 
03971         /* limit meta-data to 1/4 of the arc capacity */
03972         arc_meta_limit = arc_c_max / 4;
03973 
03974         /* Allow the tunable to override if it is reasonable */
03975         if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
03976                 arc_meta_limit = zfs_arc_meta_limit;
03977 
03978         if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
03979                 arc_c_min = arc_meta_limit / 2;
03980 
03981         if (zfs_arc_grow_retry > 0)
03982                 arc_grow_retry = zfs_arc_grow_retry;
03983 
03984         if (zfs_arc_shrink_shift > 0)
03985                 arc_shrink_shift = zfs_arc_shrink_shift;
03986 
03987         if (zfs_arc_p_min_shift > 0)
03988                 arc_p_min_shift = zfs_arc_p_min_shift;
03989 
03990         /* if kmem_flags are set, lets try to use less memory */
03991         if (kmem_debugging())
03992                 arc_c = arc_c / 2;
03993         if (arc_c < arc_c_min)
03994                 arc_c = arc_c_min;
03995 
03996         zfs_arc_min = arc_c_min;
03997         zfs_arc_max = arc_c_max;
03998 
03999         arc_anon = &ARC_anon;
04000         arc_mru = &ARC_mru;
04001         arc_mru_ghost = &ARC_mru_ghost;
04002         arc_mfu = &ARC_mfu;
04003         arc_mfu_ghost = &ARC_mfu_ghost;
04004         arc_l2c_only = &ARC_l2c_only;
04005         arc_size = 0;
04006 
04007         for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
04008                 mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
04009                     NULL, MUTEX_DEFAULT, NULL);
04010                 mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
04011                     NULL, MUTEX_DEFAULT, NULL);
04012                 mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
04013                     NULL, MUTEX_DEFAULT, NULL);
04014                 mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
04015                     NULL, MUTEX_DEFAULT, NULL);
04016                 mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
04017                     NULL, MUTEX_DEFAULT, NULL);
04018                 mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
04019                     NULL, MUTEX_DEFAULT, NULL);
04020 
04021                 list_create(&arc_mru->arcs_lists[i],
04022                     sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
04023                 list_create(&arc_mru_ghost->arcs_lists[i],
04024                     sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
04025                 list_create(&arc_mfu->arcs_lists[i],
04026                     sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
04027                 list_create(&arc_mfu_ghost->arcs_lists[i],
04028                     sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
04029                 list_create(&arc_mfu_ghost->arcs_lists[i],
04030                     sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
04031                 list_create(&arc_l2c_only->arcs_lists[i],
04032                     sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
04033         }
04034 
04035         buf_init();
04036 
04037         arc_thread_exit = 0;
04038         arc_eviction_list = NULL;
04039         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
04040         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
04041 
04042         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
04043             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
04044 
04045         if (arc_ksp != NULL) {
04046                 arc_ksp->ks_data = &arc_stats;
04047                 kstat_install(arc_ksp);
04048         }
04049 
04050         (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
04051             TS_RUN, minclsyspri);
04052 
04053 #ifdef _KERNEL
04054         arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
04055             EVENTHANDLER_PRI_FIRST);
04056 #endif
04057 
04058         arc_dead = FALSE;
04059         arc_warm = B_FALSE;
04060 
04061         if (zfs_write_limit_max == 0)
04062                 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
04063         else
04064                 zfs_write_limit_shift = 0;
04065         mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
04066 
04067 #ifdef _KERNEL
04068         if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
04069                 prefetch_tunable_set = 1;
04070 
04071 #ifdef __i386__
04072         if (prefetch_tunable_set == 0) {
04073                 printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
04074                     "-- to enable,\n");
04075                 printf("            add \"vfs.zfs.prefetch_disable=0\" "
04076                     "to /boot/loader.conf.\n");
04077                 zfs_prefetch_disable = 1;
04078         }
04079 #else
04080         if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
04081             prefetch_tunable_set == 0) {
04082                 printf("ZFS NOTICE: Prefetch is disabled by default if less "
04083                     "than 4GB of RAM is present;\n"
04084                     "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
04085                     "to /boot/loader.conf.\n");
04086                 zfs_prefetch_disable = 1;
04087         }
04088 #endif
04089         /* Warn about ZFS memory and address space requirements. */
04090         if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
04091                 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
04092                     "expect unstable behavior.\n");
04093         }
04094         if (kmem_size() < 512 * (1 << 20)) {
04095                 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
04096                     "expect unstable behavior.\n");
04097                 printf("             Consider tuning vm.kmem_size and "
04098                     "vm.kmem_size_max\n");
04099                 printf("             in /boot/loader.conf.\n");
04100         }
04101 #endif
04102 }
04103 
04104 void
04105 arc_fini(void)
04106 {
04107         int i;
04108 
04109         mutex_enter(&arc_reclaim_thr_lock);
04110         arc_thread_exit = 1;
04111         cv_signal(&arc_reclaim_thr_cv);
04112         while (arc_thread_exit != 0)
04113                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
04114         mutex_exit(&arc_reclaim_thr_lock);
04115 
04116         arc_flush(NULL);
04117 
04118         arc_dead = TRUE;
04119 
04120         if (arc_ksp != NULL) {
04121                 kstat_delete(arc_ksp);
04122                 arc_ksp = NULL;
04123         }
04124 
04125         mutex_destroy(&arc_eviction_mtx);
04126         mutex_destroy(&arc_reclaim_thr_lock);
04127         cv_destroy(&arc_reclaim_thr_cv);
04128 
04129         for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
04130                 list_destroy(&arc_mru->arcs_lists[i]);
04131                 list_destroy(&arc_mru_ghost->arcs_lists[i]);
04132                 list_destroy(&arc_mfu->arcs_lists[i]);
04133                 list_destroy(&arc_mfu_ghost->arcs_lists[i]);
04134                 list_destroy(&arc_l2c_only->arcs_lists[i]);
04135 
04136                 mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
04137                 mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
04138                 mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
04139                 mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
04140                 mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
04141                 mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
04142         }
04143 
04144         mutex_destroy(&zfs_write_limit_lock);
04145 
04146         buf_fini();
04147 
04148         ASSERT(arc_loaned_bytes == 0);
04149 
04150         mutex_destroy(&arc_lowmem_lock);
04151 #ifdef _KERNEL
04152         if (arc_event_lowmem != NULL)
04153                 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
04154 #endif
04155 }
04156 
04288 static boolean_t
04289 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
04290 {
04291         /*
04292          * A buffer is *not* eligible for the L2ARC if it:
04293          * 1. belongs to a different spa.
04294          * 2. is already cached on the L2ARC.
04295          * 3. has an I/O in progress (it may be an incomplete read).
04296          * 4. is flagged not eligible (zfs property).
04297          */
04298         if (ab->b_spa != spa_guid) {
04299                 ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
04300                 return (B_FALSE);
04301         }
04302         if (ab->b_l2hdr != NULL) {
04303                 ARCSTAT_BUMP(arcstat_l2_write_in_l2);
04304                 return (B_FALSE);
04305         }
04306         if (HDR_IO_IN_PROGRESS(ab)) {
04307                 ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
04308                 return (B_FALSE);
04309         }
04310         if (!HDR_L2CACHE(ab)) {
04311                 ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
04312                 return (B_FALSE);
04313         }
04314 
04315         return (B_TRUE);
04316 }
04317 
04318 static uint64_t
04319 l2arc_write_size(l2arc_dev_t *dev)
04320 {
04321         uint64_t size;
04322 
04323         size = dev->l2ad_write;
04324 
04325         if (arc_warm == B_FALSE)
04326                 size += dev->l2ad_boost;
04327 
04328         return (size);
04329 
04330 }
04331 
04332 static clock_t
04333 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
04334 {
04335         clock_t interval, next, now;
04336 
04337         /*
04338          * If the ARC lists are busy, increase our write rate; if the
04339          * lists are stale, idle back.  This is achieved by checking
04340          * how much we previously wrote - if it was more than half of
04341          * what we wanted, schedule the next write much sooner.
04342          */
04343         if (l2arc_feed_again && wrote > (wanted / 2))
04344                 interval = (hz * l2arc_feed_min_ms) / 1000;
04345         else
04346                 interval = hz * l2arc_feed_secs;
04347 
04348         now = ddi_get_lbolt();
04349         next = MAX(now, MIN(now + interval, began + interval));
04350 
04351         return (next);
04352 }
04353 
04354 static void
04355 l2arc_hdr_stat_add(void)
04356 {
04357         ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
04358         ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
04359 }
04360 
04361 static void
04362 l2arc_hdr_stat_remove(void)
04363 {
04364         ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
04365         ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
04366 }
04367 
04372 static l2arc_dev_t *
04373 l2arc_dev_get_next(void)
04374 {
04375         l2arc_dev_t *first, *next = NULL;
04376 
04377         /*
04378          * Lock out the removal of spas (spa_namespace_lock), then removal
04379          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
04380          * both locks will be dropped and a spa config lock held instead.
04381          */
04382         mutex_enter(&spa_namespace_lock);
04383         mutex_enter(&l2arc_dev_mtx);
04384 
04385         /* if there are no vdevs, there is nothing to do */
04386         if (l2arc_ndev == 0)
04387                 goto out;
04388 
04389         first = NULL;
04390         next = l2arc_dev_last;
04391         do {
04392                 /* loop around the list looking for a non-faulted vdev */
04393                 if (next == NULL) {
04394                         next = list_head(l2arc_dev_list);
04395                 } else {
04396                         next = list_next(l2arc_dev_list, next);
04397                         if (next == NULL)
04398                                 next = list_head(l2arc_dev_list);
04399                 }
04400 
04401                 /* if we have come back to the start, bail out */
04402                 if (first == NULL)
04403                         first = next;
04404                 else if (next == first)
04405                         break;
04406 
04407         } while (vdev_is_dead(next->l2ad_vdev));
04408 
04409         /* if we were unable to find any usable vdevs, return NULL */
04410         if (vdev_is_dead(next->l2ad_vdev))
04411                 next = NULL;
04412 
04413         l2arc_dev_last = next;
04414 
04415 out:
04416         mutex_exit(&l2arc_dev_mtx);
04417 
04418         /*
04419          * Grab the config lock to prevent the 'next' device from being
04420          * removed while we are writing to it.
04421          */
04422         if (next != NULL)
04423                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
04424         mutex_exit(&spa_namespace_lock);
04425 
04426         return (next);
04427 }
04428 
04432 static void
04433 l2arc_do_free_on_write()
04434 {
04435         list_t *buflist;
04436         l2arc_data_free_t *df, *df_prev;
04437 
04438         mutex_enter(&l2arc_free_on_write_mtx);
04439         buflist = l2arc_free_on_write;
04440 
04441         for (df = list_tail(buflist); df; df = df_prev) {
04442                 df_prev = list_prev(buflist, df);
04443                 ASSERT(df->l2df_data != NULL);
04444                 ASSERT(df->l2df_func != NULL);
04445                 df->l2df_func(df->l2df_data, df->l2df_size);
04446                 list_remove(buflist, df);
04447                 kmem_free(df, sizeof (l2arc_data_free_t));
04448         }
04449 
04450         mutex_exit(&l2arc_free_on_write_mtx);
04451 }
04452 
04457 static void
04458 l2arc_write_done(zio_t *zio)
04459 {
04460         l2arc_write_callback_t *cb;
04461         l2arc_dev_t *dev;
04462         list_t *buflist;
04463         arc_buf_hdr_t *head, *ab, *ab_prev;
04464         l2arc_buf_hdr_t *abl2;
04465         kmutex_t *hash_lock;
04466 
04467         cb = zio->io_private;
04468         ASSERT(cb != NULL);
04469         dev = cb->l2wcb_dev;
04470         ASSERT(dev != NULL);
04471         head = cb->l2wcb_head;
04472         ASSERT(head != NULL);
04473         buflist = dev->l2ad_buflist;
04474         ASSERT(buflist != NULL);
04475         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
04476             l2arc_write_callback_t *, cb);
04477 
04478         if (zio->io_error != 0)
04479                 ARCSTAT_BUMP(arcstat_l2_writes_error);
04480 
04481         mutex_enter(&l2arc_buflist_mtx);
04482 
04483         /*
04484          * All writes completed, or an error was hit.
04485          */
04486         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
04487                 ab_prev = list_prev(buflist, ab);
04488 
04489                 hash_lock = HDR_LOCK(ab);
04490                 if (!mutex_tryenter(hash_lock)) {
04491                         /*
04492                          * This buffer misses out.  It may be in a stage
04493                          * of eviction.  Its ARC_L2_WRITING flag will be
04494                          * left set, denying reads to this buffer.
04495                          */
04496                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
04497                         continue;
04498                 }
04499 
04500                 if (zio->io_error != 0) {
04501                         /*
04502                          * Error - drop L2ARC entry.
04503                          */
04504                         list_remove(buflist, ab);
04505                         abl2 = ab->b_l2hdr;
04506                         ab->b_l2hdr = NULL;
04507                         kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
04508                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
04509                 }
04510 
04511                 /*
04512                  * Allow ARC to begin reads to this L2ARC entry.
04513                  */
04514                 ab->b_flags &= ~ARC_L2_WRITING;
04515 
04516                 mutex_exit(hash_lock);
04517         }
04518 
04519         atomic_inc_64(&l2arc_writes_done);
04520         list_remove(buflist, head);
04521         kmem_cache_free(hdr_cache, head);
04522         mutex_exit(&l2arc_buflist_mtx);
04523 
04524         l2arc_do_free_on_write();
04525 
04526         kmem_free(cb, sizeof (l2arc_write_callback_t));
04527 }
04528 
04533 static void
04534 l2arc_read_done(zio_t *zio)
04535 {
04536         l2arc_read_callback_t *cb;
04537         arc_buf_hdr_t *hdr;
04538         arc_buf_t *buf;
04539         kmutex_t *hash_lock;
04540         int equal;
04541 
04542         ASSERT(zio->io_vd != NULL);
04543         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
04544 
04545         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
04546 
04547         cb = zio->io_private;
04548         ASSERT(cb != NULL);
04549         buf = cb->l2rcb_buf;
04550         ASSERT(buf != NULL);
04551 
04552         hash_lock = HDR_LOCK(buf->b_hdr);
04553         mutex_enter(hash_lock);
04554         hdr = buf->b_hdr;
04555         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
04556 
04557         /*
04558          * Check this survived the L2ARC journey.
04559          */
04560         equal = arc_cksum_equal(buf);
04561         if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
04562                 mutex_exit(hash_lock);
04563                 zio->io_private = buf;
04564                 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
04565                 zio->io_bp = &zio->io_bp_copy;  /* XXX fix in L2ARC 2.0 */
04566                 arc_read_done(zio);
04567         } else {
04568                 mutex_exit(hash_lock);
04569                 /*
04570                  * Buffer didn't survive caching.  Increment stats and
04571                  * reissue to the original storage device.
04572                  */
04573                 if (zio->io_error != 0) {
04574                         ARCSTAT_BUMP(arcstat_l2_io_error);
04575                 } else {
04576                         zio->io_error = EIO;
04577                 }
04578                 if (!equal)
04579                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
04580 
04581                 /*
04582                  * If there's no waiter, issue an async i/o to the primary
04583                  * storage now.  If there *is* a waiter, the caller must
04584                  * issue the i/o in a context where it's OK to block.
04585                  */
04586                 if (zio->io_waiter == NULL) {
04587                         zio_t *pio = zio_unique_parent(zio);
04588 
04589                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
04590 
04591                         zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
04592                             buf->b_data, zio->io_size, arc_read_done, buf,
04593                             zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
04594                 }
04595         }
04596 
04597         kmem_free(cb, sizeof (l2arc_read_callback_t));
04598 }
04599 
04610 static list_t *
04611 l2arc_list_locked(int list_num, kmutex_t **lock)
04612 {
04613         list_t *list;
04614         int idx;
04615 
04616         ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
04617 
04618         if (list_num < ARC_BUFC_NUMMETADATALISTS) {
04619                 idx = list_num;
04620                 list = &arc_mfu->arcs_lists[idx];
04621                 *lock = ARCS_LOCK(arc_mfu, idx);
04622         } else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
04623                 idx = list_num - ARC_BUFC_NUMMETADATALISTS;
04624                 list = &arc_mru->arcs_lists[idx];
04625                 *lock = ARCS_LOCK(arc_mru, idx);
04626         } else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
04627                 ARC_BUFC_NUMDATALISTS)) {
04628                 idx = list_num - ARC_BUFC_NUMMETADATALISTS;
04629                 list = &arc_mfu->arcs_lists[idx];
04630                 *lock = ARCS_LOCK(arc_mfu, idx);
04631         } else {
04632                 idx = list_num - ARC_BUFC_NUMLISTS;
04633                 list = &arc_mru->arcs_lists[idx];
04634                 *lock = ARCS_LOCK(arc_mru, idx);
04635         }
04636 
04637         ASSERT(!(MUTEX_HELD(*lock)));
04638         mutex_enter(*lock);
04639         return (list);
04640 }
04641 
04648 static void
04649 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
04650 {
04651         list_t *buflist;
04652         l2arc_buf_hdr_t *abl2;
04653         arc_buf_hdr_t *ab, *ab_prev;
04654         kmutex_t *hash_lock;
04655         uint64_t taddr;
04656 
04657         buflist = dev->l2ad_buflist;
04658 
04659         if (buflist == NULL)
04660                 return;
04661 
04662         if (!all && dev->l2ad_first) {
04663                 /*
04664                  * This is the first sweep through the device.  There is
04665                  * nothing to evict.
04666                  */
04667                 return;
04668         }
04669 
04670         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
04671                 /*
04672                  * When nearing the end of the device, evict to the end
04673                  * before the device write hand jumps to the start.
04674                  */
04675                 taddr = dev->l2ad_end;
04676         } else {
04677                 taddr = dev->l2ad_hand + distance;
04678         }
04679         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
04680             uint64_t, taddr, boolean_t, all);
04681 
04682 top:
04683         mutex_enter(&l2arc_buflist_mtx);
04684         for (ab = list_tail(buflist); ab; ab = ab_prev) {
04685                 ab_prev = list_prev(buflist, ab);
04686 
04687                 hash_lock = HDR_LOCK(ab);
04688                 if (!mutex_tryenter(hash_lock)) {
04689                         /*
04690                          * Missed the hash lock.  Retry.
04691                          */
04692                         ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
04693                         mutex_exit(&l2arc_buflist_mtx);
04694                         mutex_enter(hash_lock);
04695                         mutex_exit(hash_lock);
04696                         goto top;
04697                 }
04698 
04699                 if (HDR_L2_WRITE_HEAD(ab)) {
04700                         /*
04701                          * We hit a write head node.  Leave it for
04702                          * l2arc_write_done().
04703                          */
04704                         list_remove(buflist, ab);
04705                         mutex_exit(hash_lock);
04706                         continue;
04707                 }
04708 
04709                 if (!all && ab->b_l2hdr != NULL &&
04710                     (ab->b_l2hdr->b_daddr > taddr ||
04711                     ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
04712                         /*
04713                          * We've evicted to the target address,
04714                          * or the end of the device.
04715                          */
04716                         mutex_exit(hash_lock);
04717                         break;
04718                 }
04719 
04720                 if (HDR_FREE_IN_PROGRESS(ab)) {
04721                         /*
04722                          * Already on the path to destruction.
04723                          */
04724                         mutex_exit(hash_lock);
04725                         continue;
04726                 }
04727 
04728                 if (ab->b_state == arc_l2c_only) {
04729                         ASSERT(!HDR_L2_READING(ab));
04730                         /*
04731                          * This doesn't exist in the ARC.  Destroy.
04732                          * arc_hdr_destroy() will call list_remove()
04733                          * and decrement arcstat_l2_size.
04734                          */
04735                         arc_change_state(arc_anon, ab, hash_lock);
04736                         arc_hdr_destroy(ab);
04737                 } else {
04738                         /*
04739                          * Invalidate issued or about to be issued
04740                          * reads, since we may be about to write
04741                          * over this location.
04742                          */
04743                         if (HDR_L2_READING(ab)) {
04744                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
04745                                 ab->b_flags |= ARC_L2_EVICTED;
04746                         }
04747 
04748                         /*
04749                          * Tell ARC this no longer exists in L2ARC.
04750                          */
04751                         if (ab->b_l2hdr != NULL) {
04752                                 abl2 = ab->b_l2hdr;
04753                                 ab->b_l2hdr = NULL;
04754                                 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
04755                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
04756                         }
04757                         list_remove(buflist, ab);
04758 
04759                         /*
04760                          * This may have been leftover after a
04761                          * failed write.
04762                          */
04763                         ab->b_flags &= ~ARC_L2_WRITING;
04764                 }
04765                 mutex_exit(hash_lock);
04766         }
04767         mutex_exit(&l2arc_buflist_mtx);
04768 
04769         vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
04770         dev->l2ad_evict = taddr;
04771 }
04772 
04779 static uint64_t
04780 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
04781 {
04782         arc_buf_hdr_t *ab, *ab_prev, *head;
04783         l2arc_buf_hdr_t *hdrl2;
04784         list_t *list;
04785         uint64_t passed_sz, write_sz, buf_sz, headroom;
04786         void *buf_data;
04787         kmutex_t *hash_lock, *list_lock;
04788         boolean_t have_lock, full;
04789         l2arc_write_callback_t *cb;
04790         zio_t *pio, *wzio;
04791         uint64_t guid = spa_load_guid(spa);
04792         int try;
04793 
04794         ASSERT(dev->l2ad_vdev != NULL);
04795 
04796         pio = NULL;
04797         write_sz = 0;
04798         full = B_FALSE;
04799         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
04800         head->b_flags |= ARC_L2_WRITE_HEAD;
04801 
04802         ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
04803         /*
04804          * Copy buffers for L2ARC writing.
04805          */
04806         mutex_enter(&l2arc_buflist_mtx);
04807         for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
04808                 list = l2arc_list_locked(try, &list_lock);
04809                 passed_sz = 0;
04810                 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
04811 
04812                 /*
04813                  * L2ARC fast warmup.
04814                  *
04815                  * Until the ARC is warm and starts to evict, read from the
04816                  * head of the ARC lists rather than the tail.
04817                  */
04818                 headroom = target_sz * l2arc_headroom;
04819                 if (arc_warm == B_FALSE)
04820                         ab = list_head(list);
04821                 else
04822                         ab = list_tail(list);
04823                 if (ab == NULL)
04824                         ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
04825 
04826                 for (; ab; ab = ab_prev) {
04827                         if (arc_warm == B_FALSE)
04828                                 ab_prev = list_next(list, ab);
04829                         else
04830                                 ab_prev = list_prev(list, ab);
04831                         ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size);
04832 
04833                         hash_lock = HDR_LOCK(ab);
04834                         have_lock = MUTEX_HELD(hash_lock);
04835                         if (!have_lock && !mutex_tryenter(hash_lock)) {
04836                                 ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
04837                                 /*
04838                                  * Skip this buffer rather than waiting.
04839                                  */
04840                                 continue;
04841                         }
04842 
04843                         passed_sz += ab->b_size;
04844                         if (passed_sz > headroom) {
04845                                 /*
04846                                  * Searched too far.
04847                                  */
04848                                 mutex_exit(hash_lock);
04849                                 ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
04850                                 break;
04851                         }
04852 
04853                         if (!l2arc_write_eligible(guid, ab)) {
04854                                 mutex_exit(hash_lock);
04855                                 continue;
04856                         }
04857 
04858                         if ((write_sz + ab->b_size) > target_sz) {
04859                                 full = B_TRUE;
04860                                 mutex_exit(hash_lock);
04861                                 ARCSTAT_BUMP(arcstat_l2_write_full);
04862                                 break;
04863                         }
04864 
04865                         if (pio == NULL) {
04866                                 /*
04867                                  * Insert a dummy header on the buflist so
04868                                  * l2arc_write_done() can find where the
04869                                  * write buffers begin without searching.
04870                                  */
04871                                 list_insert_head(dev->l2ad_buflist, head);
04872 
04873                                 cb = kmem_alloc(
04874                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
04875                                 cb->l2wcb_dev = dev;
04876                                 cb->l2wcb_head = head;
04877                                 pio = zio_root(spa, l2arc_write_done, cb,
04878                                     ZIO_FLAG_CANFAIL);
04879                                 ARCSTAT_BUMP(arcstat_l2_write_pios);
04880                         }
04881 
04882                         /*
04883                          * Create and add a new L2ARC header.
04884                          */
04885                         hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
04886                         hdrl2->b_dev = dev;
04887                         hdrl2->b_daddr = dev->l2ad_hand;
04888 
04889                         ab->b_flags |= ARC_L2_WRITING;
04890                         ab->b_l2hdr = hdrl2;
04891                         list_insert_head(dev->l2ad_buflist, ab);
04892                         buf_data = ab->b_buf->b_data;
04893                         buf_sz = ab->b_size;
04894 
04895                         /*
04896                          * Compute and store the buffer cksum before
04897                          * writing.  On debug the cksum is verified first.
04898                          */
04899                         arc_cksum_verify(ab->b_buf);
04900                         arc_cksum_compute(ab->b_buf, B_TRUE);
04901 
04902                         mutex_exit(hash_lock);
04903 
04904                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
04905                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
04906                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
04907                             ZIO_FLAG_CANFAIL, B_FALSE);
04908 
04909                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
04910                             zio_t *, wzio);
04911                         (void) zio_nowait(wzio);
04912 
04913                         /*
04914                          * Keep the clock hand suitably device-aligned.
04915                          */
04916                         buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
04917 
04918                         write_sz += buf_sz;
04919                         dev->l2ad_hand += buf_sz;
04920                 }
04921 
04922                 mutex_exit(list_lock);
04923 
04924                 if (full == B_TRUE)
04925                         break;
04926         }
04927         mutex_exit(&l2arc_buflist_mtx);
04928 
04929         if (pio == NULL) {
04930                 ASSERT0(write_sz);
04931                 kmem_cache_free(hdr_cache, head);
04932                 return (0);
04933         }
04934 
04935         ASSERT3U(write_sz, <=, target_sz);
04936         ARCSTAT_BUMP(arcstat_l2_writes_sent);
04937         ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
04938         ARCSTAT_INCR(arcstat_l2_size, write_sz);
04939         vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
04940 
04941         /*
04942          * Bump device hand to the device start if it is approaching the end.
04943          * l2arc_evict() will already have evicted ahead for this case.
04944          */
04945         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
04946                 vdev_space_update(dev->l2ad_vdev,
04947                     dev->l2ad_end - dev->l2ad_hand, 0, 0);
04948                 dev->l2ad_hand = dev->l2ad_start;
04949                 dev->l2ad_evict = dev->l2ad_start;
04950                 dev->l2ad_first = B_FALSE;
04951         }
04952 
04953         dev->l2ad_writing = B_TRUE;
04954         (void) zio_wait(pio);
04955         dev->l2ad_writing = B_FALSE;
04956 
04957         return (write_sz);
04958 }
04959 
04964 static void
04965 l2arc_feed_thread(void *dummy __unused)
04966 {
04967         callb_cpr_t cpr;
04968         l2arc_dev_t *dev;
04969         spa_t *spa;
04970         uint64_t size, wrote;
04971         clock_t begin, next = ddi_get_lbolt();
04972 
04973         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
04974 
04975         mutex_enter(&l2arc_feed_thr_lock);
04976 
04977         while (l2arc_thread_exit == 0) {
04978                 CALLB_CPR_SAFE_BEGIN(&cpr);
04979                 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
04980                     next - ddi_get_lbolt());
04981                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
04982                 next = ddi_get_lbolt() + hz;
04983 
04984                 /*
04985                  * Quick check for L2ARC devices.
04986                  */
04987                 mutex_enter(&l2arc_dev_mtx);
04988                 if (l2arc_ndev == 0) {
04989                         mutex_exit(&l2arc_dev_mtx);
04990                         continue;
04991                 }
04992                 mutex_exit(&l2arc_dev_mtx);
04993                 begin = ddi_get_lbolt();
04994 
04995                 /*
04996                  * This selects the next l2arc device to write to, and in
04997                  * doing so the next spa to feed from: dev->l2ad_spa.   This
04998                  * will return NULL if there are now no l2arc devices or if
04999                  * they are all faulted.
05000                  *
05001                  * If a device is returned, its spa's config lock is also
05002                  * held to prevent device removal.  l2arc_dev_get_next()
05003                  * will grab and release l2arc_dev_mtx.
05004                  */
05005                 if ((dev = l2arc_dev_get_next()) == NULL)
05006                         continue;
05007 
05008                 spa = dev->l2ad_spa;
05009                 ASSERT(spa != NULL);
05010 
05011                 /*
05012                  * If the pool is read-only then force the feed thread to
05013                  * sleep a little longer.
05014                  */
05015                 if (!spa_writeable(spa)) {
05016                         next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
05017                         spa_config_exit(spa, SCL_L2ARC, dev);
05018                         continue;
05019                 }
05020 
05021                 /*
05022                  * Avoid contributing to memory pressure.
05023                  */
05024                 if (arc_reclaim_needed()) {
05025                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
05026                         spa_config_exit(spa, SCL_L2ARC, dev);
05027                         continue;
05028                 }
05029 
05030                 ARCSTAT_BUMP(arcstat_l2_feeds);
05031 
05032                 size = l2arc_write_size(dev);
05033 
05034                 /*
05035                  * Evict L2ARC buffers that will be overwritten.
05036                  */
05037                 l2arc_evict(dev, size, B_FALSE);
05038 
05039                 /*
05040                  * Write ARC buffers.
05041                  */
05042                 wrote = l2arc_write_buffers(spa, dev, size);
05043 
05044                 /*
05045                  * Calculate interval between writes.
05046                  */
05047                 next = l2arc_write_interval(begin, size, wrote);
05048                 spa_config_exit(spa, SCL_L2ARC, dev);
05049         }
05050 
05051         l2arc_thread_exit = 0;
05052         cv_broadcast(&l2arc_feed_thr_cv);
05053         CALLB_CPR_EXIT(&cpr);           /* drops l2arc_feed_thr_lock */
05054         thread_exit();
05055 }
05056 
05057 boolean_t
05058 l2arc_vdev_present(vdev_t *vd)
05059 {
05060         l2arc_dev_t *dev;
05061 
05062         mutex_enter(&l2arc_dev_mtx);
05063         for (dev = list_head(l2arc_dev_list); dev != NULL;
05064             dev = list_next(l2arc_dev_list, dev)) {
05065                 if (dev->l2ad_vdev == vd)
05066                         break;
05067         }
05068         mutex_exit(&l2arc_dev_mtx);
05069 
05070         return (dev != NULL);
05071 }
05072 
05077 void
05078 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
05079 {
05080         l2arc_dev_t *adddev;
05081 
05082         ASSERT(!l2arc_vdev_present(vd));
05083 
05084         /*
05085          * Create a new l2arc device entry.
05086          */
05087         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
05088         adddev->l2ad_spa = spa;
05089         adddev->l2ad_vdev = vd;
05090         adddev->l2ad_write = l2arc_write_max;
05091         adddev->l2ad_boost = l2arc_write_boost;
05092         adddev->l2ad_start = VDEV_LABEL_START_SIZE;
05093         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
05094         adddev->l2ad_hand = adddev->l2ad_start;
05095         adddev->l2ad_evict = adddev->l2ad_start;
05096         adddev->l2ad_first = B_TRUE;
05097         adddev->l2ad_writing = B_FALSE;
05098         ASSERT3U(adddev->l2ad_write, >, 0);
05099 
05100         /*
05101          * This is a list of all ARC buffers that are still valid on the
05102          * device.
05103          */
05104         adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
05105         list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
05106             offsetof(arc_buf_hdr_t, b_l2node));
05107 
05108         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
05109 
05110         /*
05111          * Add device to global list
05112          */
05113         mutex_enter(&l2arc_dev_mtx);
05114         list_insert_head(l2arc_dev_list, adddev);
05115         atomic_inc_64(&l2arc_ndev);
05116         mutex_exit(&l2arc_dev_mtx);
05117 }
05118 
05122 void
05123 l2arc_remove_vdev(vdev_t *vd)
05124 {
05125         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
05126 
05127         /*
05128          * Find the device by vdev
05129          */
05130         mutex_enter(&l2arc_dev_mtx);
05131         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
05132                 nextdev = list_next(l2arc_dev_list, dev);
05133                 if (vd == dev->l2ad_vdev) {
05134                         remdev = dev;
05135                         break;
05136                 }
05137         }
05138         ASSERT(remdev != NULL);
05139 
05140         /*
05141          * Remove device from global list
05142          */
05143         list_remove(l2arc_dev_list, remdev);
05144         l2arc_dev_last = NULL;          /* may have been invalidated */
05145         atomic_dec_64(&l2arc_ndev);
05146         mutex_exit(&l2arc_dev_mtx);
05147 
05148         /*
05149          * Clear all buflists and ARC references.  L2ARC device flush.
05150          */
05151         l2arc_evict(remdev, 0, B_TRUE);
05152         list_destroy(remdev->l2ad_buflist);
05153         kmem_free(remdev->l2ad_buflist, sizeof (list_t));
05154         kmem_free(remdev, sizeof (l2arc_dev_t));
05155 }
05156 
05157 void
05158 l2arc_init(void)
05159 {
05160         l2arc_thread_exit = 0;
05161         l2arc_ndev = 0;
05162         l2arc_writes_sent = 0;
05163         l2arc_writes_done = 0;
05164 
05165         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
05166         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
05167         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
05168         mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
05169         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
05170 
05171         l2arc_dev_list = &L2ARC_dev_list;
05172         l2arc_free_on_write = &L2ARC_free_on_write;
05173         list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
05174             offsetof(l2arc_dev_t, l2ad_node));
05175         list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
05176             offsetof(l2arc_data_free_t, l2df_list_node));
05177 }
05178 
05179 void
05180 l2arc_fini(void)
05181 {
05182         /*
05183          * This is called from dmu_fini(), which is called from spa_fini();
05184          * Because of this, we can assume that all l2arc devices have
05185          * already been removed when the pools themselves were removed.
05186          */
05187 
05188         l2arc_do_free_on_write();
05189 
05190         mutex_destroy(&l2arc_feed_thr_lock);
05191         cv_destroy(&l2arc_feed_thr_cv);
05192         mutex_destroy(&l2arc_dev_mtx);
05193         mutex_destroy(&l2arc_buflist_mtx);
05194         mutex_destroy(&l2arc_free_on_write_mtx);
05195 
05196         list_destroy(l2arc_dev_list);
05197         list_destroy(l2arc_free_on_write);
05198 }
05199 
05200 void
05201 l2arc_start(void)
05202 {
05203         if (!(spa_mode_global & FWRITE))
05204                 return;
05205 
05206         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
05207             TS_RUN, minclsyspri);
05208 }
05209 
05210 void
05211 l2arc_stop(void)
05212 {
05213         if (!(spa_mode_global & FWRITE))
05214                 return;
05215 
05216         mutex_enter(&l2arc_feed_thr_lock);
05217         cv_signal(&l2arc_feed_thr_cv);  /* kick thread out of startup */
05218         l2arc_thread_exit = 1;
05219         while (l2arc_thread_exit != 0)
05220                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
05221         mutex_exit(&l2arc_feed_thr_lock);
05222 }
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines