FreeBSD ZFS
The Zettabyte File System

dbuf.c

Go to the documentation of this file.
00001 /*
00002  * CDDL HEADER START
00003  *
00004  * The contents of this file are subject to the terms of the
00005  * Common Development and Distribution License (the "License").
00006  * You may not use this file except in compliance with the License.
00007  *
00008  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
00009  * or http://www.opensolaris.org/os/licensing.
00010  * See the License for the specific language governing permissions
00011  * and limitations under the License.
00012  *
00013  * When distributing Covered Code, include this CDDL HEADER in each
00014  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
00015  * If applicable, add the following below this CDDL HEADER, with the
00016  * fields enclosed by brackets "[]" replaced with your own identifying
00017  * information: Portions Copyright [yyyy] [name of copyright owner]
00018  *
00019  * CDDL HEADER END
00020  */
00021 /*
00022  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
00023  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
00024  * Copyright (c) 2012 by Delphix. All rights reserved.
00025  */
00026 
00027 #include <sys/zfs_context.h>
00028 #include <sys/dmu.h>
00029 #include <sys/dmu_impl.h>
00030 #include <sys/dbuf.h>
00031 #include <sys/dmu_objset.h>
00032 #include <sys/dsl_dataset.h>
00033 #include <sys/dsl_dir.h>
00034 #include <sys/dmu_tx.h>
00035 #include <sys/spa.h>
00036 #include <sys/zio.h>
00037 #include <sys/dmu_zfetch.h>
00038 #include <sys/sa.h>
00039 #include <sys/sa_impl.h>
00040 
00041 static void dbuf_destroy(dmu_buf_impl_t *db);
00042 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
00043 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
00044 
00048 static kmem_cache_t *dbuf_cache;
00049 
00050 /* ARGSUSED */
00051 static int
00052 dbuf_cons(void *vdb, void *unused, int kmflag)
00053 {
00054         dmu_buf_impl_t *db = vdb;
00055         bzero(db, sizeof (dmu_buf_impl_t));
00056 
00057         mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
00058         cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
00059         refcount_create(&db->db_holds);
00060         return (0);
00061 }
00062 
00063 /* ARGSUSED */
00064 static void
00065 dbuf_dest(void *vdb, void *unused)
00066 {
00067         dmu_buf_impl_t *db = vdb;
00068         mutex_destroy(&db->db_mtx);
00069         cv_destroy(&db->db_changed);
00070         refcount_destroy(&db->db_holds);
00071 }
00072 
00076 static dbuf_hash_table_t dbuf_hash_table;
00077 
00078 static uint64_t dbuf_hash_count;
00079 
00080 static uint64_t
00081 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
00082 {
00083         uintptr_t osv = (uintptr_t)os;
00084         uint64_t crc = -1ULL;
00085 
00086         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
00087         crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
00088         crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
00089         crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
00090         crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
00091         crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
00092         crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
00093 
00094         crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
00095 
00096         return (crc);
00097 }
00098 
00099 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
00100 
00101 #define DBUF_EQUAL(dbuf, os, obj, level, blkid)         \
00102         ((dbuf)->db.db_object == (obj) &&               \
00103         (dbuf)->db_objset == (os) &&                    \
00104         (dbuf)->db_level == (level) &&                  \
00105         (dbuf)->db_blkid == (blkid))
00106 
00107 dmu_buf_impl_t *
00108 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
00109 {
00110         dbuf_hash_table_t *h = &dbuf_hash_table;
00111         objset_t *os = dn->dn_objset;
00112         uint64_t obj = dn->dn_object;
00113         uint64_t hv = DBUF_HASH(os, obj, level, blkid);
00114         uint64_t idx = hv & h->hash_table_mask;
00115         dmu_buf_impl_t *db;
00116 
00117         mutex_enter(DBUF_HASH_MUTEX(h, idx));
00118         for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
00119                 if (DBUF_EQUAL(db, os, obj, level, blkid)) {
00120                         mutex_enter(&db->db_mtx);
00121                         if (db->db_state != DB_EVICTING) {
00122                                 mutex_exit(DBUF_HASH_MUTEX(h, idx));
00123                                 return (db);
00124                         }
00125                         mutex_exit(&db->db_mtx);
00126                 }
00127         }
00128         mutex_exit(DBUF_HASH_MUTEX(h, idx));
00129         return (NULL);
00130 }
00131 
00138 static dmu_buf_impl_t *
00139 dbuf_hash_insert(dmu_buf_impl_t *db)
00140 {
00141         dbuf_hash_table_t *h = &dbuf_hash_table;
00142         objset_t *os = db->db_objset;
00143         uint64_t obj = db->db.db_object;
00144         int level = db->db_level;
00145         uint64_t blkid = db->db_blkid;
00146         uint64_t hv = DBUF_HASH(os, obj, level, blkid);
00147         uint64_t idx = hv & h->hash_table_mask;
00148         dmu_buf_impl_t *dbf;
00149 
00150         mutex_enter(DBUF_HASH_MUTEX(h, idx));
00151         for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
00152                 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
00153                         mutex_enter(&dbf->db_mtx);
00154                         if (dbf->db_state != DB_EVICTING) {
00155                                 mutex_exit(DBUF_HASH_MUTEX(h, idx));
00156                                 return (dbf);
00157                         }
00158                         mutex_exit(&dbf->db_mtx);
00159                 }
00160         }
00161 
00162         mutex_enter(&db->db_mtx);
00163         db->db_hash_next = h->hash_table[idx];
00164         h->hash_table[idx] = db;
00165         mutex_exit(DBUF_HASH_MUTEX(h, idx));
00166         atomic_add_64(&dbuf_hash_count, 1);
00167 
00168         return (NULL);
00169 }
00170 
00175 static void
00176 dbuf_hash_remove(dmu_buf_impl_t *db)
00177 {
00178         dbuf_hash_table_t *h = &dbuf_hash_table;
00179         uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
00180             db->db_level, db->db_blkid);
00181         uint64_t idx = hv & h->hash_table_mask;
00182         dmu_buf_impl_t *dbf, **dbp;
00183 
00184         /*
00185          * We musn't hold db_mtx to maintin lock ordering:
00186          * DBUF_HASH_MUTEX > db_mtx.
00187          */
00188         ASSERT(refcount_is_zero(&db->db_holds));
00189         ASSERT(db->db_state == DB_EVICTING);
00190         ASSERT(!MUTEX_HELD(&db->db_mtx));
00191 
00192         mutex_enter(DBUF_HASH_MUTEX(h, idx));
00193         dbp = &h->hash_table[idx];
00194         while ((dbf = *dbp) != db) {
00195                 dbp = &dbf->db_hash_next;
00196                 ASSERT(dbf != NULL);
00197         }
00198         *dbp = db->db_hash_next;
00199         db->db_hash_next = NULL;
00200         mutex_exit(DBUF_HASH_MUTEX(h, idx));
00201         atomic_add_64(&dbuf_hash_count, -1);
00202 }
00203 
00204 static arc_evict_func_t dbuf_do_evict;
00205 
00206 static void
00207 dbuf_evict_user(dmu_buf_impl_t *db)
00208 {
00209         ASSERT(MUTEX_HELD(&db->db_mtx));
00210 
00211         if (db->db_level != 0 || db->db_evict_func == NULL)
00212                 return;
00213 
00214         if (db->db_user_data_ptr_ptr)
00215                 *db->db_user_data_ptr_ptr = db->db.db_data;
00216         db->db_evict_func(&db->db, db->db_user_ptr);
00217         db->db_user_ptr = NULL;
00218         db->db_user_data_ptr_ptr = NULL;
00219         db->db_evict_func = NULL;
00220 }
00221 
00222 boolean_t
00223 dbuf_is_metadata(dmu_buf_impl_t *db)
00224 {
00225         if (db->db_level > 0) {
00226                 return (B_TRUE);
00227         } else {
00228                 boolean_t is_metadata;
00229 
00230                 DB_DNODE_ENTER(db);
00231                 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
00232                 DB_DNODE_EXIT(db);
00233 
00234                 return (is_metadata);
00235         }
00236 }
00237 
00238 void
00239 dbuf_evict(dmu_buf_impl_t *db)
00240 {
00241         ASSERT(MUTEX_HELD(&db->db_mtx));
00242         ASSERT(db->db_buf == NULL);
00243         ASSERT(db->db_data_pending == NULL);
00244 
00245         dbuf_clear(db);
00246         dbuf_destroy(db);
00247 }
00248 
00249 void
00250 dbuf_init(void)
00251 {
00252         uint64_t hsize = 1ULL << 16;
00253         dbuf_hash_table_t *h = &dbuf_hash_table;
00254         int i;
00255 
00256         /*
00257          * The hash table is big enough to fill all of physical memory
00258          * with an average 4K block size.  The table will take up
00259          * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
00260          */
00261         while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
00262                 hsize <<= 1;
00263 
00264 retry:
00265         h->hash_table_mask = hsize - 1;
00266         h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
00267         if (h->hash_table == NULL) {
00268                 /* XXX - we should really return an error instead of assert */
00269                 ASSERT(hsize > (1ULL << 10));
00270                 hsize >>= 1;
00271                 goto retry;
00272         }
00273 
00274         dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
00275             sizeof (dmu_buf_impl_t),
00276             0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
00277 
00278         for (i = 0; i < DBUF_MUTEXES; i++)
00279                 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
00280 }
00281 
00282 void
00283 dbuf_fini(void)
00284 {
00285         dbuf_hash_table_t *h = &dbuf_hash_table;
00286         int i;
00287 
00288         for (i = 0; i < DBUF_MUTEXES; i++)
00289                 mutex_destroy(&h->hash_mutexes[i]);
00290         kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
00291         kmem_cache_destroy(dbuf_cache);
00292 }
00293 
00294 /*
00295  * Other stuff.
00296  */
00297 
00298 #ifdef ZFS_DEBUG
00299 static void
00300 dbuf_verify(dmu_buf_impl_t *db)
00301 {
00302         dnode_t *dn;
00303         dbuf_dirty_record_t *dr;
00304 
00305         ASSERT(MUTEX_HELD(&db->db_mtx));
00306 
00307         if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
00308                 return;
00309 
00310         ASSERT(db->db_objset != NULL);
00311         DB_DNODE_ENTER(db);
00312         dn = DB_DNODE(db);
00313         if (dn == NULL) {
00314                 ASSERT(db->db_parent == NULL);
00315                 ASSERT(db->db_blkptr == NULL);
00316         } else {
00317                 ASSERT3U(db->db.db_object, ==, dn->dn_object);
00318                 ASSERT3P(db->db_objset, ==, dn->dn_objset);
00319                 ASSERT3U(db->db_level, <, dn->dn_nlevels);
00320                 ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
00321                     db->db_blkid == DMU_SPILL_BLKID ||
00322                     !list_is_empty(&dn->dn_dbufs));
00323         }
00324         if (db->db_blkid == DMU_BONUS_BLKID) {
00325                 ASSERT(dn != NULL);
00326                 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
00327                 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
00328         } else if (db->db_blkid == DMU_SPILL_BLKID) {
00329                 ASSERT(dn != NULL);
00330                 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
00331                 ASSERT0(db->db.db_offset);
00332         } else {
00333                 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
00334         }
00335 
00336         for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
00337                 ASSERT(dr->dr_dbuf == db);
00338 
00339         for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
00340                 ASSERT(dr->dr_dbuf == db);
00341 
00342         /*
00343          * We can't assert that db_size matches dn_datablksz because it
00344          * can be momentarily different when another thread is doing
00345          * dnode_set_blksz().
00346          */
00347         if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
00348                 dr = db->db_data_pending;
00349                 /*
00350                  * It should only be modified in syncing context, so
00351                  * make sure we only have one copy of the data.
00352                  */
00353                 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
00354         }
00355 
00356         /* verify db->db_blkptr */
00357         if (db->db_blkptr) {
00358                 if (db->db_parent == dn->dn_dbuf) {
00359                         /* db is pointed to by the dnode */
00360                         /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
00361                         if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
00362                                 ASSERT(db->db_parent == NULL);
00363                         else
00364                                 ASSERT(db->db_parent != NULL);
00365                         if (db->db_blkid != DMU_SPILL_BLKID)
00366                                 ASSERT3P(db->db_blkptr, ==,
00367                                     &dn->dn_phys->dn_blkptr[db->db_blkid]);
00368                 } else {
00369                         /* db is pointed to by an indirect block */
00370                         int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
00371                         ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
00372                         ASSERT3U(db->db_parent->db.db_object, ==,
00373                             db->db.db_object);
00374                         /*
00375                          * dnode_grow_indblksz() can make this fail if we don't
00376                          * have the struct_rwlock.  XXX indblksz no longer
00377                          * grows.  safe to do this now?
00378                          */
00379                         if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
00380                                 ASSERT3P(db->db_blkptr, ==,
00381                                     ((blkptr_t *)db->db_parent->db.db_data +
00382                                     db->db_blkid % epb));
00383                         }
00384                 }
00385         }
00386         if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
00387             (db->db_buf == NULL || db->db_buf->b_data) &&
00388             db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
00389             db->db_state != DB_FILL && !dn->dn_free_txg) {
00390                 /*
00391                  * If the blkptr isn't set but they have nonzero data,
00392                  * it had better be dirty, otherwise we'll lose that
00393                  * data when we evict this buffer.
00394                  */
00395                 if (db->db_dirtycnt == 0) {
00396                         uint64_t *buf = db->db.db_data;
00397                         int i;
00398 
00399                         for (i = 0; i < db->db.db_size >> 3; i++) {
00400                                 ASSERT(buf[i] == 0);
00401                         }
00402                 }
00403         }
00404         DB_DNODE_EXIT(db);
00405 }
00406 #endif
00407 
00408 static void
00409 dbuf_update_data(dmu_buf_impl_t *db)
00410 {
00411         ASSERT(MUTEX_HELD(&db->db_mtx));
00412         if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
00413                 ASSERT(!refcount_is_zero(&db->db_holds));
00414                 *db->db_user_data_ptr_ptr = db->db.db_data;
00415         }
00416 }
00417 
00422 static void
00423 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
00424 {
00425         ASSERT(MUTEX_HELD(&db->db_mtx));
00426         ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
00427         db->db_buf = buf;
00428         if (buf != NULL) {
00429                 ASSERT(buf->b_data != NULL);
00430                 db->db.db_data = buf->b_data;
00431                 if (!arc_released(buf))
00432                         arc_set_callback(buf, dbuf_do_evict, db);
00433                 dbuf_update_data(db);
00434         } else {
00435                 dbuf_evict_user(db);
00436                 db->db.db_data = NULL;
00437                 if (db->db_state != DB_NOFILL)
00438                         db->db_state = DB_UNCACHED;
00439         }
00440 }
00441 
00447 arc_buf_t *
00448 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
00449 {
00450         arc_buf_t *abuf;
00451 
00452         mutex_enter(&db->db_mtx);
00453         if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
00454                 int blksz = db->db.db_size;
00455                 spa_t *spa;
00456 
00457                 mutex_exit(&db->db_mtx);
00458                 DB_GET_SPA(&spa, db);
00459                 abuf = arc_loan_buf(spa, blksz);
00460                 bcopy(db->db.db_data, abuf->b_data, blksz);
00461         } else {
00462                 abuf = db->db_buf;
00463                 arc_loan_inuse_buf(abuf, db);
00464                 dbuf_set_data(db, NULL);
00465                 mutex_exit(&db->db_mtx);
00466         }
00467         return (abuf);
00468 }
00469 
00470 uint64_t
00471 dbuf_whichblock(dnode_t *dn, uint64_t offset)
00472 {
00473         if (dn->dn_datablkshift) {
00474                 return (offset >> dn->dn_datablkshift);
00475         } else {
00476                 ASSERT3U(offset, <, dn->dn_datablksz);
00477                 return (0);
00478         }
00479 }
00480 
00481 static void
00482 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
00483 {
00484         dmu_buf_impl_t *db = vdb;
00485 
00486         mutex_enter(&db->db_mtx);
00487         ASSERT3U(db->db_state, ==, DB_READ);
00488         /*
00489          * All reads are synchronous, so we must have a hold on the dbuf
00490          */
00491         ASSERT(refcount_count(&db->db_holds) > 0);
00492         ASSERT(db->db_buf == NULL);
00493         ASSERT(db->db.db_data == NULL);
00494         if (db->db_level == 0 && db->db_freed_in_flight) {
00495                 /* we were freed in flight; disregard any error */
00496                 arc_release(buf, db);
00497                 bzero(buf->b_data, db->db.db_size);
00498                 arc_buf_freeze(buf);
00499                 db->db_freed_in_flight = FALSE;
00500                 dbuf_set_data(db, buf);
00501                 db->db_state = DB_CACHED;
00502         } else if (zio == NULL || zio->io_error == 0) {
00503                 dbuf_set_data(db, buf);
00504                 db->db_state = DB_CACHED;
00505         } else {
00506                 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
00507                 ASSERT3P(db->db_buf, ==, NULL);
00508                 VERIFY(arc_buf_remove_ref(buf, db) == 1);
00509                 db->db_state = DB_UNCACHED;
00510         }
00511         cv_broadcast(&db->db_changed);
00512         dbuf_rele_and_unlock(db, NULL);
00513 }
00514 
00515 static void
00516 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
00517 {
00518         dnode_t *dn;
00519         spa_t *spa;
00520         zbookmark_t zb;
00521         uint32_t aflags = ARC_NOWAIT;
00522         arc_buf_t *pbuf;
00523 
00524         DB_DNODE_ENTER(db);
00525         dn = DB_DNODE(db);
00526         ASSERT(!refcount_is_zero(&db->db_holds));
00527         /* We need the struct_rwlock to prevent db_blkptr from changing. */
00528         ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
00529         ASSERT(MUTEX_HELD(&db->db_mtx));
00530         ASSERT(db->db_state == DB_UNCACHED);
00531         ASSERT(db->db_buf == NULL);
00532 
00533         if (db->db_blkid == DMU_BONUS_BLKID) {
00534                 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
00535 
00536                 ASSERT3U(bonuslen, <=, db->db.db_size);
00537                 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
00538                 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
00539                 if (bonuslen < DN_MAX_BONUSLEN)
00540                         bzero(db->db.db_data, DN_MAX_BONUSLEN);
00541                 if (bonuslen)
00542                         bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
00543                 DB_DNODE_EXIT(db);
00544                 dbuf_update_data(db);
00545                 db->db_state = DB_CACHED;
00546                 mutex_exit(&db->db_mtx);
00547                 return;
00548         }
00549 
00550         /*
00551          * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
00552          * processes the delete record and clears the bp while we are waiting
00553          * for the dn_mtx (resulting in a "no" from block_freed).
00554          */
00555         if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
00556             (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
00557             BP_IS_HOLE(db->db_blkptr)))) {
00558                 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
00559  
00560                 dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
00561                     db->db.db_size, db, type));
00562                 DB_DNODE_EXIT(db);
00563                 bzero(db->db.db_data, db->db.db_size);
00564                 db->db_state = DB_CACHED;
00565                 *flags |= DB_RF_CACHED;
00566                 mutex_exit(&db->db_mtx);
00567                 return;
00568         }
00569 
00570         spa = dn->dn_objset->os_spa;
00571         DB_DNODE_EXIT(db);
00572 
00573         db->db_state = DB_READ;
00574         mutex_exit(&db->db_mtx);
00575 
00576         if (DBUF_IS_L2CACHEABLE(db))
00577                 aflags |= ARC_L2CACHE;
00578 
00579         SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
00580             db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
00581             db->db.db_object, db->db_level, db->db_blkid);
00582 
00583         dbuf_add_ref(db, NULL);
00584         /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
00585 
00586         if (db->db_parent)
00587                 pbuf = db->db_parent->db_buf;
00588         else
00589                 pbuf = db->db_objset->os_phys_buf;
00590 
00591         (void) dsl_read(zio, spa, db->db_blkptr, pbuf,
00592             dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
00593             (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
00594             &aflags, &zb);
00595         if (aflags & ARC_CACHED)
00596                 *flags |= DB_RF_CACHED;
00597 }
00598 
00599 int
00600 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
00601 {
00602         int err = 0;
00603         int havepzio = (zio != NULL);
00604         int prefetch;
00605         dnode_t *dn;
00606 
00607         /*
00608          * We don't have to hold the mutex to check db_state because it
00609          * can't be freed while we have a hold on the buffer.
00610          */
00611         ASSERT(!refcount_is_zero(&db->db_holds));
00612 
00613         if (db->db_state == DB_NOFILL)
00614                 return (EIO);
00615 
00616         DB_DNODE_ENTER(db);
00617         dn = DB_DNODE(db);
00618         if ((flags & DB_RF_HAVESTRUCT) == 0)
00619                 rw_enter(&dn->dn_struct_rwlock, RW_READER);
00620 
00621         prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
00622             (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
00623             DBUF_IS_CACHEABLE(db);
00624 
00625         mutex_enter(&db->db_mtx);
00626         if (db->db_state == DB_CACHED) {
00627                 mutex_exit(&db->db_mtx);
00628                 if (prefetch)
00629                         dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
00630                             db->db.db_size, TRUE);
00631                 if ((flags & DB_RF_HAVESTRUCT) == 0)
00632                         rw_exit(&dn->dn_struct_rwlock);
00633                 DB_DNODE_EXIT(db);
00634         } else if (db->db_state == DB_UNCACHED) {
00635                 spa_t *spa = dn->dn_objset->os_spa;
00636 
00637                 if (zio == NULL)
00638                         zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
00639                 dbuf_read_impl(db, zio, &flags);
00640 
00641                 /* dbuf_read_impl has dropped db_mtx for us */
00642 
00643                 if (prefetch)
00644                         dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
00645                             db->db.db_size, flags & DB_RF_CACHED);
00646 
00647                 if ((flags & DB_RF_HAVESTRUCT) == 0)
00648                         rw_exit(&dn->dn_struct_rwlock);
00649                 DB_DNODE_EXIT(db);
00650 
00651                 if (!havepzio)
00652                         err = zio_wait(zio);
00653         } else {
00654                 /*
00655                  * Another reader came in while the dbuf was in flight
00656                  * between UNCACHED and CACHED.  Either a writer will finish
00657                  * writing the buffer (sending the dbuf to CACHED) or the
00658                  * first reader's request will reach the read_done callback
00659                  * and send the dbuf to CACHED.  Otherwise, a failure
00660                  * occurred and the dbuf went to UNCACHED.
00661                  */
00662                 mutex_exit(&db->db_mtx);
00663                 if (prefetch)
00664                         dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
00665                             db->db.db_size, TRUE);
00666                 if ((flags & DB_RF_HAVESTRUCT) == 0)
00667                         rw_exit(&dn->dn_struct_rwlock);
00668                 DB_DNODE_EXIT(db);
00669 
00670                 /* Skip the wait per the caller's request. */
00671                 mutex_enter(&db->db_mtx);
00672                 if ((flags & DB_RF_NEVERWAIT) == 0) {
00673                         while (db->db_state == DB_READ ||
00674                             db->db_state == DB_FILL) {
00675                                 ASSERT(db->db_state == DB_READ ||
00676                                     (flags & DB_RF_HAVESTRUCT) == 0);
00677                                 cv_wait(&db->db_changed, &db->db_mtx);
00678                         }
00679                         if (db->db_state == DB_UNCACHED)
00680                                 err = EIO;
00681                 }
00682                 mutex_exit(&db->db_mtx);
00683         }
00684 
00685         ASSERT(err || havepzio || db->db_state == DB_CACHED);
00686         return (err);
00687 }
00688 
00689 static void
00690 dbuf_noread(dmu_buf_impl_t *db)
00691 {
00692         ASSERT(!refcount_is_zero(&db->db_holds));
00693         ASSERT(db->db_blkid != DMU_BONUS_BLKID);
00694         mutex_enter(&db->db_mtx);
00695         while (db->db_state == DB_READ || db->db_state == DB_FILL)
00696                 cv_wait(&db->db_changed, &db->db_mtx);
00697         if (db->db_state == DB_UNCACHED) {
00698                 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
00699                 spa_t *spa;
00700 
00701                 ASSERT(db->db_buf == NULL);
00702                 ASSERT(db->db.db_data == NULL);
00703                 DB_GET_SPA(&spa, db);
00704                 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
00705                 db->db_state = DB_FILL;
00706         } else if (db->db_state == DB_NOFILL) {
00707                 dbuf_set_data(db, NULL);
00708         } else {
00709                 ASSERT3U(db->db_state, ==, DB_CACHED);
00710         }
00711         mutex_exit(&db->db_mtx);
00712 }
00713 
00727 static void
00728 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
00729 {
00730         dbuf_dirty_record_t *dr = db->db_last_dirty;
00731 
00732         ASSERT(MUTEX_HELD(&db->db_mtx));
00733         ASSERT(db->db.db_data != NULL);
00734         ASSERT(db->db_level == 0);
00735         ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
00736 
00737         if (dr == NULL ||
00738             (dr->dt.dl.dr_data !=
00739             ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
00740                 return;
00741 
00742         /*
00743          * If the last dirty record for this dbuf has not yet synced
00744          * and its referencing the dbuf data, either:
00745          *      reset the reference to point to a new copy,
00746          * or (if there a no active holders)
00747          *      just null out the current db_data pointer.
00748          */
00749         ASSERT(dr->dr_txg >= txg - 2);
00750         if (db->db_blkid == DMU_BONUS_BLKID) {
00751                 /* Note that the data bufs here are zio_bufs */
00752                 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
00753                 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
00754                 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
00755         } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
00756                 int size = db->db.db_size;
00757                 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
00758                 spa_t *spa;
00759 
00760                 DB_GET_SPA(&spa, db);
00761                 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
00762                 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
00763         } else {
00764                 dbuf_set_data(db, NULL);
00765         }
00766 }
00767 
00774 void
00775 dbuf_unoverride(dbuf_dirty_record_t *dr)
00776 {
00777         dmu_buf_impl_t *db = dr->dr_dbuf;
00778         blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
00779         uint64_t txg = dr->dr_txg;
00780 
00781         ASSERT(MUTEX_HELD(&db->db_mtx));
00782         ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
00783         ASSERT(db->db_level == 0);
00784 
00785         if (db->db_blkid == DMU_BONUS_BLKID ||
00786             dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
00787                 return;
00788 
00789         ASSERT(db->db_data_pending != dr);
00790 
00791         /* free this block */
00792         if (!BP_IS_HOLE(bp)) {
00793                 spa_t *spa;
00794 
00795                 DB_GET_SPA(&spa, db);
00796                 zio_free(spa, txg, bp);
00797         }
00798         dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
00799         /*
00800          * Release the already-written buffer, so we leave it in
00801          * a consistent dirty state.  Note that all callers are
00802          * modifying the buffer, so they will immediately do
00803          * another (redundant) arc_release().  Therefore, leave
00804          * the buf thawed to save the effort of freezing &
00805          * immediately re-thawing it.
00806          */
00807         arc_release(dr->dt.dl.dr_data, db);
00808 }
00809 
00817 void
00818 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
00819 {
00820         dmu_buf_impl_t *db, *db_next;
00821         uint64_t txg = tx->tx_txg;
00822         int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
00823         uint64_t first_l1 = start >> epbs;
00824         uint64_t last_l1 = end >> epbs;
00825 
00826         if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
00827                 end = dn->dn_maxblkid;
00828                 last_l1 = end >> epbs;
00829         }
00830         dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
00831         mutex_enter(&dn->dn_dbufs_mtx);
00832         for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
00833                 db_next = list_next(&dn->dn_dbufs, db);
00834                 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
00835 
00836                 if (db->db_level == 1 &&
00837                     db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
00838                         mutex_enter(&db->db_mtx);
00839                         if (db->db_last_dirty &&
00840                             db->db_last_dirty->dr_txg < txg) {
00841                                 dbuf_add_ref(db, FTAG);
00842                                 mutex_exit(&db->db_mtx);
00843                                 dbuf_will_dirty(db, tx);
00844                                 dbuf_rele(db, FTAG);
00845                         } else {
00846                                 mutex_exit(&db->db_mtx);
00847                         }
00848                 }
00849 
00850                 if (db->db_level != 0)
00851                         continue;
00852                 dprintf_dbuf(db, "found buf %s\n", "");
00853                 if (db->db_blkid < start || db->db_blkid > end)
00854                         continue;
00855 
00856                 /* found a level 0 buffer in the range */
00857                 if (dbuf_undirty(db, tx))
00858                         continue;
00859 
00860                 mutex_enter(&db->db_mtx);
00861                 if (db->db_state == DB_UNCACHED ||
00862                     db->db_state == DB_NOFILL ||
00863                     db->db_state == DB_EVICTING) {
00864                         ASSERT(db->db.db_data == NULL);
00865                         mutex_exit(&db->db_mtx);
00866                         continue;
00867                 }
00868                 if (db->db_state == DB_READ || db->db_state == DB_FILL) {
00869                         /* will be handled in dbuf_read_done or dbuf_rele */
00870                         db->db_freed_in_flight = TRUE;
00871                         mutex_exit(&db->db_mtx);
00872                         continue;
00873                 }
00874                 if (refcount_count(&db->db_holds) == 0) {
00875                         ASSERT(db->db_buf);
00876                         dbuf_clear(db);
00877                         continue;
00878                 }
00879                 /* The dbuf is referenced */
00880 
00881                 if (db->db_last_dirty != NULL) {
00882                         dbuf_dirty_record_t *dr = db->db_last_dirty;
00883 
00884                         if (dr->dr_txg == txg) {
00885                                 /*
00886                                  * This buffer is "in-use", re-adjust the file
00887                                  * size to reflect that this buffer may
00888                                  * contain new data when we sync.
00889                                  */
00890                                 if (db->db_blkid != DMU_SPILL_BLKID &&
00891                                     db->db_blkid > dn->dn_maxblkid)
00892                                         dn->dn_maxblkid = db->db_blkid;
00893                                 dbuf_unoverride(dr);
00894                         } else {
00895                                 /*
00896                                  * This dbuf is not dirty in the open context.
00897                                  * Either uncache it (if its not referenced in
00898                                  * the open context) or reset its contents to
00899                                  * empty.
00900                                  */
00901                                 dbuf_fix_old_data(db, txg);
00902                         }
00903                 }
00904                 /* clear the contents if its cached */
00905                 if (db->db_state == DB_CACHED) {
00906                         ASSERT(db->db.db_data != NULL);
00907                         arc_release(db->db_buf, db);
00908                         bzero(db->db.db_data, db->db.db_size);
00909                         arc_buf_freeze(db->db_buf);
00910                 }
00911 
00912                 mutex_exit(&db->db_mtx);
00913         }
00914         mutex_exit(&dn->dn_dbufs_mtx);
00915 }
00916 
00917 static int
00918 dbuf_block_freeable(dmu_buf_impl_t *db)
00919 {
00920         dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
00921         uint64_t birth_txg = 0;
00922 
00923         /*
00924          * We don't need any locking to protect db_blkptr:
00925          * If it's syncing, then db_last_dirty will be set
00926          * so we'll ignore db_blkptr.
00927          */
00928         ASSERT(MUTEX_HELD(&db->db_mtx));
00929         if (db->db_last_dirty)
00930                 birth_txg = db->db_last_dirty->dr_txg;
00931         else if (db->db_blkptr)
00932                 birth_txg = db->db_blkptr->blk_birth;
00933 
00934         /*
00935          * If we don't exist or are in a snapshot, we can't be freed.
00936          * Don't pass the bp to dsl_dataset_block_freeable() since we
00937          * are holding the db_mtx lock and might deadlock if we are
00938          * prefetching a dedup-ed block.
00939          */
00940         if (birth_txg)
00941                 return (ds == NULL ||
00942                     dsl_dataset_block_freeable(ds, NULL, birth_txg));
00943         else
00944                 return (FALSE);
00945 }
00946 
00947 void
00948 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
00949 {
00950         arc_buf_t *buf, *obuf;
00951         int osize = db->db.db_size;
00952         arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
00953         dnode_t *dn;
00954 
00955         ASSERT(db->db_blkid != DMU_BONUS_BLKID);
00956 
00957         DB_DNODE_ENTER(db);
00958         dn = DB_DNODE(db);
00959 
00960         /* XXX does *this* func really need the lock? */
00961         ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
00962 
00963         /*
00964          * This call to dbuf_will_dirty() with the dn_struct_rwlock held
00965          * is OK, because there can be no other references to the db
00966          * when we are changing its size, so no concurrent DB_FILL can
00967          * be happening.
00968          */
00969         /*
00970          * XXX we should be doing a dbuf_read, checking the return
00971          * value and returning that up to our callers
00972          */
00973         dbuf_will_dirty(db, tx);
00974 
00975         /* create the data buffer for the new block */
00976         buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
00977 
00978         /* copy old block data to the new block */
00979         obuf = db->db_buf;
00980         bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
00981         /* zero the remainder */
00982         if (size > osize)
00983                 bzero((uint8_t *)buf->b_data + osize, size - osize);
00984 
00985         mutex_enter(&db->db_mtx);
00986         dbuf_set_data(db, buf);
00987         VERIFY(arc_buf_remove_ref(obuf, db) == 1);
00988         db->db.db_size = size;
00989 
00990         if (db->db_level == 0) {
00991                 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
00992                 db->db_last_dirty->dt.dl.dr_data = buf;
00993         }
00994         mutex_exit(&db->db_mtx);
00995 
00996         dnode_willuse_space(dn, size-osize, tx);
00997         DB_DNODE_EXIT(db);
00998 }
00999 
01000 void
01001 dbuf_release_bp(dmu_buf_impl_t *db)
01002 {
01003         objset_t *os;
01004         zbookmark_t zb;
01005 
01006         DB_GET_OBJSET(&os, db);
01007         ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
01008         ASSERT(arc_released(os->os_phys_buf) ||
01009             list_link_active(&os->os_dsl_dataset->ds_synced_link));
01010         ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
01011 
01012         zb.zb_objset = os->os_dsl_dataset ?
01013             os->os_dsl_dataset->ds_object : 0;
01014         zb.zb_object = db->db.db_object;
01015         zb.zb_level = db->db_level;
01016         zb.zb_blkid = db->db_blkid;
01017         (void) arc_release_bp(db->db_buf, db,
01018             db->db_blkptr, os->os_spa, &zb);
01019 }
01020 
01024 dbuf_dirty_record_t *
01025 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
01026 {
01027         dnode_t *dn;
01028         objset_t *os;
01029         dbuf_dirty_record_t **drp, *dr;
01030         int drop_struct_lock = FALSE;
01031         boolean_t do_free_accounting = B_FALSE;
01032         int txgoff = tx->tx_txg & TXG_MASK;
01033 
01034         /* Ensure that this dbuf has no transaction groups or holds */
01035         ASSERT(tx->tx_txg != 0);
01036         ASSERT(!refcount_is_zero(&db->db_holds));
01037         DMU_TX_DIRTY_BUF(tx, db);
01038 
01039         DB_DNODE_ENTER(db);
01040         dn = DB_DNODE(db);
01041         /*
01042          * Shouldn't dirty a regular buffer in syncing context.  Private
01043          * objects may be dirtied in syncing context, but only if they
01044          * were already pre-dirtied in open context.
01045          */
01046         ASSERT(!dmu_tx_is_syncing(tx) ||
01047             BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
01048             DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
01049             dn->dn_objset->os_dsl_dataset == NULL);
01050         /*
01051          * We make this assert for private objects as well, but after we
01052          * check if we're already dirty.  They are allowed to re-dirty
01053          * in syncing context.
01054          */
01055         ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
01056             dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
01057             (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
01058 
01059         mutex_enter(&db->db_mtx);
01060         /*
01061          * XXX make this true for indirects too?  The problem is that
01062          * transactions created with dmu_tx_create_assigned() from
01063          * syncing context don't bother holding ahead.
01064          */
01065         ASSERT(db->db_level != 0 ||
01066             db->db_state == DB_CACHED || db->db_state == DB_FILL ||
01067             db->db_state == DB_NOFILL);
01068 
01069         mutex_enter(&dn->dn_mtx);
01070         /*
01071          * Don't set dirtyctx to SYNC if we're just modifying this as we
01072          * initialize the objset.
01073          */
01074         if (dn->dn_dirtyctx == DN_UNDIRTIED &&
01075             !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
01076                 dn->dn_dirtyctx =
01077                     (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
01078                 ASSERT(dn->dn_dirtyctx_firstset == NULL);
01079                 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
01080         }
01081         mutex_exit(&dn->dn_mtx);
01082 
01083         if (db->db_blkid == DMU_SPILL_BLKID)
01084                 dn->dn_have_spill = B_TRUE;
01085 
01086         /*
01087          * If this buffer is already dirty, we're done.
01088          */
01089         drp = &db->db_last_dirty;
01090         ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
01091             db->db.db_object == DMU_META_DNODE_OBJECT);
01092         while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
01093                 drp = &dr->dr_next;
01094         if (dr && dr->dr_txg == tx->tx_txg) {
01095                 DB_DNODE_EXIT(db);
01096 
01097                 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
01098                         /*
01099                          * If this buffer has already been written out,
01100                          * we now need to reset its state.
01101                          */
01102                         dbuf_unoverride(dr);
01103                         if (db->db.db_object != DMU_META_DNODE_OBJECT &&
01104                             db->db_state != DB_NOFILL)
01105                                 arc_buf_thaw(db->db_buf);
01106                 }
01107                 mutex_exit(&db->db_mtx);
01108                 return (dr);
01109         }
01110 
01111         /*
01112          * Only valid if not already dirty.
01113          */
01114         ASSERT(dn->dn_object == 0 ||
01115             dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
01116             (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
01117 
01118         ASSERT3U(dn->dn_nlevels, >, db->db_level);
01119         ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
01120             dn->dn_phys->dn_nlevels > db->db_level ||
01121             dn->dn_next_nlevels[txgoff] > db->db_level ||
01122             dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
01123             dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
01124 
01125         /*
01126          * We should only be dirtying in syncing context if it's the
01127          * mos or we're initializing the os or it's a special object.
01128          * However, we are allowed to dirty in syncing context provided
01129          * we already dirtied it in open context.  Hence we must make
01130          * this assertion only if we're not already dirty.
01131          */
01132         os = dn->dn_objset;
01133         ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
01134             os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
01135         ASSERT(db->db.db_size != 0);
01136 
01137         dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
01138 
01139         if (db->db_blkid != DMU_BONUS_BLKID) {
01140                 /*
01141                  * Update the accounting.
01142                  * Note: we delay "free accounting" until after we drop
01143                  * the db_mtx.  This keeps us from grabbing other locks
01144                  * (and possibly deadlocking) in bp_get_dsize() while
01145                  * also holding the db_mtx.
01146                  */
01147                 dnode_willuse_space(dn, db->db.db_size, tx);
01148                 do_free_accounting = dbuf_block_freeable(db);
01149         }
01150 
01151         /*
01152          * If this buffer is dirty in an old transaction group we need
01153          * to make a copy of it so that the changes we make in this
01154          * transaction group won't leak out when we sync the older txg.
01155          */
01156         dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
01157         if (db->db_level == 0) {
01158                 void *data_old = db->db_buf;
01159 
01160                 if (db->db_state != DB_NOFILL) {
01161                         if (db->db_blkid == DMU_BONUS_BLKID) {
01162                                 dbuf_fix_old_data(db, tx->tx_txg);
01163                                 data_old = db->db.db_data;
01164                         } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
01165                                 /*
01166                                  * Release the data buffer from the cache so
01167                                  * that we can modify it without impacting
01168                                  * possible other users of this cached data
01169                                  * block.  Note that indirect blocks and
01170                                  * private objects are not released until the
01171                                  * syncing state (since they are only modified
01172                                  * then).
01173                                  */
01174                                 arc_release(db->db_buf, db);
01175                                 dbuf_fix_old_data(db, tx->tx_txg);
01176                                 data_old = db->db_buf;
01177                         }
01178                         ASSERT(data_old != NULL);
01179                 }
01180                 dr->dt.dl.dr_data = data_old;
01181         } else {
01182                 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
01183                 list_create(&dr->dt.di.dr_children,
01184                     sizeof (dbuf_dirty_record_t),
01185                     offsetof(dbuf_dirty_record_t, dr_dirty_node));
01186         }
01187         dr->dr_dbuf = db;
01188         dr->dr_txg = tx->tx_txg;
01189         dr->dr_next = *drp;
01190         *drp = dr;
01191 
01192         /*
01193          * We could have been freed_in_flight between the dbuf_noread
01194          * and dbuf_dirty.  We win, as though the dbuf_noread() had
01195          * happened after the free.
01196          */
01197         if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
01198             db->db_blkid != DMU_SPILL_BLKID) {
01199                 mutex_enter(&dn->dn_mtx);
01200                 dnode_clear_range(dn, db->db_blkid, 1, tx);
01201                 mutex_exit(&dn->dn_mtx);
01202                 db->db_freed_in_flight = FALSE;
01203         }
01204 
01205         /*
01206          * This buffer is now part of this txg
01207          */
01208         dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
01209         db->db_dirtycnt += 1;
01210         ASSERT3U(db->db_dirtycnt, <=, 3);
01211 
01212         mutex_exit(&db->db_mtx);
01213 
01214         if (db->db_blkid == DMU_BONUS_BLKID ||
01215             db->db_blkid == DMU_SPILL_BLKID) {
01216                 mutex_enter(&dn->dn_mtx);
01217                 ASSERT(!list_link_active(&dr->dr_dirty_node));
01218                 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
01219                 mutex_exit(&dn->dn_mtx);
01220                 dnode_setdirty(dn, tx);
01221                 DB_DNODE_EXIT(db);
01222                 return (dr);
01223         } else if (do_free_accounting) {
01224                 blkptr_t *bp = db->db_blkptr;
01225                 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
01226                     bp_get_dsize(os->os_spa, bp) : db->db.db_size;
01227                 /*
01228                  * This is only a guess -- if the dbuf is dirty
01229                  * in a previous txg, we don't know how much
01230                  * space it will use on disk yet.  We should
01231                  * really have the struct_rwlock to access
01232                  * db_blkptr, but since this is just a guess,
01233                  * it's OK if we get an odd answer.
01234                  */
01235                 ddt_prefetch(os->os_spa, bp);
01236                 dnode_willuse_space(dn, -willfree, tx);
01237         }
01238 
01239         if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
01240                 rw_enter(&dn->dn_struct_rwlock, RW_READER);
01241                 drop_struct_lock = TRUE;
01242         }
01243 
01244         if (db->db_level == 0) {
01245                 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
01246                 ASSERT(dn->dn_maxblkid >= db->db_blkid);
01247         }
01248 
01249         if (db->db_level+1 < dn->dn_nlevels) {
01250                 dmu_buf_impl_t *parent = db->db_parent;
01251                 dbuf_dirty_record_t *di;
01252                 int parent_held = FALSE;
01253 
01254                 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
01255                         int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
01256 
01257                         parent = dbuf_hold_level(dn, db->db_level+1,
01258                             db->db_blkid >> epbs, FTAG);
01259                         ASSERT(parent != NULL);
01260                         parent_held = TRUE;
01261                 }
01262                 if (drop_struct_lock)
01263                         rw_exit(&dn->dn_struct_rwlock);
01264                 ASSERT3U(db->db_level+1, ==, parent->db_level);
01265                 di = dbuf_dirty(parent, tx);
01266                 if (parent_held)
01267                         dbuf_rele(parent, FTAG);
01268 
01269                 mutex_enter(&db->db_mtx);
01270                 /*  possible race with dbuf_undirty() */
01271                 if (db->db_last_dirty == dr ||
01272                     dn->dn_object == DMU_META_DNODE_OBJECT) {
01273                         mutex_enter(&di->dt.di.dr_mtx);
01274                         ASSERT3U(di->dr_txg, ==, tx->tx_txg);
01275                         ASSERT(!list_link_active(&dr->dr_dirty_node));
01276                         list_insert_tail(&di->dt.di.dr_children, dr);
01277                         mutex_exit(&di->dt.di.dr_mtx);
01278                         dr->dr_parent = di;
01279                 }
01280                 mutex_exit(&db->db_mtx);
01281         } else {
01282                 ASSERT(db->db_level+1 == dn->dn_nlevels);
01283                 ASSERT(db->db_blkid < dn->dn_nblkptr);
01284                 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
01285                 mutex_enter(&dn->dn_mtx);
01286                 ASSERT(!list_link_active(&dr->dr_dirty_node));
01287                 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
01288                 mutex_exit(&dn->dn_mtx);
01289                 if (drop_struct_lock)
01290                         rw_exit(&dn->dn_struct_rwlock);
01291         }
01292 
01293         dnode_setdirty(dn, tx);
01294         DB_DNODE_EXIT(db);
01295         return (dr);
01296 }
01297 
01301 static int
01302 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
01303 {
01304         dnode_t *dn;
01305         uint64_t txg = tx->tx_txg;
01306         dbuf_dirty_record_t *dr, **drp;
01307 
01308         ASSERT(txg != 0);
01309         ASSERT(db->db_blkid != DMU_BONUS_BLKID);
01310 
01311         mutex_enter(&db->db_mtx);
01312         /*
01313          * If this buffer is not dirty, we're done.
01314          */
01315         for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
01316                 if (dr->dr_txg <= txg)
01317                         break;
01318         if (dr == NULL || dr->dr_txg < txg) {
01319                 mutex_exit(&db->db_mtx);
01320                 return (0);
01321         }
01322         ASSERT(dr->dr_txg == txg);
01323         ASSERT(dr->dr_dbuf == db);
01324 
01325         DB_DNODE_ENTER(db);
01326         dn = DB_DNODE(db);
01327 
01328         /*
01329          * If this buffer is currently held, we cannot undirty
01330          * it, since one of the current holders may be in the
01331          * middle of an update.  Note that users of dbuf_undirty()
01332          * should not place a hold on the dbuf before the call.
01333          * Also note: we can get here with a spill block, so
01334          * test for that similar to how dbuf_dirty does.
01335          */
01336         if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
01337                 mutex_exit(&db->db_mtx);
01338                 /* Make sure we don't toss this buffer at sync phase */
01339                 if (db->db_blkid != DMU_SPILL_BLKID) {
01340                         mutex_enter(&dn->dn_mtx);
01341                         dnode_clear_range(dn, db->db_blkid, 1, tx);
01342                         mutex_exit(&dn->dn_mtx);
01343                 }
01344                 DB_DNODE_EXIT(db);
01345                 return (0);
01346         }
01347 
01348         dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
01349 
01350         ASSERT(db->db.db_size != 0);
01351 
01352         /* XXX would be nice to fix up dn_towrite_space[] */
01353 
01354         *drp = dr->dr_next;
01355 
01356         /*
01357          * Note that there are three places in dbuf_dirty()
01358          * where this dirty record may be put on a list.
01359          * Make sure to do a list_remove corresponding to
01360          * every one of those list_insert calls.
01361          */
01362         if (dr->dr_parent) {
01363                 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
01364                 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
01365                 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
01366         } else if (db->db_blkid == DMU_SPILL_BLKID ||
01367             db->db_level+1 == dn->dn_nlevels) {
01368                 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
01369                 mutex_enter(&dn->dn_mtx);
01370                 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
01371                 mutex_exit(&dn->dn_mtx);
01372         }
01373         DB_DNODE_EXIT(db);
01374 
01375         if (db->db_level == 0) {
01376                 if (db->db_state != DB_NOFILL) {
01377                         dbuf_unoverride(dr);
01378 
01379                         ASSERT(db->db_buf != NULL);
01380                         ASSERT(dr->dt.dl.dr_data != NULL);
01381                         if (dr->dt.dl.dr_data != db->db_buf)
01382                                 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
01383                                     db) == 1);
01384                 }
01385         } else {
01386                 ASSERT(db->db_buf != NULL);
01387                 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
01388                 mutex_destroy(&dr->dt.di.dr_mtx);
01389                 list_destroy(&dr->dt.di.dr_children);
01390         }
01391         kmem_free(dr, sizeof (dbuf_dirty_record_t));
01392 
01393         ASSERT(db->db_dirtycnt > 0);
01394         db->db_dirtycnt -= 1;
01395 
01396         if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
01397                 arc_buf_t *buf = db->db_buf;
01398 
01399                 ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
01400                 dbuf_set_data(db, NULL);
01401                 VERIFY(arc_buf_remove_ref(buf, db) == 1);
01402                 dbuf_evict(db);
01403                 return (1);
01404         }
01405 
01406         mutex_exit(&db->db_mtx);
01407         return (0);
01408 }
01409 
01410 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty
01411 void
01412 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
01413 {
01414         int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
01415 
01416         ASSERT(tx->tx_txg != 0);
01417         ASSERT(!refcount_is_zero(&db->db_holds));
01418 
01419         DB_DNODE_ENTER(db);
01420         if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
01421                 rf |= DB_RF_HAVESTRUCT;
01422         DB_DNODE_EXIT(db);
01423         (void) dbuf_read(db, NULL, rf);
01424         (void) dbuf_dirty(db, tx);
01425 }
01426 
01427 void
01428 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
01429 {
01430         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
01431 
01432         db->db_state = DB_NOFILL;
01433 
01434         dmu_buf_will_fill(db_fake, tx);
01435 }
01436 
01437 void
01438 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
01439 {
01440         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
01441 
01442         ASSERT(db->db_blkid != DMU_BONUS_BLKID);
01443         ASSERT(tx->tx_txg != 0);
01444         ASSERT(db->db_level == 0);
01445         ASSERT(!refcount_is_zero(&db->db_holds));
01446 
01447         ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
01448             dmu_tx_private_ok(tx));
01449 
01450         dbuf_noread(db);
01451         (void) dbuf_dirty(db, tx);
01452 }
01453 
01454 #pragma weak dmu_buf_fill_done = dbuf_fill_done
01455 /* ARGSUSED */
01456 void
01457 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
01458 {
01459         mutex_enter(&db->db_mtx);
01460         DBUF_VERIFY(db);
01461 
01462         if (db->db_state == DB_FILL) {
01463                 if (db->db_level == 0 && db->db_freed_in_flight) {
01464                         ASSERT(db->db_blkid != DMU_BONUS_BLKID);
01465                         /* we were freed while filling */
01466                         /* XXX dbuf_undirty? */
01467                         bzero(db->db.db_data, db->db.db_size);
01468                         db->db_freed_in_flight = FALSE;
01469                 }
01470                 db->db_state = DB_CACHED;
01471                 cv_broadcast(&db->db_changed);
01472         }
01473         mutex_exit(&db->db_mtx);
01474 }
01475 
01480 void
01481 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
01482 {
01483         ASSERT(!refcount_is_zero(&db->db_holds));
01484         ASSERT(db->db_blkid != DMU_BONUS_BLKID);
01485         ASSERT(db->db_level == 0);
01486         ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
01487         ASSERT(buf != NULL);
01488         ASSERT(arc_buf_size(buf) == db->db.db_size);
01489         ASSERT(tx->tx_txg != 0);
01490 
01491         arc_return_buf(buf, db);
01492         ASSERT(arc_released(buf));
01493 
01494         mutex_enter(&db->db_mtx);
01495 
01496         while (db->db_state == DB_READ || db->db_state == DB_FILL)
01497                 cv_wait(&db->db_changed, &db->db_mtx);
01498 
01499         ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
01500 
01501         /*
01502          * If the dbuf is cached and the number of holds exceeds the number
01503          * of dirty calls on it, then dirty it again and remove the buffer
01504          * reference, before copying the ARC buffer to the dbuf.
01505          */
01506         if (db->db_state == DB_CACHED &&
01507             refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
01508                 mutex_exit(&db->db_mtx);
01509                 (void) dbuf_dirty(db, tx);
01510                 bcopy(buf->b_data, db->db.db_data, db->db.db_size);
01511                 VERIFY(arc_buf_remove_ref(buf, db) == 1);
01512                 xuio_stat_wbuf_copied();
01513                 return;
01514         }
01515 
01516         xuio_stat_wbuf_nocopy();
01517         if (db->db_state == DB_CACHED) {
01518                 dbuf_dirty_record_t *dr = db->db_last_dirty;
01519 
01520                 ASSERT(db->db_buf != NULL);
01521                 if (dr != NULL && dr->dr_txg == tx->tx_txg) {
01522                         ASSERT(dr->dt.dl.dr_data == db->db_buf);
01523                         if (!arc_released(db->db_buf)) {
01524                                 ASSERT(dr->dt.dl.dr_override_state ==
01525                                     DR_OVERRIDDEN);
01526                                 arc_release(db->db_buf, db);
01527                         }
01528                         dr->dt.dl.dr_data = buf;
01529                         VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
01530                 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
01531                         arc_release(db->db_buf, db);
01532                         VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
01533                 }
01534                 db->db_buf = NULL;
01535         }
01536         ASSERT(db->db_buf == NULL);
01537         /* Set db->db_buf = buf */
01538         dbuf_set_data(db, buf);
01539         db->db_state = DB_FILL;
01540         mutex_exit(&db->db_mtx);
01541         (void) dbuf_dirty(db, tx);
01542         /* clear db->db.db_data and tell waiters it's changed ?? */
01543         dbuf_fill_done(db, tx);
01544 }
01545 
01563 void
01564 dbuf_clear(dmu_buf_impl_t *db)
01565 {
01566         dnode_t *dn;
01567         dmu_buf_impl_t *parent = db->db_parent;
01568         dmu_buf_impl_t *dndb;
01569         int dbuf_gone = FALSE;
01570 
01571         ASSERT(MUTEX_HELD(&db->db_mtx));
01572         ASSERT(refcount_is_zero(&db->db_holds));
01573 
01574         dbuf_evict_user(db);
01575 
01576         if (db->db_state == DB_CACHED) {
01577                 ASSERT(db->db.db_data != NULL);
01578                 if (db->db_blkid == DMU_BONUS_BLKID) {
01579                         zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
01580                         arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
01581                 }
01582                 db->db.db_data = NULL;
01583                 db->db_state = DB_UNCACHED;
01584         }
01585 
01586         ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
01587         ASSERT(db->db_data_pending == NULL);
01588 
01589         db->db_state = DB_EVICTING;
01590         db->db_blkptr = NULL;
01591 
01592         DB_DNODE_ENTER(db);
01593         dn = DB_DNODE(db);
01594         dndb = dn->dn_dbuf;
01595         if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
01596                 list_remove(&dn->dn_dbufs, db);
01597                 (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
01598                 membar_producer();
01599                 DB_DNODE_EXIT(db);
01600                 /*
01601                  * Decrementing the dbuf count means that the hold corresponding
01602                  * to the removed dbuf is no longer discounted in dnode_move(),
01603                  * so the dnode cannot be moved until after we release the hold.
01604                  * The membar_producer() ensures visibility of the decremented
01605                  * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
01606                  * release any lock.
01607                  */
01608                 dnode_rele(dn, db);
01609                 db->db_dnode_handle = NULL;
01610         } else {
01611                 DB_DNODE_EXIT(db);
01612         }
01613 
01614         if (db->db_buf)
01615                 dbuf_gone = arc_buf_evict(db->db_buf);
01616 
01617         if (!dbuf_gone)
01618                 mutex_exit(&db->db_mtx);
01619 
01620         /*
01621          * If this dbuf is referenced from an indirect dbuf,
01622          * decrement the ref count on the indirect dbuf.
01623          */
01624         if (parent && parent != dndb)
01625                 dbuf_rele(parent, db);
01626 }
01627 
01628 static int
01629 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
01630     dmu_buf_impl_t **parentp, blkptr_t **bpp)
01631 {
01632         int nlevels, epbs;
01633 
01634         *parentp = NULL;
01635         *bpp = NULL;
01636 
01637         ASSERT(blkid != DMU_BONUS_BLKID);
01638 
01639         if (blkid == DMU_SPILL_BLKID) {
01640                 mutex_enter(&dn->dn_mtx);
01641                 if (dn->dn_have_spill &&
01642                     (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
01643                         *bpp = &dn->dn_phys->dn_spill;
01644                 else
01645                         *bpp = NULL;
01646                 dbuf_add_ref(dn->dn_dbuf, NULL);
01647                 *parentp = dn->dn_dbuf;
01648                 mutex_exit(&dn->dn_mtx);
01649                 return (0);
01650         }
01651 
01652         if (dn->dn_phys->dn_nlevels == 0)
01653                 nlevels = 1;
01654         else
01655                 nlevels = dn->dn_phys->dn_nlevels;
01656 
01657         epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
01658 
01659         ASSERT3U(level * epbs, <, 64);
01660         ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
01661         if (level >= nlevels ||
01662             (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
01663                 /* the buffer has no parent yet */
01664                 return (ENOENT);
01665         } else if (level < nlevels-1) {
01666                 /* this block is referenced from an indirect block */
01667                 int err = dbuf_hold_impl(dn, level+1,
01668                     blkid >> epbs, fail_sparse, NULL, parentp);
01669                 if (err)
01670                         return (err);
01671                 err = dbuf_read(*parentp, NULL,
01672                     (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
01673                 if (err) {
01674                         dbuf_rele(*parentp, NULL);
01675                         *parentp = NULL;
01676                         return (err);
01677                 }
01678                 *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
01679                     (blkid & ((1ULL << epbs) - 1));
01680                 return (0);
01681         } else {
01682                 /* the block is referenced from the dnode */
01683                 ASSERT3U(level, ==, nlevels-1);
01684                 ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
01685                     blkid < dn->dn_phys->dn_nblkptr);
01686                 if (dn->dn_dbuf) {
01687                         dbuf_add_ref(dn->dn_dbuf, NULL);
01688                         *parentp = dn->dn_dbuf;
01689                 }
01690                 *bpp = &dn->dn_phys->dn_blkptr[blkid];
01691                 return (0);
01692         }
01693 }
01694 
01695 static dmu_buf_impl_t *
01696 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
01697     dmu_buf_impl_t *parent, blkptr_t *blkptr)
01698 {
01699         objset_t *os = dn->dn_objset;
01700         dmu_buf_impl_t *db, *odb;
01701 
01702         ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
01703         ASSERT(dn->dn_type != DMU_OT_NONE);
01704 
01705         db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
01706 
01707         db->db_objset = os;
01708         db->db.db_object = dn->dn_object;
01709         db->db_level = level;
01710         db->db_blkid = blkid;
01711         db->db_last_dirty = NULL;
01712         db->db_dirtycnt = 0;
01713         db->db_dnode_handle = dn->dn_handle;
01714         db->db_parent = parent;
01715         db->db_blkptr = blkptr;
01716 
01717         db->db_user_ptr = NULL;
01718         db->db_user_data_ptr_ptr = NULL;
01719         db->db_evict_func = NULL;
01720         db->db_immediate_evict = 0;
01721         db->db_freed_in_flight = 0;
01722 
01723         if (blkid == DMU_BONUS_BLKID) {
01724                 ASSERT3P(parent, ==, dn->dn_dbuf);
01725                 db->db.db_size = DN_MAX_BONUSLEN -
01726                     (dn->dn_nblkptr-1) * sizeof (blkptr_t);
01727                 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
01728                 db->db.db_offset = DMU_BONUS_BLKID;
01729                 db->db_state = DB_UNCACHED;
01730                 /* the bonus dbuf is not placed in the hash table */
01731                 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
01732                 return (db);
01733         } else if (blkid == DMU_SPILL_BLKID) {
01734                 db->db.db_size = (blkptr != NULL) ?
01735                     BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
01736                 db->db.db_offset = 0;
01737         } else {
01738                 int blocksize =
01739                     db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
01740                 db->db.db_size = blocksize;
01741                 db->db.db_offset = db->db_blkid * blocksize;
01742         }
01743 
01744         /*
01745          * Hold the dn_dbufs_mtx while we get the new dbuf
01746          * in the hash table *and* added to the dbufs list.
01747          * This prevents a possible deadlock with someone
01748          * trying to look up this dbuf before its added to the
01749          * dn_dbufs list.
01750          */
01751         mutex_enter(&dn->dn_dbufs_mtx);
01752         db->db_state = DB_EVICTING;
01753         if ((odb = dbuf_hash_insert(db)) != NULL) {
01754                 /* someone else inserted it first */
01755                 kmem_cache_free(dbuf_cache, db);
01756                 mutex_exit(&dn->dn_dbufs_mtx);
01757                 return (odb);
01758         }
01759         list_insert_head(&dn->dn_dbufs, db);
01760         db->db_state = DB_UNCACHED;
01761         mutex_exit(&dn->dn_dbufs_mtx);
01762         arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
01763 
01764         if (parent && parent != dn->dn_dbuf)
01765                 dbuf_add_ref(parent, db);
01766 
01767         ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
01768             refcount_count(&dn->dn_holds) > 0);
01769         (void) refcount_add(&dn->dn_holds, db);
01770         (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
01771 
01772         dprintf_dbuf(db, "db=%p\n", db);
01773 
01774         return (db);
01775 }
01776 
01777 static int
01778 dbuf_do_evict(void *private)
01779 {
01780         arc_buf_t *buf = private;
01781         dmu_buf_impl_t *db = buf->b_private;
01782 
01783         if (!MUTEX_HELD(&db->db_mtx))
01784                 mutex_enter(&db->db_mtx);
01785 
01786         ASSERT(refcount_is_zero(&db->db_holds));
01787 
01788         if (db->db_state != DB_EVICTING) {
01789                 ASSERT(db->db_state == DB_CACHED);
01790                 DBUF_VERIFY(db);
01791                 db->db_buf = NULL;
01792                 dbuf_evict(db);
01793         } else {
01794                 mutex_exit(&db->db_mtx);
01795                 dbuf_destroy(db);
01796         }
01797         return (0);
01798 }
01799 
01800 static void
01801 dbuf_destroy(dmu_buf_impl_t *db)
01802 {
01803         ASSERT(refcount_is_zero(&db->db_holds));
01804 
01805         if (db->db_blkid != DMU_BONUS_BLKID) {
01806                 /*
01807                  * If this dbuf is still on the dn_dbufs list,
01808                  * remove it from that list.
01809                  */
01810                 if (db->db_dnode_handle != NULL) {
01811                         dnode_t *dn;
01812 
01813                         DB_DNODE_ENTER(db);
01814                         dn = DB_DNODE(db);
01815                         mutex_enter(&dn->dn_dbufs_mtx);
01816                         list_remove(&dn->dn_dbufs, db);
01817                         (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
01818                         mutex_exit(&dn->dn_dbufs_mtx);
01819                         DB_DNODE_EXIT(db);
01820                         /*
01821                          * Decrementing the dbuf count means that the hold
01822                          * corresponding to the removed dbuf is no longer
01823                          * discounted in dnode_move(), so the dnode cannot be
01824                          * moved until after we release the hold.
01825                          */
01826                         dnode_rele(dn, db);
01827                         db->db_dnode_handle = NULL;
01828                 }
01829                 dbuf_hash_remove(db);
01830         }
01831         db->db_parent = NULL;
01832         db->db_buf = NULL;
01833 
01834         ASSERT(!list_link_active(&db->db_link));
01835         ASSERT(db->db.db_data == NULL);
01836         ASSERT(db->db_hash_next == NULL);
01837         ASSERT(db->db_blkptr == NULL);
01838         ASSERT(db->db_data_pending == NULL);
01839 
01840         kmem_cache_free(dbuf_cache, db);
01841         arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
01842 }
01843 
01844 void
01845 dbuf_prefetch(dnode_t *dn, uint64_t blkid)
01846 {
01847         dmu_buf_impl_t *db = NULL;
01848         blkptr_t *bp = NULL;
01849 
01850         ASSERT(blkid != DMU_BONUS_BLKID);
01851         ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
01852 
01853         if (dnode_block_freed(dn, blkid))
01854                 return;
01855 
01856         /* dbuf_find() returns with db_mtx held */
01857         if (db = dbuf_find(dn, 0, blkid)) {
01858                 /*
01859                  * This dbuf is already in the cache.  We assume that
01860                  * it is already CACHED, or else about to be either
01861                  * read or filled.
01862                  */
01863                 mutex_exit(&db->db_mtx);
01864                 return;
01865         }
01866 
01867         if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
01868                 if (bp && !BP_IS_HOLE(bp)) {
01869                         int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
01870                             ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
01871                         arc_buf_t *pbuf;
01872                         dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
01873                         uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
01874                         zbookmark_t zb;
01875 
01876                         SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
01877                             dn->dn_object, 0, blkid);
01878 
01879                         if (db)
01880                                 pbuf = db->db_buf;
01881                         else
01882                                 pbuf = dn->dn_objset->os_phys_buf;
01883 
01884                         (void) dsl_read(NULL, dn->dn_objset->os_spa,
01885                             bp, pbuf, NULL, NULL, priority,
01886                             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
01887                             &aflags, &zb);
01888                 }
01889                 if (db)
01890                         dbuf_rele(db, NULL);
01891         }
01892 }
01893 
01899 int
01900 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
01901     void *tag, dmu_buf_impl_t **dbp)
01902 {
01903         dmu_buf_impl_t *db, *parent = NULL;
01904 
01905         ASSERT(blkid != DMU_BONUS_BLKID);
01906         ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
01907         ASSERT3U(dn->dn_nlevels, >, level);
01908 
01909         *dbp = NULL;
01910 top:
01911         /* dbuf_find() returns with db_mtx held */
01912         db = dbuf_find(dn, level, blkid);
01913 
01914         if (db == NULL) {
01915                 blkptr_t *bp = NULL;
01916                 int err;
01917 
01918                 ASSERT3P(parent, ==, NULL);
01919                 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
01920                 if (fail_sparse) {
01921                         if (err == 0 && bp && BP_IS_HOLE(bp))
01922                                 err = ENOENT;
01923                         if (err) {
01924                                 if (parent)
01925                                         dbuf_rele(parent, NULL);
01926                                 return (err);
01927                         }
01928                 }
01929                 if (err && err != ENOENT)
01930                         return (err);
01931                 db = dbuf_create(dn, level, blkid, parent, bp);
01932         }
01933 
01934         if (db->db_buf && refcount_is_zero(&db->db_holds)) {
01935                 arc_buf_add_ref(db->db_buf, db);
01936                 if (db->db_buf->b_data == NULL) {
01937                         dbuf_clear(db);
01938                         if (parent) {
01939                                 dbuf_rele(parent, NULL);
01940                                 parent = NULL;
01941                         }
01942                         goto top;
01943                 }
01944                 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
01945         }
01946 
01947         ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
01948 
01949         /*
01950          * If this buffer is currently syncing out, and we are are
01951          * still referencing it from db_data, we need to make a copy
01952          * of it in case we decide we want to dirty it again in this txg.
01953          */
01954         if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
01955             dn->dn_object != DMU_META_DNODE_OBJECT &&
01956             db->db_state == DB_CACHED && db->db_data_pending) {
01957                 dbuf_dirty_record_t *dr = db->db_data_pending;
01958 
01959                 if (dr->dt.dl.dr_data == db->db_buf) {
01960                         arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
01961 
01962                         dbuf_set_data(db,
01963                             arc_buf_alloc(dn->dn_objset->os_spa,
01964                             db->db.db_size, db, type));
01965                         bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
01966                             db->db.db_size);
01967                 }
01968         }
01969 
01970         (void) refcount_add(&db->db_holds, tag);
01971         dbuf_update_data(db);
01972         DBUF_VERIFY(db);
01973         mutex_exit(&db->db_mtx);
01974 
01975         /* NOTE: we can't rele the parent until after we drop the db_mtx */
01976         if (parent)
01977                 dbuf_rele(parent, NULL);
01978 
01979         ASSERT3P(DB_DNODE(db), ==, dn);
01980         ASSERT3U(db->db_blkid, ==, blkid);
01981         ASSERT3U(db->db_level, ==, level);
01982         *dbp = db;
01983 
01984         return (0);
01985 }
01986 
01987 dmu_buf_impl_t *
01988 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
01989 {
01990         dmu_buf_impl_t *db;
01991         int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
01992         return (err ? NULL : db);
01993 }
01994 
01995 dmu_buf_impl_t *
01996 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
01997 {
01998         dmu_buf_impl_t *db;
01999         int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
02000         return (err ? NULL : db);
02001 }
02002 
02003 void
02004 dbuf_create_bonus(dnode_t *dn)
02005 {
02006         ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
02007 
02008         ASSERT(dn->dn_bonus == NULL);
02009         dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
02010 }
02011 
02012 int
02013 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
02014 {
02015         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
02016         dnode_t *dn;
02017 
02018         if (db->db_blkid != DMU_SPILL_BLKID)
02019                 return (ENOTSUP);
02020         if (blksz == 0)
02021                 blksz = SPA_MINBLOCKSIZE;
02022         if (blksz > SPA_MAXBLOCKSIZE)
02023                 blksz = SPA_MAXBLOCKSIZE;
02024         else
02025                 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
02026 
02027         DB_DNODE_ENTER(db);
02028         dn = DB_DNODE(db);
02029         rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
02030         dbuf_new_size(db, blksz, tx);
02031         rw_exit(&dn->dn_struct_rwlock);
02032         DB_DNODE_EXIT(db);
02033 
02034         return (0);
02035 }
02036 
02037 void
02038 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
02039 {
02040         dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
02041 }
02042 
02043 #pragma weak dmu_buf_add_ref = dbuf_add_ref
02044 void
02045 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
02046 {
02047         int64_t holds = refcount_add(&db->db_holds, tag);
02048         ASSERT(holds > 1);
02049 }
02050 
02060 #pragma weak dmu_buf_rele = dbuf_rele
02061 void
02062 dbuf_rele(dmu_buf_impl_t *db, void *tag)
02063 {
02064         mutex_enter(&db->db_mtx);
02065         dbuf_rele_and_unlock(db, tag);
02066 }
02067 
02072 void
02073 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
02074 {
02075         int64_t holds;
02076 
02077         ASSERT(MUTEX_HELD(&db->db_mtx));
02078         DBUF_VERIFY(db);
02079 
02080         /*
02081          * Remove the reference to the dbuf before removing its hold on the
02082          * dnode so we can guarantee in dnode_move() that a referenced bonus
02083          * buffer has a corresponding dnode hold.
02084          */
02085         holds = refcount_remove(&db->db_holds, tag);
02086         ASSERT(holds >= 0);
02087 
02088         /*
02089          * We can't freeze indirects if there is a possibility that they
02090          * may be modified in the current syncing context.
02091          */
02092         if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
02093                 arc_buf_freeze(db->db_buf);
02094 
02095         if (holds == db->db_dirtycnt &&
02096             db->db_level == 0 && db->db_immediate_evict)
02097                 dbuf_evict_user(db);
02098 
02099         if (holds == 0) {
02100                 if (db->db_blkid == DMU_BONUS_BLKID) {
02101                         mutex_exit(&db->db_mtx);
02102 
02103                         /*
02104                          * If the dnode moves here, we cannot cross this barrier
02105                          * until the move completes.
02106                          */
02107                         DB_DNODE_ENTER(db);
02108                         (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
02109                         DB_DNODE_EXIT(db);
02110                         /*
02111                          * The bonus buffer's dnode hold is no longer discounted
02112                          * in dnode_move(). The dnode cannot move until after
02113                          * the dnode_rele().
02114                          */
02115                         dnode_rele(DB_DNODE(db), db);
02116                 } else if (db->db_buf == NULL) {
02117                         /*
02118                          * This is a special case: we never associated this
02119                          * dbuf with any data allocated from the ARC.
02120                          */
02121                         ASSERT(db->db_state == DB_UNCACHED ||
02122                             db->db_state == DB_NOFILL);
02123                         dbuf_evict(db);
02124                 } else if (arc_released(db->db_buf)) {
02125                         arc_buf_t *buf = db->db_buf;
02126                         /*
02127                          * This dbuf has anonymous data associated with it.
02128                          */
02129                         dbuf_set_data(db, NULL);
02130                         VERIFY(arc_buf_remove_ref(buf, db) == 1);
02131                         dbuf_evict(db);
02132                 } else {
02133                         VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
02134 
02135                         /*
02136                          * A dbuf will be eligible for eviction if either the
02137                          * 'primarycache' property is set or a duplicate
02138                          * copy of this buffer is already cached in the arc.
02139                          *
02140                          * In the case of the 'primarycache' a buffer
02141                          * is considered for eviction if it matches the
02142                          * criteria set in the property.
02143                          *
02144                          * To decide if our buffer is considered a
02145                          * duplicate, we must call into the arc to determine
02146                          * if multiple buffers are referencing the same
02147                          * block on-disk. If so, then we simply evict
02148                          * ourselves.
02149                          */
02150                         if (!DBUF_IS_CACHEABLE(db) ||
02151                             arc_buf_eviction_needed(db->db_buf))
02152                                 dbuf_clear(db);
02153                         else
02154                                 mutex_exit(&db->db_mtx);
02155                 }
02156         } else {
02157                 mutex_exit(&db->db_mtx);
02158         }
02159 }
02160 
02161 #pragma weak dmu_buf_refcount = dbuf_refcount
02162 uint64_t
02163 dbuf_refcount(dmu_buf_impl_t *db)
02164 {
02165         return (refcount_count(&db->db_holds));
02166 }
02167 
02168 void *
02169 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
02170     dmu_buf_evict_func_t *evict_func)
02171 {
02172         return (dmu_buf_update_user(db_fake, NULL, user_ptr,
02173             user_data_ptr_ptr, evict_func));
02174 }
02175 
02176 void *
02177 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
02178     dmu_buf_evict_func_t *evict_func)
02179 {
02180         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
02181 
02182         db->db_immediate_evict = TRUE;
02183         return (dmu_buf_update_user(db_fake, NULL, user_ptr,
02184             user_data_ptr_ptr, evict_func));
02185 }
02186 
02187 void *
02188 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
02189     void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
02190 {
02191         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
02192         ASSERT(db->db_level == 0);
02193 
02194         ASSERT((user_ptr == NULL) == (evict_func == NULL));
02195 
02196         mutex_enter(&db->db_mtx);
02197 
02198         if (db->db_user_ptr == old_user_ptr) {
02199                 db->db_user_ptr = user_ptr;
02200                 db->db_user_data_ptr_ptr = user_data_ptr_ptr;
02201                 db->db_evict_func = evict_func;
02202 
02203                 dbuf_update_data(db);
02204         } else {
02205                 old_user_ptr = db->db_user_ptr;
02206         }
02207 
02208         mutex_exit(&db->db_mtx);
02209         return (old_user_ptr);
02210 }
02211 
02212 void *
02213 dmu_buf_get_user(dmu_buf_t *db_fake)
02214 {
02215         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
02216         ASSERT(!refcount_is_zero(&db->db_holds));
02217 
02218         return (db->db_user_ptr);
02219 }
02220 
02221 boolean_t
02222 dmu_buf_freeable(dmu_buf_t *dbuf)
02223 {
02224         boolean_t res = B_FALSE;
02225         dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
02226 
02227         if (db->db_blkptr)
02228                 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
02229                     db->db_blkptr, db->db_blkptr->blk_birth);
02230 
02231         return (res);
02232 }
02233 
02234 static void
02235 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
02236 {
02237         /* ASSERT(dmu_tx_is_syncing(tx) */
02238         ASSERT(MUTEX_HELD(&db->db_mtx));
02239 
02240         if (db->db_blkptr != NULL)
02241                 return;
02242 
02243         if (db->db_blkid == DMU_SPILL_BLKID) {
02244                 db->db_blkptr = &dn->dn_phys->dn_spill;
02245                 BP_ZERO(db->db_blkptr);
02246                 return;
02247         }
02248         if (db->db_level == dn->dn_phys->dn_nlevels-1) {
02249                 /*
02250                  * This buffer was allocated at a time when there was
02251                  * no available blkptrs from the dnode, or it was
02252                  * inappropriate to hook it in (i.e., nlevels mis-match).
02253                  */
02254                 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
02255                 ASSERT(db->db_parent == NULL);
02256                 db->db_parent = dn->dn_dbuf;
02257                 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
02258                 DBUF_VERIFY(db);
02259         } else {
02260                 dmu_buf_impl_t *parent = db->db_parent;
02261                 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
02262 
02263                 ASSERT(dn->dn_phys->dn_nlevels > 1);
02264                 if (parent == NULL) {
02265                         mutex_exit(&db->db_mtx);
02266                         rw_enter(&dn->dn_struct_rwlock, RW_READER);
02267                         (void) dbuf_hold_impl(dn, db->db_level+1,
02268                             db->db_blkid >> epbs, FALSE, db, &parent);
02269                         rw_exit(&dn->dn_struct_rwlock);
02270                         mutex_enter(&db->db_mtx);
02271                         db->db_parent = parent;
02272                 }
02273                 db->db_blkptr = (blkptr_t *)parent->db.db_data +
02274                     (db->db_blkid & ((1ULL << epbs) - 1));
02275                 DBUF_VERIFY(db);
02276         }
02277 }
02278 
02279 static void
02280 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
02281 {
02282         dmu_buf_impl_t *db = dr->dr_dbuf;
02283         dnode_t *dn;
02284         zio_t *zio;
02285 
02286         ASSERT(dmu_tx_is_syncing(tx));
02287 
02288         dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
02289 
02290         mutex_enter(&db->db_mtx);
02291 
02292         ASSERT(db->db_level > 0);
02293         DBUF_VERIFY(db);
02294 
02295         /* Read the block if it hasn't been read yet. */
02296         if (db->db_buf == NULL) {
02297                 mutex_exit(&db->db_mtx);
02298                 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
02299                 mutex_enter(&db->db_mtx);
02300         }
02301         ASSERT3U(db->db_state, ==, DB_CACHED);
02302         ASSERT(db->db_buf != NULL);
02303 
02304         DB_DNODE_ENTER(db);
02305         dn = DB_DNODE(db);
02306         /* Indirect block size must match what the dnode thinks it is. */
02307         ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
02308         dbuf_check_blkptr(dn, db);
02309         DB_DNODE_EXIT(db);
02310 
02311         /* Provide the pending dirty record to child dbufs */
02312         db->db_data_pending = dr;
02313 
02314         mutex_exit(&db->db_mtx);
02315         dbuf_write(dr, db->db_buf, tx);
02316 
02317         zio = dr->dr_zio;
02318         mutex_enter(&dr->dt.di.dr_mtx);
02319         dbuf_sync_list(&dr->dt.di.dr_children, tx);
02320         ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
02321         mutex_exit(&dr->dt.di.dr_mtx);
02322         zio_nowait(zio);
02323 }
02324 
02325 static void
02326 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
02327 {
02328         arc_buf_t **datap = &dr->dt.dl.dr_data;
02329         dmu_buf_impl_t *db = dr->dr_dbuf;
02330         dnode_t *dn;
02331         objset_t *os;
02332         uint64_t txg = tx->tx_txg;
02333 
02334         ASSERT(dmu_tx_is_syncing(tx));
02335 
02336         dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
02337 
02338         mutex_enter(&db->db_mtx);
02339         /*
02340          * To be synced, we must be dirtied.  But we
02341          * might have been freed after the dirty.
02342          */
02343         if (db->db_state == DB_UNCACHED) {
02344                 /* This buffer has been freed since it was dirtied */
02345                 ASSERT(db->db.db_data == NULL);
02346         } else if (db->db_state == DB_FILL) {
02347                 /* This buffer was freed and is now being re-filled */
02348                 ASSERT(db->db.db_data != dr->dt.dl.dr_data);
02349         } else {
02350                 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
02351         }
02352         DBUF_VERIFY(db);
02353 
02354         DB_DNODE_ENTER(db);
02355         dn = DB_DNODE(db);
02356 
02357         if (db->db_blkid == DMU_SPILL_BLKID) {
02358                 mutex_enter(&dn->dn_mtx);
02359                 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
02360                 mutex_exit(&dn->dn_mtx);
02361         }
02362 
02363         /*
02364          * If this is a bonus buffer, simply copy the bonus data into the
02365          * dnode.  It will be written out when the dnode is synced (and it
02366          * will be synced, since it must have been dirty for dbuf_sync to
02367          * be called).
02368          */
02369         if (db->db_blkid == DMU_BONUS_BLKID) {
02370                 dbuf_dirty_record_t **drp;
02371 
02372                 ASSERT(*datap != NULL);
02373                 ASSERT0(db->db_level);
02374                 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
02375                 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
02376                 DB_DNODE_EXIT(db);
02377 
02378                 if (*datap != db->db.db_data) {
02379                         zio_buf_free(*datap, DN_MAX_BONUSLEN);
02380                         arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
02381                 }
02382                 db->db_data_pending = NULL;
02383                 drp = &db->db_last_dirty;
02384                 while (*drp != dr)
02385                         drp = &(*drp)->dr_next;
02386                 ASSERT(dr->dr_next == NULL);
02387                 ASSERT(dr->dr_dbuf == db);
02388                 *drp = dr->dr_next;
02389                 if (dr->dr_dbuf->db_level != 0) {
02390                         list_destroy(&dr->dt.di.dr_children);
02391                         mutex_destroy(&dr->dt.di.dr_mtx);
02392                 }
02393                 kmem_free(dr, sizeof (dbuf_dirty_record_t));
02394                 ASSERT(db->db_dirtycnt > 0);
02395                 db->db_dirtycnt -= 1;
02396                 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
02397                 return;
02398         }
02399 
02400         os = dn->dn_objset;
02401 
02402         /*
02403          * This function may have dropped the db_mtx lock allowing a dmu_sync
02404          * operation to sneak in. As a result, we need to ensure that we
02405          * don't check the dr_override_state until we have returned from
02406          * dbuf_check_blkptr.
02407          */
02408         dbuf_check_blkptr(dn, db);
02409 
02410         /*
02411          * If this buffer is in the middle of an immediate write,
02412          * wait for the synchronous IO to complete.
02413          */
02414         while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
02415                 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
02416                 cv_wait(&db->db_changed, &db->db_mtx);
02417                 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
02418         }
02419 
02420         if (db->db_state != DB_NOFILL &&
02421             dn->dn_object != DMU_META_DNODE_OBJECT &&
02422             refcount_count(&db->db_holds) > 1 &&
02423             dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
02424             *datap == db->db_buf) {
02425                 /*
02426                  * If this buffer is currently "in use" (i.e., there
02427                  * are active holds and db_data still references it),
02428                  * then make a copy before we start the write so that
02429                  * any modifications from the open txg will not leak
02430                  * into this write.
02431                  *
02432                  * NOTE: this copy does not need to be made for
02433                  * objects only modified in the syncing context (e.g.
02434                  * DNONE_DNODE blocks).
02435                  */
02436                 int blksz = arc_buf_size(*datap);
02437                 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
02438                 *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
02439                 bcopy(db->db.db_data, (*datap)->b_data, blksz);
02440         }
02441         /* notify that the dirty record is about to write */
02442         db->db_data_pending = dr;
02443 
02444         mutex_exit(&db->db_mtx);
02445 
02446         dbuf_write(dr, *datap, tx);
02447 
02448         ASSERT(!list_link_active(&dr->dr_dirty_node));
02449         if (dn->dn_object == DMU_META_DNODE_OBJECT) {
02450                 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
02451                 DB_DNODE_EXIT(db);
02452         } else {
02453                 /*
02454                  * Although zio_nowait() does not "wait for an IO", it does
02455                  * initiate the IO. If this is an empty write it seems plausible
02456                  * that the IO could actually be completed before the nowait
02457                  * returns. We need to DB_DNODE_EXIT() first in case
02458                  * zio_nowait() invalidates the dbuf.
02459                  */
02460                 DB_DNODE_EXIT(db);
02461                 zio_nowait(dr->dr_zio);
02462         }
02463 }
02464 
02465 void
02466 dbuf_sync_list(list_t *list, dmu_tx_t *tx)
02467 {
02468         dbuf_dirty_record_t *dr;
02469 
02470         while (dr = list_head(list)) {
02471                 if (dr->dr_zio != NULL) {
02472                         /*
02473                          * If we find an already initialized zio then we
02474                          * are processing the meta-dnode, and we have finished.
02475                          * The dbufs for all dnodes are put back on the list
02476                          * during processing, so that we can zio_wait()
02477                          * these IOs after initiating all child IOs.
02478                          */
02479                         ASSERT3U(dr->dr_dbuf->db.db_object, ==,
02480                             DMU_META_DNODE_OBJECT);
02481                         break;
02482                 }
02483                 list_remove(list, dr);
02484                 if (dr->dr_dbuf->db_level > 0)
02485                         dbuf_sync_indirect(dr, tx);
02486                 else
02487                         dbuf_sync_leaf(dr, tx);
02488         }
02489 }
02490 
02491 /* ARGSUSED */
02492 static void
02493 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
02494 {
02495         dmu_buf_impl_t *db = vdb;
02496         dnode_t *dn;
02497         blkptr_t *bp = zio->io_bp;
02498         blkptr_t *bp_orig = &zio->io_bp_orig;
02499         spa_t *spa = zio->io_spa;
02500         int64_t delta;
02501         uint64_t fill = 0;
02502         int i;
02503 
02504         ASSERT(db->db_blkptr == bp);
02505 
02506         DB_DNODE_ENTER(db);
02507         dn = DB_DNODE(db);
02508         delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
02509         dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
02510         zio->io_prev_space_delta = delta;
02511 
02512         if (BP_IS_HOLE(bp)) {
02513                 ASSERT(bp->blk_fill == 0);
02514                 DB_DNODE_EXIT(db);
02515                 return;
02516         }
02517 
02518         ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
02519             BP_GET_TYPE(bp) == dn->dn_type) ||
02520             (db->db_blkid == DMU_SPILL_BLKID &&
02521             BP_GET_TYPE(bp) == dn->dn_bonustype));
02522         ASSERT(BP_GET_LEVEL(bp) == db->db_level);
02523 
02524         mutex_enter(&db->db_mtx);
02525 
02526 #ifdef ZFS_DEBUG
02527         if (db->db_blkid == DMU_SPILL_BLKID) {
02528                 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
02529                 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
02530                     db->db_blkptr == &dn->dn_phys->dn_spill);
02531         }
02532 #endif
02533 
02534         if (db->db_level == 0) {
02535                 mutex_enter(&dn->dn_mtx);
02536                 if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
02537                     db->db_blkid != DMU_SPILL_BLKID)
02538                         dn->dn_phys->dn_maxblkid = db->db_blkid;
02539                 mutex_exit(&dn->dn_mtx);
02540 
02541                 if (dn->dn_type == DMU_OT_DNODE) {
02542                         dnode_phys_t *dnp = db->db.db_data;
02543                         for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
02544                             i--, dnp++) {
02545                                 if (dnp->dn_type != DMU_OT_NONE)
02546                                         fill++;
02547                         }
02548                 } else {
02549                         fill = 1;
02550                 }
02551         } else {
02552                 blkptr_t *ibp = db->db.db_data;
02553                 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
02554                 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
02555                         if (BP_IS_HOLE(ibp))
02556                                 continue;
02557                         fill += ibp->blk_fill;
02558                 }
02559         }
02560         DB_DNODE_EXIT(db);
02561 
02562         bp->blk_fill = fill;
02563 
02564         mutex_exit(&db->db_mtx);
02565 }
02566 
02567 /* ARGSUSED */
02568 static void
02569 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
02570 {
02571         dmu_buf_impl_t *db = vdb;
02572         blkptr_t *bp = zio->io_bp;
02573         blkptr_t *bp_orig = &zio->io_bp_orig;
02574         uint64_t txg = zio->io_txg;
02575         dbuf_dirty_record_t **drp, *dr;
02576 
02577         ASSERT0(zio->io_error);
02578         ASSERT(db->db_blkptr == bp);
02579 
02580         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
02581                 ASSERT(BP_EQUAL(bp, bp_orig));
02582         } else {
02583                 objset_t *os;
02584                 dsl_dataset_t *ds;
02585                 dmu_tx_t *tx;
02586 
02587                 DB_GET_OBJSET(&os, db);
02588                 ds = os->os_dsl_dataset;
02589                 tx = os->os_synctx;
02590 
02591                 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
02592                 dsl_dataset_block_born(ds, bp, tx);
02593         }
02594 
02595         mutex_enter(&db->db_mtx);
02596 
02597         DBUF_VERIFY(db);
02598 
02599         /*
02600          * Now that the write is completed, the dirty record it resolves is
02601          * no longer needed, so remove it.
02602          */
02603         drp = &db->db_last_dirty;
02604         while ((dr = *drp) != db->db_data_pending)
02605                 drp = &dr->dr_next;
02606         ASSERT(!list_link_active(&dr->dr_dirty_node));
02607         ASSERT(dr->dr_txg == txg);
02608         ASSERT(dr->dr_dbuf == db);
02609         ASSERT(dr->dr_next == NULL);
02610         *drp = dr->dr_next;
02611 
02612 #ifdef ZFS_DEBUG
02613         if (db->db_blkid == DMU_SPILL_BLKID) {
02614                 dnode_t *dn;
02615 
02616                 DB_DNODE_ENTER(db);
02617                 dn = DB_DNODE(db);
02618                 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
02619                 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
02620                     db->db_blkptr == &dn->dn_phys->dn_spill);
02621                 DB_DNODE_EXIT(db);
02622         }
02623 #endif
02624 
02625         /* Clean up the dirty record. */
02626         if (db->db_level == 0) {
02627                 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
02628                 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
02629                 if (db->db_state != DB_NOFILL) {
02630                         if (dr->dt.dl.dr_data != db->db_buf) {
02631                                 /*
02632                                  * What we wrote is already out of date, so
02633                                  * just free the ARC buffer.
02634                                  */
02635                                 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
02636                                     db) == 1);
02637                         } else if (!arc_released(db->db_buf)) {
02638                                 /*
02639                                  * Our dbuf has yet to be evicted, so
02640                                  * register a callback to clean it up once
02641                                  * its ARC buffer is released.
02642                                  */
02643                                 arc_set_callback(db->db_buf, dbuf_do_evict, db);
02644                         }
02645                 }
02646         } else {
02647                 dnode_t *dn;
02648 
02649                 DB_DNODE_ENTER(db);
02650                 dn = DB_DNODE(db);
02651                 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
02652                 /*
02653                  * The size of an indirect block must match what its
02654                  * associated dnode thinks it should be.
02655                  */
02656                 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
02657                 /*
02658                  * If the dbuf's block pointer is not a hole, evict it when
02659                  * its last ARC buffer hold has been released.
02660                  */
02661                 if (!BP_IS_HOLE(db->db_blkptr)) {
02662                         int epbs =
02663                             dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
02664                         ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
02665                             db->db.db_size);
02666                         ASSERT3U(dn->dn_phys->dn_maxblkid
02667                             >> (db->db_level * epbs), >=, db->db_blkid);
02668                         arc_set_callback(db->db_buf, dbuf_do_evict, db);
02669                 }
02670                 DB_DNODE_EXIT(db);
02671                 mutex_destroy(&dr->dt.di.dr_mtx);
02672                 list_destroy(&dr->dt.di.dr_children);
02673         }
02674         kmem_free(dr, sizeof (dbuf_dirty_record_t));
02675 
02676         cv_broadcast(&db->db_changed);
02677         ASSERT(db->db_dirtycnt > 0);
02678         db->db_dirtycnt -= 1;
02679         db->db_data_pending = NULL;
02680         dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
02681 }
02682 
02683 static void
02684 dbuf_write_nofill_ready(zio_t *zio)
02685 {
02686         dbuf_write_ready(zio, NULL, zio->io_private);
02687 }
02688 
02689 static void
02690 dbuf_write_nofill_done(zio_t *zio)
02691 {
02692         dbuf_write_done(zio, NULL, zio->io_private);
02693 }
02694 
02695 static void
02696 dbuf_write_override_ready(zio_t *zio)
02697 {
02698         dbuf_dirty_record_t *dr = zio->io_private;
02699         dmu_buf_impl_t *db = dr->dr_dbuf;
02700 
02701         dbuf_write_ready(zio, NULL, db);
02702 }
02703 
02704 static void
02705 dbuf_write_override_done(zio_t *zio)
02706 {
02707         dbuf_dirty_record_t *dr = zio->io_private;
02708         dmu_buf_impl_t *db = dr->dr_dbuf;
02709         blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
02710 
02711         mutex_enter(&db->db_mtx);
02712         if (!BP_EQUAL(zio->io_bp, obp)) {
02713                 if (!BP_IS_HOLE(obp))
02714                         dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
02715                 arc_release(dr->dt.dl.dr_data, db);
02716         }
02717         mutex_exit(&db->db_mtx);
02718 
02719         dbuf_write_done(zio, NULL, db);
02720 }
02721 
02725 static void
02726 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
02727 {
02728         dmu_buf_impl_t *db = dr->dr_dbuf;
02729         dnode_t *dn;
02730         objset_t *os;
02731         dmu_buf_impl_t *parent = db->db_parent;
02732         uint64_t txg = tx->tx_txg;
02733         zbookmark_t zb;
02734         zio_prop_t zp;
02735         zio_t *pio; /* parent I/O */
02736         int wp_flag = 0;
02737 
02738         DB_DNODE_ENTER(db);
02739         dn = DB_DNODE(db);
02740         os = dn->dn_objset;
02741 
02742         if (db->db_state != DB_NOFILL) {
02743                 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
02744                         /*
02745                          * Private object buffers are released here rather
02746                          * than in dbuf_dirty() since they are only modified
02747                          * in the syncing context and we don't want the
02748                          * overhead of making multiple copies of the data.
02749                          */
02750                         if (BP_IS_HOLE(db->db_blkptr)) {
02751                                 arc_buf_thaw(data);
02752                         } else {
02753                                 dbuf_release_bp(db);
02754                         }
02755                 }
02756         }
02757 
02758         if (parent != dn->dn_dbuf) {
02759                 /* Our parent is an indirect block. */
02760                 /* We have a dirty parent that has been scheduled for write. */
02761                 ASSERT(parent && parent->db_data_pending);
02762                 /* Our parent's buffer is one level closer to the dnode. */
02763                 ASSERT(db->db_level == parent->db_level-1);
02764                 /* Nobody can find the old parent in the ARC. */
02765                 ASSERT(arc_released(parent->db_buf));
02766                 pio = parent->db_data_pending->dr_zio;
02767         } else {
02768                 /* Our parent is the dnode itself. */
02769                 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
02770                     db->db_blkid != DMU_SPILL_BLKID) ||
02771                     (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
02772                 if (db->db_blkid != DMU_SPILL_BLKID)
02773                         ASSERT3P(db->db_blkptr, ==,
02774                             &dn->dn_phys->dn_blkptr[db->db_blkid]);
02775                 pio = dn->dn_zio;
02776         }
02777 
02778         ASSERT(db->db_level == 0 || data == db->db_buf);
02779         ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
02780         ASSERT(pio);
02781 
02782         SET_BOOKMARK(&zb, os->os_dsl_dataset ?
02783             os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
02784             db->db.db_object, db->db_level, db->db_blkid);
02785 
02786         if (db->db_blkid == DMU_SPILL_BLKID)
02787                 wp_flag = WP_SPILL;
02788         wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
02789 
02790         dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
02791         DB_DNODE_EXIT(db);
02792 
02793         if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
02794                 ASSERT(db->db_state != DB_NOFILL);
02795                 dr->dr_zio = zio_write(pio, os->os_spa, txg,
02796                     db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
02797                     dbuf_write_override_ready, dbuf_write_override_done, dr,
02798                     ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
02799                 mutex_enter(&db->db_mtx);
02800                 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
02801                 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
02802                     dr->dt.dl.dr_copies);
02803                 mutex_exit(&db->db_mtx);
02804         } else if (db->db_state == DB_NOFILL) {
02805                 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
02806                 dr->dr_zio = zio_write(pio, os->os_spa, txg,
02807                     db->db_blkptr, NULL, db->db.db_size, &zp,
02808                     dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
02809                     ZIO_PRIORITY_ASYNC_WRITE,
02810                     ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
02811         } else {
02812                 ASSERT(arc_released(data));
02813                 dr->dr_zio = arc_write(pio, os->os_spa, txg,
02814                     db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp,
02815                     dbuf_write_ready, dbuf_write_done, db,
02816                     ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
02817         }
02818 }
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines