FreeBSD ZFS
The Zettabyte File System
|
00001 /* 00002 * CDDL HEADER START 00003 * 00004 * The contents of this file are subject to the terms of the 00005 * Common Development and Distribution License (the "License"). 00006 * You may not use this file except in compliance with the License. 00007 * 00008 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 00009 * or http://www.opensolaris.org/os/licensing. 00010 * See the License for the specific language governing permissions 00011 * and limitations under the License. 00012 * 00013 * When distributing Covered Code, include this CDDL HEADER in each 00014 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 00015 * If applicable, add the following below this CDDL HEADER, with the 00016 * fields enclosed by brackets "[]" replaced with your own identifying 00017 * information: Portions Copyright [yyyy] [name of copyright owner] 00018 * 00019 * CDDL HEADER END 00020 */ 00021 /* 00022 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 00023 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 00024 * Copyright (c) 2012 by Delphix. All rights reserved. 00025 */ 00026 00027 #include <sys/zfs_context.h> 00028 #include <sys/dmu.h> 00029 #include <sys/dmu_impl.h> 00030 #include <sys/dbuf.h> 00031 #include <sys/dmu_objset.h> 00032 #include <sys/dsl_dataset.h> 00033 #include <sys/dsl_dir.h> 00034 #include <sys/dmu_tx.h> 00035 #include <sys/spa.h> 00036 #include <sys/zio.h> 00037 #include <sys/dmu_zfetch.h> 00038 #include <sys/sa.h> 00039 #include <sys/sa_impl.h> 00040 00041 static void dbuf_destroy(dmu_buf_impl_t *db); 00042 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 00043 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 00044 00048 static kmem_cache_t *dbuf_cache; 00049 00050 /* ARGSUSED */ 00051 static int 00052 dbuf_cons(void *vdb, void *unused, int kmflag) 00053 { 00054 dmu_buf_impl_t *db = vdb; 00055 bzero(db, sizeof (dmu_buf_impl_t)); 00056 00057 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 00058 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 00059 refcount_create(&db->db_holds); 00060 return (0); 00061 } 00062 00063 /* ARGSUSED */ 00064 static void 00065 dbuf_dest(void *vdb, void *unused) 00066 { 00067 dmu_buf_impl_t *db = vdb; 00068 mutex_destroy(&db->db_mtx); 00069 cv_destroy(&db->db_changed); 00070 refcount_destroy(&db->db_holds); 00071 } 00072 00076 static dbuf_hash_table_t dbuf_hash_table; 00077 00078 static uint64_t dbuf_hash_count; 00079 00080 static uint64_t 00081 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 00082 { 00083 uintptr_t osv = (uintptr_t)os; 00084 uint64_t crc = -1ULL; 00085 00086 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 00087 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 00088 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 00089 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 00090 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 00091 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 00092 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 00093 00094 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 00095 00096 return (crc); 00097 } 00098 00099 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 00100 00101 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 00102 ((dbuf)->db.db_object == (obj) && \ 00103 (dbuf)->db_objset == (os) && \ 00104 (dbuf)->db_level == (level) && \ 00105 (dbuf)->db_blkid == (blkid)) 00106 00107 dmu_buf_impl_t * 00108 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 00109 { 00110 dbuf_hash_table_t *h = &dbuf_hash_table; 00111 objset_t *os = dn->dn_objset; 00112 uint64_t obj = dn->dn_object; 00113 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 00114 uint64_t idx = hv & h->hash_table_mask; 00115 dmu_buf_impl_t *db; 00116 00117 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 00118 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 00119 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 00120 mutex_enter(&db->db_mtx); 00121 if (db->db_state != DB_EVICTING) { 00122 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 00123 return (db); 00124 } 00125 mutex_exit(&db->db_mtx); 00126 } 00127 } 00128 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 00129 return (NULL); 00130 } 00131 00138 static dmu_buf_impl_t * 00139 dbuf_hash_insert(dmu_buf_impl_t *db) 00140 { 00141 dbuf_hash_table_t *h = &dbuf_hash_table; 00142 objset_t *os = db->db_objset; 00143 uint64_t obj = db->db.db_object; 00144 int level = db->db_level; 00145 uint64_t blkid = db->db_blkid; 00146 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 00147 uint64_t idx = hv & h->hash_table_mask; 00148 dmu_buf_impl_t *dbf; 00149 00150 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 00151 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 00152 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 00153 mutex_enter(&dbf->db_mtx); 00154 if (dbf->db_state != DB_EVICTING) { 00155 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 00156 return (dbf); 00157 } 00158 mutex_exit(&dbf->db_mtx); 00159 } 00160 } 00161 00162 mutex_enter(&db->db_mtx); 00163 db->db_hash_next = h->hash_table[idx]; 00164 h->hash_table[idx] = db; 00165 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 00166 atomic_add_64(&dbuf_hash_count, 1); 00167 00168 return (NULL); 00169 } 00170 00175 static void 00176 dbuf_hash_remove(dmu_buf_impl_t *db) 00177 { 00178 dbuf_hash_table_t *h = &dbuf_hash_table; 00179 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 00180 db->db_level, db->db_blkid); 00181 uint64_t idx = hv & h->hash_table_mask; 00182 dmu_buf_impl_t *dbf, **dbp; 00183 00184 /* 00185 * We musn't hold db_mtx to maintin lock ordering: 00186 * DBUF_HASH_MUTEX > db_mtx. 00187 */ 00188 ASSERT(refcount_is_zero(&db->db_holds)); 00189 ASSERT(db->db_state == DB_EVICTING); 00190 ASSERT(!MUTEX_HELD(&db->db_mtx)); 00191 00192 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 00193 dbp = &h->hash_table[idx]; 00194 while ((dbf = *dbp) != db) { 00195 dbp = &dbf->db_hash_next; 00196 ASSERT(dbf != NULL); 00197 } 00198 *dbp = db->db_hash_next; 00199 db->db_hash_next = NULL; 00200 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 00201 atomic_add_64(&dbuf_hash_count, -1); 00202 } 00203 00204 static arc_evict_func_t dbuf_do_evict; 00205 00206 static void 00207 dbuf_evict_user(dmu_buf_impl_t *db) 00208 { 00209 ASSERT(MUTEX_HELD(&db->db_mtx)); 00210 00211 if (db->db_level != 0 || db->db_evict_func == NULL) 00212 return; 00213 00214 if (db->db_user_data_ptr_ptr) 00215 *db->db_user_data_ptr_ptr = db->db.db_data; 00216 db->db_evict_func(&db->db, db->db_user_ptr); 00217 db->db_user_ptr = NULL; 00218 db->db_user_data_ptr_ptr = NULL; 00219 db->db_evict_func = NULL; 00220 } 00221 00222 boolean_t 00223 dbuf_is_metadata(dmu_buf_impl_t *db) 00224 { 00225 if (db->db_level > 0) { 00226 return (B_TRUE); 00227 } else { 00228 boolean_t is_metadata; 00229 00230 DB_DNODE_ENTER(db); 00231 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 00232 DB_DNODE_EXIT(db); 00233 00234 return (is_metadata); 00235 } 00236 } 00237 00238 void 00239 dbuf_evict(dmu_buf_impl_t *db) 00240 { 00241 ASSERT(MUTEX_HELD(&db->db_mtx)); 00242 ASSERT(db->db_buf == NULL); 00243 ASSERT(db->db_data_pending == NULL); 00244 00245 dbuf_clear(db); 00246 dbuf_destroy(db); 00247 } 00248 00249 void 00250 dbuf_init(void) 00251 { 00252 uint64_t hsize = 1ULL << 16; 00253 dbuf_hash_table_t *h = &dbuf_hash_table; 00254 int i; 00255 00256 /* 00257 * The hash table is big enough to fill all of physical memory 00258 * with an average 4K block size. The table will take up 00259 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 00260 */ 00261 while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) 00262 hsize <<= 1; 00263 00264 retry: 00265 h->hash_table_mask = hsize - 1; 00266 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 00267 if (h->hash_table == NULL) { 00268 /* XXX - we should really return an error instead of assert */ 00269 ASSERT(hsize > (1ULL << 10)); 00270 hsize >>= 1; 00271 goto retry; 00272 } 00273 00274 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 00275 sizeof (dmu_buf_impl_t), 00276 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 00277 00278 for (i = 0; i < DBUF_MUTEXES; i++) 00279 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 00280 } 00281 00282 void 00283 dbuf_fini(void) 00284 { 00285 dbuf_hash_table_t *h = &dbuf_hash_table; 00286 int i; 00287 00288 for (i = 0; i < DBUF_MUTEXES; i++) 00289 mutex_destroy(&h->hash_mutexes[i]); 00290 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 00291 kmem_cache_destroy(dbuf_cache); 00292 } 00293 00294 /* 00295 * Other stuff. 00296 */ 00297 00298 #ifdef ZFS_DEBUG 00299 static void 00300 dbuf_verify(dmu_buf_impl_t *db) 00301 { 00302 dnode_t *dn; 00303 dbuf_dirty_record_t *dr; 00304 00305 ASSERT(MUTEX_HELD(&db->db_mtx)); 00306 00307 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 00308 return; 00309 00310 ASSERT(db->db_objset != NULL); 00311 DB_DNODE_ENTER(db); 00312 dn = DB_DNODE(db); 00313 if (dn == NULL) { 00314 ASSERT(db->db_parent == NULL); 00315 ASSERT(db->db_blkptr == NULL); 00316 } else { 00317 ASSERT3U(db->db.db_object, ==, dn->dn_object); 00318 ASSERT3P(db->db_objset, ==, dn->dn_objset); 00319 ASSERT3U(db->db_level, <, dn->dn_nlevels); 00320 ASSERT(db->db_blkid == DMU_BONUS_BLKID || 00321 db->db_blkid == DMU_SPILL_BLKID || 00322 !list_is_empty(&dn->dn_dbufs)); 00323 } 00324 if (db->db_blkid == DMU_BONUS_BLKID) { 00325 ASSERT(dn != NULL); 00326 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 00327 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 00328 } else if (db->db_blkid == DMU_SPILL_BLKID) { 00329 ASSERT(dn != NULL); 00330 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 00331 ASSERT0(db->db.db_offset); 00332 } else { 00333 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 00334 } 00335 00336 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 00337 ASSERT(dr->dr_dbuf == db); 00338 00339 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 00340 ASSERT(dr->dr_dbuf == db); 00341 00342 /* 00343 * We can't assert that db_size matches dn_datablksz because it 00344 * can be momentarily different when another thread is doing 00345 * dnode_set_blksz(). 00346 */ 00347 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 00348 dr = db->db_data_pending; 00349 /* 00350 * It should only be modified in syncing context, so 00351 * make sure we only have one copy of the data. 00352 */ 00353 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 00354 } 00355 00356 /* verify db->db_blkptr */ 00357 if (db->db_blkptr) { 00358 if (db->db_parent == dn->dn_dbuf) { 00359 /* db is pointed to by the dnode */ 00360 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 00361 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 00362 ASSERT(db->db_parent == NULL); 00363 else 00364 ASSERT(db->db_parent != NULL); 00365 if (db->db_blkid != DMU_SPILL_BLKID) 00366 ASSERT3P(db->db_blkptr, ==, 00367 &dn->dn_phys->dn_blkptr[db->db_blkid]); 00368 } else { 00369 /* db is pointed to by an indirect block */ 00370 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 00371 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 00372 ASSERT3U(db->db_parent->db.db_object, ==, 00373 db->db.db_object); 00374 /* 00375 * dnode_grow_indblksz() can make this fail if we don't 00376 * have the struct_rwlock. XXX indblksz no longer 00377 * grows. safe to do this now? 00378 */ 00379 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 00380 ASSERT3P(db->db_blkptr, ==, 00381 ((blkptr_t *)db->db_parent->db.db_data + 00382 db->db_blkid % epb)); 00383 } 00384 } 00385 } 00386 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 00387 (db->db_buf == NULL || db->db_buf->b_data) && 00388 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 00389 db->db_state != DB_FILL && !dn->dn_free_txg) { 00390 /* 00391 * If the blkptr isn't set but they have nonzero data, 00392 * it had better be dirty, otherwise we'll lose that 00393 * data when we evict this buffer. 00394 */ 00395 if (db->db_dirtycnt == 0) { 00396 uint64_t *buf = db->db.db_data; 00397 int i; 00398 00399 for (i = 0; i < db->db.db_size >> 3; i++) { 00400 ASSERT(buf[i] == 0); 00401 } 00402 } 00403 } 00404 DB_DNODE_EXIT(db); 00405 } 00406 #endif 00407 00408 static void 00409 dbuf_update_data(dmu_buf_impl_t *db) 00410 { 00411 ASSERT(MUTEX_HELD(&db->db_mtx)); 00412 if (db->db_level == 0 && db->db_user_data_ptr_ptr) { 00413 ASSERT(!refcount_is_zero(&db->db_holds)); 00414 *db->db_user_data_ptr_ptr = db->db.db_data; 00415 } 00416 } 00417 00422 static void 00423 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 00424 { 00425 ASSERT(MUTEX_HELD(&db->db_mtx)); 00426 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); 00427 db->db_buf = buf; 00428 if (buf != NULL) { 00429 ASSERT(buf->b_data != NULL); 00430 db->db.db_data = buf->b_data; 00431 if (!arc_released(buf)) 00432 arc_set_callback(buf, dbuf_do_evict, db); 00433 dbuf_update_data(db); 00434 } else { 00435 dbuf_evict_user(db); 00436 db->db.db_data = NULL; 00437 if (db->db_state != DB_NOFILL) 00438 db->db_state = DB_UNCACHED; 00439 } 00440 } 00441 00447 arc_buf_t * 00448 dbuf_loan_arcbuf(dmu_buf_impl_t *db) 00449 { 00450 arc_buf_t *abuf; 00451 00452 mutex_enter(&db->db_mtx); 00453 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 00454 int blksz = db->db.db_size; 00455 spa_t *spa; 00456 00457 mutex_exit(&db->db_mtx); 00458 DB_GET_SPA(&spa, db); 00459 abuf = arc_loan_buf(spa, blksz); 00460 bcopy(db->db.db_data, abuf->b_data, blksz); 00461 } else { 00462 abuf = db->db_buf; 00463 arc_loan_inuse_buf(abuf, db); 00464 dbuf_set_data(db, NULL); 00465 mutex_exit(&db->db_mtx); 00466 } 00467 return (abuf); 00468 } 00469 00470 uint64_t 00471 dbuf_whichblock(dnode_t *dn, uint64_t offset) 00472 { 00473 if (dn->dn_datablkshift) { 00474 return (offset >> dn->dn_datablkshift); 00475 } else { 00476 ASSERT3U(offset, <, dn->dn_datablksz); 00477 return (0); 00478 } 00479 } 00480 00481 static void 00482 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 00483 { 00484 dmu_buf_impl_t *db = vdb; 00485 00486 mutex_enter(&db->db_mtx); 00487 ASSERT3U(db->db_state, ==, DB_READ); 00488 /* 00489 * All reads are synchronous, so we must have a hold on the dbuf 00490 */ 00491 ASSERT(refcount_count(&db->db_holds) > 0); 00492 ASSERT(db->db_buf == NULL); 00493 ASSERT(db->db.db_data == NULL); 00494 if (db->db_level == 0 && db->db_freed_in_flight) { 00495 /* we were freed in flight; disregard any error */ 00496 arc_release(buf, db); 00497 bzero(buf->b_data, db->db.db_size); 00498 arc_buf_freeze(buf); 00499 db->db_freed_in_flight = FALSE; 00500 dbuf_set_data(db, buf); 00501 db->db_state = DB_CACHED; 00502 } else if (zio == NULL || zio->io_error == 0) { 00503 dbuf_set_data(db, buf); 00504 db->db_state = DB_CACHED; 00505 } else { 00506 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 00507 ASSERT3P(db->db_buf, ==, NULL); 00508 VERIFY(arc_buf_remove_ref(buf, db) == 1); 00509 db->db_state = DB_UNCACHED; 00510 } 00511 cv_broadcast(&db->db_changed); 00512 dbuf_rele_and_unlock(db, NULL); 00513 } 00514 00515 static void 00516 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 00517 { 00518 dnode_t *dn; 00519 spa_t *spa; 00520 zbookmark_t zb; 00521 uint32_t aflags = ARC_NOWAIT; 00522 arc_buf_t *pbuf; 00523 00524 DB_DNODE_ENTER(db); 00525 dn = DB_DNODE(db); 00526 ASSERT(!refcount_is_zero(&db->db_holds)); 00527 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 00528 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 00529 ASSERT(MUTEX_HELD(&db->db_mtx)); 00530 ASSERT(db->db_state == DB_UNCACHED); 00531 ASSERT(db->db_buf == NULL); 00532 00533 if (db->db_blkid == DMU_BONUS_BLKID) { 00534 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 00535 00536 ASSERT3U(bonuslen, <=, db->db.db_size); 00537 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 00538 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 00539 if (bonuslen < DN_MAX_BONUSLEN) 00540 bzero(db->db.db_data, DN_MAX_BONUSLEN); 00541 if (bonuslen) 00542 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 00543 DB_DNODE_EXIT(db); 00544 dbuf_update_data(db); 00545 db->db_state = DB_CACHED; 00546 mutex_exit(&db->db_mtx); 00547 return; 00548 } 00549 00550 /* 00551 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 00552 * processes the delete record and clears the bp while we are waiting 00553 * for the dn_mtx (resulting in a "no" from block_freed). 00554 */ 00555 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 00556 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 00557 BP_IS_HOLE(db->db_blkptr)))) { 00558 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 00559 00560 dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, 00561 db->db.db_size, db, type)); 00562 DB_DNODE_EXIT(db); 00563 bzero(db->db.db_data, db->db.db_size); 00564 db->db_state = DB_CACHED; 00565 *flags |= DB_RF_CACHED; 00566 mutex_exit(&db->db_mtx); 00567 return; 00568 } 00569 00570 spa = dn->dn_objset->os_spa; 00571 DB_DNODE_EXIT(db); 00572 00573 db->db_state = DB_READ; 00574 mutex_exit(&db->db_mtx); 00575 00576 if (DBUF_IS_L2CACHEABLE(db)) 00577 aflags |= ARC_L2CACHE; 00578 00579 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 00580 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 00581 db->db.db_object, db->db_level, db->db_blkid); 00582 00583 dbuf_add_ref(db, NULL); 00584 /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ 00585 00586 if (db->db_parent) 00587 pbuf = db->db_parent->db_buf; 00588 else 00589 pbuf = db->db_objset->os_phys_buf; 00590 00591 (void) dsl_read(zio, spa, db->db_blkptr, pbuf, 00592 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 00593 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 00594 &aflags, &zb); 00595 if (aflags & ARC_CACHED) 00596 *flags |= DB_RF_CACHED; 00597 } 00598 00599 int 00600 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 00601 { 00602 int err = 0; 00603 int havepzio = (zio != NULL); 00604 int prefetch; 00605 dnode_t *dn; 00606 00607 /* 00608 * We don't have to hold the mutex to check db_state because it 00609 * can't be freed while we have a hold on the buffer. 00610 */ 00611 ASSERT(!refcount_is_zero(&db->db_holds)); 00612 00613 if (db->db_state == DB_NOFILL) 00614 return (EIO); 00615 00616 DB_DNODE_ENTER(db); 00617 dn = DB_DNODE(db); 00618 if ((flags & DB_RF_HAVESTRUCT) == 0) 00619 rw_enter(&dn->dn_struct_rwlock, RW_READER); 00620 00621 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 00622 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 00623 DBUF_IS_CACHEABLE(db); 00624 00625 mutex_enter(&db->db_mtx); 00626 if (db->db_state == DB_CACHED) { 00627 mutex_exit(&db->db_mtx); 00628 if (prefetch) 00629 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 00630 db->db.db_size, TRUE); 00631 if ((flags & DB_RF_HAVESTRUCT) == 0) 00632 rw_exit(&dn->dn_struct_rwlock); 00633 DB_DNODE_EXIT(db); 00634 } else if (db->db_state == DB_UNCACHED) { 00635 spa_t *spa = dn->dn_objset->os_spa; 00636 00637 if (zio == NULL) 00638 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 00639 dbuf_read_impl(db, zio, &flags); 00640 00641 /* dbuf_read_impl has dropped db_mtx for us */ 00642 00643 if (prefetch) 00644 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 00645 db->db.db_size, flags & DB_RF_CACHED); 00646 00647 if ((flags & DB_RF_HAVESTRUCT) == 0) 00648 rw_exit(&dn->dn_struct_rwlock); 00649 DB_DNODE_EXIT(db); 00650 00651 if (!havepzio) 00652 err = zio_wait(zio); 00653 } else { 00654 /* 00655 * Another reader came in while the dbuf was in flight 00656 * between UNCACHED and CACHED. Either a writer will finish 00657 * writing the buffer (sending the dbuf to CACHED) or the 00658 * first reader's request will reach the read_done callback 00659 * and send the dbuf to CACHED. Otherwise, a failure 00660 * occurred and the dbuf went to UNCACHED. 00661 */ 00662 mutex_exit(&db->db_mtx); 00663 if (prefetch) 00664 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 00665 db->db.db_size, TRUE); 00666 if ((flags & DB_RF_HAVESTRUCT) == 0) 00667 rw_exit(&dn->dn_struct_rwlock); 00668 DB_DNODE_EXIT(db); 00669 00670 /* Skip the wait per the caller's request. */ 00671 mutex_enter(&db->db_mtx); 00672 if ((flags & DB_RF_NEVERWAIT) == 0) { 00673 while (db->db_state == DB_READ || 00674 db->db_state == DB_FILL) { 00675 ASSERT(db->db_state == DB_READ || 00676 (flags & DB_RF_HAVESTRUCT) == 0); 00677 cv_wait(&db->db_changed, &db->db_mtx); 00678 } 00679 if (db->db_state == DB_UNCACHED) 00680 err = EIO; 00681 } 00682 mutex_exit(&db->db_mtx); 00683 } 00684 00685 ASSERT(err || havepzio || db->db_state == DB_CACHED); 00686 return (err); 00687 } 00688 00689 static void 00690 dbuf_noread(dmu_buf_impl_t *db) 00691 { 00692 ASSERT(!refcount_is_zero(&db->db_holds)); 00693 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 00694 mutex_enter(&db->db_mtx); 00695 while (db->db_state == DB_READ || db->db_state == DB_FILL) 00696 cv_wait(&db->db_changed, &db->db_mtx); 00697 if (db->db_state == DB_UNCACHED) { 00698 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 00699 spa_t *spa; 00700 00701 ASSERT(db->db_buf == NULL); 00702 ASSERT(db->db.db_data == NULL); 00703 DB_GET_SPA(&spa, db); 00704 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); 00705 db->db_state = DB_FILL; 00706 } else if (db->db_state == DB_NOFILL) { 00707 dbuf_set_data(db, NULL); 00708 } else { 00709 ASSERT3U(db->db_state, ==, DB_CACHED); 00710 } 00711 mutex_exit(&db->db_mtx); 00712 } 00713 00727 static void 00728 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 00729 { 00730 dbuf_dirty_record_t *dr = db->db_last_dirty; 00731 00732 ASSERT(MUTEX_HELD(&db->db_mtx)); 00733 ASSERT(db->db.db_data != NULL); 00734 ASSERT(db->db_level == 0); 00735 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 00736 00737 if (dr == NULL || 00738 (dr->dt.dl.dr_data != 00739 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 00740 return; 00741 00742 /* 00743 * If the last dirty record for this dbuf has not yet synced 00744 * and its referencing the dbuf data, either: 00745 * reset the reference to point to a new copy, 00746 * or (if there a no active holders) 00747 * just null out the current db_data pointer. 00748 */ 00749 ASSERT(dr->dr_txg >= txg - 2); 00750 if (db->db_blkid == DMU_BONUS_BLKID) { 00751 /* Note that the data bufs here are zio_bufs */ 00752 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 00753 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 00754 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 00755 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 00756 int size = db->db.db_size; 00757 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 00758 spa_t *spa; 00759 00760 DB_GET_SPA(&spa, db); 00761 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); 00762 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 00763 } else { 00764 dbuf_set_data(db, NULL); 00765 } 00766 } 00767 00774 void 00775 dbuf_unoverride(dbuf_dirty_record_t *dr) 00776 { 00777 dmu_buf_impl_t *db = dr->dr_dbuf; 00778 blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 00779 uint64_t txg = dr->dr_txg; 00780 00781 ASSERT(MUTEX_HELD(&db->db_mtx)); 00782 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 00783 ASSERT(db->db_level == 0); 00784 00785 if (db->db_blkid == DMU_BONUS_BLKID || 00786 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 00787 return; 00788 00789 ASSERT(db->db_data_pending != dr); 00790 00791 /* free this block */ 00792 if (!BP_IS_HOLE(bp)) { 00793 spa_t *spa; 00794 00795 DB_GET_SPA(&spa, db); 00796 zio_free(spa, txg, bp); 00797 } 00798 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 00799 /* 00800 * Release the already-written buffer, so we leave it in 00801 * a consistent dirty state. Note that all callers are 00802 * modifying the buffer, so they will immediately do 00803 * another (redundant) arc_release(). Therefore, leave 00804 * the buf thawed to save the effort of freezing & 00805 * immediately re-thawing it. 00806 */ 00807 arc_release(dr->dt.dl.dr_data, db); 00808 } 00809 00817 void 00818 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) 00819 { 00820 dmu_buf_impl_t *db, *db_next; 00821 uint64_t txg = tx->tx_txg; 00822 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 00823 uint64_t first_l1 = start >> epbs; 00824 uint64_t last_l1 = end >> epbs; 00825 00826 if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) { 00827 end = dn->dn_maxblkid; 00828 last_l1 = end >> epbs; 00829 } 00830 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); 00831 mutex_enter(&dn->dn_dbufs_mtx); 00832 for (db = list_head(&dn->dn_dbufs); db; db = db_next) { 00833 db_next = list_next(&dn->dn_dbufs, db); 00834 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 00835 00836 if (db->db_level == 1 && 00837 db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { 00838 mutex_enter(&db->db_mtx); 00839 if (db->db_last_dirty && 00840 db->db_last_dirty->dr_txg < txg) { 00841 dbuf_add_ref(db, FTAG); 00842 mutex_exit(&db->db_mtx); 00843 dbuf_will_dirty(db, tx); 00844 dbuf_rele(db, FTAG); 00845 } else { 00846 mutex_exit(&db->db_mtx); 00847 } 00848 } 00849 00850 if (db->db_level != 0) 00851 continue; 00852 dprintf_dbuf(db, "found buf %s\n", ""); 00853 if (db->db_blkid < start || db->db_blkid > end) 00854 continue; 00855 00856 /* found a level 0 buffer in the range */ 00857 if (dbuf_undirty(db, tx)) 00858 continue; 00859 00860 mutex_enter(&db->db_mtx); 00861 if (db->db_state == DB_UNCACHED || 00862 db->db_state == DB_NOFILL || 00863 db->db_state == DB_EVICTING) { 00864 ASSERT(db->db.db_data == NULL); 00865 mutex_exit(&db->db_mtx); 00866 continue; 00867 } 00868 if (db->db_state == DB_READ || db->db_state == DB_FILL) { 00869 /* will be handled in dbuf_read_done or dbuf_rele */ 00870 db->db_freed_in_flight = TRUE; 00871 mutex_exit(&db->db_mtx); 00872 continue; 00873 } 00874 if (refcount_count(&db->db_holds) == 0) { 00875 ASSERT(db->db_buf); 00876 dbuf_clear(db); 00877 continue; 00878 } 00879 /* The dbuf is referenced */ 00880 00881 if (db->db_last_dirty != NULL) { 00882 dbuf_dirty_record_t *dr = db->db_last_dirty; 00883 00884 if (dr->dr_txg == txg) { 00885 /* 00886 * This buffer is "in-use", re-adjust the file 00887 * size to reflect that this buffer may 00888 * contain new data when we sync. 00889 */ 00890 if (db->db_blkid != DMU_SPILL_BLKID && 00891 db->db_blkid > dn->dn_maxblkid) 00892 dn->dn_maxblkid = db->db_blkid; 00893 dbuf_unoverride(dr); 00894 } else { 00895 /* 00896 * This dbuf is not dirty in the open context. 00897 * Either uncache it (if its not referenced in 00898 * the open context) or reset its contents to 00899 * empty. 00900 */ 00901 dbuf_fix_old_data(db, txg); 00902 } 00903 } 00904 /* clear the contents if its cached */ 00905 if (db->db_state == DB_CACHED) { 00906 ASSERT(db->db.db_data != NULL); 00907 arc_release(db->db_buf, db); 00908 bzero(db->db.db_data, db->db.db_size); 00909 arc_buf_freeze(db->db_buf); 00910 } 00911 00912 mutex_exit(&db->db_mtx); 00913 } 00914 mutex_exit(&dn->dn_dbufs_mtx); 00915 } 00916 00917 static int 00918 dbuf_block_freeable(dmu_buf_impl_t *db) 00919 { 00920 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 00921 uint64_t birth_txg = 0; 00922 00923 /* 00924 * We don't need any locking to protect db_blkptr: 00925 * If it's syncing, then db_last_dirty will be set 00926 * so we'll ignore db_blkptr. 00927 */ 00928 ASSERT(MUTEX_HELD(&db->db_mtx)); 00929 if (db->db_last_dirty) 00930 birth_txg = db->db_last_dirty->dr_txg; 00931 else if (db->db_blkptr) 00932 birth_txg = db->db_blkptr->blk_birth; 00933 00934 /* 00935 * If we don't exist or are in a snapshot, we can't be freed. 00936 * Don't pass the bp to dsl_dataset_block_freeable() since we 00937 * are holding the db_mtx lock and might deadlock if we are 00938 * prefetching a dedup-ed block. 00939 */ 00940 if (birth_txg) 00941 return (ds == NULL || 00942 dsl_dataset_block_freeable(ds, NULL, birth_txg)); 00943 else 00944 return (FALSE); 00945 } 00946 00947 void 00948 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 00949 { 00950 arc_buf_t *buf, *obuf; 00951 int osize = db->db.db_size; 00952 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 00953 dnode_t *dn; 00954 00955 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 00956 00957 DB_DNODE_ENTER(db); 00958 dn = DB_DNODE(db); 00959 00960 /* XXX does *this* func really need the lock? */ 00961 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 00962 00963 /* 00964 * This call to dbuf_will_dirty() with the dn_struct_rwlock held 00965 * is OK, because there can be no other references to the db 00966 * when we are changing its size, so no concurrent DB_FILL can 00967 * be happening. 00968 */ 00969 /* 00970 * XXX we should be doing a dbuf_read, checking the return 00971 * value and returning that up to our callers 00972 */ 00973 dbuf_will_dirty(db, tx); 00974 00975 /* create the data buffer for the new block */ 00976 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); 00977 00978 /* copy old block data to the new block */ 00979 obuf = db->db_buf; 00980 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 00981 /* zero the remainder */ 00982 if (size > osize) 00983 bzero((uint8_t *)buf->b_data + osize, size - osize); 00984 00985 mutex_enter(&db->db_mtx); 00986 dbuf_set_data(db, buf); 00987 VERIFY(arc_buf_remove_ref(obuf, db) == 1); 00988 db->db.db_size = size; 00989 00990 if (db->db_level == 0) { 00991 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 00992 db->db_last_dirty->dt.dl.dr_data = buf; 00993 } 00994 mutex_exit(&db->db_mtx); 00995 00996 dnode_willuse_space(dn, size-osize, tx); 00997 DB_DNODE_EXIT(db); 00998 } 00999 01000 void 01001 dbuf_release_bp(dmu_buf_impl_t *db) 01002 { 01003 objset_t *os; 01004 zbookmark_t zb; 01005 01006 DB_GET_OBJSET(&os, db); 01007 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 01008 ASSERT(arc_released(os->os_phys_buf) || 01009 list_link_active(&os->os_dsl_dataset->ds_synced_link)); 01010 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 01011 01012 zb.zb_objset = os->os_dsl_dataset ? 01013 os->os_dsl_dataset->ds_object : 0; 01014 zb.zb_object = db->db.db_object; 01015 zb.zb_level = db->db_level; 01016 zb.zb_blkid = db->db_blkid; 01017 (void) arc_release_bp(db->db_buf, db, 01018 db->db_blkptr, os->os_spa, &zb); 01019 } 01020 01024 dbuf_dirty_record_t * 01025 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 01026 { 01027 dnode_t *dn; 01028 objset_t *os; 01029 dbuf_dirty_record_t **drp, *dr; 01030 int drop_struct_lock = FALSE; 01031 boolean_t do_free_accounting = B_FALSE; 01032 int txgoff = tx->tx_txg & TXG_MASK; 01033 01034 /* Ensure that this dbuf has no transaction groups or holds */ 01035 ASSERT(tx->tx_txg != 0); 01036 ASSERT(!refcount_is_zero(&db->db_holds)); 01037 DMU_TX_DIRTY_BUF(tx, db); 01038 01039 DB_DNODE_ENTER(db); 01040 dn = DB_DNODE(db); 01041 /* 01042 * Shouldn't dirty a regular buffer in syncing context. Private 01043 * objects may be dirtied in syncing context, but only if they 01044 * were already pre-dirtied in open context. 01045 */ 01046 ASSERT(!dmu_tx_is_syncing(tx) || 01047 BP_IS_HOLE(dn->dn_objset->os_rootbp) || 01048 DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 01049 dn->dn_objset->os_dsl_dataset == NULL); 01050 /* 01051 * We make this assert for private objects as well, but after we 01052 * check if we're already dirty. They are allowed to re-dirty 01053 * in syncing context. 01054 */ 01055 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 01056 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 01057 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 01058 01059 mutex_enter(&db->db_mtx); 01060 /* 01061 * XXX make this true for indirects too? The problem is that 01062 * transactions created with dmu_tx_create_assigned() from 01063 * syncing context don't bother holding ahead. 01064 */ 01065 ASSERT(db->db_level != 0 || 01066 db->db_state == DB_CACHED || db->db_state == DB_FILL || 01067 db->db_state == DB_NOFILL); 01068 01069 mutex_enter(&dn->dn_mtx); 01070 /* 01071 * Don't set dirtyctx to SYNC if we're just modifying this as we 01072 * initialize the objset. 01073 */ 01074 if (dn->dn_dirtyctx == DN_UNDIRTIED && 01075 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 01076 dn->dn_dirtyctx = 01077 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 01078 ASSERT(dn->dn_dirtyctx_firstset == NULL); 01079 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 01080 } 01081 mutex_exit(&dn->dn_mtx); 01082 01083 if (db->db_blkid == DMU_SPILL_BLKID) 01084 dn->dn_have_spill = B_TRUE; 01085 01086 /* 01087 * If this buffer is already dirty, we're done. 01088 */ 01089 drp = &db->db_last_dirty; 01090 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 01091 db->db.db_object == DMU_META_DNODE_OBJECT); 01092 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 01093 drp = &dr->dr_next; 01094 if (dr && dr->dr_txg == tx->tx_txg) { 01095 DB_DNODE_EXIT(db); 01096 01097 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 01098 /* 01099 * If this buffer has already been written out, 01100 * we now need to reset its state. 01101 */ 01102 dbuf_unoverride(dr); 01103 if (db->db.db_object != DMU_META_DNODE_OBJECT && 01104 db->db_state != DB_NOFILL) 01105 arc_buf_thaw(db->db_buf); 01106 } 01107 mutex_exit(&db->db_mtx); 01108 return (dr); 01109 } 01110 01111 /* 01112 * Only valid if not already dirty. 01113 */ 01114 ASSERT(dn->dn_object == 0 || 01115 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 01116 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 01117 01118 ASSERT3U(dn->dn_nlevels, >, db->db_level); 01119 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 01120 dn->dn_phys->dn_nlevels > db->db_level || 01121 dn->dn_next_nlevels[txgoff] > db->db_level || 01122 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 01123 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 01124 01125 /* 01126 * We should only be dirtying in syncing context if it's the 01127 * mos or we're initializing the os or it's a special object. 01128 * However, we are allowed to dirty in syncing context provided 01129 * we already dirtied it in open context. Hence we must make 01130 * this assertion only if we're not already dirty. 01131 */ 01132 os = dn->dn_objset; 01133 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 01134 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 01135 ASSERT(db->db.db_size != 0); 01136 01137 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 01138 01139 if (db->db_blkid != DMU_BONUS_BLKID) { 01140 /* 01141 * Update the accounting. 01142 * Note: we delay "free accounting" until after we drop 01143 * the db_mtx. This keeps us from grabbing other locks 01144 * (and possibly deadlocking) in bp_get_dsize() while 01145 * also holding the db_mtx. 01146 */ 01147 dnode_willuse_space(dn, db->db.db_size, tx); 01148 do_free_accounting = dbuf_block_freeable(db); 01149 } 01150 01151 /* 01152 * If this buffer is dirty in an old transaction group we need 01153 * to make a copy of it so that the changes we make in this 01154 * transaction group won't leak out when we sync the older txg. 01155 */ 01156 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 01157 if (db->db_level == 0) { 01158 void *data_old = db->db_buf; 01159 01160 if (db->db_state != DB_NOFILL) { 01161 if (db->db_blkid == DMU_BONUS_BLKID) { 01162 dbuf_fix_old_data(db, tx->tx_txg); 01163 data_old = db->db.db_data; 01164 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 01165 /* 01166 * Release the data buffer from the cache so 01167 * that we can modify it without impacting 01168 * possible other users of this cached data 01169 * block. Note that indirect blocks and 01170 * private objects are not released until the 01171 * syncing state (since they are only modified 01172 * then). 01173 */ 01174 arc_release(db->db_buf, db); 01175 dbuf_fix_old_data(db, tx->tx_txg); 01176 data_old = db->db_buf; 01177 } 01178 ASSERT(data_old != NULL); 01179 } 01180 dr->dt.dl.dr_data = data_old; 01181 } else { 01182 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 01183 list_create(&dr->dt.di.dr_children, 01184 sizeof (dbuf_dirty_record_t), 01185 offsetof(dbuf_dirty_record_t, dr_dirty_node)); 01186 } 01187 dr->dr_dbuf = db; 01188 dr->dr_txg = tx->tx_txg; 01189 dr->dr_next = *drp; 01190 *drp = dr; 01191 01192 /* 01193 * We could have been freed_in_flight between the dbuf_noread 01194 * and dbuf_dirty. We win, as though the dbuf_noread() had 01195 * happened after the free. 01196 */ 01197 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 01198 db->db_blkid != DMU_SPILL_BLKID) { 01199 mutex_enter(&dn->dn_mtx); 01200 dnode_clear_range(dn, db->db_blkid, 1, tx); 01201 mutex_exit(&dn->dn_mtx); 01202 db->db_freed_in_flight = FALSE; 01203 } 01204 01205 /* 01206 * This buffer is now part of this txg 01207 */ 01208 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 01209 db->db_dirtycnt += 1; 01210 ASSERT3U(db->db_dirtycnt, <=, 3); 01211 01212 mutex_exit(&db->db_mtx); 01213 01214 if (db->db_blkid == DMU_BONUS_BLKID || 01215 db->db_blkid == DMU_SPILL_BLKID) { 01216 mutex_enter(&dn->dn_mtx); 01217 ASSERT(!list_link_active(&dr->dr_dirty_node)); 01218 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 01219 mutex_exit(&dn->dn_mtx); 01220 dnode_setdirty(dn, tx); 01221 DB_DNODE_EXIT(db); 01222 return (dr); 01223 } else if (do_free_accounting) { 01224 blkptr_t *bp = db->db_blkptr; 01225 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 01226 bp_get_dsize(os->os_spa, bp) : db->db.db_size; 01227 /* 01228 * This is only a guess -- if the dbuf is dirty 01229 * in a previous txg, we don't know how much 01230 * space it will use on disk yet. We should 01231 * really have the struct_rwlock to access 01232 * db_blkptr, but since this is just a guess, 01233 * it's OK if we get an odd answer. 01234 */ 01235 ddt_prefetch(os->os_spa, bp); 01236 dnode_willuse_space(dn, -willfree, tx); 01237 } 01238 01239 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 01240 rw_enter(&dn->dn_struct_rwlock, RW_READER); 01241 drop_struct_lock = TRUE; 01242 } 01243 01244 if (db->db_level == 0) { 01245 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 01246 ASSERT(dn->dn_maxblkid >= db->db_blkid); 01247 } 01248 01249 if (db->db_level+1 < dn->dn_nlevels) { 01250 dmu_buf_impl_t *parent = db->db_parent; 01251 dbuf_dirty_record_t *di; 01252 int parent_held = FALSE; 01253 01254 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 01255 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 01256 01257 parent = dbuf_hold_level(dn, db->db_level+1, 01258 db->db_blkid >> epbs, FTAG); 01259 ASSERT(parent != NULL); 01260 parent_held = TRUE; 01261 } 01262 if (drop_struct_lock) 01263 rw_exit(&dn->dn_struct_rwlock); 01264 ASSERT3U(db->db_level+1, ==, parent->db_level); 01265 di = dbuf_dirty(parent, tx); 01266 if (parent_held) 01267 dbuf_rele(parent, FTAG); 01268 01269 mutex_enter(&db->db_mtx); 01270 /* possible race with dbuf_undirty() */ 01271 if (db->db_last_dirty == dr || 01272 dn->dn_object == DMU_META_DNODE_OBJECT) { 01273 mutex_enter(&di->dt.di.dr_mtx); 01274 ASSERT3U(di->dr_txg, ==, tx->tx_txg); 01275 ASSERT(!list_link_active(&dr->dr_dirty_node)); 01276 list_insert_tail(&di->dt.di.dr_children, dr); 01277 mutex_exit(&di->dt.di.dr_mtx); 01278 dr->dr_parent = di; 01279 } 01280 mutex_exit(&db->db_mtx); 01281 } else { 01282 ASSERT(db->db_level+1 == dn->dn_nlevels); 01283 ASSERT(db->db_blkid < dn->dn_nblkptr); 01284 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 01285 mutex_enter(&dn->dn_mtx); 01286 ASSERT(!list_link_active(&dr->dr_dirty_node)); 01287 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 01288 mutex_exit(&dn->dn_mtx); 01289 if (drop_struct_lock) 01290 rw_exit(&dn->dn_struct_rwlock); 01291 } 01292 01293 dnode_setdirty(dn, tx); 01294 DB_DNODE_EXIT(db); 01295 return (dr); 01296 } 01297 01301 static int 01302 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 01303 { 01304 dnode_t *dn; 01305 uint64_t txg = tx->tx_txg; 01306 dbuf_dirty_record_t *dr, **drp; 01307 01308 ASSERT(txg != 0); 01309 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 01310 01311 mutex_enter(&db->db_mtx); 01312 /* 01313 * If this buffer is not dirty, we're done. 01314 */ 01315 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 01316 if (dr->dr_txg <= txg) 01317 break; 01318 if (dr == NULL || dr->dr_txg < txg) { 01319 mutex_exit(&db->db_mtx); 01320 return (0); 01321 } 01322 ASSERT(dr->dr_txg == txg); 01323 ASSERT(dr->dr_dbuf == db); 01324 01325 DB_DNODE_ENTER(db); 01326 dn = DB_DNODE(db); 01327 01328 /* 01329 * If this buffer is currently held, we cannot undirty 01330 * it, since one of the current holders may be in the 01331 * middle of an update. Note that users of dbuf_undirty() 01332 * should not place a hold on the dbuf before the call. 01333 * Also note: we can get here with a spill block, so 01334 * test for that similar to how dbuf_dirty does. 01335 */ 01336 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 01337 mutex_exit(&db->db_mtx); 01338 /* Make sure we don't toss this buffer at sync phase */ 01339 if (db->db_blkid != DMU_SPILL_BLKID) { 01340 mutex_enter(&dn->dn_mtx); 01341 dnode_clear_range(dn, db->db_blkid, 1, tx); 01342 mutex_exit(&dn->dn_mtx); 01343 } 01344 DB_DNODE_EXIT(db); 01345 return (0); 01346 } 01347 01348 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 01349 01350 ASSERT(db->db.db_size != 0); 01351 01352 /* XXX would be nice to fix up dn_towrite_space[] */ 01353 01354 *drp = dr->dr_next; 01355 01356 /* 01357 * Note that there are three places in dbuf_dirty() 01358 * where this dirty record may be put on a list. 01359 * Make sure to do a list_remove corresponding to 01360 * every one of those list_insert calls. 01361 */ 01362 if (dr->dr_parent) { 01363 mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 01364 list_remove(&dr->dr_parent->dt.di.dr_children, dr); 01365 mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 01366 } else if (db->db_blkid == DMU_SPILL_BLKID || 01367 db->db_level+1 == dn->dn_nlevels) { 01368 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 01369 mutex_enter(&dn->dn_mtx); 01370 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 01371 mutex_exit(&dn->dn_mtx); 01372 } 01373 DB_DNODE_EXIT(db); 01374 01375 if (db->db_level == 0) { 01376 if (db->db_state != DB_NOFILL) { 01377 dbuf_unoverride(dr); 01378 01379 ASSERT(db->db_buf != NULL); 01380 ASSERT(dr->dt.dl.dr_data != NULL); 01381 if (dr->dt.dl.dr_data != db->db_buf) 01382 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 01383 db) == 1); 01384 } 01385 } else { 01386 ASSERT(db->db_buf != NULL); 01387 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 01388 mutex_destroy(&dr->dt.di.dr_mtx); 01389 list_destroy(&dr->dt.di.dr_children); 01390 } 01391 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 01392 01393 ASSERT(db->db_dirtycnt > 0); 01394 db->db_dirtycnt -= 1; 01395 01396 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 01397 arc_buf_t *buf = db->db_buf; 01398 01399 ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 01400 dbuf_set_data(db, NULL); 01401 VERIFY(arc_buf_remove_ref(buf, db) == 1); 01402 dbuf_evict(db); 01403 return (1); 01404 } 01405 01406 mutex_exit(&db->db_mtx); 01407 return (0); 01408 } 01409 01410 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty 01411 void 01412 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 01413 { 01414 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 01415 01416 ASSERT(tx->tx_txg != 0); 01417 ASSERT(!refcount_is_zero(&db->db_holds)); 01418 01419 DB_DNODE_ENTER(db); 01420 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 01421 rf |= DB_RF_HAVESTRUCT; 01422 DB_DNODE_EXIT(db); 01423 (void) dbuf_read(db, NULL, rf); 01424 (void) dbuf_dirty(db, tx); 01425 } 01426 01427 void 01428 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 01429 { 01430 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 01431 01432 db->db_state = DB_NOFILL; 01433 01434 dmu_buf_will_fill(db_fake, tx); 01435 } 01436 01437 void 01438 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 01439 { 01440 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 01441 01442 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 01443 ASSERT(tx->tx_txg != 0); 01444 ASSERT(db->db_level == 0); 01445 ASSERT(!refcount_is_zero(&db->db_holds)); 01446 01447 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 01448 dmu_tx_private_ok(tx)); 01449 01450 dbuf_noread(db); 01451 (void) dbuf_dirty(db, tx); 01452 } 01453 01454 #pragma weak dmu_buf_fill_done = dbuf_fill_done 01455 /* ARGSUSED */ 01456 void 01457 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 01458 { 01459 mutex_enter(&db->db_mtx); 01460 DBUF_VERIFY(db); 01461 01462 if (db->db_state == DB_FILL) { 01463 if (db->db_level == 0 && db->db_freed_in_flight) { 01464 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 01465 /* we were freed while filling */ 01466 /* XXX dbuf_undirty? */ 01467 bzero(db->db.db_data, db->db.db_size); 01468 db->db_freed_in_flight = FALSE; 01469 } 01470 db->db_state = DB_CACHED; 01471 cv_broadcast(&db->db_changed); 01472 } 01473 mutex_exit(&db->db_mtx); 01474 } 01475 01480 void 01481 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 01482 { 01483 ASSERT(!refcount_is_zero(&db->db_holds)); 01484 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 01485 ASSERT(db->db_level == 0); 01486 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 01487 ASSERT(buf != NULL); 01488 ASSERT(arc_buf_size(buf) == db->db.db_size); 01489 ASSERT(tx->tx_txg != 0); 01490 01491 arc_return_buf(buf, db); 01492 ASSERT(arc_released(buf)); 01493 01494 mutex_enter(&db->db_mtx); 01495 01496 while (db->db_state == DB_READ || db->db_state == DB_FILL) 01497 cv_wait(&db->db_changed, &db->db_mtx); 01498 01499 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 01500 01501 /* 01502 * If the dbuf is cached and the number of holds exceeds the number 01503 * of dirty calls on it, then dirty it again and remove the buffer 01504 * reference, before copying the ARC buffer to the dbuf. 01505 */ 01506 if (db->db_state == DB_CACHED && 01507 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 01508 mutex_exit(&db->db_mtx); 01509 (void) dbuf_dirty(db, tx); 01510 bcopy(buf->b_data, db->db.db_data, db->db.db_size); 01511 VERIFY(arc_buf_remove_ref(buf, db) == 1); 01512 xuio_stat_wbuf_copied(); 01513 return; 01514 } 01515 01516 xuio_stat_wbuf_nocopy(); 01517 if (db->db_state == DB_CACHED) { 01518 dbuf_dirty_record_t *dr = db->db_last_dirty; 01519 01520 ASSERT(db->db_buf != NULL); 01521 if (dr != NULL && dr->dr_txg == tx->tx_txg) { 01522 ASSERT(dr->dt.dl.dr_data == db->db_buf); 01523 if (!arc_released(db->db_buf)) { 01524 ASSERT(dr->dt.dl.dr_override_state == 01525 DR_OVERRIDDEN); 01526 arc_release(db->db_buf, db); 01527 } 01528 dr->dt.dl.dr_data = buf; 01529 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); 01530 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 01531 arc_release(db->db_buf, db); 01532 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); 01533 } 01534 db->db_buf = NULL; 01535 } 01536 ASSERT(db->db_buf == NULL); 01537 /* Set db->db_buf = buf */ 01538 dbuf_set_data(db, buf); 01539 db->db_state = DB_FILL; 01540 mutex_exit(&db->db_mtx); 01541 (void) dbuf_dirty(db, tx); 01542 /* clear db->db.db_data and tell waiters it's changed ?? */ 01543 dbuf_fill_done(db, tx); 01544 } 01545 01563 void 01564 dbuf_clear(dmu_buf_impl_t *db) 01565 { 01566 dnode_t *dn; 01567 dmu_buf_impl_t *parent = db->db_parent; 01568 dmu_buf_impl_t *dndb; 01569 int dbuf_gone = FALSE; 01570 01571 ASSERT(MUTEX_HELD(&db->db_mtx)); 01572 ASSERT(refcount_is_zero(&db->db_holds)); 01573 01574 dbuf_evict_user(db); 01575 01576 if (db->db_state == DB_CACHED) { 01577 ASSERT(db->db.db_data != NULL); 01578 if (db->db_blkid == DMU_BONUS_BLKID) { 01579 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 01580 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 01581 } 01582 db->db.db_data = NULL; 01583 db->db_state = DB_UNCACHED; 01584 } 01585 01586 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 01587 ASSERT(db->db_data_pending == NULL); 01588 01589 db->db_state = DB_EVICTING; 01590 db->db_blkptr = NULL; 01591 01592 DB_DNODE_ENTER(db); 01593 dn = DB_DNODE(db); 01594 dndb = dn->dn_dbuf; 01595 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 01596 list_remove(&dn->dn_dbufs, db); 01597 (void) atomic_dec_32_nv(&dn->dn_dbufs_count); 01598 membar_producer(); 01599 DB_DNODE_EXIT(db); 01600 /* 01601 * Decrementing the dbuf count means that the hold corresponding 01602 * to the removed dbuf is no longer discounted in dnode_move(), 01603 * so the dnode cannot be moved until after we release the hold. 01604 * The membar_producer() ensures visibility of the decremented 01605 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 01606 * release any lock. 01607 */ 01608 dnode_rele(dn, db); 01609 db->db_dnode_handle = NULL; 01610 } else { 01611 DB_DNODE_EXIT(db); 01612 } 01613 01614 if (db->db_buf) 01615 dbuf_gone = arc_buf_evict(db->db_buf); 01616 01617 if (!dbuf_gone) 01618 mutex_exit(&db->db_mtx); 01619 01620 /* 01621 * If this dbuf is referenced from an indirect dbuf, 01622 * decrement the ref count on the indirect dbuf. 01623 */ 01624 if (parent && parent != dndb) 01625 dbuf_rele(parent, db); 01626 } 01627 01628 static int 01629 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 01630 dmu_buf_impl_t **parentp, blkptr_t **bpp) 01631 { 01632 int nlevels, epbs; 01633 01634 *parentp = NULL; 01635 *bpp = NULL; 01636 01637 ASSERT(blkid != DMU_BONUS_BLKID); 01638 01639 if (blkid == DMU_SPILL_BLKID) { 01640 mutex_enter(&dn->dn_mtx); 01641 if (dn->dn_have_spill && 01642 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 01643 *bpp = &dn->dn_phys->dn_spill; 01644 else 01645 *bpp = NULL; 01646 dbuf_add_ref(dn->dn_dbuf, NULL); 01647 *parentp = dn->dn_dbuf; 01648 mutex_exit(&dn->dn_mtx); 01649 return (0); 01650 } 01651 01652 if (dn->dn_phys->dn_nlevels == 0) 01653 nlevels = 1; 01654 else 01655 nlevels = dn->dn_phys->dn_nlevels; 01656 01657 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 01658 01659 ASSERT3U(level * epbs, <, 64); 01660 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 01661 if (level >= nlevels || 01662 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 01663 /* the buffer has no parent yet */ 01664 return (ENOENT); 01665 } else if (level < nlevels-1) { 01666 /* this block is referenced from an indirect block */ 01667 int err = dbuf_hold_impl(dn, level+1, 01668 blkid >> epbs, fail_sparse, NULL, parentp); 01669 if (err) 01670 return (err); 01671 err = dbuf_read(*parentp, NULL, 01672 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 01673 if (err) { 01674 dbuf_rele(*parentp, NULL); 01675 *parentp = NULL; 01676 return (err); 01677 } 01678 *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 01679 (blkid & ((1ULL << epbs) - 1)); 01680 return (0); 01681 } else { 01682 /* the block is referenced from the dnode */ 01683 ASSERT3U(level, ==, nlevels-1); 01684 ASSERT(dn->dn_phys->dn_nblkptr == 0 || 01685 blkid < dn->dn_phys->dn_nblkptr); 01686 if (dn->dn_dbuf) { 01687 dbuf_add_ref(dn->dn_dbuf, NULL); 01688 *parentp = dn->dn_dbuf; 01689 } 01690 *bpp = &dn->dn_phys->dn_blkptr[blkid]; 01691 return (0); 01692 } 01693 } 01694 01695 static dmu_buf_impl_t * 01696 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 01697 dmu_buf_impl_t *parent, blkptr_t *blkptr) 01698 { 01699 objset_t *os = dn->dn_objset; 01700 dmu_buf_impl_t *db, *odb; 01701 01702 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 01703 ASSERT(dn->dn_type != DMU_OT_NONE); 01704 01705 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 01706 01707 db->db_objset = os; 01708 db->db.db_object = dn->dn_object; 01709 db->db_level = level; 01710 db->db_blkid = blkid; 01711 db->db_last_dirty = NULL; 01712 db->db_dirtycnt = 0; 01713 db->db_dnode_handle = dn->dn_handle; 01714 db->db_parent = parent; 01715 db->db_blkptr = blkptr; 01716 01717 db->db_user_ptr = NULL; 01718 db->db_user_data_ptr_ptr = NULL; 01719 db->db_evict_func = NULL; 01720 db->db_immediate_evict = 0; 01721 db->db_freed_in_flight = 0; 01722 01723 if (blkid == DMU_BONUS_BLKID) { 01724 ASSERT3P(parent, ==, dn->dn_dbuf); 01725 db->db.db_size = DN_MAX_BONUSLEN - 01726 (dn->dn_nblkptr-1) * sizeof (blkptr_t); 01727 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 01728 db->db.db_offset = DMU_BONUS_BLKID; 01729 db->db_state = DB_UNCACHED; 01730 /* the bonus dbuf is not placed in the hash table */ 01731 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 01732 return (db); 01733 } else if (blkid == DMU_SPILL_BLKID) { 01734 db->db.db_size = (blkptr != NULL) ? 01735 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 01736 db->db.db_offset = 0; 01737 } else { 01738 int blocksize = 01739 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; 01740 db->db.db_size = blocksize; 01741 db->db.db_offset = db->db_blkid * blocksize; 01742 } 01743 01744 /* 01745 * Hold the dn_dbufs_mtx while we get the new dbuf 01746 * in the hash table *and* added to the dbufs list. 01747 * This prevents a possible deadlock with someone 01748 * trying to look up this dbuf before its added to the 01749 * dn_dbufs list. 01750 */ 01751 mutex_enter(&dn->dn_dbufs_mtx); 01752 db->db_state = DB_EVICTING; 01753 if ((odb = dbuf_hash_insert(db)) != NULL) { 01754 /* someone else inserted it first */ 01755 kmem_cache_free(dbuf_cache, db); 01756 mutex_exit(&dn->dn_dbufs_mtx); 01757 return (odb); 01758 } 01759 list_insert_head(&dn->dn_dbufs, db); 01760 db->db_state = DB_UNCACHED; 01761 mutex_exit(&dn->dn_dbufs_mtx); 01762 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 01763 01764 if (parent && parent != dn->dn_dbuf) 01765 dbuf_add_ref(parent, db); 01766 01767 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 01768 refcount_count(&dn->dn_holds) > 0); 01769 (void) refcount_add(&dn->dn_holds, db); 01770 (void) atomic_inc_32_nv(&dn->dn_dbufs_count); 01771 01772 dprintf_dbuf(db, "db=%p\n", db); 01773 01774 return (db); 01775 } 01776 01777 static int 01778 dbuf_do_evict(void *private) 01779 { 01780 arc_buf_t *buf = private; 01781 dmu_buf_impl_t *db = buf->b_private; 01782 01783 if (!MUTEX_HELD(&db->db_mtx)) 01784 mutex_enter(&db->db_mtx); 01785 01786 ASSERT(refcount_is_zero(&db->db_holds)); 01787 01788 if (db->db_state != DB_EVICTING) { 01789 ASSERT(db->db_state == DB_CACHED); 01790 DBUF_VERIFY(db); 01791 db->db_buf = NULL; 01792 dbuf_evict(db); 01793 } else { 01794 mutex_exit(&db->db_mtx); 01795 dbuf_destroy(db); 01796 } 01797 return (0); 01798 } 01799 01800 static void 01801 dbuf_destroy(dmu_buf_impl_t *db) 01802 { 01803 ASSERT(refcount_is_zero(&db->db_holds)); 01804 01805 if (db->db_blkid != DMU_BONUS_BLKID) { 01806 /* 01807 * If this dbuf is still on the dn_dbufs list, 01808 * remove it from that list. 01809 */ 01810 if (db->db_dnode_handle != NULL) { 01811 dnode_t *dn; 01812 01813 DB_DNODE_ENTER(db); 01814 dn = DB_DNODE(db); 01815 mutex_enter(&dn->dn_dbufs_mtx); 01816 list_remove(&dn->dn_dbufs, db); 01817 (void) atomic_dec_32_nv(&dn->dn_dbufs_count); 01818 mutex_exit(&dn->dn_dbufs_mtx); 01819 DB_DNODE_EXIT(db); 01820 /* 01821 * Decrementing the dbuf count means that the hold 01822 * corresponding to the removed dbuf is no longer 01823 * discounted in dnode_move(), so the dnode cannot be 01824 * moved until after we release the hold. 01825 */ 01826 dnode_rele(dn, db); 01827 db->db_dnode_handle = NULL; 01828 } 01829 dbuf_hash_remove(db); 01830 } 01831 db->db_parent = NULL; 01832 db->db_buf = NULL; 01833 01834 ASSERT(!list_link_active(&db->db_link)); 01835 ASSERT(db->db.db_data == NULL); 01836 ASSERT(db->db_hash_next == NULL); 01837 ASSERT(db->db_blkptr == NULL); 01838 ASSERT(db->db_data_pending == NULL); 01839 01840 kmem_cache_free(dbuf_cache, db); 01841 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 01842 } 01843 01844 void 01845 dbuf_prefetch(dnode_t *dn, uint64_t blkid) 01846 { 01847 dmu_buf_impl_t *db = NULL; 01848 blkptr_t *bp = NULL; 01849 01850 ASSERT(blkid != DMU_BONUS_BLKID); 01851 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 01852 01853 if (dnode_block_freed(dn, blkid)) 01854 return; 01855 01856 /* dbuf_find() returns with db_mtx held */ 01857 if (db = dbuf_find(dn, 0, blkid)) { 01858 /* 01859 * This dbuf is already in the cache. We assume that 01860 * it is already CACHED, or else about to be either 01861 * read or filled. 01862 */ 01863 mutex_exit(&db->db_mtx); 01864 return; 01865 } 01866 01867 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 01868 if (bp && !BP_IS_HOLE(bp)) { 01869 int priority = dn->dn_type == DMU_OT_DDT_ZAP ? 01870 ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ; 01871 arc_buf_t *pbuf; 01872 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 01873 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 01874 zbookmark_t zb; 01875 01876 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 01877 dn->dn_object, 0, blkid); 01878 01879 if (db) 01880 pbuf = db->db_buf; 01881 else 01882 pbuf = dn->dn_objset->os_phys_buf; 01883 01884 (void) dsl_read(NULL, dn->dn_objset->os_spa, 01885 bp, pbuf, NULL, NULL, priority, 01886 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 01887 &aflags, &zb); 01888 } 01889 if (db) 01890 dbuf_rele(db, NULL); 01891 } 01892 } 01893 01899 int 01900 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 01901 void *tag, dmu_buf_impl_t **dbp) 01902 { 01903 dmu_buf_impl_t *db, *parent = NULL; 01904 01905 ASSERT(blkid != DMU_BONUS_BLKID); 01906 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 01907 ASSERT3U(dn->dn_nlevels, >, level); 01908 01909 *dbp = NULL; 01910 top: 01911 /* dbuf_find() returns with db_mtx held */ 01912 db = dbuf_find(dn, level, blkid); 01913 01914 if (db == NULL) { 01915 blkptr_t *bp = NULL; 01916 int err; 01917 01918 ASSERT3P(parent, ==, NULL); 01919 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 01920 if (fail_sparse) { 01921 if (err == 0 && bp && BP_IS_HOLE(bp)) 01922 err = ENOENT; 01923 if (err) { 01924 if (parent) 01925 dbuf_rele(parent, NULL); 01926 return (err); 01927 } 01928 } 01929 if (err && err != ENOENT) 01930 return (err); 01931 db = dbuf_create(dn, level, blkid, parent, bp); 01932 } 01933 01934 if (db->db_buf && refcount_is_zero(&db->db_holds)) { 01935 arc_buf_add_ref(db->db_buf, db); 01936 if (db->db_buf->b_data == NULL) { 01937 dbuf_clear(db); 01938 if (parent) { 01939 dbuf_rele(parent, NULL); 01940 parent = NULL; 01941 } 01942 goto top; 01943 } 01944 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 01945 } 01946 01947 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 01948 01949 /* 01950 * If this buffer is currently syncing out, and we are are 01951 * still referencing it from db_data, we need to make a copy 01952 * of it in case we decide we want to dirty it again in this txg. 01953 */ 01954 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 01955 dn->dn_object != DMU_META_DNODE_OBJECT && 01956 db->db_state == DB_CACHED && db->db_data_pending) { 01957 dbuf_dirty_record_t *dr = db->db_data_pending; 01958 01959 if (dr->dt.dl.dr_data == db->db_buf) { 01960 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 01961 01962 dbuf_set_data(db, 01963 arc_buf_alloc(dn->dn_objset->os_spa, 01964 db->db.db_size, db, type)); 01965 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 01966 db->db.db_size); 01967 } 01968 } 01969 01970 (void) refcount_add(&db->db_holds, tag); 01971 dbuf_update_data(db); 01972 DBUF_VERIFY(db); 01973 mutex_exit(&db->db_mtx); 01974 01975 /* NOTE: we can't rele the parent until after we drop the db_mtx */ 01976 if (parent) 01977 dbuf_rele(parent, NULL); 01978 01979 ASSERT3P(DB_DNODE(db), ==, dn); 01980 ASSERT3U(db->db_blkid, ==, blkid); 01981 ASSERT3U(db->db_level, ==, level); 01982 *dbp = db; 01983 01984 return (0); 01985 } 01986 01987 dmu_buf_impl_t * 01988 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 01989 { 01990 dmu_buf_impl_t *db; 01991 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 01992 return (err ? NULL : db); 01993 } 01994 01995 dmu_buf_impl_t * 01996 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 01997 { 01998 dmu_buf_impl_t *db; 01999 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 02000 return (err ? NULL : db); 02001 } 02002 02003 void 02004 dbuf_create_bonus(dnode_t *dn) 02005 { 02006 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 02007 02008 ASSERT(dn->dn_bonus == NULL); 02009 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 02010 } 02011 02012 int 02013 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 02014 { 02015 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 02016 dnode_t *dn; 02017 02018 if (db->db_blkid != DMU_SPILL_BLKID) 02019 return (ENOTSUP); 02020 if (blksz == 0) 02021 blksz = SPA_MINBLOCKSIZE; 02022 if (blksz > SPA_MAXBLOCKSIZE) 02023 blksz = SPA_MAXBLOCKSIZE; 02024 else 02025 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 02026 02027 DB_DNODE_ENTER(db); 02028 dn = DB_DNODE(db); 02029 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 02030 dbuf_new_size(db, blksz, tx); 02031 rw_exit(&dn->dn_struct_rwlock); 02032 DB_DNODE_EXIT(db); 02033 02034 return (0); 02035 } 02036 02037 void 02038 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 02039 { 02040 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 02041 } 02042 02043 #pragma weak dmu_buf_add_ref = dbuf_add_ref 02044 void 02045 dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 02046 { 02047 int64_t holds = refcount_add(&db->db_holds, tag); 02048 ASSERT(holds > 1); 02049 } 02050 02060 #pragma weak dmu_buf_rele = dbuf_rele 02061 void 02062 dbuf_rele(dmu_buf_impl_t *db, void *tag) 02063 { 02064 mutex_enter(&db->db_mtx); 02065 dbuf_rele_and_unlock(db, tag); 02066 } 02067 02072 void 02073 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 02074 { 02075 int64_t holds; 02076 02077 ASSERT(MUTEX_HELD(&db->db_mtx)); 02078 DBUF_VERIFY(db); 02079 02080 /* 02081 * Remove the reference to the dbuf before removing its hold on the 02082 * dnode so we can guarantee in dnode_move() that a referenced bonus 02083 * buffer has a corresponding dnode hold. 02084 */ 02085 holds = refcount_remove(&db->db_holds, tag); 02086 ASSERT(holds >= 0); 02087 02088 /* 02089 * We can't freeze indirects if there is a possibility that they 02090 * may be modified in the current syncing context. 02091 */ 02092 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 02093 arc_buf_freeze(db->db_buf); 02094 02095 if (holds == db->db_dirtycnt && 02096 db->db_level == 0 && db->db_immediate_evict) 02097 dbuf_evict_user(db); 02098 02099 if (holds == 0) { 02100 if (db->db_blkid == DMU_BONUS_BLKID) { 02101 mutex_exit(&db->db_mtx); 02102 02103 /* 02104 * If the dnode moves here, we cannot cross this barrier 02105 * until the move completes. 02106 */ 02107 DB_DNODE_ENTER(db); 02108 (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count); 02109 DB_DNODE_EXIT(db); 02110 /* 02111 * The bonus buffer's dnode hold is no longer discounted 02112 * in dnode_move(). The dnode cannot move until after 02113 * the dnode_rele(). 02114 */ 02115 dnode_rele(DB_DNODE(db), db); 02116 } else if (db->db_buf == NULL) { 02117 /* 02118 * This is a special case: we never associated this 02119 * dbuf with any data allocated from the ARC. 02120 */ 02121 ASSERT(db->db_state == DB_UNCACHED || 02122 db->db_state == DB_NOFILL); 02123 dbuf_evict(db); 02124 } else if (arc_released(db->db_buf)) { 02125 arc_buf_t *buf = db->db_buf; 02126 /* 02127 * This dbuf has anonymous data associated with it. 02128 */ 02129 dbuf_set_data(db, NULL); 02130 VERIFY(arc_buf_remove_ref(buf, db) == 1); 02131 dbuf_evict(db); 02132 } else { 02133 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); 02134 02135 /* 02136 * A dbuf will be eligible for eviction if either the 02137 * 'primarycache' property is set or a duplicate 02138 * copy of this buffer is already cached in the arc. 02139 * 02140 * In the case of the 'primarycache' a buffer 02141 * is considered for eviction if it matches the 02142 * criteria set in the property. 02143 * 02144 * To decide if our buffer is considered a 02145 * duplicate, we must call into the arc to determine 02146 * if multiple buffers are referencing the same 02147 * block on-disk. If so, then we simply evict 02148 * ourselves. 02149 */ 02150 if (!DBUF_IS_CACHEABLE(db) || 02151 arc_buf_eviction_needed(db->db_buf)) 02152 dbuf_clear(db); 02153 else 02154 mutex_exit(&db->db_mtx); 02155 } 02156 } else { 02157 mutex_exit(&db->db_mtx); 02158 } 02159 } 02160 02161 #pragma weak dmu_buf_refcount = dbuf_refcount 02162 uint64_t 02163 dbuf_refcount(dmu_buf_impl_t *db) 02164 { 02165 return (refcount_count(&db->db_holds)); 02166 } 02167 02168 void * 02169 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 02170 dmu_buf_evict_func_t *evict_func) 02171 { 02172 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 02173 user_data_ptr_ptr, evict_func)); 02174 } 02175 02176 void * 02177 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 02178 dmu_buf_evict_func_t *evict_func) 02179 { 02180 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 02181 02182 db->db_immediate_evict = TRUE; 02183 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 02184 user_data_ptr_ptr, evict_func)); 02185 } 02186 02187 void * 02188 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 02189 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 02190 { 02191 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 02192 ASSERT(db->db_level == 0); 02193 02194 ASSERT((user_ptr == NULL) == (evict_func == NULL)); 02195 02196 mutex_enter(&db->db_mtx); 02197 02198 if (db->db_user_ptr == old_user_ptr) { 02199 db->db_user_ptr = user_ptr; 02200 db->db_user_data_ptr_ptr = user_data_ptr_ptr; 02201 db->db_evict_func = evict_func; 02202 02203 dbuf_update_data(db); 02204 } else { 02205 old_user_ptr = db->db_user_ptr; 02206 } 02207 02208 mutex_exit(&db->db_mtx); 02209 return (old_user_ptr); 02210 } 02211 02212 void * 02213 dmu_buf_get_user(dmu_buf_t *db_fake) 02214 { 02215 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 02216 ASSERT(!refcount_is_zero(&db->db_holds)); 02217 02218 return (db->db_user_ptr); 02219 } 02220 02221 boolean_t 02222 dmu_buf_freeable(dmu_buf_t *dbuf) 02223 { 02224 boolean_t res = B_FALSE; 02225 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 02226 02227 if (db->db_blkptr) 02228 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 02229 db->db_blkptr, db->db_blkptr->blk_birth); 02230 02231 return (res); 02232 } 02233 02234 static void 02235 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 02236 { 02237 /* ASSERT(dmu_tx_is_syncing(tx) */ 02238 ASSERT(MUTEX_HELD(&db->db_mtx)); 02239 02240 if (db->db_blkptr != NULL) 02241 return; 02242 02243 if (db->db_blkid == DMU_SPILL_BLKID) { 02244 db->db_blkptr = &dn->dn_phys->dn_spill; 02245 BP_ZERO(db->db_blkptr); 02246 return; 02247 } 02248 if (db->db_level == dn->dn_phys->dn_nlevels-1) { 02249 /* 02250 * This buffer was allocated at a time when there was 02251 * no available blkptrs from the dnode, or it was 02252 * inappropriate to hook it in (i.e., nlevels mis-match). 02253 */ 02254 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 02255 ASSERT(db->db_parent == NULL); 02256 db->db_parent = dn->dn_dbuf; 02257 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 02258 DBUF_VERIFY(db); 02259 } else { 02260 dmu_buf_impl_t *parent = db->db_parent; 02261 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 02262 02263 ASSERT(dn->dn_phys->dn_nlevels > 1); 02264 if (parent == NULL) { 02265 mutex_exit(&db->db_mtx); 02266 rw_enter(&dn->dn_struct_rwlock, RW_READER); 02267 (void) dbuf_hold_impl(dn, db->db_level+1, 02268 db->db_blkid >> epbs, FALSE, db, &parent); 02269 rw_exit(&dn->dn_struct_rwlock); 02270 mutex_enter(&db->db_mtx); 02271 db->db_parent = parent; 02272 } 02273 db->db_blkptr = (blkptr_t *)parent->db.db_data + 02274 (db->db_blkid & ((1ULL << epbs) - 1)); 02275 DBUF_VERIFY(db); 02276 } 02277 } 02278 02279 static void 02280 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 02281 { 02282 dmu_buf_impl_t *db = dr->dr_dbuf; 02283 dnode_t *dn; 02284 zio_t *zio; 02285 02286 ASSERT(dmu_tx_is_syncing(tx)); 02287 02288 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 02289 02290 mutex_enter(&db->db_mtx); 02291 02292 ASSERT(db->db_level > 0); 02293 DBUF_VERIFY(db); 02294 02295 /* Read the block if it hasn't been read yet. */ 02296 if (db->db_buf == NULL) { 02297 mutex_exit(&db->db_mtx); 02298 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 02299 mutex_enter(&db->db_mtx); 02300 } 02301 ASSERT3U(db->db_state, ==, DB_CACHED); 02302 ASSERT(db->db_buf != NULL); 02303 02304 DB_DNODE_ENTER(db); 02305 dn = DB_DNODE(db); 02306 /* Indirect block size must match what the dnode thinks it is. */ 02307 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 02308 dbuf_check_blkptr(dn, db); 02309 DB_DNODE_EXIT(db); 02310 02311 /* Provide the pending dirty record to child dbufs */ 02312 db->db_data_pending = dr; 02313 02314 mutex_exit(&db->db_mtx); 02315 dbuf_write(dr, db->db_buf, tx); 02316 02317 zio = dr->dr_zio; 02318 mutex_enter(&dr->dt.di.dr_mtx); 02319 dbuf_sync_list(&dr->dt.di.dr_children, tx); 02320 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 02321 mutex_exit(&dr->dt.di.dr_mtx); 02322 zio_nowait(zio); 02323 } 02324 02325 static void 02326 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 02327 { 02328 arc_buf_t **datap = &dr->dt.dl.dr_data; 02329 dmu_buf_impl_t *db = dr->dr_dbuf; 02330 dnode_t *dn; 02331 objset_t *os; 02332 uint64_t txg = tx->tx_txg; 02333 02334 ASSERT(dmu_tx_is_syncing(tx)); 02335 02336 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 02337 02338 mutex_enter(&db->db_mtx); 02339 /* 02340 * To be synced, we must be dirtied. But we 02341 * might have been freed after the dirty. 02342 */ 02343 if (db->db_state == DB_UNCACHED) { 02344 /* This buffer has been freed since it was dirtied */ 02345 ASSERT(db->db.db_data == NULL); 02346 } else if (db->db_state == DB_FILL) { 02347 /* This buffer was freed and is now being re-filled */ 02348 ASSERT(db->db.db_data != dr->dt.dl.dr_data); 02349 } else { 02350 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 02351 } 02352 DBUF_VERIFY(db); 02353 02354 DB_DNODE_ENTER(db); 02355 dn = DB_DNODE(db); 02356 02357 if (db->db_blkid == DMU_SPILL_BLKID) { 02358 mutex_enter(&dn->dn_mtx); 02359 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 02360 mutex_exit(&dn->dn_mtx); 02361 } 02362 02363 /* 02364 * If this is a bonus buffer, simply copy the bonus data into the 02365 * dnode. It will be written out when the dnode is synced (and it 02366 * will be synced, since it must have been dirty for dbuf_sync to 02367 * be called). 02368 */ 02369 if (db->db_blkid == DMU_BONUS_BLKID) { 02370 dbuf_dirty_record_t **drp; 02371 02372 ASSERT(*datap != NULL); 02373 ASSERT0(db->db_level); 02374 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 02375 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 02376 DB_DNODE_EXIT(db); 02377 02378 if (*datap != db->db.db_data) { 02379 zio_buf_free(*datap, DN_MAX_BONUSLEN); 02380 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 02381 } 02382 db->db_data_pending = NULL; 02383 drp = &db->db_last_dirty; 02384 while (*drp != dr) 02385 drp = &(*drp)->dr_next; 02386 ASSERT(dr->dr_next == NULL); 02387 ASSERT(dr->dr_dbuf == db); 02388 *drp = dr->dr_next; 02389 if (dr->dr_dbuf->db_level != 0) { 02390 list_destroy(&dr->dt.di.dr_children); 02391 mutex_destroy(&dr->dt.di.dr_mtx); 02392 } 02393 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 02394 ASSERT(db->db_dirtycnt > 0); 02395 db->db_dirtycnt -= 1; 02396 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 02397 return; 02398 } 02399 02400 os = dn->dn_objset; 02401 02402 /* 02403 * This function may have dropped the db_mtx lock allowing a dmu_sync 02404 * operation to sneak in. As a result, we need to ensure that we 02405 * don't check the dr_override_state until we have returned from 02406 * dbuf_check_blkptr. 02407 */ 02408 dbuf_check_blkptr(dn, db); 02409 02410 /* 02411 * If this buffer is in the middle of an immediate write, 02412 * wait for the synchronous IO to complete. 02413 */ 02414 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 02415 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 02416 cv_wait(&db->db_changed, &db->db_mtx); 02417 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 02418 } 02419 02420 if (db->db_state != DB_NOFILL && 02421 dn->dn_object != DMU_META_DNODE_OBJECT && 02422 refcount_count(&db->db_holds) > 1 && 02423 dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 02424 *datap == db->db_buf) { 02425 /* 02426 * If this buffer is currently "in use" (i.e., there 02427 * are active holds and db_data still references it), 02428 * then make a copy before we start the write so that 02429 * any modifications from the open txg will not leak 02430 * into this write. 02431 * 02432 * NOTE: this copy does not need to be made for 02433 * objects only modified in the syncing context (e.g. 02434 * DNONE_DNODE blocks). 02435 */ 02436 int blksz = arc_buf_size(*datap); 02437 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 02438 *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 02439 bcopy(db->db.db_data, (*datap)->b_data, blksz); 02440 } 02441 /* notify that the dirty record is about to write */ 02442 db->db_data_pending = dr; 02443 02444 mutex_exit(&db->db_mtx); 02445 02446 dbuf_write(dr, *datap, tx); 02447 02448 ASSERT(!list_link_active(&dr->dr_dirty_node)); 02449 if (dn->dn_object == DMU_META_DNODE_OBJECT) { 02450 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 02451 DB_DNODE_EXIT(db); 02452 } else { 02453 /* 02454 * Although zio_nowait() does not "wait for an IO", it does 02455 * initiate the IO. If this is an empty write it seems plausible 02456 * that the IO could actually be completed before the nowait 02457 * returns. We need to DB_DNODE_EXIT() first in case 02458 * zio_nowait() invalidates the dbuf. 02459 */ 02460 DB_DNODE_EXIT(db); 02461 zio_nowait(dr->dr_zio); 02462 } 02463 } 02464 02465 void 02466 dbuf_sync_list(list_t *list, dmu_tx_t *tx) 02467 { 02468 dbuf_dirty_record_t *dr; 02469 02470 while (dr = list_head(list)) { 02471 if (dr->dr_zio != NULL) { 02472 /* 02473 * If we find an already initialized zio then we 02474 * are processing the meta-dnode, and we have finished. 02475 * The dbufs for all dnodes are put back on the list 02476 * during processing, so that we can zio_wait() 02477 * these IOs after initiating all child IOs. 02478 */ 02479 ASSERT3U(dr->dr_dbuf->db.db_object, ==, 02480 DMU_META_DNODE_OBJECT); 02481 break; 02482 } 02483 list_remove(list, dr); 02484 if (dr->dr_dbuf->db_level > 0) 02485 dbuf_sync_indirect(dr, tx); 02486 else 02487 dbuf_sync_leaf(dr, tx); 02488 } 02489 } 02490 02491 /* ARGSUSED */ 02492 static void 02493 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 02494 { 02495 dmu_buf_impl_t *db = vdb; 02496 dnode_t *dn; 02497 blkptr_t *bp = zio->io_bp; 02498 blkptr_t *bp_orig = &zio->io_bp_orig; 02499 spa_t *spa = zio->io_spa; 02500 int64_t delta; 02501 uint64_t fill = 0; 02502 int i; 02503 02504 ASSERT(db->db_blkptr == bp); 02505 02506 DB_DNODE_ENTER(db); 02507 dn = DB_DNODE(db); 02508 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 02509 dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 02510 zio->io_prev_space_delta = delta; 02511 02512 if (BP_IS_HOLE(bp)) { 02513 ASSERT(bp->blk_fill == 0); 02514 DB_DNODE_EXIT(db); 02515 return; 02516 } 02517 02518 ASSERT((db->db_blkid != DMU_SPILL_BLKID && 02519 BP_GET_TYPE(bp) == dn->dn_type) || 02520 (db->db_blkid == DMU_SPILL_BLKID && 02521 BP_GET_TYPE(bp) == dn->dn_bonustype)); 02522 ASSERT(BP_GET_LEVEL(bp) == db->db_level); 02523 02524 mutex_enter(&db->db_mtx); 02525 02526 #ifdef ZFS_DEBUG 02527 if (db->db_blkid == DMU_SPILL_BLKID) { 02528 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 02529 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 02530 db->db_blkptr == &dn->dn_phys->dn_spill); 02531 } 02532 #endif 02533 02534 if (db->db_level == 0) { 02535 mutex_enter(&dn->dn_mtx); 02536 if (db->db_blkid > dn->dn_phys->dn_maxblkid && 02537 db->db_blkid != DMU_SPILL_BLKID) 02538 dn->dn_phys->dn_maxblkid = db->db_blkid; 02539 mutex_exit(&dn->dn_mtx); 02540 02541 if (dn->dn_type == DMU_OT_DNODE) { 02542 dnode_phys_t *dnp = db->db.db_data; 02543 for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 02544 i--, dnp++) { 02545 if (dnp->dn_type != DMU_OT_NONE) 02546 fill++; 02547 } 02548 } else { 02549 fill = 1; 02550 } 02551 } else { 02552 blkptr_t *ibp = db->db.db_data; 02553 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 02554 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 02555 if (BP_IS_HOLE(ibp)) 02556 continue; 02557 fill += ibp->blk_fill; 02558 } 02559 } 02560 DB_DNODE_EXIT(db); 02561 02562 bp->blk_fill = fill; 02563 02564 mutex_exit(&db->db_mtx); 02565 } 02566 02567 /* ARGSUSED */ 02568 static void 02569 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 02570 { 02571 dmu_buf_impl_t *db = vdb; 02572 blkptr_t *bp = zio->io_bp; 02573 blkptr_t *bp_orig = &zio->io_bp_orig; 02574 uint64_t txg = zio->io_txg; 02575 dbuf_dirty_record_t **drp, *dr; 02576 02577 ASSERT0(zio->io_error); 02578 ASSERT(db->db_blkptr == bp); 02579 02580 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 02581 ASSERT(BP_EQUAL(bp, bp_orig)); 02582 } else { 02583 objset_t *os; 02584 dsl_dataset_t *ds; 02585 dmu_tx_t *tx; 02586 02587 DB_GET_OBJSET(&os, db); 02588 ds = os->os_dsl_dataset; 02589 tx = os->os_synctx; 02590 02591 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 02592 dsl_dataset_block_born(ds, bp, tx); 02593 } 02594 02595 mutex_enter(&db->db_mtx); 02596 02597 DBUF_VERIFY(db); 02598 02599 /* 02600 * Now that the write is completed, the dirty record it resolves is 02601 * no longer needed, so remove it. 02602 */ 02603 drp = &db->db_last_dirty; 02604 while ((dr = *drp) != db->db_data_pending) 02605 drp = &dr->dr_next; 02606 ASSERT(!list_link_active(&dr->dr_dirty_node)); 02607 ASSERT(dr->dr_txg == txg); 02608 ASSERT(dr->dr_dbuf == db); 02609 ASSERT(dr->dr_next == NULL); 02610 *drp = dr->dr_next; 02611 02612 #ifdef ZFS_DEBUG 02613 if (db->db_blkid == DMU_SPILL_BLKID) { 02614 dnode_t *dn; 02615 02616 DB_DNODE_ENTER(db); 02617 dn = DB_DNODE(db); 02618 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 02619 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 02620 db->db_blkptr == &dn->dn_phys->dn_spill); 02621 DB_DNODE_EXIT(db); 02622 } 02623 #endif 02624 02625 /* Clean up the dirty record. */ 02626 if (db->db_level == 0) { 02627 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 02628 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 02629 if (db->db_state != DB_NOFILL) { 02630 if (dr->dt.dl.dr_data != db->db_buf) { 02631 /* 02632 * What we wrote is already out of date, so 02633 * just free the ARC buffer. 02634 */ 02635 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 02636 db) == 1); 02637 } else if (!arc_released(db->db_buf)) { 02638 /* 02639 * Our dbuf has yet to be evicted, so 02640 * register a callback to clean it up once 02641 * its ARC buffer is released. 02642 */ 02643 arc_set_callback(db->db_buf, dbuf_do_evict, db); 02644 } 02645 } 02646 } else { 02647 dnode_t *dn; 02648 02649 DB_DNODE_ENTER(db); 02650 dn = DB_DNODE(db); 02651 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 02652 /* 02653 * The size of an indirect block must match what its 02654 * associated dnode thinks it should be. 02655 */ 02656 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 02657 /* 02658 * If the dbuf's block pointer is not a hole, evict it when 02659 * its last ARC buffer hold has been released. 02660 */ 02661 if (!BP_IS_HOLE(db->db_blkptr)) { 02662 int epbs = 02663 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 02664 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 02665 db->db.db_size); 02666 ASSERT3U(dn->dn_phys->dn_maxblkid 02667 >> (db->db_level * epbs), >=, db->db_blkid); 02668 arc_set_callback(db->db_buf, dbuf_do_evict, db); 02669 } 02670 DB_DNODE_EXIT(db); 02671 mutex_destroy(&dr->dt.di.dr_mtx); 02672 list_destroy(&dr->dt.di.dr_children); 02673 } 02674 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 02675 02676 cv_broadcast(&db->db_changed); 02677 ASSERT(db->db_dirtycnt > 0); 02678 db->db_dirtycnt -= 1; 02679 db->db_data_pending = NULL; 02680 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 02681 } 02682 02683 static void 02684 dbuf_write_nofill_ready(zio_t *zio) 02685 { 02686 dbuf_write_ready(zio, NULL, zio->io_private); 02687 } 02688 02689 static void 02690 dbuf_write_nofill_done(zio_t *zio) 02691 { 02692 dbuf_write_done(zio, NULL, zio->io_private); 02693 } 02694 02695 static void 02696 dbuf_write_override_ready(zio_t *zio) 02697 { 02698 dbuf_dirty_record_t *dr = zio->io_private; 02699 dmu_buf_impl_t *db = dr->dr_dbuf; 02700 02701 dbuf_write_ready(zio, NULL, db); 02702 } 02703 02704 static void 02705 dbuf_write_override_done(zio_t *zio) 02706 { 02707 dbuf_dirty_record_t *dr = zio->io_private; 02708 dmu_buf_impl_t *db = dr->dr_dbuf; 02709 blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 02710 02711 mutex_enter(&db->db_mtx); 02712 if (!BP_EQUAL(zio->io_bp, obp)) { 02713 if (!BP_IS_HOLE(obp)) 02714 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 02715 arc_release(dr->dt.dl.dr_data, db); 02716 } 02717 mutex_exit(&db->db_mtx); 02718 02719 dbuf_write_done(zio, NULL, db); 02720 } 02721 02725 static void 02726 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 02727 { 02728 dmu_buf_impl_t *db = dr->dr_dbuf; 02729 dnode_t *dn; 02730 objset_t *os; 02731 dmu_buf_impl_t *parent = db->db_parent; 02732 uint64_t txg = tx->tx_txg; 02733 zbookmark_t zb; 02734 zio_prop_t zp; 02735 zio_t *pio; /* parent I/O */ 02736 int wp_flag = 0; 02737 02738 DB_DNODE_ENTER(db); 02739 dn = DB_DNODE(db); 02740 os = dn->dn_objset; 02741 02742 if (db->db_state != DB_NOFILL) { 02743 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 02744 /* 02745 * Private object buffers are released here rather 02746 * than in dbuf_dirty() since they are only modified 02747 * in the syncing context and we don't want the 02748 * overhead of making multiple copies of the data. 02749 */ 02750 if (BP_IS_HOLE(db->db_blkptr)) { 02751 arc_buf_thaw(data); 02752 } else { 02753 dbuf_release_bp(db); 02754 } 02755 } 02756 } 02757 02758 if (parent != dn->dn_dbuf) { 02759 /* Our parent is an indirect block. */ 02760 /* We have a dirty parent that has been scheduled for write. */ 02761 ASSERT(parent && parent->db_data_pending); 02762 /* Our parent's buffer is one level closer to the dnode. */ 02763 ASSERT(db->db_level == parent->db_level-1); 02764 /* Nobody can find the old parent in the ARC. */ 02765 ASSERT(arc_released(parent->db_buf)); 02766 pio = parent->db_data_pending->dr_zio; 02767 } else { 02768 /* Our parent is the dnode itself. */ 02769 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 02770 db->db_blkid != DMU_SPILL_BLKID) || 02771 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 02772 if (db->db_blkid != DMU_SPILL_BLKID) 02773 ASSERT3P(db->db_blkptr, ==, 02774 &dn->dn_phys->dn_blkptr[db->db_blkid]); 02775 pio = dn->dn_zio; 02776 } 02777 02778 ASSERT(db->db_level == 0 || data == db->db_buf); 02779 ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 02780 ASSERT(pio); 02781 02782 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 02783 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 02784 db->db.db_object, db->db_level, db->db_blkid); 02785 02786 if (db->db_blkid == DMU_SPILL_BLKID) 02787 wp_flag = WP_SPILL; 02788 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 02789 02790 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 02791 DB_DNODE_EXIT(db); 02792 02793 if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 02794 ASSERT(db->db_state != DB_NOFILL); 02795 dr->dr_zio = zio_write(pio, os->os_spa, txg, 02796 db->db_blkptr, data->b_data, arc_buf_size(data), &zp, 02797 dbuf_write_override_ready, dbuf_write_override_done, dr, 02798 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 02799 mutex_enter(&db->db_mtx); 02800 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 02801 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 02802 dr->dt.dl.dr_copies); 02803 mutex_exit(&db->db_mtx); 02804 } else if (db->db_state == DB_NOFILL) { 02805 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); 02806 dr->dr_zio = zio_write(pio, os->os_spa, txg, 02807 db->db_blkptr, NULL, db->db.db_size, &zp, 02808 dbuf_write_nofill_ready, dbuf_write_nofill_done, db, 02809 ZIO_PRIORITY_ASYNC_WRITE, 02810 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 02811 } else { 02812 ASSERT(arc_released(data)); 02813 dr->dr_zio = arc_write(pio, os->os_spa, txg, 02814 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp, 02815 dbuf_write_ready, dbuf_write_done, db, 02816 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 02817 } 02818 }