FreeBSD ZFS
The Zettabyte File System
|
00001 /* 00002 * CDDL HEADER START 00003 * 00004 * The contents of this file are subject to the terms of the 00005 * Common Development and Distribution License (the "License"). 00006 * You may not use this file except in compliance with the License. 00007 * 00008 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 00009 * or http://www.opensolaris.org/os/licensing. 00010 * See the License for the specific language governing permissions 00011 * and limitations under the License. 00012 * 00013 * When distributing Covered Code, include this CDDL HEADER in each 00014 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 00015 * If applicable, add the following below this CDDL HEADER, with the 00016 * fields enclosed by brackets "[]" replaced with your own identifying 00017 * information: Portions Copyright [yyyy] [name of copyright owner] 00018 * 00019 * CDDL HEADER END 00020 */ 00021 /* 00022 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 00023 * Copyright (c) 2012 by Delphix. All rights reserved. 00024 */ 00025 00026 #include <sys/dmu.h> 00027 #include <sys/dmu_impl.h> 00028 #include <sys/dmu_tx.h> 00029 #include <sys/dbuf.h> 00030 #include <sys/dnode.h> 00031 #include <sys/zfs_context.h> 00032 #include <sys/dmu_objset.h> 00033 #include <sys/dmu_traverse.h> 00034 #include <sys/dsl_dataset.h> 00035 #include <sys/dsl_dir.h> 00036 #include <sys/dsl_pool.h> 00037 #include <sys/dsl_synctask.h> 00038 #include <sys/dsl_prop.h> 00039 #include <sys/dmu_zfetch.h> 00040 #include <sys/zfs_ioctl.h> 00041 #include <sys/zap.h> 00042 #include <sys/zio_checksum.h> 00043 #include <sys/sa.h> 00044 #ifdef _KERNEL 00045 #include <sys/zfs_znode.h> 00046 #endif 00047 00048 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { 00049 { DMU_BSWAP_UINT8, TRUE, "unallocated" }, 00050 { DMU_BSWAP_ZAP, TRUE, "object directory" }, 00051 { DMU_BSWAP_UINT64, TRUE, "object array" }, 00052 { DMU_BSWAP_UINT8, TRUE, "packed nvlist" }, 00053 { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" }, 00054 { DMU_BSWAP_UINT64, TRUE, "bpobj" }, 00055 { DMU_BSWAP_UINT64, TRUE, "bpobj header" }, 00056 { DMU_BSWAP_UINT64, TRUE, "SPA space map header" }, 00057 { DMU_BSWAP_UINT64, TRUE, "SPA space map" }, 00058 { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" }, 00059 { DMU_BSWAP_DNODE, TRUE, "DMU dnode" }, 00060 { DMU_BSWAP_OBJSET, TRUE, "DMU objset" }, 00061 { DMU_BSWAP_UINT64, TRUE, "DSL directory" }, 00062 { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"}, 00063 { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" }, 00064 { DMU_BSWAP_ZAP, TRUE, "DSL props" }, 00065 { DMU_BSWAP_UINT64, TRUE, "DSL dataset" }, 00066 { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" }, 00067 { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" }, 00068 { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" }, 00069 { DMU_BSWAP_ZAP, TRUE, "ZFS directory" }, 00070 { DMU_BSWAP_ZAP, TRUE, "ZFS master node" }, 00071 { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" }, 00072 { DMU_BSWAP_UINT8, FALSE, "zvol object" }, 00073 { DMU_BSWAP_ZAP, TRUE, "zvol prop" }, 00074 { DMU_BSWAP_UINT8, FALSE, "other uint8[]" }, 00075 { DMU_BSWAP_UINT64, FALSE, "other uint64[]" }, 00076 { DMU_BSWAP_ZAP, TRUE, "other ZAP" }, 00077 { DMU_BSWAP_ZAP, TRUE, "persistent error log" }, 00078 { DMU_BSWAP_UINT8, TRUE, "SPA history" }, 00079 { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" }, 00080 { DMU_BSWAP_ZAP, TRUE, "Pool properties" }, 00081 { DMU_BSWAP_ZAP, TRUE, "DSL permissions" }, 00082 { DMU_BSWAP_ACL, TRUE, "ZFS ACL" }, 00083 { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" }, 00084 { DMU_BSWAP_UINT8, TRUE, "FUID table" }, 00085 { DMU_BSWAP_UINT64, TRUE, "FUID table size" }, 00086 { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"}, 00087 { DMU_BSWAP_ZAP, TRUE, "scan work queue" }, 00088 { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" }, 00089 { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" }, 00090 { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"}, 00091 { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" }, 00092 { DMU_BSWAP_ZAP, TRUE, "DDT statistics" }, 00093 { DMU_BSWAP_UINT8, TRUE, "System attributes" }, 00094 { DMU_BSWAP_ZAP, TRUE, "SA master node" }, 00095 { DMU_BSWAP_ZAP, TRUE, "SA attr registration" }, 00096 { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" }, 00097 { DMU_BSWAP_ZAP, TRUE, "scan translations" }, 00098 { DMU_BSWAP_UINT8, FALSE, "deduplicated block" }, 00099 { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" }, 00100 { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" }, 00101 { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" }, 00102 { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" } 00103 }; 00104 00105 const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { 00106 { byteswap_uint8_array, "uint8" }, 00107 { byteswap_uint16_array, "uint16" }, 00108 { byteswap_uint32_array, "uint32" }, 00109 { byteswap_uint64_array, "uint64" }, 00110 { zap_byteswap, "zap" }, 00111 { dnode_buf_byteswap, "dnode" }, 00112 { dmu_objset_byteswap, "objset" }, 00113 { zfs_znode_byteswap, "znode" }, 00114 { zfs_oldacl_byteswap, "oldacl" }, 00115 { zfs_acl_byteswap, "acl" } 00116 }; 00117 00118 int 00119 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, 00120 void *tag, dmu_buf_t **dbp, int flags) 00121 { 00122 dnode_t *dn; 00123 uint64_t blkid; 00124 dmu_buf_impl_t *db; 00125 int err; 00126 int db_flags = DB_RF_CANFAIL; 00127 00128 if (flags & DMU_READ_NO_PREFETCH) 00129 db_flags |= DB_RF_NOPREFETCH; 00130 00131 err = dnode_hold(os, object, FTAG, &dn); 00132 if (err) 00133 return (err); 00134 blkid = dbuf_whichblock(dn, offset); 00135 rw_enter(&dn->dn_struct_rwlock, RW_READER); 00136 db = dbuf_hold(dn, blkid, tag); 00137 rw_exit(&dn->dn_struct_rwlock); 00138 if (db == NULL) { 00139 err = EIO; 00140 } else { 00141 err = dbuf_read(db, NULL, db_flags); 00142 if (err) { 00143 dbuf_rele(db, tag); 00144 db = NULL; 00145 } 00146 } 00147 00148 dnode_rele(dn, FTAG); 00149 *dbp = &db->db; /* NULL db plus first field offset is NULL */ 00150 return (err); 00151 } 00152 00153 int 00154 dmu_bonus_max(void) 00155 { 00156 return (DN_MAX_BONUSLEN); 00157 } 00158 00159 int 00160 dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) 00161 { 00162 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 00163 dnode_t *dn; 00164 int error; 00165 00166 DB_DNODE_ENTER(db); 00167 dn = DB_DNODE(db); 00168 00169 if (dn->dn_bonus != db) { 00170 error = EINVAL; 00171 } else if (newsize < 0 || newsize > db_fake->db_size) { 00172 error = EINVAL; 00173 } else { 00174 dnode_setbonuslen(dn, newsize, tx); 00175 error = 0; 00176 } 00177 00178 DB_DNODE_EXIT(db); 00179 return (error); 00180 } 00181 00182 int 00183 dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) 00184 { 00185 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 00186 dnode_t *dn; 00187 int error; 00188 00189 DB_DNODE_ENTER(db); 00190 dn = DB_DNODE(db); 00191 00192 if (!DMU_OT_IS_VALID(type)) { 00193 error = EINVAL; 00194 } else if (dn->dn_bonus != db) { 00195 error = EINVAL; 00196 } else { 00197 dnode_setbonus_type(dn, type, tx); 00198 error = 0; 00199 } 00200 00201 DB_DNODE_EXIT(db); 00202 return (error); 00203 } 00204 00205 dmu_object_type_t 00206 dmu_get_bonustype(dmu_buf_t *db_fake) 00207 { 00208 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 00209 dnode_t *dn; 00210 dmu_object_type_t type; 00211 00212 DB_DNODE_ENTER(db); 00213 dn = DB_DNODE(db); 00214 type = dn->dn_bonustype; 00215 DB_DNODE_EXIT(db); 00216 00217 return (type); 00218 } 00219 00220 int 00221 dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) 00222 { 00223 dnode_t *dn; 00224 int error; 00225 00226 error = dnode_hold(os, object, FTAG, &dn); 00227 dbuf_rm_spill(dn, tx); 00228 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 00229 dnode_rm_spill(dn, tx); 00230 rw_exit(&dn->dn_struct_rwlock); 00231 dnode_rele(dn, FTAG); 00232 return (error); 00233 } 00234 00235 int 00236 dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) 00237 { 00238 dnode_t *dn; 00239 dmu_buf_impl_t *db; 00240 int error; 00241 00242 error = dnode_hold(os, object, FTAG, &dn); 00243 if (error) 00244 return (error); 00245 00246 rw_enter(&dn->dn_struct_rwlock, RW_READER); 00247 if (dn->dn_bonus == NULL) { 00248 rw_exit(&dn->dn_struct_rwlock); 00249 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 00250 if (dn->dn_bonus == NULL) 00251 dbuf_create_bonus(dn); 00252 } 00253 db = dn->dn_bonus; 00254 00255 /* as long as the bonus buf is held, the dnode will be held */ 00256 if (refcount_add(&db->db_holds, tag) == 1) { 00257 VERIFY(dnode_add_ref(dn, db)); 00258 (void) atomic_inc_32_nv(&dn->dn_dbufs_count); 00259 } 00260 00261 /* 00262 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's 00263 * hold and incrementing the dbuf count to ensure that dnode_move() sees 00264 * a dnode hold for every dbuf. 00265 */ 00266 rw_exit(&dn->dn_struct_rwlock); 00267 00268 dnode_rele(dn, FTAG); 00269 00270 VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH)); 00271 00272 *dbp = &db->db; 00273 return (0); 00274 } 00275 00285 int 00286 dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) 00287 { 00288 dmu_buf_impl_t *db = NULL; 00289 int err; 00290 00291 if ((flags & DB_RF_HAVESTRUCT) == 0) 00292 rw_enter(&dn->dn_struct_rwlock, RW_READER); 00293 00294 db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); 00295 00296 if ((flags & DB_RF_HAVESTRUCT) == 0) 00297 rw_exit(&dn->dn_struct_rwlock); 00298 00299 ASSERT(db != NULL); 00300 err = dbuf_read(db, NULL, flags); 00301 if (err == 0) 00302 *dbp = &db->db; 00303 else 00304 dbuf_rele(db, tag); 00305 return (err); 00306 } 00307 00308 int 00309 dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) 00310 { 00311 dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; 00312 dnode_t *dn; 00313 int err; 00314 00315 DB_DNODE_ENTER(db); 00316 dn = DB_DNODE(db); 00317 00318 if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { 00319 err = EINVAL; 00320 } else { 00321 rw_enter(&dn->dn_struct_rwlock, RW_READER); 00322 00323 if (!dn->dn_have_spill) { 00324 err = ENOENT; 00325 } else { 00326 err = dmu_spill_hold_by_dnode(dn, 00327 DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); 00328 } 00329 00330 rw_exit(&dn->dn_struct_rwlock); 00331 } 00332 00333 DB_DNODE_EXIT(db); 00334 return (err); 00335 } 00336 00337 int 00338 dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) 00339 { 00340 dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; 00341 dnode_t *dn; 00342 int err; 00343 00344 DB_DNODE_ENTER(db); 00345 dn = DB_DNODE(db); 00346 err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); 00347 DB_DNODE_EXIT(db); 00348 00349 return (err); 00350 } 00351 00358 static int 00359 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, 00360 int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) 00361 { 00362 dsl_pool_t *dp = NULL; 00363 dmu_buf_t **dbp; 00364 uint64_t blkid, nblks, i; 00365 uint32_t dbuf_flags; 00366 int err; 00367 zio_t *zio; 00368 hrtime_t start; 00369 00370 ASSERT(length <= DMU_MAX_ACCESS); 00371 00372 dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; 00373 if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) 00374 dbuf_flags |= DB_RF_NOPREFETCH; 00375 00376 rw_enter(&dn->dn_struct_rwlock, RW_READER); 00377 if (dn->dn_datablkshift) { 00378 int blkshift = dn->dn_datablkshift; 00379 nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - 00380 P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; 00381 } else { 00382 if (offset + length > dn->dn_datablksz) { 00383 zfs_panic_recover("zfs: accessing past end of object " 00384 "%llx/%llx (size=%u access=%llu+%llu)", 00385 (longlong_t)dn->dn_objset-> 00386 os_dsl_dataset->ds_object, 00387 (longlong_t)dn->dn_object, dn->dn_datablksz, 00388 (longlong_t)offset, (longlong_t)length); 00389 rw_exit(&dn->dn_struct_rwlock); 00390 return (EIO); 00391 } 00392 nblks = 1; 00393 } 00394 dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); 00395 00396 if (dn->dn_objset->os_dsl_dataset) 00397 dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; 00398 if (dp && dsl_pool_sync_context(dp)) 00399 start = gethrtime(); 00400 zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); 00401 blkid = dbuf_whichblock(dn, offset); 00402 for (i = 0; i < nblks; i++) { 00403 dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); 00404 if (db == NULL) { 00405 rw_exit(&dn->dn_struct_rwlock); 00406 dmu_buf_rele_array(dbp, nblks, tag); 00407 zio_nowait(zio); 00408 return (EIO); 00409 } 00410 /* initiate async i/o */ 00411 if (read) 00412 (void) dbuf_read(db, zio, dbuf_flags); 00413 #ifdef _KERNEL 00414 else 00415 curthread->td_ru.ru_oublock++; 00416 #endif 00417 dbp[i] = &db->db; 00418 } 00419 rw_exit(&dn->dn_struct_rwlock); 00420 00421 /* wait for async i/o */ 00422 err = zio_wait(zio); 00423 /* track read overhead when we are in sync context */ 00424 if (dp && dsl_pool_sync_context(dp)) 00425 dp->dp_read_overhead += gethrtime() - start; 00426 if (err) { 00427 dmu_buf_rele_array(dbp, nblks, tag); 00428 return (err); 00429 } 00430 00431 /* wait for other io to complete */ 00432 if (read) { 00433 for (i = 0; i < nblks; i++) { 00434 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; 00435 mutex_enter(&db->db_mtx); 00436 while (db->db_state == DB_READ || 00437 db->db_state == DB_FILL) 00438 cv_wait(&db->db_changed, &db->db_mtx); 00439 if (db->db_state == DB_UNCACHED) 00440 err = EIO; 00441 mutex_exit(&db->db_mtx); 00442 if (err) { 00443 dmu_buf_rele_array(dbp, nblks, tag); 00444 return (err); 00445 } 00446 } 00447 } 00448 00449 *numbufsp = nblks; 00450 *dbpp = dbp; 00451 return (0); 00452 } 00453 00454 static int 00455 dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, 00456 uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 00457 { 00458 dnode_t *dn; 00459 int err; 00460 00461 err = dnode_hold(os, object, FTAG, &dn); 00462 if (err) 00463 return (err); 00464 00465 err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, 00466 numbufsp, dbpp, DMU_READ_PREFETCH); 00467 00468 dnode_rele(dn, FTAG); 00469 00470 return (err); 00471 } 00472 00473 int 00474 dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, 00475 uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 00476 { 00477 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 00478 dnode_t *dn; 00479 int err; 00480 00481 DB_DNODE_ENTER(db); 00482 dn = DB_DNODE(db); 00483 err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, 00484 numbufsp, dbpp, DMU_READ_PREFETCH); 00485 DB_DNODE_EXIT(db); 00486 00487 return (err); 00488 } 00489 00490 void 00491 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) 00492 { 00493 int i; 00494 dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; 00495 00496 if (numbufs == 0) 00497 return; 00498 00499 for (i = 0; i < numbufs; i++) { 00500 if (dbp[i]) 00501 dbuf_rele(dbp[i], tag); 00502 } 00503 00504 kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); 00505 } 00506 00507 void 00508 dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) 00509 { 00510 dnode_t *dn; 00511 uint64_t blkid; 00512 int nblks, i, err; 00513 00514 if (zfs_prefetch_disable) 00515 return; 00516 00517 if (len == 0) { /* they're interested in the bonus buffer */ 00518 dn = DMU_META_DNODE(os); 00519 00520 if (object == 0 || object >= DN_MAX_OBJECT) 00521 return; 00522 00523 rw_enter(&dn->dn_struct_rwlock, RW_READER); 00524 blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); 00525 dbuf_prefetch(dn, blkid); 00526 rw_exit(&dn->dn_struct_rwlock); 00527 return; 00528 } 00529 00530 /* 00531 * XXX - Note, if the dnode for the requested object is not 00532 * already cached, we will do a *synchronous* read in the 00533 * dnode_hold() call. The same is true for any indirects. 00534 */ 00535 err = dnode_hold(os, object, FTAG, &dn); 00536 if (err != 0) 00537 return; 00538 00539 rw_enter(&dn->dn_struct_rwlock, RW_READER); 00540 if (dn->dn_datablkshift) { 00541 int blkshift = dn->dn_datablkshift; 00542 nblks = (P2ROUNDUP(offset+len, 1<<blkshift) - 00543 P2ALIGN(offset, 1<<blkshift)) >> blkshift; 00544 } else { 00545 nblks = (offset < dn->dn_datablksz); 00546 } 00547 00548 if (nblks != 0) { 00549 blkid = dbuf_whichblock(dn, offset); 00550 for (i = 0; i < nblks; i++) 00551 dbuf_prefetch(dn, blkid+i); 00552 } 00553 00554 rw_exit(&dn->dn_struct_rwlock); 00555 00556 dnode_rele(dn, FTAG); 00557 } 00558 00565 static int 00566 get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit) 00567 { 00568 uint64_t len = *start - limit; 00569 uint64_t blkcnt = 0; 00570 uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1)); 00571 uint64_t iblkrange = 00572 dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); 00573 00574 ASSERT(limit <= *start); 00575 00576 if (len <= iblkrange * maxblks) { 00577 *start = limit; 00578 return (0); 00579 } 00580 ASSERT(ISP2(iblkrange)); 00581 00582 while (*start > limit && blkcnt < maxblks) { 00583 int err; 00584 00585 /* find next allocated L1 indirect */ 00586 err = dnode_next_offset(dn, 00587 DNODE_FIND_BACKWARDS, start, 2, 1, 0); 00588 00589 /* if there are no more, then we are done */ 00590 if (err == ESRCH) { 00591 *start = limit; 00592 return (0); 00593 } else if (err) { 00594 return (err); 00595 } 00596 blkcnt += 1; 00597 00598 /* reset offset to end of "next" block back */ 00599 *start = P2ALIGN(*start, iblkrange); 00600 if (*start <= limit) 00601 *start = limit; 00602 else 00603 *start -= 1; 00604 } 00605 return (0); 00606 } 00607 00608 static int 00609 dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, 00610 uint64_t length, boolean_t free_dnode) 00611 { 00612 dmu_tx_t *tx; 00613 uint64_t object_size, start, end, len; 00614 boolean_t trunc = (length == DMU_OBJECT_END); 00615 int align, err; 00616 00617 align = 1 << dn->dn_datablkshift; 00618 ASSERT(align > 0); 00619 object_size = align == 1 ? dn->dn_datablksz : 00620 (dn->dn_maxblkid + 1) << dn->dn_datablkshift; 00621 00622 end = offset + length; 00623 if (trunc || end > object_size) 00624 end = object_size; 00625 if (end <= offset) 00626 return (0); 00627 length = end - offset; 00628 00629 while (length) { 00630 start = end; 00631 /* assert(offset <= start) */ 00632 err = get_next_chunk(dn, &start, offset); 00633 if (err) 00634 return (err); 00635 len = trunc ? DMU_OBJECT_END : end - start; 00636 00637 tx = dmu_tx_create(os); 00638 dmu_tx_hold_free(tx, dn->dn_object, start, len); 00639 err = dmu_tx_assign(tx, TXG_WAIT); 00640 if (err) { 00641 dmu_tx_abort(tx); 00642 return (err); 00643 } 00644 00645 dnode_free_range(dn, start, trunc ? -1 : len, tx); 00646 00647 if (start == 0 && free_dnode) { 00648 ASSERT(trunc); 00649 dnode_free(dn, tx); 00650 } 00651 00652 length -= end - start; 00653 00654 dmu_tx_commit(tx); 00655 end = start; 00656 } 00657 return (0); 00658 } 00659 00660 int 00661 dmu_free_long_range(objset_t *os, uint64_t object, 00662 uint64_t offset, uint64_t length) 00663 { 00664 dnode_t *dn; 00665 int err; 00666 00667 err = dnode_hold(os, object, FTAG, &dn); 00668 if (err != 0) 00669 return (err); 00670 err = dmu_free_long_range_impl(os, dn, offset, length, FALSE); 00671 dnode_rele(dn, FTAG); 00672 return (err); 00673 } 00674 00675 int 00676 dmu_free_object(objset_t *os, uint64_t object) 00677 { 00678 dnode_t *dn; 00679 dmu_tx_t *tx; 00680 int err; 00681 00682 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 00683 FTAG, &dn); 00684 if (err != 0) 00685 return (err); 00686 if (dn->dn_nlevels == 1) { 00687 tx = dmu_tx_create(os); 00688 dmu_tx_hold_bonus(tx, object); 00689 dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END); 00690 err = dmu_tx_assign(tx, TXG_WAIT); 00691 if (err == 0) { 00692 dnode_free_range(dn, 0, DMU_OBJECT_END, tx); 00693 dnode_free(dn, tx); 00694 dmu_tx_commit(tx); 00695 } else { 00696 dmu_tx_abort(tx); 00697 } 00698 } else { 00699 err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE); 00700 } 00701 dnode_rele(dn, FTAG); 00702 return (err); 00703 } 00704 00705 int 00706 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, 00707 uint64_t size, dmu_tx_t *tx) 00708 { 00709 dnode_t *dn; 00710 int err = dnode_hold(os, object, FTAG, &dn); 00711 if (err) 00712 return (err); 00713 ASSERT(offset < UINT64_MAX); 00714 ASSERT(size == -1ULL || size <= UINT64_MAX - offset); 00715 dnode_free_range(dn, offset, size, tx); 00716 dnode_rele(dn, FTAG); 00717 return (0); 00718 } 00719 00720 int 00721 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 00722 void *buf, uint32_t flags) 00723 { 00724 dnode_t *dn; 00725 dmu_buf_t **dbp; 00726 int numbufs, err; 00727 00728 err = dnode_hold(os, object, FTAG, &dn); 00729 if (err) 00730 return (err); 00731 00732 /* 00733 * Deal with odd block sizes, where there can't be data past the first 00734 * block. If we ever do the tail block optimization, we will need to 00735 * handle that here as well. 00736 */ 00737 if (dn->dn_maxblkid == 0) { 00738 int newsz = offset > dn->dn_datablksz ? 0 : 00739 MIN(size, dn->dn_datablksz - offset); 00740 bzero((char *)buf + newsz, size - newsz); 00741 size = newsz; 00742 } 00743 00744 while (size > 0) { 00745 uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); 00746 int i; 00747 00748 /* 00749 * NB: we could do this block-at-a-time, but it's nice 00750 * to be reading in parallel. 00751 */ 00752 err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, 00753 TRUE, FTAG, &numbufs, &dbp, flags); 00754 if (err) 00755 break; 00756 00757 for (i = 0; i < numbufs; i++) { 00758 int tocpy; 00759 int bufoff; 00760 dmu_buf_t *db = dbp[i]; 00761 00762 ASSERT(size > 0); 00763 00764 bufoff = offset - db->db_offset; 00765 tocpy = (int)MIN(db->db_size - bufoff, size); 00766 00767 bcopy((char *)db->db_data + bufoff, buf, tocpy); 00768 00769 offset += tocpy; 00770 size -= tocpy; 00771 buf = (char *)buf + tocpy; 00772 } 00773 dmu_buf_rele_array(dbp, numbufs, FTAG); 00774 } 00775 dnode_rele(dn, FTAG); 00776 return (err); 00777 } 00778 00779 void 00780 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 00781 const void *buf, dmu_tx_t *tx) 00782 { 00783 dmu_buf_t **dbp; 00784 int numbufs, i; 00785 00786 if (size == 0) 00787 return; 00788 00789 VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, 00790 FALSE, FTAG, &numbufs, &dbp)); 00791 00792 for (i = 0; i < numbufs; i++) { 00793 int tocpy; 00794 int bufoff; 00795 dmu_buf_t *db = dbp[i]; 00796 00797 ASSERT(size > 0); 00798 00799 bufoff = offset - db->db_offset; 00800 tocpy = (int)MIN(db->db_size - bufoff, size); 00801 00802 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 00803 00804 if (tocpy == db->db_size) 00805 dmu_buf_will_fill(db, tx); 00806 else 00807 dmu_buf_will_dirty(db, tx); 00808 00809 bcopy(buf, (char *)db->db_data + bufoff, tocpy); 00810 00811 if (tocpy == db->db_size) 00812 dmu_buf_fill_done(db, tx); 00813 00814 offset += tocpy; 00815 size -= tocpy; 00816 buf = (char *)buf + tocpy; 00817 } 00818 dmu_buf_rele_array(dbp, numbufs, FTAG); 00819 } 00820 00821 void 00822 dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 00823 dmu_tx_t *tx) 00824 { 00825 dmu_buf_t **dbp; 00826 int numbufs, i; 00827 00828 if (size == 0) 00829 return; 00830 00831 VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, 00832 FALSE, FTAG, &numbufs, &dbp)); 00833 00834 for (i = 0; i < numbufs; i++) { 00835 dmu_buf_t *db = dbp[i]; 00836 00837 dmu_buf_will_not_fill(db, tx); 00838 } 00839 dmu_buf_rele_array(dbp, numbufs, FTAG); 00840 } 00841 00845 kstat_t *xuio_ksp = NULL; 00846 00847 int 00848 dmu_xuio_init(xuio_t *xuio, int nblk) 00849 { 00850 dmu_xuio_t *priv; 00851 uio_t *uio = &xuio->xu_uio; 00852 00853 uio->uio_iovcnt = nblk; 00854 uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); 00855 00856 priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); 00857 priv->cnt = nblk; 00858 priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); 00859 priv->iovp = uio->uio_iov; 00860 XUIO_XUZC_PRIV(xuio) = priv; 00861 00862 if (XUIO_XUZC_RW(xuio) == UIO_READ) 00863 XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); 00864 else 00865 XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); 00866 00867 return (0); 00868 } 00869 00870 void 00871 dmu_xuio_fini(xuio_t *xuio) 00872 { 00873 dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 00874 int nblk = priv->cnt; 00875 00876 kmem_free(priv->iovp, nblk * sizeof (iovec_t)); 00877 kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); 00878 kmem_free(priv, sizeof (dmu_xuio_t)); 00879 00880 if (XUIO_XUZC_RW(xuio) == UIO_READ) 00881 XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); 00882 else 00883 XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); 00884 } 00885 00890 int 00891 dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) 00892 { 00893 struct iovec *iov; 00894 uio_t *uio = &xuio->xu_uio; 00895 dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 00896 int i = priv->next++; 00897 00898 ASSERT(i < priv->cnt); 00899 ASSERT(off + n <= arc_buf_size(abuf)); 00900 iov = uio->uio_iov + i; 00901 iov->iov_base = (char *)abuf->b_data + off; 00902 iov->iov_len = n; 00903 priv->bufs[i] = abuf; 00904 return (0); 00905 } 00906 00907 int 00908 dmu_xuio_cnt(xuio_t *xuio) 00909 { 00910 dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 00911 return (priv->cnt); 00912 } 00913 00914 arc_buf_t * 00915 dmu_xuio_arcbuf(xuio_t *xuio, int i) 00916 { 00917 dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 00918 00919 ASSERT(i < priv->cnt); 00920 return (priv->bufs[i]); 00921 } 00922 00923 void 00924 dmu_xuio_clear(xuio_t *xuio, int i) 00925 { 00926 dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 00927 00928 ASSERT(i < priv->cnt); 00929 priv->bufs[i] = NULL; 00930 } 00931 00932 static void 00933 xuio_stat_init(void) 00934 { 00935 xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", 00936 KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), 00937 KSTAT_FLAG_VIRTUAL); 00938 if (xuio_ksp != NULL) { 00939 xuio_ksp->ks_data = &xuio_stats; 00940 kstat_install(xuio_ksp); 00941 } 00942 } 00943 00944 static void 00945 xuio_stat_fini(void) 00946 { 00947 if (xuio_ksp != NULL) { 00948 kstat_delete(xuio_ksp); 00949 xuio_ksp = NULL; 00950 } 00951 } 00952 00953 void 00954 xuio_stat_wbuf_copied() 00955 { 00956 XUIOSTAT_BUMP(xuiostat_wbuf_copied); 00957 } 00958 00959 void 00960 xuio_stat_wbuf_nocopy() 00961 { 00962 XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); 00963 } 00964 00965 #ifdef _KERNEL 00966 int 00967 dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) 00968 { 00969 dmu_buf_t **dbp; 00970 int numbufs, i, err; 00971 xuio_t *xuio = NULL; 00972 00973 /* 00974 * NB: we could do this block-at-a-time, but it's nice 00975 * to be reading in parallel. 00976 */ 00977 err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, 00978 &numbufs, &dbp); 00979 if (err) 00980 return (err); 00981 00982 #ifdef UIO_XUIO 00983 if (uio->uio_extflg == UIO_XUIO) 00984 xuio = (xuio_t *)uio; 00985 #endif 00986 00987 for (i = 0; i < numbufs; i++) { 00988 int tocpy; 00989 int bufoff; 00990 dmu_buf_t *db = dbp[i]; 00991 00992 ASSERT(size > 0); 00993 00994 bufoff = uio->uio_loffset - db->db_offset; 00995 tocpy = (int)MIN(db->db_size - bufoff, size); 00996 00997 if (xuio) { 00998 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 00999 arc_buf_t *dbuf_abuf = dbi->db_buf; 01000 arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); 01001 err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); 01002 if (!err) { 01003 uio->uio_resid -= tocpy; 01004 uio->uio_loffset += tocpy; 01005 } 01006 01007 if (abuf == dbuf_abuf) 01008 XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); 01009 else 01010 XUIOSTAT_BUMP(xuiostat_rbuf_copied); 01011 } else { 01012 err = uiomove((char *)db->db_data + bufoff, tocpy, 01013 UIO_READ, uio); 01014 } 01015 if (err) 01016 break; 01017 01018 size -= tocpy; 01019 } 01020 dmu_buf_rele_array(dbp, numbufs, FTAG); 01021 01022 return (err); 01023 } 01024 01025 static int 01026 dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) 01027 { 01028 dmu_buf_t **dbp; 01029 int numbufs; 01030 int err = 0; 01031 int i; 01032 01033 err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, 01034 FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); 01035 if (err) 01036 return (err); 01037 01038 for (i = 0; i < numbufs; i++) { 01039 int tocpy; 01040 int bufoff; 01041 dmu_buf_t *db = dbp[i]; 01042 01043 ASSERT(size > 0); 01044 01045 bufoff = uio->uio_loffset - db->db_offset; 01046 tocpy = (int)MIN(db->db_size - bufoff, size); 01047 01048 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 01049 01050 if (tocpy == db->db_size) 01051 dmu_buf_will_fill(db, tx); 01052 else 01053 dmu_buf_will_dirty(db, tx); 01054 01055 /* 01056 * XXX uiomove could block forever (eg. nfs-backed 01057 * pages). There needs to be a uiolockdown() function 01058 * to lock the pages in memory, so that uiomove won't 01059 * block. 01060 */ 01061 err = uiomove((char *)db->db_data + bufoff, tocpy, 01062 UIO_WRITE, uio); 01063 01064 if (tocpy == db->db_size) 01065 dmu_buf_fill_done(db, tx); 01066 01067 if (err) 01068 break; 01069 01070 size -= tocpy; 01071 } 01072 01073 dmu_buf_rele_array(dbp, numbufs, FTAG); 01074 return (err); 01075 } 01076 01077 int 01078 dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, 01079 dmu_tx_t *tx) 01080 { 01081 dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; 01082 dnode_t *dn; 01083 int err; 01084 01085 if (size == 0) 01086 return (0); 01087 01088 DB_DNODE_ENTER(db); 01089 dn = DB_DNODE(db); 01090 err = dmu_write_uio_dnode(dn, uio, size, tx); 01091 DB_DNODE_EXIT(db); 01092 01093 return (err); 01094 } 01095 01096 int 01097 dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, 01098 dmu_tx_t *tx) 01099 { 01100 dnode_t *dn; 01101 int err; 01102 01103 if (size == 0) 01104 return (0); 01105 01106 err = dnode_hold(os, object, FTAG, &dn); 01107 if (err) 01108 return (err); 01109 01110 err = dmu_write_uio_dnode(dn, uio, size, tx); 01111 01112 dnode_rele(dn, FTAG); 01113 01114 return (err); 01115 } 01116 01117 #ifdef sun 01118 int 01119 dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 01120 page_t *pp, dmu_tx_t *tx) 01121 { 01122 dmu_buf_t **dbp; 01123 int numbufs, i; 01124 int err; 01125 01126 if (size == 0) 01127 return (0); 01128 01129 err = dmu_buf_hold_array(os, object, offset, size, 01130 FALSE, FTAG, &numbufs, &dbp); 01131 if (err) 01132 return (err); 01133 01134 for (i = 0; i < numbufs; i++) { 01135 int tocpy, copied, thiscpy; 01136 int bufoff; 01137 dmu_buf_t *db = dbp[i]; 01138 caddr_t va; 01139 01140 ASSERT(size > 0); 01141 ASSERT3U(db->db_size, >=, PAGESIZE); 01142 01143 bufoff = offset - db->db_offset; 01144 tocpy = (int)MIN(db->db_size - bufoff, size); 01145 01146 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 01147 01148 if (tocpy == db->db_size) 01149 dmu_buf_will_fill(db, tx); 01150 else 01151 dmu_buf_will_dirty(db, tx); 01152 01153 for (copied = 0; copied < tocpy; copied += PAGESIZE) { 01154 ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); 01155 thiscpy = MIN(PAGESIZE, tocpy - copied); 01156 va = zfs_map_page(pp, S_READ); 01157 bcopy(va, (char *)db->db_data + bufoff, thiscpy); 01158 zfs_unmap_page(pp, va); 01159 pp = pp->p_next; 01160 bufoff += PAGESIZE; 01161 } 01162 01163 if (tocpy == db->db_size) 01164 dmu_buf_fill_done(db, tx); 01165 01166 offset += tocpy; 01167 size -= tocpy; 01168 } 01169 dmu_buf_rele_array(dbp, numbufs, FTAG); 01170 return (err); 01171 } 01172 #endif /* sun */ 01173 #endif 01174 01178 arc_buf_t * 01179 dmu_request_arcbuf(dmu_buf_t *handle, int size) 01180 { 01181 dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; 01182 spa_t *spa; 01183 01184 DB_GET_SPA(&spa, db); 01185 return (arc_loan_buf(spa, size)); 01186 } 01187 01191 void 01192 dmu_return_arcbuf(arc_buf_t *buf) 01193 { 01194 arc_return_buf(buf, FTAG); 01195 VERIFY(arc_buf_remove_ref(buf, FTAG) == 1); 01196 } 01197 01203 void 01204 dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, 01205 dmu_tx_t *tx) 01206 { 01207 dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; 01208 dnode_t *dn; 01209 dmu_buf_impl_t *db; 01210 uint32_t blksz = (uint32_t)arc_buf_size(buf); 01211 uint64_t blkid; 01212 01213 DB_DNODE_ENTER(dbuf); 01214 dn = DB_DNODE(dbuf); 01215 rw_enter(&dn->dn_struct_rwlock, RW_READER); 01216 blkid = dbuf_whichblock(dn, offset); 01217 VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); 01218 rw_exit(&dn->dn_struct_rwlock); 01219 DB_DNODE_EXIT(dbuf); 01220 01221 if (offset == db->db.db_offset && blksz == db->db.db_size) { 01222 dbuf_assign_arcbuf(db, buf, tx); 01223 dbuf_rele(db, FTAG); 01224 } else { 01225 objset_t *os; 01226 uint64_t object; 01227 01228 DB_DNODE_ENTER(dbuf); 01229 dn = DB_DNODE(dbuf); 01230 os = dn->dn_objset; 01231 object = dn->dn_object; 01232 DB_DNODE_EXIT(dbuf); 01233 01234 dbuf_rele(db, FTAG); 01235 dmu_write(os, object, offset, blksz, buf->b_data, tx); 01236 dmu_return_arcbuf(buf); 01237 XUIOSTAT_BUMP(xuiostat_wbuf_copied); 01238 } 01239 } 01240 01241 typedef struct { 01242 dbuf_dirty_record_t *dsa_dr; 01243 dmu_sync_cb_t *dsa_done; 01244 zgd_t *dsa_zgd; 01245 dmu_tx_t *dsa_tx; 01246 } dmu_sync_arg_t; 01247 01248 /* ARGSUSED */ 01249 static void 01250 dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) 01251 { 01252 dmu_sync_arg_t *dsa = varg; 01253 dmu_buf_t *db = dsa->dsa_zgd->zgd_db; 01254 blkptr_t *bp = zio->io_bp; 01255 01256 if (zio->io_error == 0) { 01257 if (BP_IS_HOLE(bp)) { 01258 /* 01259 * A block of zeros may compress to a hole, but the 01260 * block size still needs to be known for replay. 01261 */ 01262 BP_SET_LSIZE(bp, db->db_size); 01263 } else { 01264 ASSERT(BP_GET_LEVEL(bp) == 0); 01265 bp->blk_fill = 1; 01266 } 01267 } 01268 } 01269 01270 static void 01271 dmu_sync_late_arrival_ready(zio_t *zio) 01272 { 01273 dmu_sync_ready(zio, NULL, zio->io_private); 01274 } 01275 01276 /* ARGSUSED */ 01277 static void 01278 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) 01279 { 01280 dmu_sync_arg_t *dsa = varg; 01281 dbuf_dirty_record_t *dr = dsa->dsa_dr; 01282 dmu_buf_impl_t *db = dr->dr_dbuf; 01283 01284 mutex_enter(&db->db_mtx); 01285 ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); 01286 if (zio->io_error == 0) { 01287 dr->dt.dl.dr_overridden_by = *zio->io_bp; 01288 dr->dt.dl.dr_override_state = DR_OVERRIDDEN; 01289 dr->dt.dl.dr_copies = zio->io_prop.zp_copies; 01290 if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) 01291 BP_ZERO(&dr->dt.dl.dr_overridden_by); 01292 } else { 01293 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 01294 } 01295 cv_broadcast(&db->db_changed); 01296 mutex_exit(&db->db_mtx); 01297 01298 dsa->dsa_done(dsa->dsa_zgd, zio->io_error); 01299 01300 kmem_free(dsa, sizeof (*dsa)); 01301 } 01302 01303 static void 01304 dmu_sync_late_arrival_done(zio_t *zio) 01305 { 01306 blkptr_t *bp = zio->io_bp; 01307 dmu_sync_arg_t *dsa = zio->io_private; 01308 01309 if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { 01310 ASSERT(zio->io_bp->blk_birth == zio->io_txg); 01311 ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); 01312 zio_free(zio->io_spa, zio->io_txg, zio->io_bp); 01313 } 01314 01315 dmu_tx_commit(dsa->dsa_tx); 01316 01317 dsa->dsa_done(dsa->dsa_zgd, zio->io_error); 01318 01319 kmem_free(dsa, sizeof (*dsa)); 01320 } 01321 01322 static int 01323 dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, 01324 zio_prop_t *zp, zbookmark_t *zb) 01325 { 01326 dmu_sync_arg_t *dsa; 01327 dmu_tx_t *tx; 01328 01329 tx = dmu_tx_create(os); 01330 dmu_tx_hold_space(tx, zgd->zgd_db->db_size); 01331 if (dmu_tx_assign(tx, TXG_WAIT) != 0) { 01332 dmu_tx_abort(tx); 01333 return (EIO); /* Make zl_get_data do txg_waited_synced() */ 01334 } 01335 01336 dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); 01337 dsa->dsa_dr = NULL; 01338 dsa->dsa_done = done; 01339 dsa->dsa_zgd = zgd; 01340 dsa->dsa_tx = tx; 01341 01342 zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, 01343 zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp, 01344 dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa, 01345 ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); 01346 01347 return (0); 01348 } 01349 01350 int 01351 dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) 01352 { 01353 blkptr_t *bp = zgd->zgd_bp; 01354 dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; 01355 objset_t *os = db->db_objset; 01356 dsl_dataset_t *ds = os->os_dsl_dataset; 01357 dbuf_dirty_record_t *dr; 01358 dmu_sync_arg_t *dsa; 01359 zbookmark_t zb; 01360 zio_prop_t zp; 01361 dnode_t *dn; 01362 01363 ASSERT(pio != NULL); 01364 ASSERT(BP_IS_HOLE(bp)); 01365 ASSERT(txg != 0); 01366 01367 SET_BOOKMARK(&zb, ds->ds_object, 01368 db->db.db_object, db->db_level, db->db_blkid); 01369 01370 DB_DNODE_ENTER(db); 01371 dn = DB_DNODE(db); 01372 dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); 01373 DB_DNODE_EXIT(db); 01374 01375 /* 01376 * If we're frozen (running ziltest), we always need to generate a bp. 01377 */ 01378 if (txg > spa_freeze_txg(os->os_spa)) 01379 return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); 01380 01381 /* 01382 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() 01383 * and us. If we determine that this txg is not yet syncing, 01384 * but it begins to sync a moment later, that's OK because the 01385 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. 01386 */ 01387 mutex_enter(&db->db_mtx); 01388 01389 if (txg <= spa_last_synced_txg(os->os_spa)) { 01390 /* 01391 * This txg has already synced. There's nothing to do. 01392 */ 01393 mutex_exit(&db->db_mtx); 01394 return (EEXIST); 01395 } 01396 01397 if (txg <= spa_syncing_txg(os->os_spa)) { 01398 /* 01399 * This txg is currently syncing, so we can't mess with 01400 * the dirty record anymore; just write a new log block. 01401 */ 01402 mutex_exit(&db->db_mtx); 01403 return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); 01404 } 01405 01406 dr = db->db_last_dirty; 01407 while (dr && dr->dr_txg != txg) 01408 dr = dr->dr_next; 01409 01410 if (dr == NULL) { 01411 /* 01412 * There's no dr for this dbuf, so it must have been freed. 01413 * There's no need to log writes to freed blocks, so we're done. 01414 */ 01415 mutex_exit(&db->db_mtx); 01416 return (ENOENT); 01417 } 01418 01419 ASSERT(dr->dr_txg == txg); 01420 if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || 01421 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 01422 /* 01423 * We have already issued a sync write for this buffer, 01424 * or this buffer has already been synced. It could not 01425 * have been dirtied since, or we would have cleared the state. 01426 */ 01427 mutex_exit(&db->db_mtx); 01428 return (EALREADY); 01429 } 01430 01431 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 01432 dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; 01433 mutex_exit(&db->db_mtx); 01434 01435 dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); 01436 dsa->dsa_dr = dr; 01437 dsa->dsa_done = done; 01438 dsa->dsa_zgd = zgd; 01439 dsa->dsa_tx = NULL; 01440 01441 zio_nowait(arc_write(pio, os->os_spa, txg, 01442 bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp, 01443 dmu_sync_ready, dmu_sync_done, dsa, 01444 ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); 01445 01446 return (0); 01447 } 01448 01449 int 01450 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, 01451 dmu_tx_t *tx) 01452 { 01453 dnode_t *dn; 01454 int err; 01455 01456 err = dnode_hold(os, object, FTAG, &dn); 01457 if (err) 01458 return (err); 01459 err = dnode_set_blksz(dn, size, ibs, tx); 01460 dnode_rele(dn, FTAG); 01461 return (err); 01462 } 01463 01464 void 01465 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, 01466 dmu_tx_t *tx) 01467 { 01468 dnode_t *dn; 01469 01470 /* XXX assumes dnode_hold will not get an i/o error */ 01471 (void) dnode_hold(os, object, FTAG, &dn); 01472 ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); 01473 dn->dn_checksum = checksum; 01474 dnode_setdirty(dn, tx); 01475 dnode_rele(dn, FTAG); 01476 } 01477 01478 void 01479 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, 01480 dmu_tx_t *tx) 01481 { 01482 dnode_t *dn; 01483 01484 /* XXX assumes dnode_hold will not get an i/o error */ 01485 (void) dnode_hold(os, object, FTAG, &dn); 01486 ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); 01487 dn->dn_compress = compress; 01488 dnode_setdirty(dn, tx); 01489 dnode_rele(dn, FTAG); 01490 } 01491 01492 int zfs_mdcomp_disable = 0; 01493 TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable); 01494 SYSCTL_DECL(_vfs_zfs); 01495 SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RW, 01496 &zfs_mdcomp_disable, 0, "Disable metadata compression"); 01497 01498 void 01499 dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) 01500 { 01501 dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; 01502 boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || 01503 (wp & WP_SPILL)); 01504 enum zio_checksum checksum = os->os_checksum; 01505 enum zio_compress compress = os->os_compress; 01506 enum zio_checksum dedup_checksum = os->os_dedup_checksum; 01507 boolean_t dedup; 01508 boolean_t dedup_verify = os->os_dedup_verify; 01509 int copies = os->os_copies; 01510 01511 /* 01512 * Determine checksum setting. 01513 */ 01514 if (ismd) { 01515 /* 01516 * Metadata always gets checksummed. If the data 01517 * checksum is multi-bit correctable, and it's not a 01518 * ZBT-style checksum, then it's suitable for metadata 01519 * as well. Otherwise, the metadata checksum defaults 01520 * to fletcher4. 01521 */ 01522 if (zio_checksum_table[checksum].ci_correctable < 1 || 01523 zio_checksum_table[checksum].ci_eck) 01524 checksum = ZIO_CHECKSUM_FLETCHER_4; 01525 } else { 01526 checksum = zio_checksum_select(dn->dn_checksum, checksum); 01527 } 01528 01529 /* 01530 * Determine compression setting. 01531 */ 01532 if (ismd) { 01533 /* 01534 * XXX -- we should design a compression algorithm 01535 * that specializes in arrays of bps. 01536 */ 01537 compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : 01538 ZIO_COMPRESS_LZJB; 01539 } else { 01540 compress = zio_compress_select(dn->dn_compress, compress); 01541 } 01542 01543 /* 01544 * Determine dedup setting. If we are in dmu_sync(), we won't 01545 * actually dedup now because that's all done in syncing context; 01546 * but we do want to use the dedup checkum. If the checksum is not 01547 * strong enough to ensure unique signatures, force dedup_verify. 01548 */ 01549 dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF); 01550 if (dedup) { 01551 checksum = dedup_checksum; 01552 if (!zio_checksum_table[checksum].ci_dedup) 01553 dedup_verify = 1; 01554 } 01555 01556 if (wp & WP_DMU_SYNC) 01557 dedup = 0; 01558 01559 if (wp & WP_NOFILL) { 01560 ASSERT(!ismd && level == 0); 01561 checksum = ZIO_CHECKSUM_OFF; 01562 compress = ZIO_COMPRESS_OFF; 01563 dedup = B_FALSE; 01564 } 01565 01566 zp->zp_checksum = checksum; 01567 zp->zp_compress = compress; 01568 zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; 01569 zp->zp_level = level; 01570 zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa)); 01571 zp->zp_dedup = dedup; 01572 zp->zp_dedup_verify = dedup && dedup_verify; 01573 } 01574 01575 int 01576 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) 01577 { 01578 dnode_t *dn; 01579 int i, err; 01580 01581 err = dnode_hold(os, object, FTAG, &dn); 01582 if (err) 01583 return (err); 01584 /* 01585 * Sync any current changes before 01586 * we go trundling through the block pointers. 01587 */ 01588 for (i = 0; i < TXG_SIZE; i++) { 01589 if (list_link_active(&dn->dn_dirty_link[i])) 01590 break; 01591 } 01592 if (i != TXG_SIZE) { 01593 dnode_rele(dn, FTAG); 01594 txg_wait_synced(dmu_objset_pool(os), 0); 01595 err = dnode_hold(os, object, FTAG, &dn); 01596 if (err) 01597 return (err); 01598 } 01599 01600 err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); 01601 dnode_rele(dn, FTAG); 01602 01603 return (err); 01604 } 01605 01606 void 01607 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) 01608 { 01609 dnode_phys_t *dnp; 01610 01611 rw_enter(&dn->dn_struct_rwlock, RW_READER); 01612 mutex_enter(&dn->dn_mtx); 01613 01614 dnp = dn->dn_phys; 01615 01616 doi->doi_data_block_size = dn->dn_datablksz; 01617 doi->doi_metadata_block_size = dn->dn_indblkshift ? 01618 1ULL << dn->dn_indblkshift : 0; 01619 doi->doi_type = dn->dn_type; 01620 doi->doi_bonus_type = dn->dn_bonustype; 01621 doi->doi_bonus_size = dn->dn_bonuslen; 01622 doi->doi_indirection = dn->dn_nlevels; 01623 doi->doi_checksum = dn->dn_checksum; 01624 doi->doi_compress = dn->dn_compress; 01625 doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; 01626 doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz; 01627 doi->doi_fill_count = 0; 01628 for (int i = 0; i < dnp->dn_nblkptr; i++) 01629 doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill; 01630 01631 mutex_exit(&dn->dn_mtx); 01632 rw_exit(&dn->dn_struct_rwlock); 01633 } 01634 01635 int 01636 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) 01637 { 01638 dnode_t *dn; 01639 int err = dnode_hold(os, object, FTAG, &dn); 01640 01641 if (err) 01642 return (err); 01643 01644 if (doi != NULL) 01645 dmu_object_info_from_dnode(dn, doi); 01646 01647 dnode_rele(dn, FTAG); 01648 return (0); 01649 } 01650 01651 void 01652 dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) 01653 { 01654 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 01655 01656 DB_DNODE_ENTER(db); 01657 dmu_object_info_from_dnode(DB_DNODE(db), doi); 01658 DB_DNODE_EXIT(db); 01659 } 01660 01661 void 01662 dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, 01663 u_longlong_t *nblk512) 01664 { 01665 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 01666 dnode_t *dn; 01667 01668 DB_DNODE_ENTER(db); 01669 dn = DB_DNODE(db); 01670 01671 *blksize = dn->dn_datablksz; 01672 /* add 1 for dnode space */ 01673 *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> 01674 SPA_MINBLOCKSHIFT) + 1; 01675 DB_DNODE_EXIT(db); 01676 } 01677 01678 void 01679 byteswap_uint64_array(void *vbuf, size_t size) 01680 { 01681 uint64_t *buf = vbuf; 01682 size_t count = size >> 3; 01683 int i; 01684 01685 ASSERT((size & 7) == 0); 01686 01687 for (i = 0; i < count; i++) 01688 buf[i] = BSWAP_64(buf[i]); 01689 } 01690 01691 void 01692 byteswap_uint32_array(void *vbuf, size_t size) 01693 { 01694 uint32_t *buf = vbuf; 01695 size_t count = size >> 2; 01696 int i; 01697 01698 ASSERT((size & 3) == 0); 01699 01700 for (i = 0; i < count; i++) 01701 buf[i] = BSWAP_32(buf[i]); 01702 } 01703 01704 void 01705 byteswap_uint16_array(void *vbuf, size_t size) 01706 { 01707 uint16_t *buf = vbuf; 01708 size_t count = size >> 1; 01709 int i; 01710 01711 ASSERT((size & 1) == 0); 01712 01713 for (i = 0; i < count; i++) 01714 buf[i] = BSWAP_16(buf[i]); 01715 } 01716 01717 /* ARGSUSED */ 01718 void 01719 byteswap_uint8_array(void *vbuf, size_t size) 01720 { 01721 } 01722 01723 void 01724 dmu_init(void) 01725 { 01726 zfs_dbgmsg_init(); 01727 sa_cache_init(); 01728 xuio_stat_init(); 01729 dmu_objset_init(); 01730 dnode_init(); 01731 dbuf_init(); 01732 zfetch_init(); 01733 l2arc_init(); 01734 arc_init(); 01735 } 01736 01737 void 01738 dmu_fini(void) 01739 { 01740 arc_fini(); 01741 l2arc_fini(); 01742 zfetch_fini(); 01743 dbuf_fini(); 01744 dnode_fini(); 01745 dmu_objset_fini(); 01746 xuio_stat_fini(); 01747 sa_cache_fini(); 01748 zfs_dbgmsg_fini(); 01749 }