FreeBSD ZFS
The Zettabyte File System
|
00001 /* 00002 * CDDL HEADER START 00003 * 00004 * The contents of this file are subject to the terms of the 00005 * Common Development and Distribution License (the "License"). 00006 * You may not use this file except in compliance with the License. 00007 * 00008 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 00009 * or http://www.opensolaris.org/os/licensing. 00010 * See the License for the specific language governing permissions 00011 * and limitations under the License. 00012 * 00013 * When distributing Covered Code, include this CDDL HEADER in each 00014 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 00015 * If applicable, add the following below this CDDL HEADER, with the 00016 * fields enclosed by brackets "[]" replaced with your own identifying 00017 * information: Portions Copyright [yyyy] [name of copyright owner] 00018 * 00019 * CDDL HEADER END 00020 */ 00021 /* 00022 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 00023 */ 00024 00025 /* Portions Copyright 2010 Robert Milkowski */ 00026 00027 #include <sys/cred.h> 00028 #include <sys/zfs_context.h> 00029 #include <sys/dmu_objset.h> 00030 #include <sys/dsl_dir.h> 00031 #include <sys/dsl_dataset.h> 00032 #include <sys/dsl_prop.h> 00033 #include <sys/dsl_pool.h> 00034 #include <sys/dsl_synctask.h> 00035 #include <sys/dsl_deleg.h> 00036 #include <sys/dnode.h> 00037 #include <sys/dbuf.h> 00038 #include <sys/zvol.h> 00039 #include <sys/dmu_tx.h> 00040 #include <sys/zap.h> 00041 #include <sys/zil.h> 00042 #include <sys/dmu_impl.h> 00043 #include <sys/zfs_ioctl.h> 00044 #include <sys/sa.h> 00045 #include <sys/zfs_onexit.h> 00046 00051 krwlock_t os_lock; 00052 00053 void 00054 dmu_objset_init(void) 00055 { 00056 rw_init(&os_lock, NULL, RW_DEFAULT, NULL); 00057 } 00058 00059 void 00060 dmu_objset_fini(void) 00061 { 00062 rw_destroy(&os_lock); 00063 } 00064 00065 spa_t * 00066 dmu_objset_spa(objset_t *os) 00067 { 00068 return (os->os_spa); 00069 } 00070 00071 zilog_t * 00072 dmu_objset_zil(objset_t *os) 00073 { 00074 return (os->os_zil); 00075 } 00076 00077 dsl_pool_t * 00078 dmu_objset_pool(objset_t *os) 00079 { 00080 dsl_dataset_t *ds; 00081 00082 if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) 00083 return (ds->ds_dir->dd_pool); 00084 else 00085 return (spa_get_dsl(os->os_spa)); 00086 } 00087 00088 dsl_dataset_t * 00089 dmu_objset_ds(objset_t *os) 00090 { 00091 return (os->os_dsl_dataset); 00092 } 00093 00094 dmu_objset_type_t 00095 dmu_objset_type(objset_t *os) 00096 { 00097 return (os->os_phys->os_type); 00098 } 00099 00100 void 00101 dmu_objset_name(objset_t *os, char *buf) 00102 { 00103 dsl_dataset_name(os->os_dsl_dataset, buf); 00104 } 00105 00106 uint64_t 00107 dmu_objset_id(objset_t *os) 00108 { 00109 dsl_dataset_t *ds = os->os_dsl_dataset; 00110 00111 return (ds ? ds->ds_object : 0); 00112 } 00113 00114 uint64_t 00115 dmu_objset_syncprop(objset_t *os) 00116 { 00117 return (os->os_sync); 00118 } 00119 00120 uint64_t 00121 dmu_objset_logbias(objset_t *os) 00122 { 00123 return (os->os_logbias); 00124 } 00125 00126 static void 00127 checksum_changed_cb(void *arg, uint64_t newval) 00128 { 00129 objset_t *os = arg; 00130 00131 /* 00132 * Inheritance should have been done by now. 00133 */ 00134 ASSERT(newval != ZIO_CHECKSUM_INHERIT); 00135 00136 os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); 00137 } 00138 00139 static void 00140 compression_changed_cb(void *arg, uint64_t newval) 00141 { 00142 objset_t *os = arg; 00143 00144 /* 00145 * Inheritance and range checking should have been done by now. 00146 */ 00147 ASSERT(newval != ZIO_COMPRESS_INHERIT); 00148 00149 os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); 00150 } 00151 00152 static void 00153 copies_changed_cb(void *arg, uint64_t newval) 00154 { 00155 objset_t *os = arg; 00156 00157 /* 00158 * Inheritance and range checking should have been done by now. 00159 */ 00160 ASSERT(newval > 0); 00161 ASSERT(newval <= spa_max_replication(os->os_spa)); 00162 00163 os->os_copies = newval; 00164 } 00165 00166 static void 00167 dedup_changed_cb(void *arg, uint64_t newval) 00168 { 00169 objset_t *os = arg; 00170 spa_t *spa = os->os_spa; 00171 enum zio_checksum checksum; 00172 00173 /* 00174 * Inheritance should have been done by now. 00175 */ 00176 ASSERT(newval != ZIO_CHECKSUM_INHERIT); 00177 00178 checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); 00179 00180 os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; 00181 os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); 00182 } 00183 00184 static void 00185 primary_cache_changed_cb(void *arg, uint64_t newval) 00186 { 00187 objset_t *os = arg; 00188 00189 /* 00190 * Inheritance and range checking should have been done by now. 00191 */ 00192 ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || 00193 newval == ZFS_CACHE_METADATA); 00194 00195 os->os_primary_cache = newval; 00196 } 00197 00198 static void 00199 secondary_cache_changed_cb(void *arg, uint64_t newval) 00200 { 00201 objset_t *os = arg; 00202 00203 /* 00204 * Inheritance and range checking should have been done by now. 00205 */ 00206 ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || 00207 newval == ZFS_CACHE_METADATA); 00208 00209 os->os_secondary_cache = newval; 00210 } 00211 00212 static void 00213 sync_changed_cb(void *arg, uint64_t newval) 00214 { 00215 objset_t *os = arg; 00216 00217 /* 00218 * Inheritance and range checking should have been done by now. 00219 */ 00220 ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || 00221 newval == ZFS_SYNC_DISABLED); 00222 00223 os->os_sync = newval; 00224 if (os->os_zil) 00225 zil_set_sync(os->os_zil, newval); 00226 } 00227 00228 static void 00229 logbias_changed_cb(void *arg, uint64_t newval) 00230 { 00231 objset_t *os = arg; 00232 00233 ASSERT(newval == ZFS_LOGBIAS_LATENCY || 00234 newval == ZFS_LOGBIAS_THROUGHPUT); 00235 os->os_logbias = newval; 00236 if (os->os_zil) 00237 zil_set_logbias(os->os_zil, newval); 00238 } 00239 00240 void 00241 dmu_objset_byteswap(void *buf, size_t size) 00242 { 00243 objset_phys_t *osp = buf; 00244 00245 ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t)); 00246 dnode_byteswap(&osp->os_meta_dnode); 00247 byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); 00248 osp->os_type = BSWAP_64(osp->os_type); 00249 osp->os_flags = BSWAP_64(osp->os_flags); 00250 if (size == sizeof (objset_phys_t)) { 00251 dnode_byteswap(&osp->os_userused_dnode); 00252 dnode_byteswap(&osp->os_groupused_dnode); 00253 } 00254 } 00255 00256 int 00257 dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, 00258 objset_t **osp) 00259 { 00260 objset_t *os; 00261 int i, err; 00262 00263 ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); 00264 00265 os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); 00266 os->os_dsl_dataset = ds; 00267 os->os_spa = spa; 00268 os->os_rootbp = bp; 00269 if (!BP_IS_HOLE(os->os_rootbp)) { 00270 uint32_t aflags = ARC_WAIT; 00271 zbookmark_t zb; 00272 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 00273 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 00274 00275 if (DMU_OS_IS_L2CACHEABLE(os)) 00276 aflags |= ARC_L2CACHE; 00277 00278 dprintf_bp(os->os_rootbp, "reading %s", ""); 00279 /* 00280 * XXX when bprewrite scrub can change the bp, 00281 * and this is called from dmu_objset_open_ds_os, the bp 00282 * could change, and we'll need a lock. 00283 */ 00284 err = dsl_read_nolock(NULL, spa, os->os_rootbp, 00285 arc_getbuf_func, &os->os_phys_buf, 00286 ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); 00287 if (err) { 00288 kmem_free(os, sizeof (objset_t)); 00289 /* convert checksum errors into IO errors */ 00290 if (err == ECKSUM) 00291 err = EIO; 00292 return (err); 00293 } 00294 00295 /* Increase the blocksize if we are permitted. */ 00296 if (spa_version(spa) >= SPA_VERSION_USERSPACE && 00297 arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { 00298 arc_buf_t *buf = arc_buf_alloc(spa, 00299 sizeof (objset_phys_t), &os->os_phys_buf, 00300 ARC_BUFC_METADATA); 00301 bzero(buf->b_data, sizeof (objset_phys_t)); 00302 bcopy(os->os_phys_buf->b_data, buf->b_data, 00303 arc_buf_size(os->os_phys_buf)); 00304 (void) arc_buf_remove_ref(os->os_phys_buf, 00305 &os->os_phys_buf); 00306 os->os_phys_buf = buf; 00307 } 00308 00309 os->os_phys = os->os_phys_buf->b_data; 00310 os->os_flags = os->os_phys->os_flags; 00311 } else { 00312 int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? 00313 sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; 00314 os->os_phys_buf = arc_buf_alloc(spa, size, 00315 &os->os_phys_buf, ARC_BUFC_METADATA); 00316 os->os_phys = os->os_phys_buf->b_data; 00317 bzero(os->os_phys, size); 00318 } 00319 00320 /* 00321 * Note: the changed_cb will be called once before the register 00322 * func returns, thus changing the checksum/compression from the 00323 * default (fletcher2/off). Snapshots don't need to know about 00324 * checksum/compression/copies. 00325 */ 00326 if (ds) { 00327 err = dsl_prop_register(ds, "primarycache", 00328 primary_cache_changed_cb, os); 00329 if (err == 0) 00330 err = dsl_prop_register(ds, "secondarycache", 00331 secondary_cache_changed_cb, os); 00332 if (!dsl_dataset_is_snapshot(ds)) { 00333 if (err == 0) 00334 err = dsl_prop_register(ds, "checksum", 00335 checksum_changed_cb, os); 00336 if (err == 0) 00337 err = dsl_prop_register(ds, "compression", 00338 compression_changed_cb, os); 00339 if (err == 0) 00340 err = dsl_prop_register(ds, "copies", 00341 copies_changed_cb, os); 00342 if (err == 0) 00343 err = dsl_prop_register(ds, "dedup", 00344 dedup_changed_cb, os); 00345 if (err == 0) 00346 err = dsl_prop_register(ds, "logbias", 00347 logbias_changed_cb, os); 00348 if (err == 0) 00349 err = dsl_prop_register(ds, "sync", 00350 sync_changed_cb, os); 00351 } 00352 if (err) { 00353 VERIFY(arc_buf_remove_ref(os->os_phys_buf, 00354 &os->os_phys_buf) == 1); 00355 kmem_free(os, sizeof (objset_t)); 00356 return (err); 00357 } 00358 } else if (ds == NULL) { 00359 /* It's the meta-objset. */ 00360 os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; 00361 os->os_compress = ZIO_COMPRESS_LZJB; 00362 os->os_copies = spa_max_replication(spa); 00363 os->os_dedup_checksum = ZIO_CHECKSUM_OFF; 00364 os->os_dedup_verify = 0; 00365 os->os_logbias = 0; 00366 os->os_sync = 0; 00367 os->os_primary_cache = ZFS_CACHE_ALL; 00368 os->os_secondary_cache = ZFS_CACHE_ALL; 00369 } 00370 00371 if (ds == NULL || !dsl_dataset_is_snapshot(ds)) 00372 os->os_zil_header = os->os_phys->os_zil_header; 00373 os->os_zil = zil_alloc(os, &os->os_zil_header); 00374 00375 for (i = 0; i < TXG_SIZE; i++) { 00376 list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), 00377 offsetof(dnode_t, dn_dirty_link[i])); 00378 list_create(&os->os_free_dnodes[i], sizeof (dnode_t), 00379 offsetof(dnode_t, dn_dirty_link[i])); 00380 } 00381 list_create(&os->os_dnodes, sizeof (dnode_t), 00382 offsetof(dnode_t, dn_link)); 00383 list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), 00384 offsetof(dmu_buf_impl_t, db_link)); 00385 00386 mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); 00387 mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); 00388 mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); 00389 00390 DMU_META_DNODE(os) = dnode_special_open(os, 00391 &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, 00392 &os->os_meta_dnode); 00393 if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { 00394 DMU_USERUSED_DNODE(os) = dnode_special_open(os, 00395 &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, 00396 &os->os_userused_dnode); 00397 DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, 00398 &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, 00399 &os->os_groupused_dnode); 00400 } 00401 00402 /* 00403 * We should be the only thread trying to do this because we 00404 * have ds_opening_lock 00405 */ 00406 if (ds) { 00407 mutex_enter(&ds->ds_lock); 00408 ASSERT(ds->ds_objset == NULL); 00409 ds->ds_objset = os; 00410 mutex_exit(&ds->ds_lock); 00411 } 00412 00413 *osp = os; 00414 return (0); 00415 } 00416 00417 int 00418 dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) 00419 { 00420 int err = 0; 00421 00422 mutex_enter(&ds->ds_opening_lock); 00423 *osp = ds->ds_objset; 00424 if (*osp == NULL) { 00425 err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), 00426 ds, dsl_dataset_get_blkptr(ds), osp); 00427 } 00428 mutex_exit(&ds->ds_opening_lock); 00429 return (err); 00430 } 00431 00432 /* called from zpl */ 00433 int 00434 dmu_objset_hold(const char *name, void *tag, objset_t **osp) 00435 { 00436 dsl_dataset_t *ds; 00437 int err; 00438 00439 err = dsl_dataset_hold(name, tag, &ds); 00440 if (err) 00441 return (err); 00442 00443 err = dmu_objset_from_ds(ds, osp); 00444 if (err) 00445 dsl_dataset_rele(ds, tag); 00446 00447 return (err); 00448 } 00449 00450 /* called from zpl */ 00451 int 00452 dmu_objset_own(const char *name, dmu_objset_type_t type, 00453 boolean_t readonly, void *tag, objset_t **osp) 00454 { 00455 dsl_dataset_t *ds; 00456 int err; 00457 00458 err = dsl_dataset_own(name, B_FALSE, tag, &ds); 00459 if (err) 00460 return (err); 00461 00462 err = dmu_objset_from_ds(ds, osp); 00463 if (err) { 00464 dsl_dataset_disown(ds, tag); 00465 } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { 00466 dmu_objset_disown(*osp, tag); 00467 return (EINVAL); 00468 } else if (!readonly && dsl_dataset_is_snapshot(ds)) { 00469 dmu_objset_disown(*osp, tag); 00470 return (EROFS); 00471 } 00472 return (err); 00473 } 00474 00475 void 00476 dmu_objset_rele(objset_t *os, void *tag) 00477 { 00478 dsl_dataset_rele(os->os_dsl_dataset, tag); 00479 } 00480 00481 void 00482 dmu_objset_disown(objset_t *os, void *tag) 00483 { 00484 dsl_dataset_disown(os->os_dsl_dataset, tag); 00485 } 00486 00487 int 00488 dmu_objset_evict_dbufs(objset_t *os) 00489 { 00490 dnode_t *dn; 00491 00492 mutex_enter(&os->os_lock); 00493 00494 /* process the mdn last, since the other dnodes have holds on it */ 00495 list_remove(&os->os_dnodes, DMU_META_DNODE(os)); 00496 list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); 00497 00498 /* 00499 * Find the first dnode with holds. We have to do this dance 00500 * because dnode_add_ref() only works if you already have a 00501 * hold. If there are no holds then it has no dbufs so OK to 00502 * skip. 00503 */ 00504 for (dn = list_head(&os->os_dnodes); 00505 dn && !dnode_add_ref(dn, FTAG); 00506 dn = list_next(&os->os_dnodes, dn)) 00507 continue; 00508 00509 while (dn) { 00510 dnode_t *next_dn = dn; 00511 00512 do { 00513 next_dn = list_next(&os->os_dnodes, next_dn); 00514 } while (next_dn && !dnode_add_ref(next_dn, FTAG)); 00515 00516 mutex_exit(&os->os_lock); 00517 dnode_evict_dbufs(dn); 00518 dnode_rele(dn, FTAG); 00519 mutex_enter(&os->os_lock); 00520 dn = next_dn; 00521 } 00522 dn = list_head(&os->os_dnodes); 00523 mutex_exit(&os->os_lock); 00524 return (dn != DMU_META_DNODE(os)); 00525 } 00526 00527 void 00528 dmu_objset_evict(objset_t *os) 00529 { 00530 dsl_dataset_t *ds = os->os_dsl_dataset; 00531 00532 for (int t = 0; t < TXG_SIZE; t++) 00533 ASSERT(!dmu_objset_is_dirty(os, t)); 00534 00535 if (ds) { 00536 if (!dsl_dataset_is_snapshot(ds)) { 00537 VERIFY(0 == dsl_prop_unregister(ds, "checksum", 00538 checksum_changed_cb, os)); 00539 VERIFY(0 == dsl_prop_unregister(ds, "compression", 00540 compression_changed_cb, os)); 00541 VERIFY(0 == dsl_prop_unregister(ds, "copies", 00542 copies_changed_cb, os)); 00543 VERIFY(0 == dsl_prop_unregister(ds, "dedup", 00544 dedup_changed_cb, os)); 00545 VERIFY(0 == dsl_prop_unregister(ds, "logbias", 00546 logbias_changed_cb, os)); 00547 VERIFY(0 == dsl_prop_unregister(ds, "sync", 00548 sync_changed_cb, os)); 00549 } 00550 VERIFY(0 == dsl_prop_unregister(ds, "primarycache", 00551 primary_cache_changed_cb, os)); 00552 VERIFY(0 == dsl_prop_unregister(ds, "secondarycache", 00553 secondary_cache_changed_cb, os)); 00554 } 00555 00556 if (os->os_sa) 00557 sa_tear_down(os); 00558 00559 /* 00560 * We should need only a single pass over the dnode list, since 00561 * nothing can be added to the list at this point. 00562 */ 00563 (void) dmu_objset_evict_dbufs(os); 00564 00565 dnode_special_close(&os->os_meta_dnode); 00566 if (DMU_USERUSED_DNODE(os)) { 00567 dnode_special_close(&os->os_userused_dnode); 00568 dnode_special_close(&os->os_groupused_dnode); 00569 } 00570 zil_free(os->os_zil); 00571 00572 ASSERT3P(list_head(&os->os_dnodes), ==, NULL); 00573 00574 VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1); 00575 00576 /* 00577 * This is a barrier to prevent the objset from going away in 00578 * dnode_move() until we can safely ensure that the objset is still in 00579 * use. We consider the objset valid before the barrier and invalid 00580 * after the barrier. 00581 */ 00582 rw_enter(&os_lock, RW_READER); 00583 rw_exit(&os_lock); 00584 00585 mutex_destroy(&os->os_lock); 00586 mutex_destroy(&os->os_obj_lock); 00587 mutex_destroy(&os->os_user_ptr_lock); 00588 kmem_free(os, sizeof (objset_t)); 00589 } 00590 00591 timestruc_t 00592 dmu_objset_snap_cmtime(objset_t *os) 00593 { 00594 return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); 00595 } 00596 00597 /* called from dsl for meta-objset */ 00598 objset_t * 00599 dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, 00600 dmu_objset_type_t type, dmu_tx_t *tx) 00601 { 00602 objset_t *os; 00603 dnode_t *mdn; 00604 00605 ASSERT(dmu_tx_is_syncing(tx)); 00606 if (ds != NULL) 00607 VERIFY(0 == dmu_objset_from_ds(ds, &os)); 00608 else 00609 VERIFY(0 == dmu_objset_open_impl(spa, NULL, bp, &os)); 00610 00611 mdn = DMU_META_DNODE(os); 00612 00613 dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, 00614 DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); 00615 00616 /* 00617 * We don't want to have to increase the meta-dnode's nlevels 00618 * later, because then we could do it in quescing context while 00619 * we are also accessing it in open context. 00620 * 00621 * This precaution is not necessary for the MOS (ds == NULL), 00622 * because the MOS is only updated in syncing context. 00623 * This is most fortunate: the MOS is the only objset that 00624 * needs to be synced multiple times as spa_sync() iterates 00625 * to convergence, so minimizing its dn_nlevels matters. 00626 */ 00627 if (ds != NULL) { 00628 int levels = 1; 00629 00630 /* 00631 * Determine the number of levels necessary for the meta-dnode 00632 * to contain DN_MAX_OBJECT dnodes. 00633 */ 00634 while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift + 00635 (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < 00636 DN_MAX_OBJECT * sizeof (dnode_phys_t)) 00637 levels++; 00638 00639 mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = 00640 mdn->dn_nlevels = levels; 00641 } 00642 00643 ASSERT(type != DMU_OST_NONE); 00644 ASSERT(type != DMU_OST_ANY); 00645 ASSERT(type < DMU_OST_NUMTYPES); 00646 os->os_phys->os_type = type; 00647 if (dmu_objset_userused_enabled(os)) { 00648 os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; 00649 os->os_flags = os->os_phys->os_flags; 00650 } 00651 00652 dsl_dataset_dirty(ds, tx); 00653 00654 return (os); 00655 } 00656 00657 struct oscarg { 00658 void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); 00659 void *userarg; 00660 dsl_dataset_t *clone_origin; 00661 const char *lastname; 00662 dmu_objset_type_t type; 00663 uint64_t flags; 00664 cred_t *cr; 00665 }; 00666 00667 /*ARGSUSED*/ 00668 static int 00669 dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) 00670 { 00671 dsl_dir_t *dd = arg1; 00672 struct oscarg *oa = arg2; 00673 objset_t *mos = dd->dd_pool->dp_meta_objset; 00674 int err; 00675 uint64_t ddobj; 00676 00677 err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, 00678 oa->lastname, sizeof (uint64_t), 1, &ddobj); 00679 if (err != ENOENT) 00680 return (err ? err : EEXIST); 00681 00682 if (oa->clone_origin != NULL) { 00683 /* You can't clone across pools. */ 00684 if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool) 00685 return (EXDEV); 00686 00687 /* You can only clone snapshots, not the head datasets. */ 00688 if (!dsl_dataset_is_snapshot(oa->clone_origin)) 00689 return (EINVAL); 00690 } 00691 00692 return (0); 00693 } 00694 00695 static void 00696 dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) 00697 { 00698 dsl_dir_t *dd = arg1; 00699 spa_t *spa = dd->dd_pool->dp_spa; 00700 struct oscarg *oa = arg2; 00701 uint64_t obj; 00702 00703 ASSERT(dmu_tx_is_syncing(tx)); 00704 00705 obj = dsl_dataset_create_sync(dd, oa->lastname, 00706 oa->clone_origin, oa->flags, oa->cr, tx); 00707 00708 if (oa->clone_origin == NULL) { 00709 dsl_pool_t *dp = dd->dd_pool; 00710 dsl_dataset_t *ds; 00711 blkptr_t *bp; 00712 objset_t *os; 00713 00714 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); 00715 bp = dsl_dataset_get_blkptr(ds); 00716 ASSERT(BP_IS_HOLE(bp)); 00717 00718 os = dmu_objset_create_impl(spa, ds, bp, oa->type, tx); 00719 00720 if (oa->userfunc) 00721 oa->userfunc(os, oa->userarg, oa->cr, tx); 00722 dsl_dataset_rele(ds, FTAG); 00723 } 00724 00725 spa_history_log_internal(LOG_DS_CREATE, spa, tx, "dataset = %llu", obj); 00726 } 00727 00728 int 00729 dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, 00730 void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) 00731 { 00732 dsl_dir_t *pdd; 00733 const char *tail; 00734 int err = 0; 00735 struct oscarg oa = { 0 }; 00736 00737 ASSERT(strchr(name, '@') == NULL); 00738 err = dsl_dir_open(name, FTAG, &pdd, &tail); 00739 if (err) 00740 return (err); 00741 if (tail == NULL) { 00742 dsl_dir_close(pdd, FTAG); 00743 return (EEXIST); 00744 } 00745 00746 oa.userfunc = func; 00747 oa.userarg = arg; 00748 oa.lastname = tail; 00749 oa.type = type; 00750 oa.flags = flags; 00751 oa.cr = CRED(); 00752 00753 err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, 00754 dmu_objset_create_sync, pdd, &oa, 5); 00755 dsl_dir_close(pdd, FTAG); 00756 return (err); 00757 } 00758 00759 int 00760 dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags) 00761 { 00762 dsl_dir_t *pdd; 00763 const char *tail; 00764 int err = 0; 00765 struct oscarg oa = { 0 }; 00766 00767 ASSERT(strchr(name, '@') == NULL); 00768 err = dsl_dir_open(name, FTAG, &pdd, &tail); 00769 if (err) 00770 return (err); 00771 if (tail == NULL) { 00772 dsl_dir_close(pdd, FTAG); 00773 return (EEXIST); 00774 } 00775 00776 oa.lastname = tail; 00777 oa.clone_origin = clone_origin; 00778 oa.flags = flags; 00779 oa.cr = CRED(); 00780 00781 err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, 00782 dmu_objset_create_sync, pdd, &oa, 5); 00783 dsl_dir_close(pdd, FTAG); 00784 return (err); 00785 } 00786 00787 int 00788 dmu_objset_destroy(const char *name, boolean_t defer) 00789 { 00790 dsl_dataset_t *ds; 00791 int error; 00792 00793 error = dsl_dataset_own(name, B_TRUE, FTAG, &ds); 00794 if (error == 0) { 00795 error = dsl_dataset_destroy(ds, FTAG, defer); 00796 /* dsl_dataset_destroy() closes the ds. */ 00797 } 00798 00799 return (error); 00800 } 00801 00802 struct snaparg { 00803 dsl_sync_task_group_t *dstg; 00804 char *snapname; 00805 char *htag; 00806 char failed[MAXPATHLEN]; 00807 boolean_t recursive; 00808 boolean_t needsuspend; 00809 boolean_t temporary; 00810 nvlist_t *props; 00811 struct dsl_ds_holdarg *ha; /* only needed in the temporary case */ 00812 dsl_dataset_t *newds; 00813 }; 00814 00815 static int 00816 snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) 00817 { 00818 objset_t *os = arg1; 00819 struct snaparg *sn = arg2; 00820 int error; 00821 00822 /* The props have already been checked by zfs_check_userprops(). */ 00823 00824 error = dsl_dataset_snapshot_check(os->os_dsl_dataset, 00825 sn->snapname, tx); 00826 if (error) 00827 return (error); 00828 00829 if (sn->temporary) { 00830 /* 00831 * Ideally we would just call 00832 * dsl_dataset_user_hold_check() and 00833 * dsl_dataset_destroy_check() here. However the 00834 * dataset we want to hold and destroy is the snapshot 00835 * that we just confirmed we can create, but it won't 00836 * exist until after these checks are run. Do any 00837 * checks we can here and if more checks are added to 00838 * those routines in the future, similar checks may be 00839 * necessary here. 00840 */ 00841 if (spa_version(os->os_spa) < SPA_VERSION_USERREFS) 00842 return (ENOTSUP); 00843 /* 00844 * Not checking number of tags because the tag will be 00845 * unique, as it will be the only tag. 00846 */ 00847 if (strlen(sn->htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) 00848 return (E2BIG); 00849 00850 sn->ha = kmem_alloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 00851 sn->ha->temphold = B_TRUE; 00852 sn->ha->htag = sn->htag; 00853 } 00854 return (error); 00855 } 00856 00857 static void 00858 snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) 00859 { 00860 objset_t *os = arg1; 00861 dsl_dataset_t *ds = os->os_dsl_dataset; 00862 struct snaparg *sn = arg2; 00863 00864 dsl_dataset_snapshot_sync(ds, sn->snapname, tx); 00865 00866 if (sn->props) { 00867 dsl_props_arg_t pa; 00868 pa.pa_props = sn->props; 00869 pa.pa_source = ZPROP_SRC_LOCAL; 00870 dsl_props_set_sync(ds->ds_prev, &pa, tx); 00871 } 00872 00873 if (sn->temporary) { 00874 struct dsl_ds_destroyarg da; 00875 00876 dsl_dataset_user_hold_sync(ds->ds_prev, sn->ha, tx); 00877 kmem_free(sn->ha, sizeof (struct dsl_ds_holdarg)); 00878 sn->ha = NULL; 00879 sn->newds = ds->ds_prev; 00880 00881 da.ds = ds->ds_prev; 00882 da.defer = B_TRUE; 00883 dsl_dataset_destroy_sync(&da, FTAG, tx); 00884 } 00885 } 00886 00887 static int 00888 dmu_objset_snapshot_one(const char *name, void *arg) 00889 { 00890 struct snaparg *sn = arg; 00891 objset_t *os; 00892 int err; 00893 char *cp; 00894 00895 /* 00896 * If the objset starts with a '%', then ignore it unless it was 00897 * explicitly named (ie, not recursive). These hidden datasets 00898 * are always inconsistent, and by not opening them here, we can 00899 * avoid a race with dsl_dir_destroy_check(). 00900 */ 00901 cp = strrchr(name, '/'); 00902 if (cp && cp[1] == '%' && sn->recursive) 00903 return (0); 00904 00905 (void) strcpy(sn->failed, name); 00906 00907 /* 00908 * Check permissions if we are doing a recursive snapshot. The 00909 * permission checks for the starting dataset have already been 00910 * performed in zfs_secpolicy_snapshot() 00911 */ 00912 if (sn->recursive && (err = zfs_secpolicy_snapshot_perms(name, CRED()))) 00913 return (err); 00914 00915 err = dmu_objset_hold(name, sn, &os); 00916 if (err != 0) 00917 return (err); 00918 00919 /* 00920 * If the objset is in an inconsistent state (eg, in the process 00921 * of being destroyed), don't snapshot it. As with %hidden 00922 * datasets, we return EBUSY if this name was explicitly 00923 * requested (ie, not recursive), and otherwise ignore it. 00924 */ 00925 if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) { 00926 dmu_objset_rele(os, sn); 00927 return (sn->recursive ? 0 : EBUSY); 00928 } 00929 00930 if (sn->needsuspend) { 00931 err = zil_suspend(dmu_objset_zil(os)); 00932 if (err) { 00933 dmu_objset_rele(os, sn); 00934 return (err); 00935 } 00936 } 00937 dsl_sync_task_create(sn->dstg, snapshot_check, snapshot_sync, 00938 os, sn, 3); 00939 00940 return (0); 00941 } 00942 00943 int 00944 dmu_objset_snapshot(char *fsname, char *snapname, char *tag, 00945 nvlist_t *props, boolean_t recursive, boolean_t temporary, int cleanup_fd) 00946 { 00947 dsl_sync_task_t *dst; 00948 struct snaparg sn; 00949 spa_t *spa; 00950 minor_t minor; 00951 int err; 00952 00953 (void) strcpy(sn.failed, fsname); 00954 00955 err = spa_open(fsname, &spa, FTAG); 00956 if (err) 00957 return (err); 00958 00959 if (temporary) { 00960 if (cleanup_fd < 0) { 00961 spa_close(spa, FTAG); 00962 return (EINVAL); 00963 } 00964 if ((err = zfs_onexit_fd_hold(cleanup_fd, &minor)) != 0) { 00965 spa_close(spa, FTAG); 00966 return (err); 00967 } 00968 } 00969 00970 sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 00971 sn.snapname = snapname; 00972 sn.htag = tag; 00973 sn.props = props; 00974 sn.recursive = recursive; 00975 sn.needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); 00976 sn.temporary = temporary; 00977 sn.ha = NULL; 00978 sn.newds = NULL; 00979 00980 if (recursive) { 00981 err = dmu_objset_find(fsname, 00982 dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN); 00983 } else { 00984 err = dmu_objset_snapshot_one(fsname, &sn); 00985 } 00986 00987 if (err == 0) 00988 err = dsl_sync_task_group_wait(sn.dstg); 00989 00990 for (dst = list_head(&sn.dstg->dstg_tasks); dst; 00991 dst = list_next(&sn.dstg->dstg_tasks, dst)) { 00992 objset_t *os = dst->dst_arg1; 00993 dsl_dataset_t *ds = os->os_dsl_dataset; 00994 if (dst->dst_err) { 00995 dsl_dataset_name(ds, sn.failed); 00996 } else if (temporary) { 00997 dsl_register_onexit_hold_cleanup(sn.newds, tag, minor); 00998 } 00999 if (sn.needsuspend) 01000 zil_resume(dmu_objset_zil(os)); 01001 #ifdef __FreeBSD__ 01002 #ifdef _KERNEL 01003 if (dst->dst_err == 0 && dmu_objset_type(os) == DMU_OST_ZVOL) { 01004 char name[MAXNAMELEN]; 01005 01006 dmu_objset_name(os, name); 01007 strlcat(name, "@", sizeof(name)); 01008 strlcat(name, snapname, sizeof(name)); 01009 zvol_create_minors(name); 01010 } 01011 #endif 01012 #endif 01013 dmu_objset_rele(os, &sn); 01014 } 01015 01016 if (err) 01017 (void) strcpy(fsname, sn.failed); 01018 if (temporary) 01019 zfs_onexit_fd_rele(cleanup_fd); 01020 dsl_sync_task_group_destroy(sn.dstg); 01021 spa_close(spa, FTAG); 01022 return (err); 01023 } 01024 01025 static void 01026 dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) 01027 { 01028 dnode_t *dn; 01029 01030 while (dn = list_head(list)) { 01031 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 01032 ASSERT(dn->dn_dbuf->db_data_pending); 01033 /* 01034 * Initialize dn_zio outside dnode_sync() because the 01035 * meta-dnode needs to set it ouside dnode_sync(). 01036 */ 01037 dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; 01038 ASSERT(dn->dn_zio); 01039 01040 ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); 01041 list_remove(list, dn); 01042 01043 if (newlist) { 01044 (void) dnode_add_ref(dn, newlist); 01045 list_insert_tail(newlist, dn); 01046 } 01047 01048 dnode_sync(dn, tx); 01049 } 01050 } 01051 01052 /* ARGSUSED */ 01053 static void 01054 dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) 01055 { 01056 blkptr_t *bp = zio->io_bp; 01057 objset_t *os = arg; 01058 dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; 01059 01060 ASSERT(bp == os->os_rootbp); 01061 ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET); 01062 ASSERT(BP_GET_LEVEL(bp) == 0); 01063 01064 /* 01065 * Update rootbp fill count: it should be the number of objects 01066 * allocated in the object set (not counting the "special" 01067 * objects that are stored in the objset_phys_t -- the meta 01068 * dnode and user/group accounting objects). 01069 */ 01070 bp->blk_fill = 0; 01071 for (int i = 0; i < dnp->dn_nblkptr; i++) 01072 bp->blk_fill += dnp->dn_blkptr[i].blk_fill; 01073 } 01074 01075 /* ARGSUSED */ 01076 static void 01077 dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) 01078 { 01079 blkptr_t *bp = zio->io_bp; 01080 blkptr_t *bp_orig = &zio->io_bp_orig; 01081 objset_t *os = arg; 01082 01083 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 01084 ASSERT(BP_EQUAL(bp, bp_orig)); 01085 } else { 01086 dsl_dataset_t *ds = os->os_dsl_dataset; 01087 dmu_tx_t *tx = os->os_synctx; 01088 01089 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 01090 dsl_dataset_block_born(ds, bp, tx); 01091 } 01092 } 01093 01094 /* called from dsl */ 01095 void 01096 dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) 01097 { 01098 int txgoff; 01099 zbookmark_t zb; 01100 zio_prop_t zp; 01101 zio_t *zio; 01102 list_t *list; 01103 list_t *newlist = NULL; 01104 dbuf_dirty_record_t *dr; 01105 01106 dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); 01107 01108 ASSERT(dmu_tx_is_syncing(tx)); 01109 /* XXX the write_done callback should really give us the tx... */ 01110 os->os_synctx = tx; 01111 01112 if (os->os_dsl_dataset == NULL) { 01113 /* 01114 * This is the MOS. If we have upgraded, 01115 * spa_max_replication() could change, so reset 01116 * os_copies here. 01117 */ 01118 os->os_copies = spa_max_replication(os->os_spa); 01119 } 01120 01121 /* 01122 * Create the root block IO 01123 */ 01124 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 01125 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 01126 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 01127 VERIFY3U(0, ==, arc_release_bp(os->os_phys_buf, &os->os_phys_buf, 01128 os->os_rootbp, os->os_spa, &zb)); 01129 01130 dmu_write_policy(os, NULL, 0, 0, &zp); 01131 01132 zio = arc_write(pio, os->os_spa, tx->tx_txg, 01133 os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp, 01134 dmu_objset_write_ready, dmu_objset_write_done, os, 01135 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 01136 01137 /* 01138 * Sync special dnodes - the parent IO for the sync is the root block 01139 */ 01140 DMU_META_DNODE(os)->dn_zio = zio; 01141 dnode_sync(DMU_META_DNODE(os), tx); 01142 01143 os->os_phys->os_flags = os->os_flags; 01144 01145 if (DMU_USERUSED_DNODE(os) && 01146 DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { 01147 DMU_USERUSED_DNODE(os)->dn_zio = zio; 01148 dnode_sync(DMU_USERUSED_DNODE(os), tx); 01149 DMU_GROUPUSED_DNODE(os)->dn_zio = zio; 01150 dnode_sync(DMU_GROUPUSED_DNODE(os), tx); 01151 } 01152 01153 txgoff = tx->tx_txg & TXG_MASK; 01154 01155 if (dmu_objset_userused_enabled(os)) { 01156 newlist = &os->os_synced_dnodes; 01157 /* 01158 * We must create the list here because it uses the 01159 * dn_dirty_link[] of this txg. 01160 */ 01161 list_create(newlist, sizeof (dnode_t), 01162 offsetof(dnode_t, dn_dirty_link[txgoff])); 01163 } 01164 01165 dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); 01166 dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); 01167 01168 list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; 01169 while (dr = list_head(list)) { 01170 ASSERT(dr->dr_dbuf->db_level == 0); 01171 list_remove(list, dr); 01172 if (dr->dr_zio) 01173 zio_nowait(dr->dr_zio); 01174 } 01175 /* 01176 * Free intent log blocks up to this tx. 01177 */ 01178 zil_sync(os->os_zil, tx); 01179 os->os_phys->os_zil_header = os->os_zil_header; 01180 zio_nowait(zio); 01181 } 01182 01183 boolean_t 01184 dmu_objset_is_dirty(objset_t *os, uint64_t txg) 01185 { 01186 return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) || 01187 !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK])); 01188 } 01189 01190 static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; 01191 01192 void 01193 dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) 01194 { 01195 used_cbs[ost] = cb; 01196 } 01197 01198 boolean_t 01199 dmu_objset_userused_enabled(objset_t *os) 01200 { 01201 return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && 01202 used_cbs[os->os_phys->os_type] != NULL && 01203 DMU_USERUSED_DNODE(os) != NULL); 01204 } 01205 01206 static void 01207 do_userquota_update(objset_t *os, uint64_t used, uint64_t flags, 01208 uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx) 01209 { 01210 if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { 01211 int64_t delta = DNODE_SIZE + used; 01212 if (subtract) 01213 delta = -delta; 01214 VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT, 01215 user, delta, tx)); 01216 VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT, 01217 group, delta, tx)); 01218 } 01219 } 01220 01221 void 01222 dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) 01223 { 01224 dnode_t *dn; 01225 list_t *list = &os->os_synced_dnodes; 01226 01227 ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); 01228 01229 while (dn = list_head(list)) { 01230 int flags; 01231 ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); 01232 ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || 01233 dn->dn_phys->dn_flags & 01234 DNODE_FLAG_USERUSED_ACCOUNTED); 01235 01236 /* Allocate the user/groupused objects if necessary. */ 01237 if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { 01238 VERIFY(0 == zap_create_claim(os, 01239 DMU_USERUSED_OBJECT, 01240 DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); 01241 VERIFY(0 == zap_create_claim(os, 01242 DMU_GROUPUSED_OBJECT, 01243 DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); 01244 } 01245 01246 /* 01247 * We intentionally modify the zap object even if the 01248 * net delta is zero. Otherwise 01249 * the block of the zap obj could be shared between 01250 * datasets but need to be different between them after 01251 * a bprewrite. 01252 */ 01253 01254 flags = dn->dn_id_flags; 01255 ASSERT(flags); 01256 if (flags & DN_ID_OLD_EXIST) { 01257 do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags, 01258 dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx); 01259 } 01260 if (flags & DN_ID_NEW_EXIST) { 01261 do_userquota_update(os, DN_USED_BYTES(dn->dn_phys), 01262 dn->dn_phys->dn_flags, dn->dn_newuid, 01263 dn->dn_newgid, B_FALSE, tx); 01264 } 01265 01266 mutex_enter(&dn->dn_mtx); 01267 dn->dn_oldused = 0; 01268 dn->dn_oldflags = 0; 01269 if (dn->dn_id_flags & DN_ID_NEW_EXIST) { 01270 dn->dn_olduid = dn->dn_newuid; 01271 dn->dn_oldgid = dn->dn_newgid; 01272 dn->dn_id_flags |= DN_ID_OLD_EXIST; 01273 if (dn->dn_bonuslen == 0) 01274 dn->dn_id_flags |= DN_ID_CHKED_SPILL; 01275 else 01276 dn->dn_id_flags |= DN_ID_CHKED_BONUS; 01277 } 01278 dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); 01279 mutex_exit(&dn->dn_mtx); 01280 01281 list_remove(list, dn); 01282 dnode_rele(dn, list); 01283 } 01284 } 01285 01293 static void * 01294 dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) 01295 { 01296 dbuf_dirty_record_t *dr, **drp; 01297 void *data; 01298 01299 if (db->db_dirtycnt == 0) 01300 return (db->db.db_data); /* Nothing is changing */ 01301 01302 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 01303 if (dr->dr_txg == tx->tx_txg) 01304 break; 01305 01306 if (dr == NULL) { 01307 data = NULL; 01308 } else { 01309 dnode_t *dn; 01310 01311 DB_DNODE_ENTER(dr->dr_dbuf); 01312 dn = DB_DNODE(dr->dr_dbuf); 01313 01314 if (dn->dn_bonuslen == 0 && 01315 dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) 01316 data = dr->dt.dl.dr_data->b_data; 01317 else 01318 data = dr->dt.dl.dr_data; 01319 01320 DB_DNODE_EXIT(dr->dr_dbuf); 01321 } 01322 01323 return (data); 01324 } 01325 01326 void 01327 dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) 01328 { 01329 objset_t *os = dn->dn_objset; 01330 void *data = NULL; 01331 dmu_buf_impl_t *db = NULL; 01332 uint64_t *user, *group; 01333 int flags = dn->dn_id_flags; 01334 int error; 01335 boolean_t have_spill = B_FALSE; 01336 01337 if (!dmu_objset_userused_enabled(dn->dn_objset)) 01338 return; 01339 01340 if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| 01341 DN_ID_CHKED_SPILL))) 01342 return; 01343 01344 if (before && dn->dn_bonuslen != 0) 01345 data = DN_BONUS(dn->dn_phys); 01346 else if (!before && dn->dn_bonuslen != 0) { 01347 if (dn->dn_bonus) { 01348 db = dn->dn_bonus; 01349 mutex_enter(&db->db_mtx); 01350 data = dmu_objset_userquota_find_data(db, tx); 01351 } else { 01352 data = DN_BONUS(dn->dn_phys); 01353 } 01354 } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { 01355 int rf = 0; 01356 01357 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) 01358 rf |= DB_RF_HAVESTRUCT; 01359 error = dmu_spill_hold_by_dnode(dn, 01360 rf | DB_RF_MUST_SUCCEED, 01361 FTAG, (dmu_buf_t **)&db); 01362 ASSERT(error == 0); 01363 mutex_enter(&db->db_mtx); 01364 data = (before) ? db->db.db_data : 01365 dmu_objset_userquota_find_data(db, tx); 01366 have_spill = B_TRUE; 01367 } else { 01368 mutex_enter(&dn->dn_mtx); 01369 dn->dn_id_flags |= DN_ID_CHKED_BONUS; 01370 mutex_exit(&dn->dn_mtx); 01371 return; 01372 } 01373 01374 if (before) { 01375 ASSERT(data); 01376 user = &dn->dn_olduid; 01377 group = &dn->dn_oldgid; 01378 } else if (data) { 01379 user = &dn->dn_newuid; 01380 group = &dn->dn_newgid; 01381 } 01382 01383 /* 01384 * Must always call the callback in case the object 01385 * type has changed and that type isn't an object type to track 01386 */ 01387 error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, 01388 user, group); 01389 01390 /* 01391 * Preserve existing uid/gid when the callback can't determine 01392 * what the new uid/gid are and the callback returned EEXIST. 01393 * The EEXIST error tells us to just use the existing uid/gid. 01394 * If we don't know what the old values are then just assign 01395 * them to 0, since that is a new file being created. 01396 */ 01397 if (!before && data == NULL && error == EEXIST) { 01398 if (flags & DN_ID_OLD_EXIST) { 01399 dn->dn_newuid = dn->dn_olduid; 01400 dn->dn_newgid = dn->dn_oldgid; 01401 } else { 01402 dn->dn_newuid = 0; 01403 dn->dn_newgid = 0; 01404 } 01405 error = 0; 01406 } 01407 01408 if (db) 01409 mutex_exit(&db->db_mtx); 01410 01411 mutex_enter(&dn->dn_mtx); 01412 if (error == 0 && before) 01413 dn->dn_id_flags |= DN_ID_OLD_EXIST; 01414 if (error == 0 && !before) 01415 dn->dn_id_flags |= DN_ID_NEW_EXIST; 01416 01417 if (have_spill) { 01418 dn->dn_id_flags |= DN_ID_CHKED_SPILL; 01419 } else { 01420 dn->dn_id_flags |= DN_ID_CHKED_BONUS; 01421 } 01422 mutex_exit(&dn->dn_mtx); 01423 if (have_spill) 01424 dmu_buf_rele((dmu_buf_t *)db, FTAG); 01425 } 01426 01427 boolean_t 01428 dmu_objset_userspace_present(objset_t *os) 01429 { 01430 return (os->os_phys->os_flags & 01431 OBJSET_FLAG_USERACCOUNTING_COMPLETE); 01432 } 01433 01434 int 01435 dmu_objset_userspace_upgrade(objset_t *os) 01436 { 01437 uint64_t obj; 01438 int err = 0; 01439 01440 if (dmu_objset_userspace_present(os)) 01441 return (0); 01442 if (!dmu_objset_userused_enabled(os)) 01443 return (ENOTSUP); 01444 if (dmu_objset_is_snapshot(os)) 01445 return (EINVAL); 01446 01447 /* 01448 * We simply need to mark every object dirty, so that it will be 01449 * synced out and now accounted. If this is called 01450 * concurrently, or if we already did some work before crashing, 01451 * that's fine, since we track each object's accounted state 01452 * independently. 01453 */ 01454 01455 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 01456 dmu_tx_t *tx; 01457 dmu_buf_t *db; 01458 int objerr; 01459 01460 if (issig(JUSTLOOKING) && issig(FORREAL)) 01461 return (EINTR); 01462 01463 objerr = dmu_bonus_hold(os, obj, FTAG, &db); 01464 if (objerr) 01465 continue; 01466 tx = dmu_tx_create(os); 01467 dmu_tx_hold_bonus(tx, obj); 01468 objerr = dmu_tx_assign(tx, TXG_WAIT); 01469 if (objerr) { 01470 dmu_tx_abort(tx); 01471 continue; 01472 } 01473 dmu_buf_will_dirty(db, tx); 01474 dmu_buf_rele(db, FTAG); 01475 dmu_tx_commit(tx); 01476 } 01477 01478 os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; 01479 txg_wait_synced(dmu_objset_pool(os), 0); 01480 return (0); 01481 } 01482 01483 void 01484 dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, 01485 uint64_t *usedobjsp, uint64_t *availobjsp) 01486 { 01487 dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, 01488 usedobjsp, availobjsp); 01489 } 01490 01491 uint64_t 01492 dmu_objset_fsid_guid(objset_t *os) 01493 { 01494 return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); 01495 } 01496 01497 void 01498 dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) 01499 { 01500 stat->dds_type = os->os_phys->os_type; 01501 if (os->os_dsl_dataset) 01502 dsl_dataset_fast_stat(os->os_dsl_dataset, stat); 01503 } 01504 01505 void 01506 dmu_objset_stats(objset_t *os, nvlist_t *nv) 01507 { 01508 ASSERT(os->os_dsl_dataset || 01509 os->os_phys->os_type == DMU_OST_META); 01510 01511 if (os->os_dsl_dataset != NULL) 01512 dsl_dataset_stats(os->os_dsl_dataset, nv); 01513 01514 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, 01515 os->os_phys->os_type); 01516 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, 01517 dmu_objset_userspace_present(os)); 01518 } 01519 01520 int 01521 dmu_objset_is_snapshot(objset_t *os) 01522 { 01523 if (os->os_dsl_dataset != NULL) 01524 return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); 01525 else 01526 return (B_FALSE); 01527 } 01528 01529 int 01530 dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, 01531 boolean_t *conflict) 01532 { 01533 dsl_dataset_t *ds = os->os_dsl_dataset; 01534 uint64_t ignored; 01535 01536 if (ds->ds_phys->ds_snapnames_zapobj == 0) 01537 return (ENOENT); 01538 01539 return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, 01540 ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST, 01541 real, maxlen, conflict)); 01542 } 01543 01544 int 01545 dmu_snapshot_list_next(objset_t *os, int namelen, char *name, 01546 uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) 01547 { 01548 dsl_dataset_t *ds = os->os_dsl_dataset; 01549 zap_cursor_t cursor; 01550 zap_attribute_t attr; 01551 01552 if (ds->ds_phys->ds_snapnames_zapobj == 0) 01553 return (ENOENT); 01554 01555 zap_cursor_init_serialized(&cursor, 01556 ds->ds_dir->dd_pool->dp_meta_objset, 01557 ds->ds_phys->ds_snapnames_zapobj, *offp); 01558 01559 if (zap_cursor_retrieve(&cursor, &attr) != 0) { 01560 zap_cursor_fini(&cursor); 01561 return (ENOENT); 01562 } 01563 01564 if (strlen(attr.za_name) + 1 > namelen) { 01565 zap_cursor_fini(&cursor); 01566 return (ENAMETOOLONG); 01567 } 01568 01569 (void) strcpy(name, attr.za_name); 01570 if (idp) 01571 *idp = attr.za_first_integer; 01572 if (case_conflict) 01573 *case_conflict = attr.za_normalization_conflict; 01574 zap_cursor_advance(&cursor); 01575 *offp = zap_cursor_serialize(&cursor); 01576 zap_cursor_fini(&cursor); 01577 01578 return (0); 01579 } 01580 01581 int 01582 dmu_dir_list_next(objset_t *os, int namelen, char *name, 01583 uint64_t *idp, uint64_t *offp) 01584 { 01585 dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; 01586 zap_cursor_t cursor; 01587 zap_attribute_t attr; 01588 01589 /* there is no next dir on a snapshot! */ 01590 if (os->os_dsl_dataset->ds_object != 01591 dd->dd_phys->dd_head_dataset_obj) 01592 return (ENOENT); 01593 01594 zap_cursor_init_serialized(&cursor, 01595 dd->dd_pool->dp_meta_objset, 01596 dd->dd_phys->dd_child_dir_zapobj, *offp); 01597 01598 if (zap_cursor_retrieve(&cursor, &attr) != 0) { 01599 zap_cursor_fini(&cursor); 01600 return (ENOENT); 01601 } 01602 01603 if (strlen(attr.za_name) + 1 > namelen) { 01604 zap_cursor_fini(&cursor); 01605 return (ENAMETOOLONG); 01606 } 01607 01608 (void) strcpy(name, attr.za_name); 01609 if (idp) 01610 *idp = attr.za_first_integer; 01611 zap_cursor_advance(&cursor); 01612 *offp = zap_cursor_serialize(&cursor); 01613 zap_cursor_fini(&cursor); 01614 01615 return (0); 01616 } 01617 01618 struct findarg { 01619 int (*func)(const char *, void *); 01620 void *arg; 01621 }; 01622 01623 /* ARGSUSED */ 01624 static int 01625 findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 01626 { 01627 struct findarg *fa = arg; 01628 return (fa->func(dsname, fa->arg)); 01629 } 01630 01636 int 01637 dmu_objset_find(const char *name, int func(const char *, void *), void *arg, 01638 int flags) 01639 { 01640 struct findarg fa; 01641 fa.func = func; 01642 fa.arg = arg; 01643 return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags)); 01644 } 01645 01649 int 01650 dmu_objset_find_spa(spa_t *spa, const char *name, 01651 int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags) 01652 { 01653 dsl_dir_t *dd; 01654 dsl_pool_t *dp; 01655 dsl_dataset_t *ds; 01656 zap_cursor_t zc; 01657 zap_attribute_t *attr; 01658 char *child; 01659 uint64_t thisobj; 01660 int err; 01661 01662 if (name == NULL) 01663 name = spa_name(spa); 01664 err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL); 01665 if (err) 01666 return (err); 01667 01668 /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ 01669 if (dd->dd_myname[0] == '$') { 01670 dsl_dir_close(dd, FTAG); 01671 return (0); 01672 } 01673 01674 thisobj = dd->dd_phys->dd_head_dataset_obj; 01675 attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 01676 dp = dd->dd_pool; 01677 01678 /* 01679 * Iterate over all children. 01680 */ 01681 if (flags & DS_FIND_CHILDREN) { 01682 for (zap_cursor_init(&zc, dp->dp_meta_objset, 01683 dd->dd_phys->dd_child_dir_zapobj); 01684 zap_cursor_retrieve(&zc, attr) == 0; 01685 (void) zap_cursor_advance(&zc)) { 01686 ASSERT(attr->za_integer_length == sizeof (uint64_t)); 01687 ASSERT(attr->za_num_integers == 1); 01688 01689 child = kmem_asprintf("%s/%s", name, attr->za_name); 01690 err = dmu_objset_find_spa(spa, child, func, arg, flags); 01691 strfree(child); 01692 if (err) 01693 break; 01694 } 01695 zap_cursor_fini(&zc); 01696 01697 if (err) { 01698 dsl_dir_close(dd, FTAG); 01699 kmem_free(attr, sizeof (zap_attribute_t)); 01700 return (err); 01701 } 01702 } 01703 01704 /* 01705 * Iterate over all snapshots. 01706 */ 01707 if (flags & DS_FIND_SNAPSHOTS) { 01708 if (!dsl_pool_sync_context(dp)) 01709 rw_enter(&dp->dp_config_rwlock, RW_READER); 01710 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); 01711 if (!dsl_pool_sync_context(dp)) 01712 rw_exit(&dp->dp_config_rwlock); 01713 01714 if (err == 0) { 01715 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 01716 dsl_dataset_rele(ds, FTAG); 01717 01718 for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); 01719 zap_cursor_retrieve(&zc, attr) == 0; 01720 (void) zap_cursor_advance(&zc)) { 01721 ASSERT(attr->za_integer_length == 01722 sizeof (uint64_t)); 01723 ASSERT(attr->za_num_integers == 1); 01724 01725 child = kmem_asprintf("%s@%s", 01726 name, attr->za_name); 01727 err = func(spa, attr->za_first_integer, 01728 child, arg); 01729 strfree(child); 01730 if (err) 01731 break; 01732 } 01733 zap_cursor_fini(&zc); 01734 } 01735 } 01736 01737 dsl_dir_close(dd, FTAG); 01738 kmem_free(attr, sizeof (zap_attribute_t)); 01739 01740 if (err) 01741 return (err); 01742 01743 /* 01744 * Apply to self if appropriate. 01745 */ 01746 err = func(spa, thisobj, name, arg); 01747 return (err); 01748 } 01749 01750 /* ARGSUSED */ 01751 int 01752 dmu_objset_prefetch(const char *name, void *arg) 01753 { 01754 dsl_dataset_t *ds; 01755 01756 if (dsl_dataset_hold(name, FTAG, &ds)) 01757 return (0); 01758 01759 if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) { 01760 mutex_enter(&ds->ds_opening_lock); 01761 if (ds->ds_objset == NULL) { 01762 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 01763 zbookmark_t zb; 01764 01765 SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, 01766 ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 01767 01768 (void) dsl_read_nolock(NULL, dsl_dataset_get_spa(ds), 01769 &ds->ds_phys->ds_bp, NULL, NULL, 01770 ZIO_PRIORITY_ASYNC_READ, 01771 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 01772 &aflags, &zb); 01773 } 01774 mutex_exit(&ds->ds_opening_lock); 01775 } 01776 01777 dsl_dataset_rele(ds, FTAG); 01778 return (0); 01779 } 01780 01781 void 01782 dmu_objset_set_user(objset_t *os, void *user_ptr) 01783 { 01784 ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); 01785 os->os_user_ptr = user_ptr; 01786 } 01787 01788 void * 01789 dmu_objset_get_user(objset_t *os) 01790 { 01791 ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); 01792 return (os->os_user_ptr); 01793 }