FreeBSD ZFS
The Zettabyte File System
|
00001 /* 00002 * CDDL HEADER START 00003 * 00004 * The contents of this file are subject to the terms of the 00005 * Common Development and Distribution License (the "License"). 00006 * You may not use this file except in compliance with the License. 00007 * 00008 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 00009 * or http://www.opensolaris.org/os/licensing. 00010 * See the License for the specific language governing permissions 00011 * and limitations under the License. 00012 * 00013 * When distributing Covered Code, include this CDDL HEADER in each 00014 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 00015 * If applicable, add the following below this CDDL HEADER, with the 00016 * fields enclosed by brackets "[]" replaced with your own identifying 00017 * information: Portions Copyright [yyyy] [name of copyright owner] 00018 * 00019 * CDDL HEADER END 00020 */ 00021 /* 00022 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 00023 * Copyright (c) 2012 by Delphix. All rights reserved. 00024 */ 00025 00026 #include <sys/dsl_dataset.h> 00027 #include <sys/dmu.h> 00028 #include <sys/refcount.h> 00029 #include <sys/zap.h> 00030 #include <sys/zfs_context.h> 00031 #include <sys/dsl_pool.h> 00032 00033 /* 00034 * Deadlist concurrency: 00035 * 00036 * Deadlists can only be modified from the syncing thread. 00037 * 00038 * Except for dsl_deadlist_insert(), it can only be modified with the 00039 * dp_config_rwlock held with RW_WRITER. 00040 * 00041 * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can 00042 * be called concurrently, from open context, with the dl_config_rwlock held 00043 * with RW_READER. 00044 * 00045 * Therefore, we only need to provide locking between dsl_deadlist_insert() and 00046 * the accessors, protecting: 00047 * dl_phys->dl_used,comp,uncomp 00048 * and protecting the dl_tree from being loaded. 00049 * The locking is provided by dl_lock. Note that locking on the bpobj_t 00050 * provides its own locking, and dl_oldfmt is immutable. 00051 */ 00052 00053 static int 00054 dsl_deadlist_compare(const void *arg1, const void *arg2) 00055 { 00056 const dsl_deadlist_entry_t *dle1 = arg1; 00057 const dsl_deadlist_entry_t *dle2 = arg2; 00058 00059 if (dle1->dle_mintxg < dle2->dle_mintxg) 00060 return (-1); 00061 else if (dle1->dle_mintxg > dle2->dle_mintxg) 00062 return (+1); 00063 else 00064 return (0); 00065 } 00066 00067 static void 00068 dsl_deadlist_load_tree(dsl_deadlist_t *dl) 00069 { 00070 zap_cursor_t zc; 00071 zap_attribute_t za; 00072 00073 ASSERT(!dl->dl_oldfmt); 00074 if (dl->dl_havetree) 00075 return; 00076 00077 avl_create(&dl->dl_tree, dsl_deadlist_compare, 00078 sizeof (dsl_deadlist_entry_t), 00079 offsetof(dsl_deadlist_entry_t, dle_node)); 00080 for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); 00081 zap_cursor_retrieve(&zc, &za) == 0; 00082 zap_cursor_advance(&zc)) { 00083 dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 00084 dle->dle_mintxg = strtonum(za.za_name, NULL); 00085 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, 00086 za.za_first_integer)); 00087 avl_add(&dl->dl_tree, dle); 00088 } 00089 zap_cursor_fini(&zc); 00090 dl->dl_havetree = B_TRUE; 00091 } 00092 00093 void 00094 dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) 00095 { 00096 dmu_object_info_t doi; 00097 00098 mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); 00099 dl->dl_os = os; 00100 dl->dl_object = object; 00101 VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); 00102 dmu_object_info_from_db(dl->dl_dbuf, &doi); 00103 if (doi.doi_type == DMU_OT_BPOBJ) { 00104 dmu_buf_rele(dl->dl_dbuf, dl); 00105 dl->dl_dbuf = NULL; 00106 dl->dl_oldfmt = B_TRUE; 00107 VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object)); 00108 return; 00109 } 00110 00111 dl->dl_oldfmt = B_FALSE; 00112 dl->dl_phys = dl->dl_dbuf->db_data; 00113 dl->dl_havetree = B_FALSE; 00114 } 00115 00116 void 00117 dsl_deadlist_close(dsl_deadlist_t *dl) 00118 { 00119 void *cookie = NULL; 00120 dsl_deadlist_entry_t *dle; 00121 00122 if (dl->dl_oldfmt) { 00123 dl->dl_oldfmt = B_FALSE; 00124 bpobj_close(&dl->dl_bpobj); 00125 return; 00126 } 00127 00128 if (dl->dl_havetree) { 00129 while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) 00130 != NULL) { 00131 bpobj_close(&dle->dle_bpobj); 00132 kmem_free(dle, sizeof (*dle)); 00133 } 00134 avl_destroy(&dl->dl_tree); 00135 } 00136 dmu_buf_rele(dl->dl_dbuf, dl); 00137 mutex_destroy(&dl->dl_lock); 00138 dl->dl_dbuf = NULL; 00139 dl->dl_phys = NULL; 00140 } 00141 00142 uint64_t 00143 dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) 00144 { 00145 if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) 00146 return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx)); 00147 return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, 00148 sizeof (dsl_deadlist_phys_t), tx)); 00149 } 00150 00151 void 00152 dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) 00153 { 00154 dmu_object_info_t doi; 00155 zap_cursor_t zc; 00156 zap_attribute_t za; 00157 00158 VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi)); 00159 if (doi.doi_type == DMU_OT_BPOBJ) { 00160 bpobj_free(os, dlobj, tx); 00161 return; 00162 } 00163 00164 for (zap_cursor_init(&zc, os, dlobj); 00165 zap_cursor_retrieve(&zc, &za) == 0; 00166 zap_cursor_advance(&zc)) { 00167 uint64_t obj = za.za_first_integer; 00168 if (obj == dmu_objset_pool(os)->dp_empty_bpobj) 00169 bpobj_decr_empty(os, tx); 00170 else 00171 bpobj_free(os, obj, tx); 00172 } 00173 zap_cursor_fini(&zc); 00174 VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx)); 00175 } 00176 00177 static void 00178 dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 00179 const blkptr_t *bp, dmu_tx_t *tx) 00180 { 00181 if (dle->dle_bpobj.bpo_object == 00182 dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { 00183 uint64_t obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx); 00184 bpobj_close(&dle->dle_bpobj); 00185 bpobj_decr_empty(dl->dl_os, tx); 00186 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 00187 VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, 00188 dle->dle_mintxg, obj, tx)); 00189 } 00190 bpobj_enqueue(&dle->dle_bpobj, bp, tx); 00191 } 00192 00193 static void 00194 dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 00195 uint64_t obj, dmu_tx_t *tx) 00196 { 00197 if (dle->dle_bpobj.bpo_object != 00198 dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { 00199 bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); 00200 } else { 00201 bpobj_close(&dle->dle_bpobj); 00202 bpobj_decr_empty(dl->dl_os, tx); 00203 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 00204 VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, 00205 dle->dle_mintxg, obj, tx)); 00206 } 00207 } 00208 00209 void 00210 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) 00211 { 00212 dsl_deadlist_entry_t dle_tofind; 00213 dsl_deadlist_entry_t *dle; 00214 avl_index_t where; 00215 00216 if (dl->dl_oldfmt) { 00217 bpobj_enqueue(&dl->dl_bpobj, bp, tx); 00218 return; 00219 } 00220 00221 dsl_deadlist_load_tree(dl); 00222 00223 dmu_buf_will_dirty(dl->dl_dbuf, tx); 00224 mutex_enter(&dl->dl_lock); 00225 dl->dl_phys->dl_used += 00226 bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); 00227 dl->dl_phys->dl_comp += BP_GET_PSIZE(bp); 00228 dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp); 00229 mutex_exit(&dl->dl_lock); 00230 00231 dle_tofind.dle_mintxg = bp->blk_birth; 00232 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 00233 if (dle == NULL) 00234 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 00235 else 00236 dle = AVL_PREV(&dl->dl_tree, dle); 00237 dle_enqueue(dl, dle, bp, tx); 00238 } 00239 00244 void 00245 dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 00246 { 00247 uint64_t obj; 00248 dsl_deadlist_entry_t *dle; 00249 00250 if (dl->dl_oldfmt) 00251 return; 00252 00253 dsl_deadlist_load_tree(dl); 00254 00255 dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 00256 dle->dle_mintxg = mintxg; 00257 obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx); 00258 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 00259 avl_add(&dl->dl_tree, dle); 00260 00261 VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object, 00262 mintxg, obj, tx)); 00263 } 00264 00268 void 00269 dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 00270 { 00271 dsl_deadlist_entry_t dle_tofind; 00272 dsl_deadlist_entry_t *dle, *dle_prev; 00273 00274 if (dl->dl_oldfmt) 00275 return; 00276 00277 dsl_deadlist_load_tree(dl); 00278 00279 dle_tofind.dle_mintxg = mintxg; 00280 dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); 00281 dle_prev = AVL_PREV(&dl->dl_tree, dle); 00282 00283 dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); 00284 00285 avl_remove(&dl->dl_tree, dle); 00286 bpobj_close(&dle->dle_bpobj); 00287 kmem_free(dle, sizeof (*dle)); 00288 00289 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); 00290 } 00291 00295 static void 00296 dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, 00297 uint64_t mrs_obj, dmu_tx_t *tx) 00298 { 00299 dsl_deadlist_t dl; 00300 dsl_pool_t *dp = dmu_objset_pool(os); 00301 00302 dsl_deadlist_open(&dl, os, dlobj); 00303 if (dl.dl_oldfmt) { 00304 dsl_deadlist_close(&dl); 00305 return; 00306 } 00307 00308 while (mrs_obj != 0) { 00309 dsl_dataset_t *ds; 00310 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); 00311 dsl_deadlist_add_key(&dl, ds->ds_phys->ds_prev_snap_txg, tx); 00312 mrs_obj = ds->ds_phys->ds_prev_snap_obj; 00313 dsl_dataset_rele(ds, FTAG); 00314 } 00315 dsl_deadlist_close(&dl); 00316 } 00317 00318 uint64_t 00319 dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, 00320 uint64_t mrs_obj, dmu_tx_t *tx) 00321 { 00322 dsl_deadlist_entry_t *dle; 00323 uint64_t newobj; 00324 00325 newobj = dsl_deadlist_alloc(dl->dl_os, tx); 00326 00327 if (dl->dl_oldfmt) { 00328 dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); 00329 return (newobj); 00330 } 00331 00332 dsl_deadlist_load_tree(dl); 00333 00334 for (dle = avl_first(&dl->dl_tree); dle; 00335 dle = AVL_NEXT(&dl->dl_tree, dle)) { 00336 uint64_t obj; 00337 00338 if (dle->dle_mintxg >= maxtxg) 00339 break; 00340 00341 obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx); 00342 VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj, 00343 dle->dle_mintxg, obj, tx)); 00344 } 00345 return (newobj); 00346 } 00347 00348 void 00349 dsl_deadlist_space(dsl_deadlist_t *dl, 00350 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 00351 { 00352 if (dl->dl_oldfmt) { 00353 VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj, 00354 usedp, compp, uncompp)); 00355 return; 00356 } 00357 00358 mutex_enter(&dl->dl_lock); 00359 *usedp = dl->dl_phys->dl_used; 00360 *compp = dl->dl_phys->dl_comp; 00361 *uncompp = dl->dl_phys->dl_uncomp; 00362 mutex_exit(&dl->dl_lock); 00363 } 00364 00371 void 00372 dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, 00373 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 00374 { 00375 dsl_deadlist_entry_t *dle; 00376 dsl_deadlist_entry_t dle_tofind; 00377 avl_index_t where; 00378 00379 if (dl->dl_oldfmt) { 00380 VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj, 00381 mintxg, maxtxg, usedp, compp, uncompp)); 00382 return; 00383 } 00384 00385 *usedp = *compp = *uncompp = 0; 00386 00387 mutex_enter(&dl->dl_lock); 00388 dsl_deadlist_load_tree(dl); 00389 dle_tofind.dle_mintxg = mintxg; 00390 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 00391 /* 00392 * If we don't find this mintxg, there shouldn't be anything 00393 * after it either. 00394 */ 00395 ASSERT(dle != NULL || 00396 avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL); 00397 00398 for (; dle && dle->dle_mintxg < maxtxg; 00399 dle = AVL_NEXT(&dl->dl_tree, dle)) { 00400 uint64_t used, comp, uncomp; 00401 00402 VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, 00403 &used, &comp, &uncomp)); 00404 00405 *usedp += used; 00406 *compp += comp; 00407 *uncompp += uncomp; 00408 } 00409 mutex_exit(&dl->dl_lock); 00410 } 00411 00412 static void 00413 dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, 00414 dmu_tx_t *tx) 00415 { 00416 dsl_deadlist_entry_t dle_tofind; 00417 dsl_deadlist_entry_t *dle; 00418 avl_index_t where; 00419 uint64_t used, comp, uncomp; 00420 bpobj_t bpo; 00421 00422 VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); 00423 VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp)); 00424 bpobj_close(&bpo); 00425 00426 dsl_deadlist_load_tree(dl); 00427 00428 dmu_buf_will_dirty(dl->dl_dbuf, tx); 00429 mutex_enter(&dl->dl_lock); 00430 dl->dl_phys->dl_used += used; 00431 dl->dl_phys->dl_comp += comp; 00432 dl->dl_phys->dl_uncomp += uncomp; 00433 mutex_exit(&dl->dl_lock); 00434 00435 dle_tofind.dle_mintxg = birth; 00436 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 00437 if (dle == NULL) 00438 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 00439 dle_enqueue_subobj(dl, dle, obj, tx); 00440 } 00441 00442 static int 00443 dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 00444 { 00445 dsl_deadlist_t *dl = arg; 00446 dsl_deadlist_insert(dl, bp, tx); 00447 return (0); 00448 } 00449 00454 void 00455 dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) 00456 { 00457 zap_cursor_t zc; 00458 zap_attribute_t za; 00459 dmu_buf_t *bonus; 00460 dsl_deadlist_phys_t *dlp; 00461 dmu_object_info_t doi; 00462 00463 VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi)); 00464 if (doi.doi_type == DMU_OT_BPOBJ) { 00465 bpobj_t bpo; 00466 VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); 00467 VERIFY3U(0, ==, bpobj_iterate(&bpo, 00468 dsl_deadlist_insert_cb, dl, tx)); 00469 bpobj_close(&bpo); 00470 return; 00471 } 00472 00473 for (zap_cursor_init(&zc, dl->dl_os, obj); 00474 zap_cursor_retrieve(&zc, &za) == 0; 00475 zap_cursor_advance(&zc)) { 00476 uint64_t mintxg = strtonum(za.za_name, NULL); 00477 dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); 00478 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx)); 00479 } 00480 zap_cursor_fini(&zc); 00481 00482 VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); 00483 dlp = bonus->db_data; 00484 dmu_buf_will_dirty(bonus, tx); 00485 bzero(dlp, sizeof (*dlp)); 00486 dmu_buf_rele(bonus, FTAG); 00487 } 00488 00492 void 00493 dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, 00494 dmu_tx_t *tx) 00495 { 00496 dsl_deadlist_entry_t dle_tofind; 00497 dsl_deadlist_entry_t *dle; 00498 avl_index_t where; 00499 00500 ASSERT(!dl->dl_oldfmt); 00501 dmu_buf_will_dirty(dl->dl_dbuf, tx); 00502 dsl_deadlist_load_tree(dl); 00503 00504 dle_tofind.dle_mintxg = mintxg; 00505 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 00506 if (dle == NULL) 00507 dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); 00508 while (dle) { 00509 uint64_t used, comp, uncomp; 00510 dsl_deadlist_entry_t *dle_next; 00511 00512 bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); 00513 00514 VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, 00515 &used, &comp, &uncomp)); 00516 mutex_enter(&dl->dl_lock); 00517 ASSERT3U(dl->dl_phys->dl_used, >=, used); 00518 ASSERT3U(dl->dl_phys->dl_comp, >=, comp); 00519 ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); 00520 dl->dl_phys->dl_used -= used; 00521 dl->dl_phys->dl_comp -= comp; 00522 dl->dl_phys->dl_uncomp -= uncomp; 00523 mutex_exit(&dl->dl_lock); 00524 00525 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, 00526 dle->dle_mintxg, tx)); 00527 00528 dle_next = AVL_NEXT(&dl->dl_tree, dle); 00529 avl_remove(&dl->dl_tree, dle); 00530 bpobj_close(&dle->dle_bpobj); 00531 kmem_free(dle, sizeof (*dle)); 00532 dle = dle_next; 00533 } 00534 }