FreeBSD ZFS
The Zettabyte File System
|
00001 /* 00002 * CDDL HEADER START 00003 * 00004 * The contents of this file are subject to the terms of the 00005 * Common Development and Distribution License (the "License"). 00006 * You may not use this file except in compliance with the License. 00007 * 00008 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 00009 * or http://www.opensolaris.org/os/licensing. 00010 * See the License for the specific language governing permissions 00011 * and limitations under the License. 00012 * 00013 * When distributing Covered Code, include this CDDL HEADER in each 00014 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 00015 * If applicable, add the following below this CDDL HEADER, with the 00016 * fields enclosed by brackets "[]" replaced with your own identifying 00017 * information: Portions Copyright [yyyy] [name of copyright owner] 00018 * 00019 * CDDL HEADER END 00020 */ 00021 /* 00022 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 00023 * Use is subject to license terms. 00024 */ 00025 /* 00026 * Copyright (c) 2012 by Delphix. All rights reserved. 00027 */ 00028 00029 #include <sys/zfs_context.h> 00030 #include <sys/spa.h> 00031 #include <sys/dmu.h> 00032 #include <sys/zio.h> 00033 #include <sys/space_map.h> 00034 00035 SYSCTL_DECL(_vfs_zfs); 00036 static int space_map_last_hope; 00037 TUNABLE_INT("vfs.zfs.space_map_last_hope", &space_map_last_hope); 00038 SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_last_hope, CTLFLAG_RDTUN, 00039 &space_map_last_hope, 0, 00040 "If kernel panic in space_map code on pool import, import the pool in readonly mode and backup all your data before trying this option."); 00041 00048 static int 00049 space_map_seg_compare(const void *x1, const void *x2) 00050 { 00051 const space_seg_t *s1 = x1; 00052 const space_seg_t *s2 = x2; 00053 00054 if (s1->ss_start < s2->ss_start) { 00055 if (s1->ss_end > s2->ss_start) 00056 return (0); 00057 return (-1); 00058 } 00059 if (s1->ss_start > s2->ss_start) { 00060 if (s1->ss_start < s2->ss_end) 00061 return (0); 00062 return (1); 00063 } 00064 return (0); 00065 } 00066 00067 void 00068 space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift, 00069 kmutex_t *lp) 00070 { 00071 bzero(sm, sizeof (*sm)); 00072 00073 cv_init(&sm->sm_load_cv, NULL, CV_DEFAULT, NULL); 00074 00075 avl_create(&sm->sm_root, space_map_seg_compare, 00076 sizeof (space_seg_t), offsetof(struct space_seg, ss_node)); 00077 00078 sm->sm_start = start; 00079 sm->sm_size = size; 00080 sm->sm_shift = shift; 00081 sm->sm_lock = lp; 00082 } 00083 00084 void 00085 space_map_destroy(space_map_t *sm) 00086 { 00087 ASSERT(!sm->sm_loaded && !sm->sm_loading); 00088 VERIFY0(sm->sm_space); 00089 avl_destroy(&sm->sm_root); 00090 cv_destroy(&sm->sm_load_cv); 00091 } 00092 00093 void 00094 space_map_add(space_map_t *sm, uint64_t start, uint64_t size) 00095 { 00096 avl_index_t where; 00097 space_seg_t ssearch, *ss_before, *ss_after, *ss; 00098 uint64_t end = start + size; 00099 int merge_before, merge_after; 00100 00101 ASSERT(MUTEX_HELD(sm->sm_lock)); 00102 VERIFY(size != 0); 00103 VERIFY3U(start, >=, sm->sm_start); 00104 VERIFY3U(end, <=, sm->sm_start + sm->sm_size); 00105 VERIFY(sm->sm_space + size <= sm->sm_size); 00106 VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); 00107 VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); 00108 again: 00109 ssearch.ss_start = start; 00110 ssearch.ss_end = end; 00111 ss = avl_find(&sm->sm_root, &ssearch, &where); 00112 00113 if (ss != NULL && ss->ss_start <= start && ss->ss_end >= end) { 00114 zfs_panic_recover("zfs: allocating allocated segment" 00115 "(offset=%llu size=%llu)\n", 00116 (longlong_t)start, (longlong_t)size); 00117 return; 00118 } 00119 if (ss != NULL && space_map_last_hope) { 00120 uint64_t sstart, ssize; 00121 00122 if (ss->ss_start > start) 00123 sstart = ss->ss_start; 00124 else 00125 sstart = start; 00126 if (ss->ss_end > end) 00127 ssize = end - sstart; 00128 else 00129 ssize = ss->ss_end - sstart; 00130 ZFS_LOG(0, 00131 "Removing colliding space_map range (start=%ju end=%ju). Good luck!", 00132 (uintmax_t)sstart, (uintmax_t)(sstart + ssize)); 00133 space_map_remove(sm, sstart, ssize); 00134 goto again; 00135 } 00136 00137 /* Make sure we don't overlap with either of our neighbors */ 00138 VERIFY(ss == NULL); 00139 00140 ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE); 00141 ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER); 00142 00143 merge_before = (ss_before != NULL && ss_before->ss_end == start); 00144 merge_after = (ss_after != NULL && ss_after->ss_start == end); 00145 00146 if (merge_before && merge_after) { 00147 avl_remove(&sm->sm_root, ss_before); 00148 if (sm->sm_pp_root) { 00149 avl_remove(sm->sm_pp_root, ss_before); 00150 avl_remove(sm->sm_pp_root, ss_after); 00151 } 00152 ss_after->ss_start = ss_before->ss_start; 00153 kmem_free(ss_before, sizeof (*ss_before)); 00154 ss = ss_after; 00155 } else if (merge_before) { 00156 ss_before->ss_end = end; 00157 if (sm->sm_pp_root) 00158 avl_remove(sm->sm_pp_root, ss_before); 00159 ss = ss_before; 00160 } else if (merge_after) { 00161 ss_after->ss_start = start; 00162 if (sm->sm_pp_root) 00163 avl_remove(sm->sm_pp_root, ss_after); 00164 ss = ss_after; 00165 } else { 00166 ss = kmem_alloc(sizeof (*ss), KM_SLEEP); 00167 ss->ss_start = start; 00168 ss->ss_end = end; 00169 avl_insert(&sm->sm_root, ss, where); 00170 } 00171 00172 if (sm->sm_pp_root) 00173 avl_add(sm->sm_pp_root, ss); 00174 00175 sm->sm_space += size; 00176 } 00177 00178 void 00179 space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) 00180 { 00181 space_seg_t ssearch, *ss, *newseg; 00182 uint64_t end = start + size; 00183 int left_over, right_over; 00184 00185 ASSERT(MUTEX_HELD(sm->sm_lock)); 00186 VERIFY(size != 0); 00187 VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); 00188 VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); 00189 00190 ssearch.ss_start = start; 00191 ssearch.ss_end = end; 00192 ss = avl_find(&sm->sm_root, &ssearch, NULL); 00193 00194 /* Make sure we completely overlap with someone */ 00195 if (ss == NULL) { 00196 zfs_panic_recover("zfs: freeing free segment " 00197 "(offset=%llu size=%llu)", 00198 (longlong_t)start, (longlong_t)size); 00199 return; 00200 } 00201 VERIFY3U(ss->ss_start, <=, start); 00202 VERIFY3U(ss->ss_end, >=, end); 00203 VERIFY(sm->sm_space - size < sm->sm_size); 00204 00205 left_over = (ss->ss_start != start); 00206 right_over = (ss->ss_end != end); 00207 00208 if (sm->sm_pp_root) 00209 avl_remove(sm->sm_pp_root, ss); 00210 00211 if (left_over && right_over) { 00212 newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP); 00213 newseg->ss_start = end; 00214 newseg->ss_end = ss->ss_end; 00215 ss->ss_end = start; 00216 avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER); 00217 if (sm->sm_pp_root) 00218 avl_add(sm->sm_pp_root, newseg); 00219 } else if (left_over) { 00220 ss->ss_end = start; 00221 } else if (right_over) { 00222 ss->ss_start = end; 00223 } else { 00224 avl_remove(&sm->sm_root, ss); 00225 kmem_free(ss, sizeof (*ss)); 00226 ss = NULL; 00227 } 00228 00229 if (sm->sm_pp_root && ss != NULL) 00230 avl_add(sm->sm_pp_root, ss); 00231 00232 sm->sm_space -= size; 00233 } 00234 00235 boolean_t 00236 space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) 00237 { 00238 avl_index_t where; 00239 space_seg_t ssearch, *ss; 00240 uint64_t end = start + size; 00241 00242 ASSERT(MUTEX_HELD(sm->sm_lock)); 00243 VERIFY(size != 0); 00244 VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); 00245 VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); 00246 00247 ssearch.ss_start = start; 00248 ssearch.ss_end = end; 00249 ss = avl_find(&sm->sm_root, &ssearch, &where); 00250 00251 return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end); 00252 } 00253 00254 void 00255 space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) 00256 { 00257 space_seg_t *ss; 00258 void *cookie = NULL; 00259 00260 ASSERT(MUTEX_HELD(sm->sm_lock)); 00261 00262 while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) { 00263 if (func != NULL) 00264 func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); 00265 kmem_free(ss, sizeof (*ss)); 00266 } 00267 sm->sm_space = 0; 00268 } 00269 00270 void 00271 space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) 00272 { 00273 space_seg_t *ss; 00274 00275 ASSERT(MUTEX_HELD(sm->sm_lock)); 00276 00277 for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) 00278 func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); 00279 } 00280 00284 void 00285 space_map_load_wait(space_map_t *sm) 00286 { 00287 ASSERT(MUTEX_HELD(sm->sm_lock)); 00288 00289 while (sm->sm_loading) { 00290 ASSERT(!sm->sm_loaded); 00291 cv_wait(&sm->sm_load_cv, sm->sm_lock); 00292 } 00293 } 00294 00299 int 00300 space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype, 00301 space_map_obj_t *smo, objset_t *os) 00302 { 00303 uint64_t *entry, *entry_map, *entry_map_end; 00304 uint64_t bufsize, size, offset, end, space; 00305 uint64_t mapstart = sm->sm_start; 00306 int error = 0; 00307 00308 ASSERT(MUTEX_HELD(sm->sm_lock)); 00309 ASSERT(!sm->sm_loaded); 00310 ASSERT(!sm->sm_loading); 00311 00312 sm->sm_loading = B_TRUE; 00313 end = smo->smo_objsize; 00314 space = smo->smo_alloc; 00315 00316 ASSERT(sm->sm_ops == NULL); 00317 VERIFY0(sm->sm_space); 00318 00319 if (maptype == SM_FREE) { 00320 space_map_add(sm, sm->sm_start, sm->sm_size); 00321 space = sm->sm_size - space; 00322 } 00323 00324 bufsize = 1ULL << SPACE_MAP_BLOCKSHIFT; 00325 entry_map = zio_buf_alloc(bufsize); 00326 00327 mutex_exit(sm->sm_lock); 00328 if (end > bufsize) 00329 dmu_prefetch(os, smo->smo_object, bufsize, end - bufsize); 00330 mutex_enter(sm->sm_lock); 00331 00332 for (offset = 0; offset < end; offset += bufsize) { 00333 size = MIN(end - offset, bufsize); 00334 VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0); 00335 VERIFY(size != 0); 00336 00337 dprintf("object=%llu offset=%llx size=%llx\n", 00338 smo->smo_object, offset, size); 00339 00340 mutex_exit(sm->sm_lock); 00341 error = dmu_read(os, smo->smo_object, offset, size, entry_map, 00342 DMU_READ_PREFETCH); 00343 mutex_enter(sm->sm_lock); 00344 if (error != 0) 00345 break; 00346 00347 entry_map_end = entry_map + (size / sizeof (uint64_t)); 00348 for (entry = entry_map; entry < entry_map_end; entry++) { 00349 uint64_t e = *entry; 00350 00351 if (SM_DEBUG_DECODE(e)) /* Skip debug entries */ 00352 continue; 00353 00354 (SM_TYPE_DECODE(e) == maptype ? 00355 space_map_add : space_map_remove)(sm, 00356 (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart, 00357 SM_RUN_DECODE(e) << sm->sm_shift); 00358 } 00359 } 00360 00361 if (error == 0) { 00362 VERIFY3U(sm->sm_space, ==, space); 00363 00364 sm->sm_loaded = B_TRUE; 00365 sm->sm_ops = ops; 00366 if (ops != NULL) 00367 ops->smop_load(sm); 00368 } else { 00369 space_map_vacate(sm, NULL, NULL); 00370 } 00371 00372 zio_buf_free(entry_map, bufsize); 00373 00374 sm->sm_loading = B_FALSE; 00375 00376 cv_broadcast(&sm->sm_load_cv); 00377 00378 return (error); 00379 } 00380 00381 void 00382 space_map_unload(space_map_t *sm) 00383 { 00384 ASSERT(MUTEX_HELD(sm->sm_lock)); 00385 00386 if (sm->sm_loaded && sm->sm_ops != NULL) 00387 sm->sm_ops->smop_unload(sm); 00388 00389 sm->sm_loaded = B_FALSE; 00390 sm->sm_ops = NULL; 00391 00392 space_map_vacate(sm, NULL, NULL); 00393 } 00394 00395 uint64_t 00396 space_map_maxsize(space_map_t *sm) 00397 { 00398 ASSERT(sm->sm_ops != NULL); 00399 return (sm->sm_ops->smop_max(sm)); 00400 } 00401 00402 uint64_t 00403 space_map_alloc(space_map_t *sm, uint64_t size) 00404 { 00405 uint64_t start; 00406 00407 start = sm->sm_ops->smop_alloc(sm, size); 00408 if (start != -1ULL) 00409 space_map_remove(sm, start, size); 00410 return (start); 00411 } 00412 00413 void 00414 space_map_claim(space_map_t *sm, uint64_t start, uint64_t size) 00415 { 00416 sm->sm_ops->smop_claim(sm, start, size); 00417 space_map_remove(sm, start, size); 00418 } 00419 00420 void 00421 space_map_free(space_map_t *sm, uint64_t start, uint64_t size) 00422 { 00423 space_map_add(sm, start, size); 00424 sm->sm_ops->smop_free(sm, start, size); 00425 } 00426 00430 void 00431 space_map_sync(space_map_t *sm, uint8_t maptype, 00432 space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx) 00433 { 00434 spa_t *spa = dmu_objset_spa(os); 00435 void *cookie = NULL; 00436 space_seg_t *ss; 00437 uint64_t bufsize, start, size, run_len; 00438 uint64_t *entry, *entry_map, *entry_map_end; 00439 00440 ASSERT(MUTEX_HELD(sm->sm_lock)); 00441 00442 if (sm->sm_space == 0) 00443 return; 00444 00445 dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n", 00446 smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa), 00447 maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root), 00448 sm->sm_space); 00449 00450 if (maptype == SM_ALLOC) 00451 smo->smo_alloc += sm->sm_space; 00452 else 00453 smo->smo_alloc -= sm->sm_space; 00454 00455 bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t); 00456 bufsize = MIN(bufsize, 1ULL << SPACE_MAP_BLOCKSHIFT); 00457 entry_map = zio_buf_alloc(bufsize); 00458 entry_map_end = entry_map + (bufsize / sizeof (uint64_t)); 00459 entry = entry_map; 00460 00461 *entry++ = SM_DEBUG_ENCODE(1) | 00462 SM_DEBUG_ACTION_ENCODE(maptype) | 00463 SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) | 00464 SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); 00465 00466 while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) { 00467 size = ss->ss_end - ss->ss_start; 00468 start = (ss->ss_start - sm->sm_start) >> sm->sm_shift; 00469 00470 sm->sm_space -= size; 00471 size >>= sm->sm_shift; 00472 00473 while (size) { 00474 run_len = MIN(size, SM_RUN_MAX); 00475 00476 if (entry == entry_map_end) { 00477 mutex_exit(sm->sm_lock); 00478 dmu_write(os, smo->smo_object, smo->smo_objsize, 00479 bufsize, entry_map, tx); 00480 mutex_enter(sm->sm_lock); 00481 smo->smo_objsize += bufsize; 00482 entry = entry_map; 00483 } 00484 00485 *entry++ = SM_OFFSET_ENCODE(start) | 00486 SM_TYPE_ENCODE(maptype) | 00487 SM_RUN_ENCODE(run_len); 00488 00489 start += run_len; 00490 size -= run_len; 00491 } 00492 kmem_free(ss, sizeof (*ss)); 00493 } 00494 00495 if (entry != entry_map) { 00496 size = (entry - entry_map) * sizeof (uint64_t); 00497 mutex_exit(sm->sm_lock); 00498 dmu_write(os, smo->smo_object, smo->smo_objsize, 00499 size, entry_map, tx); 00500 mutex_enter(sm->sm_lock); 00501 smo->smo_objsize += size; 00502 } 00503 00504 zio_buf_free(entry_map, bufsize); 00505 00506 VERIFY0(sm->sm_space); 00507 } 00508 00509 void 00510 space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx) 00511 { 00512 VERIFY(dmu_free_range(os, smo->smo_object, 0, -1ULL, tx) == 0); 00513 00514 smo->smo_objsize = 0; 00515 smo->smo_alloc = 0; 00516 } 00517 00539 static int 00540 space_map_ref_compare(const void *x1, const void *x2) 00541 { 00542 const space_ref_t *sr1 = x1; 00543 const space_ref_t *sr2 = x2; 00544 00545 if (sr1->sr_offset < sr2->sr_offset) 00546 return (-1); 00547 if (sr1->sr_offset > sr2->sr_offset) 00548 return (1); 00549 00550 if (sr1 < sr2) 00551 return (-1); 00552 if (sr1 > sr2) 00553 return (1); 00554 00555 return (0); 00556 } 00557 00558 void 00559 space_map_ref_create(avl_tree_t *t) 00560 { 00561 avl_create(t, space_map_ref_compare, 00562 sizeof (space_ref_t), offsetof(space_ref_t, sr_node)); 00563 } 00564 00565 void 00566 space_map_ref_destroy(avl_tree_t *t) 00567 { 00568 space_ref_t *sr; 00569 void *cookie = NULL; 00570 00571 while ((sr = avl_destroy_nodes(t, &cookie)) != NULL) 00572 kmem_free(sr, sizeof (*sr)); 00573 00574 avl_destroy(t); 00575 } 00576 00577 static void 00578 space_map_ref_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt) 00579 { 00580 space_ref_t *sr; 00581 00582 sr = kmem_alloc(sizeof (*sr), KM_SLEEP); 00583 sr->sr_offset = offset; 00584 sr->sr_refcnt = refcnt; 00585 00586 avl_add(t, sr); 00587 } 00588 00589 void 00590 space_map_ref_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, 00591 int64_t refcnt) 00592 { 00593 space_map_ref_add_node(t, start, refcnt); 00594 space_map_ref_add_node(t, end, -refcnt); 00595 } 00596 00600 void 00601 space_map_ref_add_map(avl_tree_t *t, space_map_t *sm, int64_t refcnt) 00602 { 00603 space_seg_t *ss; 00604 00605 ASSERT(MUTEX_HELD(sm->sm_lock)); 00606 00607 for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) 00608 space_map_ref_add_seg(t, ss->ss_start, ss->ss_end, refcnt); 00609 } 00610 00615 void 00616 space_map_ref_generate_map(avl_tree_t *t, space_map_t *sm, int64_t minref) 00617 { 00618 uint64_t start = -1ULL; 00619 int64_t refcnt = 0; 00620 space_ref_t *sr; 00621 00622 ASSERT(MUTEX_HELD(sm->sm_lock)); 00623 00624 space_map_vacate(sm, NULL, NULL); 00625 00626 for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) { 00627 refcnt += sr->sr_refcnt; 00628 if (refcnt >= minref) { 00629 if (start == -1ULL) { 00630 start = sr->sr_offset; 00631 } 00632 } else { 00633 if (start != -1ULL) { 00634 uint64_t end = sr->sr_offset; 00635 ASSERT(start <= end); 00636 if (end > start) 00637 space_map_add(sm, start, end - start); 00638 start = -1ULL; 00639 } 00640 } 00641 } 00642 ASSERT(refcnt == 0); 00643 ASSERT(start == -1ULL); 00644 }