FreeBSD ZFS
The Zettabyte File System
|
00001 /* 00002 * CDDL HEADER START 00003 * 00004 * The contents of this file are subject to the terms of the 00005 * Common Development and Distribution License (the "License"). 00006 * You may not use this file except in compliance with the License. 00007 * 00008 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 00009 * or http://www.opensolaris.org/os/licensing. 00010 * See the License for the specific language governing permissions 00011 * and limitations under the License. 00012 * 00013 * When distributing Covered Code, include this CDDL HEADER in each 00014 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 00015 * If applicable, add the following below this CDDL HEADER, with the 00016 * fields enclosed by brackets "[]" replaced with your own identifying 00017 * information: Portions Copyright [yyyy] [name of copyright owner] 00018 * 00019 * CDDL HEADER END 00020 */ 00021 /* 00022 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 00023 * Copyright (c) 2012 by Delphix. All rights reserved. 00024 */ 00025 00026 #include <sys/zfs_context.h> 00027 #include <sys/dmu.h> 00028 #include <sys/dmu_tx.h> 00029 #include <sys/space_map.h> 00030 #include <sys/metaslab_impl.h> 00031 #include <sys/vdev_impl.h> 00032 #include <sys/zio.h> 00033 00042 #define CAN_FASTGANG(flags) \ 00043 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ 00044 METASLAB_GANG_AVOID))) 00045 00046 uint64_t metaslab_aliquot = 512ULL << 10; 00047 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 00048 00056 int zfs_mg_alloc_failures = 0; 00057 00058 SYSCTL_DECL(_vfs_zfs); 00059 SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_alloc_failures, CTLFLAG_RDTUN, 00060 &zfs_mg_alloc_failures, 0, 00061 "Number of allowed allocation failures per vdev"); 00062 TUNABLE_INT("vfs.zfs.mg_alloc_failures", &zfs_mg_alloc_failures); 00063 00067 static int metaslab_debug = 0; 00068 00075 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; 00076 00083 int metaslab_df_free_pct = 4; 00084 00089 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 00090 00094 int metaslab_prefetch_limit = SPA_DVAS_PER_BP; 00095 00099 int metaslab_smo_bonus_pct = 150; 00100 00101 /* 00102 * ========================================================================== 00103 * Metaslab classes 00104 * ========================================================================== 00105 */ 00106 metaslab_class_t * 00107 metaslab_class_create(spa_t *spa, space_map_ops_t *ops) 00108 { 00109 metaslab_class_t *mc; 00110 00111 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 00112 00113 mc->mc_spa = spa; 00114 mc->mc_rotor = NULL; 00115 mc->mc_ops = ops; 00116 00117 return (mc); 00118 } 00119 00120 void 00121 metaslab_class_destroy(metaslab_class_t *mc) 00122 { 00123 ASSERT(mc->mc_rotor == NULL); 00124 ASSERT(mc->mc_alloc == 0); 00125 ASSERT(mc->mc_deferred == 0); 00126 ASSERT(mc->mc_space == 0); 00127 ASSERT(mc->mc_dspace == 0); 00128 00129 kmem_free(mc, sizeof (metaslab_class_t)); 00130 } 00131 00132 int 00133 metaslab_class_validate(metaslab_class_t *mc) 00134 { 00135 metaslab_group_t *mg; 00136 vdev_t *vd; 00137 00138 /* 00139 * Must hold one of the spa_config locks. 00140 */ 00141 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 00142 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 00143 00144 if ((mg = mc->mc_rotor) == NULL) 00145 return (0); 00146 00147 do { 00148 vd = mg->mg_vd; 00149 ASSERT(vd->vdev_mg != NULL); 00150 ASSERT3P(vd->vdev_top, ==, vd); 00151 ASSERT3P(mg->mg_class, ==, mc); 00152 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 00153 } while ((mg = mg->mg_next) != mc->mc_rotor); 00154 00155 return (0); 00156 } 00157 00158 void 00159 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 00160 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 00161 { 00162 atomic_add_64(&mc->mc_alloc, alloc_delta); 00163 atomic_add_64(&mc->mc_deferred, defer_delta); 00164 atomic_add_64(&mc->mc_space, space_delta); 00165 atomic_add_64(&mc->mc_dspace, dspace_delta); 00166 } 00167 00168 uint64_t 00169 metaslab_class_get_alloc(metaslab_class_t *mc) 00170 { 00171 return (mc->mc_alloc); 00172 } 00173 00174 uint64_t 00175 metaslab_class_get_deferred(metaslab_class_t *mc) 00176 { 00177 return (mc->mc_deferred); 00178 } 00179 00180 uint64_t 00181 metaslab_class_get_space(metaslab_class_t *mc) 00182 { 00183 return (mc->mc_space); 00184 } 00185 00186 uint64_t 00187 metaslab_class_get_dspace(metaslab_class_t *mc) 00188 { 00189 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 00190 } 00191 00192 /* 00193 * ========================================================================== 00194 * Metaslab groups 00195 * ========================================================================== 00196 */ 00197 static int 00198 metaslab_compare(const void *x1, const void *x2) 00199 { 00200 const metaslab_t *m1 = x1; 00201 const metaslab_t *m2 = x2; 00202 00203 if (m1->ms_weight < m2->ms_weight) 00204 return (1); 00205 if (m1->ms_weight > m2->ms_weight) 00206 return (-1); 00207 00208 /* 00209 * If the weights are identical, use the offset to force uniqueness. 00210 */ 00211 if (m1->ms_map.sm_start < m2->ms_map.sm_start) 00212 return (-1); 00213 if (m1->ms_map.sm_start > m2->ms_map.sm_start) 00214 return (1); 00215 00216 ASSERT3P(m1, ==, m2); 00217 00218 return (0); 00219 } 00220 00221 metaslab_group_t * 00222 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 00223 { 00224 metaslab_group_t *mg; 00225 00226 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 00227 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 00228 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 00229 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 00230 mg->mg_vd = vd; 00231 mg->mg_class = mc; 00232 mg->mg_activation_count = 0; 00233 00234 return (mg); 00235 } 00236 00237 void 00238 metaslab_group_destroy(metaslab_group_t *mg) 00239 { 00240 ASSERT(mg->mg_prev == NULL); 00241 ASSERT(mg->mg_next == NULL); 00242 /* 00243 * We may have gone below zero with the activation count 00244 * either because we never activated in the first place or 00245 * because we're done, and possibly removing the vdev. 00246 */ 00247 ASSERT(mg->mg_activation_count <= 0); 00248 00249 avl_destroy(&mg->mg_metaslab_tree); 00250 mutex_destroy(&mg->mg_lock); 00251 kmem_free(mg, sizeof (metaslab_group_t)); 00252 } 00253 00254 void 00255 metaslab_group_activate(metaslab_group_t *mg) 00256 { 00257 metaslab_class_t *mc = mg->mg_class; 00258 metaslab_group_t *mgprev, *mgnext; 00259 00260 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 00261 00262 ASSERT(mc->mc_rotor != mg); 00263 ASSERT(mg->mg_prev == NULL); 00264 ASSERT(mg->mg_next == NULL); 00265 ASSERT(mg->mg_activation_count <= 0); 00266 00267 if (++mg->mg_activation_count <= 0) 00268 return; 00269 00270 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 00271 00272 if ((mgprev = mc->mc_rotor) == NULL) { 00273 mg->mg_prev = mg; 00274 mg->mg_next = mg; 00275 } else { 00276 mgnext = mgprev->mg_next; 00277 mg->mg_prev = mgprev; 00278 mg->mg_next = mgnext; 00279 mgprev->mg_next = mg; 00280 mgnext->mg_prev = mg; 00281 } 00282 mc->mc_rotor = mg; 00283 } 00284 00285 void 00286 metaslab_group_passivate(metaslab_group_t *mg) 00287 { 00288 metaslab_class_t *mc = mg->mg_class; 00289 metaslab_group_t *mgprev, *mgnext; 00290 00291 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 00292 00293 if (--mg->mg_activation_count != 0) { 00294 ASSERT(mc->mc_rotor != mg); 00295 ASSERT(mg->mg_prev == NULL); 00296 ASSERT(mg->mg_next == NULL); 00297 ASSERT(mg->mg_activation_count < 0); 00298 return; 00299 } 00300 00301 mgprev = mg->mg_prev; 00302 mgnext = mg->mg_next; 00303 00304 if (mg == mgnext) { 00305 mc->mc_rotor = NULL; 00306 } else { 00307 mc->mc_rotor = mgnext; 00308 mgprev->mg_next = mgnext; 00309 mgnext->mg_prev = mgprev; 00310 } 00311 00312 mg->mg_prev = NULL; 00313 mg->mg_next = NULL; 00314 } 00315 00316 static void 00317 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 00318 { 00319 mutex_enter(&mg->mg_lock); 00320 ASSERT(msp->ms_group == NULL); 00321 msp->ms_group = mg; 00322 msp->ms_weight = 0; 00323 avl_add(&mg->mg_metaslab_tree, msp); 00324 mutex_exit(&mg->mg_lock); 00325 } 00326 00327 static void 00328 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 00329 { 00330 mutex_enter(&mg->mg_lock); 00331 ASSERT(msp->ms_group == mg); 00332 avl_remove(&mg->mg_metaslab_tree, msp); 00333 msp->ms_group = NULL; 00334 mutex_exit(&mg->mg_lock); 00335 } 00336 00337 static void 00338 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 00339 { 00340 /* 00341 * Although in principle the weight can be any value, in 00342 * practice we do not use values in the range [1, 510]. 00343 */ 00344 ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); 00345 ASSERT(MUTEX_HELD(&msp->ms_lock)); 00346 00347 mutex_enter(&mg->mg_lock); 00348 ASSERT(msp->ms_group == mg); 00349 avl_remove(&mg->mg_metaslab_tree, msp); 00350 msp->ms_weight = weight; 00351 avl_add(&mg->mg_metaslab_tree, msp); 00352 mutex_exit(&mg->mg_lock); 00353 } 00354 00355 /* 00356 * ========================================================================== 00357 * Common allocator routines 00358 * ========================================================================== 00359 */ 00360 static int 00361 metaslab_segsize_compare(const void *x1, const void *x2) 00362 { 00363 const space_seg_t *s1 = x1; 00364 const space_seg_t *s2 = x2; 00365 uint64_t ss_size1 = s1->ss_end - s1->ss_start; 00366 uint64_t ss_size2 = s2->ss_end - s2->ss_start; 00367 00368 if (ss_size1 < ss_size2) 00369 return (-1); 00370 if (ss_size1 > ss_size2) 00371 return (1); 00372 00373 if (s1->ss_start < s2->ss_start) 00374 return (-1); 00375 if (s1->ss_start > s2->ss_start) 00376 return (1); 00377 00378 return (0); 00379 } 00380 00386 static uint64_t 00387 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 00388 uint64_t align) 00389 { 00390 space_seg_t *ss, ssearch; 00391 avl_index_t where; 00392 00393 ssearch.ss_start = *cursor; 00394 ssearch.ss_end = *cursor + size; 00395 00396 ss = avl_find(t, &ssearch, &where); 00397 if (ss == NULL) 00398 ss = avl_nearest(t, where, AVL_AFTER); 00399 00400 while (ss != NULL) { 00401 uint64_t offset = P2ROUNDUP(ss->ss_start, align); 00402 00403 if (offset + size <= ss->ss_end) { 00404 *cursor = offset + size; 00405 return (offset); 00406 } 00407 ss = AVL_NEXT(t, ss); 00408 } 00409 00410 /* 00411 * If we know we've searched the whole map (*cursor == 0), give up. 00412 * Otherwise, reset the cursor to the beginning and try again. 00413 */ 00414 if (*cursor == 0) 00415 return (-1ULL); 00416 00417 *cursor = 0; 00418 return (metaslab_block_picker(t, cursor, size, align)); 00419 } 00420 00421 static void 00422 metaslab_pp_load(space_map_t *sm) 00423 { 00424 space_seg_t *ss; 00425 00426 ASSERT(sm->sm_ppd == NULL); 00427 sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); 00428 00429 sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 00430 avl_create(sm->sm_pp_root, metaslab_segsize_compare, 00431 sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); 00432 00433 for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) 00434 avl_add(sm->sm_pp_root, ss); 00435 } 00436 00437 static void 00438 metaslab_pp_unload(space_map_t *sm) 00439 { 00440 void *cookie = NULL; 00441 00442 kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); 00443 sm->sm_ppd = NULL; 00444 00445 while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { 00446 /* tear down the tree */ 00447 } 00448 00449 avl_destroy(sm->sm_pp_root); 00450 kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); 00451 sm->sm_pp_root = NULL; 00452 } 00453 00454 /* ARGSUSED */ 00455 static void 00456 metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size) 00457 { 00458 /* No need to update cursor */ 00459 } 00460 00461 /* ARGSUSED */ 00462 static void 00463 metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size) 00464 { 00465 /* No need to update cursor */ 00466 } 00467 00471 uint64_t 00472 metaslab_pp_maxsize(space_map_t *sm) 00473 { 00474 avl_tree_t *t = sm->sm_pp_root; 00475 space_seg_t *ss; 00476 00477 if (t == NULL || (ss = avl_last(t)) == NULL) 00478 return (0ULL); 00479 00480 return (ss->ss_end - ss->ss_start); 00481 } 00482 00483 /* 00484 * ========================================================================== 00485 * The first-fit block allocator 00486 * ========================================================================== 00487 */ 00488 static uint64_t 00489 metaslab_ff_alloc(space_map_t *sm, uint64_t size) 00490 { 00491 avl_tree_t *t = &sm->sm_root; 00492 uint64_t align = size & -size; 00493 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 00494 00495 return (metaslab_block_picker(t, cursor, size, align)); 00496 } 00497 00498 /* ARGSUSED */ 00499 boolean_t 00500 metaslab_ff_fragmented(space_map_t *sm) 00501 { 00502 return (B_TRUE); 00503 } 00504 00505 static space_map_ops_t metaslab_ff_ops = { 00506 metaslab_pp_load, 00507 metaslab_pp_unload, 00508 metaslab_ff_alloc, 00509 metaslab_pp_claim, 00510 metaslab_pp_free, 00511 metaslab_pp_maxsize, 00512 metaslab_ff_fragmented 00513 }; 00514 00515 /* 00516 * ========================================================================== 00517 * Dynamic block allocator 00518 * 00519 * Uses the first fit allocation scheme until space get low and then 00520 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 00521 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 00522 * ========================================================================== 00523 */ 00524 static uint64_t 00525 metaslab_df_alloc(space_map_t *sm, uint64_t size) 00526 { 00527 avl_tree_t *t = &sm->sm_root; 00528 uint64_t align = size & -size; 00529 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 00530 uint64_t max_size = metaslab_pp_maxsize(sm); 00531 int free_pct = sm->sm_space * 100 / sm->sm_size; 00532 00533 ASSERT(MUTEX_HELD(sm->sm_lock)); 00534 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 00535 00536 if (max_size < size) 00537 return (-1ULL); 00538 00539 /* 00540 * If we're running low on space switch to using the size 00541 * sorted AVL tree (best-fit). 00542 */ 00543 if (max_size < metaslab_df_alloc_threshold || 00544 free_pct < metaslab_df_free_pct) { 00545 t = sm->sm_pp_root; 00546 *cursor = 0; 00547 } 00548 00549 return (metaslab_block_picker(t, cursor, size, 1ULL)); 00550 } 00551 00552 static boolean_t 00553 metaslab_df_fragmented(space_map_t *sm) 00554 { 00555 uint64_t max_size = metaslab_pp_maxsize(sm); 00556 int free_pct = sm->sm_space * 100 / sm->sm_size; 00557 00558 if (max_size >= metaslab_df_alloc_threshold && 00559 free_pct >= metaslab_df_free_pct) 00560 return (B_FALSE); 00561 00562 return (B_TRUE); 00563 } 00564 00565 static space_map_ops_t metaslab_df_ops = { 00566 metaslab_pp_load, 00567 metaslab_pp_unload, 00568 metaslab_df_alloc, 00569 metaslab_pp_claim, 00570 metaslab_pp_free, 00571 metaslab_pp_maxsize, 00572 metaslab_df_fragmented 00573 }; 00574 00575 /* 00576 * ========================================================================== 00577 * Other experimental allocators 00578 * ========================================================================== 00579 */ 00580 static uint64_t 00581 metaslab_cdf_alloc(space_map_t *sm, uint64_t size) 00582 { 00583 avl_tree_t *t = &sm->sm_root; 00584 uint64_t *cursor = (uint64_t *)sm->sm_ppd; 00585 uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1; 00586 uint64_t max_size = metaslab_pp_maxsize(sm); 00587 uint64_t rsize = size; 00588 uint64_t offset = 0; 00589 00590 ASSERT(MUTEX_HELD(sm->sm_lock)); 00591 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 00592 00593 if (max_size < size) 00594 return (-1ULL); 00595 00596 ASSERT3U(*extent_end, >=, *cursor); 00597 00598 /* 00599 * If we're running low on space switch to using the size 00600 * sorted AVL tree (best-fit). 00601 */ 00602 if ((*cursor + size) > *extent_end) { 00603 00604 t = sm->sm_pp_root; 00605 *cursor = *extent_end = 0; 00606 00607 if (max_size > 2 * SPA_MAXBLOCKSIZE) 00608 rsize = MIN(metaslab_min_alloc_size, max_size); 00609 offset = metaslab_block_picker(t, extent_end, rsize, 1ULL); 00610 if (offset != -1) 00611 *cursor = offset + size; 00612 } else { 00613 offset = metaslab_block_picker(t, cursor, rsize, 1ULL); 00614 } 00615 ASSERT3U(*cursor, <=, *extent_end); 00616 return (offset); 00617 } 00618 00619 static boolean_t 00620 metaslab_cdf_fragmented(space_map_t *sm) 00621 { 00622 uint64_t max_size = metaslab_pp_maxsize(sm); 00623 00624 if (max_size > (metaslab_min_alloc_size * 10)) 00625 return (B_FALSE); 00626 return (B_TRUE); 00627 } 00628 00629 static space_map_ops_t metaslab_cdf_ops = { 00630 metaslab_pp_load, 00631 metaslab_pp_unload, 00632 metaslab_cdf_alloc, 00633 metaslab_pp_claim, 00634 metaslab_pp_free, 00635 metaslab_pp_maxsize, 00636 metaslab_cdf_fragmented 00637 }; 00638 00639 uint64_t metaslab_ndf_clump_shift = 4; 00640 00641 static uint64_t 00642 metaslab_ndf_alloc(space_map_t *sm, uint64_t size) 00643 { 00644 avl_tree_t *t = &sm->sm_root; 00645 avl_index_t where; 00646 space_seg_t *ss, ssearch; 00647 uint64_t hbit = highbit(size); 00648 uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1; 00649 uint64_t max_size = metaslab_pp_maxsize(sm); 00650 00651 ASSERT(MUTEX_HELD(sm->sm_lock)); 00652 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 00653 00654 if (max_size < size) 00655 return (-1ULL); 00656 00657 ssearch.ss_start = *cursor; 00658 ssearch.ss_end = *cursor + size; 00659 00660 ss = avl_find(t, &ssearch, &where); 00661 if (ss == NULL || (ss->ss_start + size > ss->ss_end)) { 00662 t = sm->sm_pp_root; 00663 00664 ssearch.ss_start = 0; 00665 ssearch.ss_end = MIN(max_size, 00666 1ULL << (hbit + metaslab_ndf_clump_shift)); 00667 ss = avl_find(t, &ssearch, &where); 00668 if (ss == NULL) 00669 ss = avl_nearest(t, where, AVL_AFTER); 00670 ASSERT(ss != NULL); 00671 } 00672 00673 if (ss != NULL) { 00674 if (ss->ss_start + size <= ss->ss_end) { 00675 *cursor = ss->ss_start + size; 00676 return (ss->ss_start); 00677 } 00678 } 00679 return (-1ULL); 00680 } 00681 00682 static boolean_t 00683 metaslab_ndf_fragmented(space_map_t *sm) 00684 { 00685 uint64_t max_size = metaslab_pp_maxsize(sm); 00686 00687 if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift)) 00688 return (B_FALSE); 00689 return (B_TRUE); 00690 } 00691 00692 00693 static space_map_ops_t metaslab_ndf_ops = { 00694 metaslab_pp_load, 00695 metaslab_pp_unload, 00696 metaslab_ndf_alloc, 00697 metaslab_pp_claim, 00698 metaslab_pp_free, 00699 metaslab_pp_maxsize, 00700 metaslab_ndf_fragmented 00701 }; 00702 00703 space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 00704 00705 /* 00706 * ========================================================================== 00707 * Metaslabs 00708 * ========================================================================== 00709 */ 00710 metaslab_t * 00711 metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, 00712 uint64_t start, uint64_t size, uint64_t txg) 00713 { 00714 vdev_t *vd = mg->mg_vd; 00715 metaslab_t *msp; 00716 00717 msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 00718 mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); 00719 00720 msp->ms_smo_syncing = *smo; 00721 00722 /* 00723 * We create the main space map here, but we don't create the 00724 * allocmaps and freemaps until metaslab_sync_done(). This serves 00725 * two purposes: it allows metaslab_sync_done() to detect the 00726 * addition of new space; and for debugging, it ensures that we'd 00727 * data fault on any attempt to use this metaslab before it's ready. 00728 */ 00729 space_map_create(&msp->ms_map, start, size, 00730 vd->vdev_ashift, &msp->ms_lock); 00731 00732 metaslab_group_add(mg, msp); 00733 00734 if (metaslab_debug && smo->smo_object != 0) { 00735 mutex_enter(&msp->ms_lock); 00736 VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops, 00737 SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); 00738 mutex_exit(&msp->ms_lock); 00739 } 00740 00741 /* 00742 * If we're opening an existing pool (txg == 0) or creating 00743 * a new one (txg == TXG_INITIAL), all space is available now. 00744 * If we're adding space to an existing pool, the new space 00745 * does not become available until after this txg has synced. 00746 */ 00747 if (txg <= TXG_INITIAL) 00748 metaslab_sync_done(msp, 0); 00749 00750 if (txg != 0) { 00751 vdev_dirty(vd, 0, NULL, txg); 00752 vdev_dirty(vd, VDD_METASLAB, msp, txg); 00753 } 00754 00755 return (msp); 00756 } 00757 00758 void 00759 metaslab_fini(metaslab_t *msp) 00760 { 00761 metaslab_group_t *mg = msp->ms_group; 00762 00763 vdev_space_update(mg->mg_vd, 00764 -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size); 00765 00766 metaslab_group_remove(mg, msp); 00767 00768 mutex_enter(&msp->ms_lock); 00769 00770 space_map_unload(&msp->ms_map); 00771 space_map_destroy(&msp->ms_map); 00772 00773 for (int t = 0; t < TXG_SIZE; t++) { 00774 space_map_destroy(&msp->ms_allocmap[t]); 00775 space_map_destroy(&msp->ms_freemap[t]); 00776 } 00777 00778 for (int t = 0; t < TXG_DEFER_SIZE; t++) 00779 space_map_destroy(&msp->ms_defermap[t]); 00780 00781 ASSERT0(msp->ms_deferspace); 00782 00783 mutex_exit(&msp->ms_lock); 00784 mutex_destroy(&msp->ms_lock); 00785 00786 kmem_free(msp, sizeof (metaslab_t)); 00787 } 00788 00789 #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 00790 #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 00791 #define METASLAB_ACTIVE_MASK \ 00792 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 00793 00794 static uint64_t 00795 metaslab_weight(metaslab_t *msp) 00796 { 00797 metaslab_group_t *mg = msp->ms_group; 00798 space_map_t *sm = &msp->ms_map; 00799 space_map_obj_t *smo = &msp->ms_smo; 00800 vdev_t *vd = mg->mg_vd; 00801 uint64_t weight, space; 00802 00803 ASSERT(MUTEX_HELD(&msp->ms_lock)); 00804 00805 /* 00806 * The baseline weight is the metaslab's free space. 00807 */ 00808 space = sm->sm_size - smo->smo_alloc; 00809 weight = space; 00810 00811 /* 00812 * Modern disks have uniform bit density and constant angular velocity. 00813 * Therefore, the outer recording zones are faster (higher bandwidth) 00814 * than the inner zones by the ratio of outer to inner track diameter, 00815 * which is typically around 2:1. We account for this by assigning 00816 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 00817 * In effect, this means that we'll select the metaslab with the most 00818 * free bandwidth rather than simply the one with the most free space. 00819 */ 00820 weight = 2 * weight - 00821 ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; 00822 ASSERT(weight >= space && weight <= 2 * space); 00823 00824 /* 00825 * For locality, assign higher weight to metaslabs which have 00826 * a lower offset than what we've already activated. 00827 */ 00828 if (sm->sm_start <= mg->mg_bonus_area) 00829 weight *= (metaslab_smo_bonus_pct / 100); 00830 ASSERT(weight >= space && 00831 weight <= 2 * (metaslab_smo_bonus_pct / 100) * space); 00832 00833 if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) { 00834 /* 00835 * If this metaslab is one we're actively using, adjust its 00836 * weight to make it preferable to any inactive metaslab so 00837 * we'll polish it off. 00838 */ 00839 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 00840 } 00841 return (weight); 00842 } 00843 00844 static void 00845 metaslab_prefetch(metaslab_group_t *mg) 00846 { 00847 spa_t *spa = mg->mg_vd->vdev_spa; 00848 metaslab_t *msp; 00849 avl_tree_t *t = &mg->mg_metaslab_tree; 00850 int m; 00851 00852 mutex_enter(&mg->mg_lock); 00853 00854 /* 00855 * Prefetch the next potential metaslabs 00856 */ 00857 for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { 00858 space_map_t *sm = &msp->ms_map; 00859 space_map_obj_t *smo = &msp->ms_smo; 00860 00861 /* If we have reached our prefetch limit then we're done */ 00862 if (m >= metaslab_prefetch_limit) 00863 break; 00864 00865 if (!sm->sm_loaded && smo->smo_object != 0) { 00866 mutex_exit(&mg->mg_lock); 00867 dmu_prefetch(spa_meta_objset(spa), smo->smo_object, 00868 0ULL, smo->smo_objsize); 00869 mutex_enter(&mg->mg_lock); 00870 } 00871 } 00872 mutex_exit(&mg->mg_lock); 00873 } 00874 00875 static int 00876 metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 00877 { 00878 metaslab_group_t *mg = msp->ms_group; 00879 space_map_t *sm = &msp->ms_map; 00880 space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; 00881 00882 ASSERT(MUTEX_HELD(&msp->ms_lock)); 00883 00884 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 00885 space_map_load_wait(sm); 00886 if (!sm->sm_loaded) { 00887 int error = space_map_load(sm, sm_ops, SM_FREE, 00888 &msp->ms_smo, 00889 spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); 00890 if (error) { 00891 metaslab_group_sort(msp->ms_group, msp, 0); 00892 return (error); 00893 } 00894 for (int t = 0; t < TXG_DEFER_SIZE; t++) 00895 space_map_walk(&msp->ms_defermap[t], 00896 space_map_claim, sm); 00897 00898 } 00899 00900 /* 00901 * Track the bonus area as we activate new metaslabs. 00902 */ 00903 if (sm->sm_start > mg->mg_bonus_area) { 00904 mutex_enter(&mg->mg_lock); 00905 mg->mg_bonus_area = sm->sm_start; 00906 mutex_exit(&mg->mg_lock); 00907 } 00908 00909 metaslab_group_sort(msp->ms_group, msp, 00910 msp->ms_weight | activation_weight); 00911 } 00912 ASSERT(sm->sm_loaded); 00913 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 00914 00915 return (0); 00916 } 00917 00918 static void 00919 metaslab_passivate(metaslab_t *msp, uint64_t size) 00920 { 00921 /* 00922 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 00923 * this metaslab again. In that case, it had better be empty, 00924 * or we would be leaving space on the table. 00925 */ 00926 ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); 00927 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 00928 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 00929 } 00930 00934 void 00935 metaslab_sync(metaslab_t *msp, uint64_t txg) 00936 { 00937 vdev_t *vd = msp->ms_group->mg_vd; 00938 spa_t *spa = vd->vdev_spa; 00939 objset_t *mos = spa_meta_objset(spa); 00940 space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; 00941 space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; 00942 space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 00943 space_map_t *sm = &msp->ms_map; 00944 space_map_obj_t *smo = &msp->ms_smo_syncing; 00945 dmu_buf_t *db; 00946 dmu_tx_t *tx; 00947 00948 ASSERT(!vd->vdev_ishole); 00949 00950 if (allocmap->sm_space == 0 && freemap->sm_space == 0) 00951 return; 00952 00953 /* 00954 * The only state that can actually be changing concurrently with 00955 * metaslab_sync() is the metaslab's ms_map. No other thread can 00956 * be modifying this txg's allocmap, freemap, freed_map, or smo. 00957 * Therefore, we only hold ms_lock to satify space_map ASSERTs. 00958 * We drop it whenever we call into the DMU, because the DMU 00959 * can call down to us (e.g. via zio_free()) at any time. 00960 */ 00961 00962 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 00963 00964 if (smo->smo_object == 0) { 00965 ASSERT(smo->smo_objsize == 0); 00966 ASSERT(smo->smo_alloc == 0); 00967 smo->smo_object = dmu_object_alloc(mos, 00968 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 00969 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 00970 ASSERT(smo->smo_object != 0); 00971 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 00972 (sm->sm_start >> vd->vdev_ms_shift), 00973 sizeof (uint64_t), &smo->smo_object, tx); 00974 } 00975 00976 mutex_enter(&msp->ms_lock); 00977 00978 space_map_walk(freemap, space_map_add, freed_map); 00979 00980 if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= 00981 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { 00982 /* 00983 * The in-core space map representation is twice as compact 00984 * as the on-disk one, so it's time to condense the latter 00985 * by generating a pure allocmap from first principles. 00986 * 00987 * This metaslab is 100% allocated, 00988 * minus the content of the in-core map (sm), 00989 * minus what's been freed this txg (freed_map), 00990 * minus deferred frees (ms_defermap[]), 00991 * minus allocations from txgs in the future 00992 * (because they haven't been committed yet). 00993 */ 00994 space_map_vacate(allocmap, NULL, NULL); 00995 space_map_vacate(freemap, NULL, NULL); 00996 00997 space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); 00998 00999 space_map_walk(sm, space_map_remove, allocmap); 01000 space_map_walk(freed_map, space_map_remove, allocmap); 01001 01002 for (int t = 0; t < TXG_DEFER_SIZE; t++) 01003 space_map_walk(&msp->ms_defermap[t], 01004 space_map_remove, allocmap); 01005 01006 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 01007 space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], 01008 space_map_remove, allocmap); 01009 01010 mutex_exit(&msp->ms_lock); 01011 space_map_truncate(smo, mos, tx); 01012 mutex_enter(&msp->ms_lock); 01013 } 01014 01015 space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); 01016 space_map_sync(freemap, SM_FREE, smo, mos, tx); 01017 01018 mutex_exit(&msp->ms_lock); 01019 01020 VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 01021 dmu_buf_will_dirty(db, tx); 01022 ASSERT3U(db->db_size, >=, sizeof (*smo)); 01023 bcopy(smo, db->db_data, sizeof (*smo)); 01024 dmu_buf_rele(db, FTAG); 01025 01026 dmu_tx_commit(tx); 01027 } 01028 01033 void 01034 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 01035 { 01036 space_map_obj_t *smo = &msp->ms_smo; 01037 space_map_obj_t *smosync = &msp->ms_smo_syncing; 01038 space_map_t *sm = &msp->ms_map; 01039 space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 01040 space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; 01041 metaslab_group_t *mg = msp->ms_group; 01042 vdev_t *vd = mg->mg_vd; 01043 int64_t alloc_delta, defer_delta; 01044 01045 ASSERT(!vd->vdev_ishole); 01046 01047 mutex_enter(&msp->ms_lock); 01048 01049 /* 01050 * If this metaslab is just becoming available, initialize its 01051 * allocmaps and freemaps and add its capacity to the vdev. 01052 */ 01053 if (freed_map->sm_size == 0) { 01054 for (int t = 0; t < TXG_SIZE; t++) { 01055 space_map_create(&msp->ms_allocmap[t], sm->sm_start, 01056 sm->sm_size, sm->sm_shift, sm->sm_lock); 01057 space_map_create(&msp->ms_freemap[t], sm->sm_start, 01058 sm->sm_size, sm->sm_shift, sm->sm_lock); 01059 } 01060 01061 for (int t = 0; t < TXG_DEFER_SIZE; t++) 01062 space_map_create(&msp->ms_defermap[t], sm->sm_start, 01063 sm->sm_size, sm->sm_shift, sm->sm_lock); 01064 01065 vdev_space_update(vd, 0, 0, sm->sm_size); 01066 } 01067 01068 alloc_delta = smosync->smo_alloc - smo->smo_alloc; 01069 defer_delta = freed_map->sm_space - defer_map->sm_space; 01070 01071 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 01072 01073 ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); 01074 ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); 01075 01076 /* 01077 * If there's a space_map_load() in progress, wait for it to complete 01078 * so that we have a consistent view of the in-core space map. 01079 * Then, add defer_map (oldest deferred frees) to this map and 01080 * transfer freed_map (this txg's frees) to defer_map. 01081 */ 01082 space_map_load_wait(sm); 01083 space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); 01084 space_map_vacate(freed_map, space_map_add, defer_map); 01085 01086 *smo = *smosync; 01087 01088 msp->ms_deferspace += defer_delta; 01089 ASSERT3S(msp->ms_deferspace, >=, 0); 01090 ASSERT3S(msp->ms_deferspace, <=, sm->sm_size); 01091 if (msp->ms_deferspace != 0) { 01092 /* 01093 * Keep syncing this metaslab until all deferred frees 01094 * are back in circulation. 01095 */ 01096 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 01097 } 01098 01099 /* 01100 * If the map is loaded but no longer active, evict it as soon as all 01101 * future allocations have synced. (If we unloaded it now and then 01102 * loaded a moment later, the map wouldn't reflect those allocations.) 01103 */ 01104 if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 01105 int evictable = 1; 01106 01107 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 01108 if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) 01109 evictable = 0; 01110 01111 if (evictable && !metaslab_debug) 01112 space_map_unload(sm); 01113 } 01114 01115 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 01116 01117 mutex_exit(&msp->ms_lock); 01118 } 01119 01120 void 01121 metaslab_sync_reassess(metaslab_group_t *mg) 01122 { 01123 vdev_t *vd = mg->mg_vd; 01124 int64_t failures = mg->mg_alloc_failures; 01125 01126 /* 01127 * Re-evaluate all metaslabs which have lower offsets than the 01128 * bonus area. 01129 */ 01130 for (int m = 0; m < vd->vdev_ms_count; m++) { 01131 metaslab_t *msp = vd->vdev_ms[m]; 01132 01133 if (msp->ms_map.sm_start > mg->mg_bonus_area) 01134 break; 01135 01136 mutex_enter(&msp->ms_lock); 01137 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 01138 mutex_exit(&msp->ms_lock); 01139 } 01140 01141 atomic_add_64(&mg->mg_alloc_failures, -failures); 01142 01143 /* 01144 * Prefetch the next potential metaslabs 01145 */ 01146 metaslab_prefetch(mg); 01147 } 01148 01149 static uint64_t 01150 metaslab_distance(metaslab_t *msp, dva_t *dva) 01151 { 01152 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 01153 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 01154 uint64_t start = msp->ms_map.sm_start >> ms_shift; 01155 01156 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 01157 return (1ULL << 63); 01158 01159 if (offset < start) 01160 return ((start - offset) << ms_shift); 01161 if (offset > start) 01162 return ((offset - start) << ms_shift); 01163 return (0); 01164 } 01165 01166 static uint64_t 01167 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, 01168 uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags) 01169 { 01170 spa_t *spa = mg->mg_vd->vdev_spa; 01171 metaslab_t *msp = NULL; 01172 uint64_t offset = -1ULL; 01173 avl_tree_t *t = &mg->mg_metaslab_tree; 01174 uint64_t activation_weight; 01175 uint64_t target_distance; 01176 int i; 01177 01178 activation_weight = METASLAB_WEIGHT_PRIMARY; 01179 for (i = 0; i < d; i++) { 01180 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 01181 activation_weight = METASLAB_WEIGHT_SECONDARY; 01182 break; 01183 } 01184 } 01185 01186 for (;;) { 01187 boolean_t was_active; 01188 01189 mutex_enter(&mg->mg_lock); 01190 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 01191 if (msp->ms_weight < asize) { 01192 spa_dbgmsg(spa, "%s: failed to meet weight " 01193 "requirement: vdev %llu, txg %llu, mg %p, " 01194 "msp %p, psize %llu, asize %llu, " 01195 "failures %llu, weight %llu", 01196 spa_name(spa), mg->mg_vd->vdev_id, txg, 01197 mg, msp, psize, asize, 01198 mg->mg_alloc_failures, msp->ms_weight); 01199 mutex_exit(&mg->mg_lock); 01200 return (-1ULL); 01201 } 01202 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 01203 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 01204 break; 01205 01206 target_distance = min_distance + 01207 (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); 01208 01209 for (i = 0; i < d; i++) 01210 if (metaslab_distance(msp, &dva[i]) < 01211 target_distance) 01212 break; 01213 if (i == d) 01214 break; 01215 } 01216 mutex_exit(&mg->mg_lock); 01217 if (msp == NULL) 01218 return (-1ULL); 01219 01220 /* 01221 * If we've already reached the allowable number of failed 01222 * allocation attempts on this metaslab group then we 01223 * consider skipping it. We skip it only if we're allowed 01224 * to "fast" gang, the physical size is larger than 01225 * a gang block, and we're attempting to allocate from 01226 * the primary metaslab. 01227 */ 01228 if (mg->mg_alloc_failures > zfs_mg_alloc_failures && 01229 CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE && 01230 activation_weight == METASLAB_WEIGHT_PRIMARY) { 01231 spa_dbgmsg(spa, "%s: skipping metaslab group: " 01232 "vdev %llu, txg %llu, mg %p, psize %llu, " 01233 "asize %llu, failures %llu", spa_name(spa), 01234 mg->mg_vd->vdev_id, txg, mg, psize, asize, 01235 mg->mg_alloc_failures); 01236 return (-1ULL); 01237 } 01238 01239 mutex_enter(&msp->ms_lock); 01240 01241 /* 01242 * Ensure that the metaslab we have selected is still 01243 * capable of handling our request. It's possible that 01244 * another thread may have changed the weight while we 01245 * were blocked on the metaslab lock. 01246 */ 01247 if (msp->ms_weight < asize || (was_active && 01248 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 01249 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 01250 mutex_exit(&msp->ms_lock); 01251 continue; 01252 } 01253 01254 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 01255 activation_weight == METASLAB_WEIGHT_PRIMARY) { 01256 metaslab_passivate(msp, 01257 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 01258 mutex_exit(&msp->ms_lock); 01259 continue; 01260 } 01261 01262 if (metaslab_activate(msp, activation_weight) != 0) { 01263 mutex_exit(&msp->ms_lock); 01264 continue; 01265 } 01266 01267 if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL) 01268 break; 01269 01270 atomic_inc_64(&mg->mg_alloc_failures); 01271 01272 metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); 01273 01274 mutex_exit(&msp->ms_lock); 01275 } 01276 01277 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) 01278 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 01279 01280 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize); 01281 01282 mutex_exit(&msp->ms_lock); 01283 01284 return (offset); 01285 } 01286 01290 static int 01291 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 01292 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 01293 { 01294 metaslab_group_t *mg, *rotor; 01295 vdev_t *vd; 01296 int dshift = 3; 01297 int all_zero; 01298 int zio_lock = B_FALSE; 01299 boolean_t allocatable; 01300 uint64_t offset = -1ULL; 01301 uint64_t asize; 01302 uint64_t distance; 01303 01304 ASSERT(!DVA_IS_VALID(&dva[d])); 01305 01306 /* 01307 * For testing, make some blocks above a certain size be gang blocks. 01308 */ 01309 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 01310 return (ENOSPC); 01311 01312 /* 01313 * Start at the rotor and loop through all mgs until we find something. 01314 * Note that there's no locking on mc_rotor or mc_aliquot because 01315 * nothing actually breaks if we miss a few updates -- we just won't 01316 * allocate quite as evenly. It all balances out over time. 01317 * 01318 * If we are doing ditto or log blocks, try to spread them across 01319 * consecutive vdevs. If we're forced to reuse a vdev before we've 01320 * allocated all of our ditto blocks, then try and spread them out on 01321 * that vdev as much as possible. If it turns out to not be possible, 01322 * gradually lower our standards until anything becomes acceptable. 01323 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 01324 * gives us hope of containing our fault domains to something we're 01325 * able to reason about. Otherwise, any two top-level vdev failures 01326 * will guarantee the loss of data. With consecutive allocation, 01327 * only two adjacent top-level vdev failures will result in data loss. 01328 * 01329 * If we are doing gang blocks (hintdva is non-NULL), try to keep 01330 * ourselves on the same vdev as our gang block header. That 01331 * way, we can hope for locality in vdev_cache, plus it makes our 01332 * fault domains something tractable. 01333 */ 01334 if (hintdva) { 01335 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 01336 01337 /* 01338 * It's possible the vdev we're using as the hint no 01339 * longer exists (i.e. removed). Consult the rotor when 01340 * all else fails. 01341 */ 01342 if (vd != NULL) { 01343 mg = vd->vdev_mg; 01344 01345 if (flags & METASLAB_HINTBP_AVOID && 01346 mg->mg_next != NULL) 01347 mg = mg->mg_next; 01348 } else { 01349 mg = mc->mc_rotor; 01350 } 01351 } else if (d != 0) { 01352 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 01353 mg = vd->vdev_mg->mg_next; 01354 } else { 01355 mg = mc->mc_rotor; 01356 } 01357 01358 /* 01359 * If the hint put us into the wrong metaslab class, or into a 01360 * metaslab group that has been passivated, just follow the rotor. 01361 */ 01362 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 01363 mg = mc->mc_rotor; 01364 01365 rotor = mg; 01366 top: 01367 all_zero = B_TRUE; 01368 do { 01369 ASSERT(mg->mg_activation_count == 1); 01370 01371 vd = mg->mg_vd; 01372 01373 /* 01374 * Don't allocate from faulted devices. 01375 */ 01376 if (zio_lock) { 01377 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 01378 allocatable = vdev_allocatable(vd); 01379 spa_config_exit(spa, SCL_ZIO, FTAG); 01380 } else { 01381 allocatable = vdev_allocatable(vd); 01382 } 01383 if (!allocatable) 01384 goto next; 01385 01386 /* 01387 * Avoid writing single-copy data to a failing vdev 01388 */ 01389 if ((vd->vdev_stat.vs_write_errors > 0 || 01390 vd->vdev_state < VDEV_STATE_HEALTHY) && 01391 d == 0 && dshift == 3) { 01392 all_zero = B_FALSE; 01393 goto next; 01394 } 01395 01396 ASSERT(mg->mg_class == mc); 01397 01398 distance = vd->vdev_asize >> dshift; 01399 if (distance <= (1ULL << vd->vdev_ms_shift)) 01400 distance = 0; 01401 else 01402 all_zero = B_FALSE; 01403 01404 asize = vdev_psize_to_asize(vd, psize); 01405 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 01406 01407 offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 01408 dva, d, flags); 01409 if (offset != -1ULL) { 01410 /* 01411 * If we've just selected this metaslab group, 01412 * figure out whether the corresponding vdev is 01413 * over- or under-used relative to the pool, 01414 * and set an allocation bias to even it out. 01415 */ 01416 if (mc->mc_aliquot == 0) { 01417 vdev_stat_t *vs = &vd->vdev_stat; 01418 int64_t vu, cu; 01419 01420 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 01421 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 01422 01423 /* 01424 * Calculate how much more or less we should 01425 * try to allocate from this device during 01426 * this iteration around the rotor. 01427 * For example, if a device is 80% full 01428 * and the pool is 20% full then we should 01429 * reduce allocations by 60% on this device. 01430 * 01431 * mg_bias = (20 - 80) * 512K / 100 = -307K 01432 * 01433 * This reduces allocations by 307K for this 01434 * iteration. 01435 */ 01436 mg->mg_bias = ((cu - vu) * 01437 (int64_t)mg->mg_aliquot) / 100; 01438 } 01439 01440 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 01441 mg->mg_aliquot + mg->mg_bias) { 01442 mc->mc_rotor = mg->mg_next; 01443 mc->mc_aliquot = 0; 01444 } 01445 01446 DVA_SET_VDEV(&dva[d], vd->vdev_id); 01447 DVA_SET_OFFSET(&dva[d], offset); 01448 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 01449 DVA_SET_ASIZE(&dva[d], asize); 01450 01451 return (0); 01452 } 01453 next: 01454 mc->mc_rotor = mg->mg_next; 01455 mc->mc_aliquot = 0; 01456 } while ((mg = mg->mg_next) != rotor); 01457 01458 if (!all_zero) { 01459 dshift++; 01460 ASSERT(dshift < 64); 01461 goto top; 01462 } 01463 01464 if (!allocatable && !zio_lock) { 01465 dshift = 3; 01466 zio_lock = B_TRUE; 01467 goto top; 01468 } 01469 01470 bzero(&dva[d], sizeof (dva_t)); 01471 01472 return (ENOSPC); 01473 } 01474 01479 static void 01480 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 01481 { 01482 uint64_t vdev = DVA_GET_VDEV(dva); 01483 uint64_t offset = DVA_GET_OFFSET(dva); 01484 uint64_t size = DVA_GET_ASIZE(dva); 01485 vdev_t *vd; 01486 metaslab_t *msp; 01487 01488 ASSERT(DVA_IS_VALID(dva)); 01489 01490 if (txg > spa_freeze_txg(spa)) 01491 return; 01492 01493 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 01494 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 01495 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 01496 (u_longlong_t)vdev, (u_longlong_t)offset); 01497 ASSERT(0); 01498 return; 01499 } 01500 01501 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 01502 01503 if (DVA_GET_GANG(dva)) 01504 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 01505 01506 mutex_enter(&msp->ms_lock); 01507 01508 if (now) { 01509 space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], 01510 offset, size); 01511 space_map_free(&msp->ms_map, offset, size); 01512 } else { 01513 if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) 01514 vdev_dirty(vd, VDD_METASLAB, msp, txg); 01515 space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); 01516 } 01517 01518 mutex_exit(&msp->ms_lock); 01519 } 01520 01527 static int 01528 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 01529 { 01530 uint64_t vdev = DVA_GET_VDEV(dva); 01531 uint64_t offset = DVA_GET_OFFSET(dva); 01532 uint64_t size = DVA_GET_ASIZE(dva); 01533 vdev_t *vd; 01534 metaslab_t *msp; 01535 int error = 0; 01536 01537 ASSERT(DVA_IS_VALID(dva)); 01538 01539 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 01540 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 01541 return (ENXIO); 01542 01543 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 01544 01545 if (DVA_GET_GANG(dva)) 01546 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 01547 01548 mutex_enter(&msp->ms_lock); 01549 01550 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) 01551 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 01552 01553 if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) 01554 error = ENOENT; 01555 01556 if (error || txg == 0) { /* txg == 0 indicates dry run */ 01557 mutex_exit(&msp->ms_lock); 01558 return (error); 01559 } 01560 01561 space_map_claim(&msp->ms_map, offset, size); 01562 01563 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 01564 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) 01565 vdev_dirty(vd, VDD_METASLAB, msp, txg); 01566 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); 01567 } 01568 01569 mutex_exit(&msp->ms_lock); 01570 01571 return (0); 01572 } 01573 01574 int 01575 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 01576 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 01577 { 01578 dva_t *dva = bp->blk_dva; 01579 dva_t *hintdva = hintbp->blk_dva; 01580 int error = 0; 01581 01582 ASSERT(bp->blk_birth == 0); 01583 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 01584 01585 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 01586 01587 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 01588 spa_config_exit(spa, SCL_ALLOC, FTAG); 01589 return (ENOSPC); 01590 } 01591 01592 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 01593 ASSERT(BP_GET_NDVAS(bp) == 0); 01594 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 01595 01596 for (int d = 0; d < ndvas; d++) { 01597 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 01598 txg, flags); 01599 if (error) { 01600 for (d--; d >= 0; d--) { 01601 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 01602 bzero(&dva[d], sizeof (dva_t)); 01603 } 01604 spa_config_exit(spa, SCL_ALLOC, FTAG); 01605 return (error); 01606 } 01607 } 01608 ASSERT(error == 0); 01609 ASSERT(BP_GET_NDVAS(bp) == ndvas); 01610 01611 spa_config_exit(spa, SCL_ALLOC, FTAG); 01612 01613 BP_SET_BIRTH(bp, txg, txg); 01614 01615 return (0); 01616 } 01617 01618 void 01619 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 01620 { 01621 const dva_t *dva = bp->blk_dva; 01622 int ndvas = BP_GET_NDVAS(bp); 01623 01624 ASSERT(!BP_IS_HOLE(bp)); 01625 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 01626 01627 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 01628 01629 for (int d = 0; d < ndvas; d++) 01630 metaslab_free_dva(spa, &dva[d], txg, now); 01631 01632 spa_config_exit(spa, SCL_FREE, FTAG); 01633 } 01634 01635 int 01636 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 01637 { 01638 const dva_t *dva = bp->blk_dva; 01639 int ndvas = BP_GET_NDVAS(bp); 01640 int error = 0; 01641 01642 ASSERT(!BP_IS_HOLE(bp)); 01643 01644 if (txg != 0) { 01645 /* 01646 * First do a dry run to make sure all DVAs are claimable, 01647 * so we don't have to unwind from partial failures below. 01648 */ 01649 if ((error = metaslab_claim(spa, bp, 0)) != 0) 01650 return (error); 01651 } 01652 01653 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 01654 01655 for (int d = 0; d < ndvas; d++) 01656 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 01657 break; 01658 01659 spa_config_exit(spa, SCL_ALLOC, FTAG); 01660 01661 ASSERT(error == 0 || txg == 0); 01662 01663 return (error); 01664 }