FreeBSD ZFS: metaslab.c Source File

FreeBSD ZFS
The Zettabyte File System
00001 /*
00002  * CDDL HEADER START
00003  *
00004  * The contents of this file are subject to the terms of the
00005  * Common Development and Distribution License (the "License").
00006  * You may not use this file except in compliance with the License.
00007  *
00008  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
00009  * or http://www.opensolaris.org/os/licensing.
00010  * See the License for the specific language governing permissions
00011  * and limitations under the License.
00012  *
00013  * When distributing Covered Code, include this CDDL HEADER in each
00014  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
00015  * If applicable, add the following below this CDDL HEADER, with the
00016  * fields enclosed by brackets "[]" replaced with your own identifying
00017  * information: Portions Copyright [yyyy] [name of copyright owner]
00018  *
00019  * CDDL HEADER END
00020  */
00021 /*
00022  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
00023  * Copyright (c) 2012 by Delphix. All rights reserved.
00024  */
00025 
00026 #include <sys/zfs_context.h>
00027 #include <sys/dmu.h>
00028 #include <sys/dmu_tx.h>
00029 #include <sys/space_map.h>
00030 #include <sys/metaslab_impl.h>
00031 #include <sys/vdev_impl.h>
00032 #include <sys/zio.h>
00033 
00042 #define CAN_FASTGANG(flags) \
00043         (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
00044         METASLAB_GANG_AVOID)))
00045 
00046 uint64_t metaslab_aliquot = 512ULL << 10;
00047 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;     /* force gang blocks */
00048 
00056 int zfs_mg_alloc_failures = 0;
00057 
00058 SYSCTL_DECL(_vfs_zfs);
00059 SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_alloc_failures, CTLFLAG_RDTUN,
00060     &zfs_mg_alloc_failures, 0,
00061     "Number of allowed allocation failures per vdev");
00062 TUNABLE_INT("vfs.zfs.mg_alloc_failures", &zfs_mg_alloc_failures);
00063 
00067 static int metaslab_debug = 0;
00068 
00075 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
00076 
00083 int metaslab_df_free_pct = 4;
00084 
00089 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
00090 
00094 int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
00095 
00099 int metaslab_smo_bonus_pct = 150;
00100 
00101 /*
00102  * ==========================================================================
00103  * Metaslab classes
00104  * ==========================================================================
00105  */
00106 metaslab_class_t *
00107 metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
00108 {
00109         metaslab_class_t *mc;
00110 
00111         mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
00112 
00113         mc->mc_spa = spa;
00114         mc->mc_rotor = NULL;
00115         mc->mc_ops = ops;
00116 
00117         return (mc);
00118 }
00119 
00120 void
00121 metaslab_class_destroy(metaslab_class_t *mc)
00122 {
00123         ASSERT(mc->mc_rotor == NULL);
00124         ASSERT(mc->mc_alloc == 0);
00125         ASSERT(mc->mc_deferred == 0);
00126         ASSERT(mc->mc_space == 0);
00127         ASSERT(mc->mc_dspace == 0);
00128 
00129         kmem_free(mc, sizeof (metaslab_class_t));
00130 }
00131 
00132 int
00133 metaslab_class_validate(metaslab_class_t *mc)
00134 {
00135         metaslab_group_t *mg;
00136         vdev_t *vd;
00137 
00138         /*
00139          * Must hold one of the spa_config locks.
00140          */
00141         ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
00142             spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
00143 
00144         if ((mg = mc->mc_rotor) == NULL)
00145                 return (0);
00146 
00147         do {
00148                 vd = mg->mg_vd;
00149                 ASSERT(vd->vdev_mg != NULL);
00150                 ASSERT3P(vd->vdev_top, ==, vd);
00151                 ASSERT3P(mg->mg_class, ==, mc);
00152                 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
00153         } while ((mg = mg->mg_next) != mc->mc_rotor);
00154 
00155         return (0);
00156 }
00157 
00158 void
00159 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
00160     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
00161 {
00162         atomic_add_64(&mc->mc_alloc, alloc_delta);
00163         atomic_add_64(&mc->mc_deferred, defer_delta);
00164         atomic_add_64(&mc->mc_space, space_delta);
00165         atomic_add_64(&mc->mc_dspace, dspace_delta);
00166 }
00167 
00168 uint64_t
00169 metaslab_class_get_alloc(metaslab_class_t *mc)
00170 {
00171         return (mc->mc_alloc);
00172 }
00173 
00174 uint64_t
00175 metaslab_class_get_deferred(metaslab_class_t *mc)
00176 {
00177         return (mc->mc_deferred);
00178 }
00179 
00180 uint64_t
00181 metaslab_class_get_space(metaslab_class_t *mc)
00182 {
00183         return (mc->mc_space);
00184 }
00185 
00186 uint64_t
00187 metaslab_class_get_dspace(metaslab_class_t *mc)
00188 {
00189         return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
00190 }
00191 
00192 /*
00193  * ==========================================================================
00194  * Metaslab groups
00195  * ==========================================================================
00196  */
00197 static int
00198 metaslab_compare(const void *x1, const void *x2)
00199 {
00200         const metaslab_t *m1 = x1;
00201         const metaslab_t *m2 = x2;
00202 
00203         if (m1->ms_weight < m2->ms_weight)
00204                 return (1);
00205         if (m1->ms_weight > m2->ms_weight)
00206                 return (-1);
00207 
00208         /*
00209          * If the weights are identical, use the offset to force uniqueness.
00210          */
00211         if (m1->ms_map.sm_start < m2->ms_map.sm_start)
00212                 return (-1);
00213         if (m1->ms_map.sm_start > m2->ms_map.sm_start)
00214                 return (1);
00215 
00216         ASSERT3P(m1, ==, m2);
00217 
00218         return (0);
00219 }
00220 
00221 metaslab_group_t *
00222 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
00223 {
00224         metaslab_group_t *mg;
00225 
00226         mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
00227         mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
00228         avl_create(&mg->mg_metaslab_tree, metaslab_compare,
00229             sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
00230         mg->mg_vd = vd;
00231         mg->mg_class = mc;
00232         mg->mg_activation_count = 0;
00233 
00234         return (mg);
00235 }
00236 
00237 void
00238 metaslab_group_destroy(metaslab_group_t *mg)
00239 {
00240         ASSERT(mg->mg_prev == NULL);
00241         ASSERT(mg->mg_next == NULL);
00242         /*
00243          * We may have gone below zero with the activation count
00244          * either because we never activated in the first place or
00245          * because we're done, and possibly removing the vdev.
00246          */
00247         ASSERT(mg->mg_activation_count <= 0);
00248 
00249         avl_destroy(&mg->mg_metaslab_tree);
00250         mutex_destroy(&mg->mg_lock);
00251         kmem_free(mg, sizeof (metaslab_group_t));
00252 }
00253 
00254 void
00255 metaslab_group_activate(metaslab_group_t *mg)
00256 {
00257         metaslab_class_t *mc = mg->mg_class;
00258         metaslab_group_t *mgprev, *mgnext;
00259 
00260         ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
00261 
00262         ASSERT(mc->mc_rotor != mg);
00263         ASSERT(mg->mg_prev == NULL);
00264         ASSERT(mg->mg_next == NULL);
00265         ASSERT(mg->mg_activation_count <= 0);
00266 
00267         if (++mg->mg_activation_count <= 0)
00268                 return;
00269 
00270         mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
00271 
00272         if ((mgprev = mc->mc_rotor) == NULL) {
00273                 mg->mg_prev = mg;
00274                 mg->mg_next = mg;
00275         } else {
00276                 mgnext = mgprev->mg_next;
00277                 mg->mg_prev = mgprev;
00278                 mg->mg_next = mgnext;
00279                 mgprev->mg_next = mg;
00280                 mgnext->mg_prev = mg;
00281         }
00282         mc->mc_rotor = mg;
00283 }
00284 
00285 void
00286 metaslab_group_passivate(metaslab_group_t *mg)
00287 {
00288         metaslab_class_t *mc = mg->mg_class;
00289         metaslab_group_t *mgprev, *mgnext;
00290 
00291         ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
00292 
00293         if (--mg->mg_activation_count != 0) {
00294                 ASSERT(mc->mc_rotor != mg);
00295                 ASSERT(mg->mg_prev == NULL);
00296                 ASSERT(mg->mg_next == NULL);
00297                 ASSERT(mg->mg_activation_count < 0);
00298                 return;
00299         }
00300 
00301         mgprev = mg->mg_prev;
00302         mgnext = mg->mg_next;
00303 
00304         if (mg == mgnext) {
00305                 mc->mc_rotor = NULL;
00306         } else {
00307                 mc->mc_rotor = mgnext;
00308                 mgprev->mg_next = mgnext;
00309                 mgnext->mg_prev = mgprev;
00310         }
00311 
00312         mg->mg_prev = NULL;
00313         mg->mg_next = NULL;
00314 }
00315 
00316 static void
00317 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
00318 {
00319         mutex_enter(&mg->mg_lock);
00320         ASSERT(msp->ms_group == NULL);
00321         msp->ms_group = mg;
00322         msp->ms_weight = 0;
00323         avl_add(&mg->mg_metaslab_tree, msp);
00324         mutex_exit(&mg->mg_lock);
00325 }
00326 
00327 static void
00328 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
00329 {
00330         mutex_enter(&mg->mg_lock);
00331         ASSERT(msp->ms_group == mg);
00332         avl_remove(&mg->mg_metaslab_tree, msp);
00333         msp->ms_group = NULL;
00334         mutex_exit(&mg->mg_lock);
00335 }
00336 
00337 static void
00338 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
00339 {
00340         /*
00341          * Although in principle the weight can be any value, in
00342          * practice we do not use values in the range [1, 510].
00343          */
00344         ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
00345         ASSERT(MUTEX_HELD(&msp->ms_lock));
00346 
00347         mutex_enter(&mg->mg_lock);
00348         ASSERT(msp->ms_group == mg);
00349         avl_remove(&mg->mg_metaslab_tree, msp);
00350         msp->ms_weight = weight;
00351         avl_add(&mg->mg_metaslab_tree, msp);
00352         mutex_exit(&mg->mg_lock);
00353 }
00354 
00355 /*
00356  * ==========================================================================
00357  * Common allocator routines
00358  * ==========================================================================
00359  */
00360 static int
00361 metaslab_segsize_compare(const void *x1, const void *x2)
00362 {
00363         const space_seg_t *s1 = x1;
00364         const space_seg_t *s2 = x2;
00365         uint64_t ss_size1 = s1->ss_end - s1->ss_start;
00366         uint64_t ss_size2 = s2->ss_end - s2->ss_start;
00367 
00368         if (ss_size1 < ss_size2)
00369                 return (-1);
00370         if (ss_size1 > ss_size2)
00371                 return (1);
00372 
00373         if (s1->ss_start < s2->ss_start)
00374                 return (-1);
00375         if (s1->ss_start > s2->ss_start)
00376                 return (1);
00377 
00378         return (0);
00379 }
00380 
00386 static uint64_t
00387 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
00388     uint64_t align)
00389 {
00390         space_seg_t *ss, ssearch;
00391         avl_index_t where;
00392 
00393         ssearch.ss_start = *cursor;
00394         ssearch.ss_end = *cursor + size;
00395 
00396         ss = avl_find(t, &ssearch, &where);
00397         if (ss == NULL)
00398                 ss = avl_nearest(t, where, AVL_AFTER);
00399 
00400         while (ss != NULL) {
00401                 uint64_t offset = P2ROUNDUP(ss->ss_start, align);
00402 
00403                 if (offset + size <= ss->ss_end) {
00404                         *cursor = offset + size;
00405                         return (offset);
00406                 }
00407                 ss = AVL_NEXT(t, ss);
00408         }
00409 
00410         /*
00411          * If we know we've searched the whole map (*cursor == 0), give up.
00412          * Otherwise, reset the cursor to the beginning and try again.
00413          */
00414         if (*cursor == 0)
00415                 return (-1ULL);
00416 
00417         *cursor = 0;
00418         return (metaslab_block_picker(t, cursor, size, align));
00419 }
00420 
00421 static void
00422 metaslab_pp_load(space_map_t *sm)
00423 {
00424         space_seg_t *ss;
00425 
00426         ASSERT(sm->sm_ppd == NULL);
00427         sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
00428 
00429         sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
00430         avl_create(sm->sm_pp_root, metaslab_segsize_compare,
00431             sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
00432 
00433         for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
00434                 avl_add(sm->sm_pp_root, ss);
00435 }
00436 
00437 static void
00438 metaslab_pp_unload(space_map_t *sm)
00439 {
00440         void *cookie = NULL;
00441 
00442         kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
00443         sm->sm_ppd = NULL;
00444 
00445         while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
00446                 /* tear down the tree */
00447         }
00448 
00449         avl_destroy(sm->sm_pp_root);
00450         kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
00451         sm->sm_pp_root = NULL;
00452 }
00453 
00454 /* ARGSUSED */
00455 static void
00456 metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
00457 {
00458         /* No need to update cursor */
00459 }
00460 
00461 /* ARGSUSED */
00462 static void
00463 metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
00464 {
00465         /* No need to update cursor */
00466 }
00467 
00471 uint64_t
00472 metaslab_pp_maxsize(space_map_t *sm)
00473 {
00474         avl_tree_t *t = sm->sm_pp_root;
00475         space_seg_t *ss;
00476 
00477         if (t == NULL || (ss = avl_last(t)) == NULL)
00478                 return (0ULL);
00479 
00480         return (ss->ss_end - ss->ss_start);
00481 }
00482 
00483 /*
00484  * ==========================================================================
00485  * The first-fit block allocator
00486  * ==========================================================================
00487  */
00488 static uint64_t
00489 metaslab_ff_alloc(space_map_t *sm, uint64_t size)
00490 {
00491         avl_tree_t *t = &sm->sm_root;
00492         uint64_t align = size & -size;
00493         uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
00494 
00495         return (metaslab_block_picker(t, cursor, size, align));
00496 }
00497 
00498 /* ARGSUSED */
00499 boolean_t
00500 metaslab_ff_fragmented(space_map_t *sm)
00501 {
00502         return (B_TRUE);
00503 }
00504 
00505 static space_map_ops_t metaslab_ff_ops = {
00506         metaslab_pp_load,
00507         metaslab_pp_unload,
00508         metaslab_ff_alloc,
00509         metaslab_pp_claim,
00510         metaslab_pp_free,
00511         metaslab_pp_maxsize,
00512         metaslab_ff_fragmented
00513 };
00514 
00515 /*
00516  * ==========================================================================
00517  * Dynamic block allocator 
00518  *
00519  * Uses the first fit allocation scheme until space get low and then
00520  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
00521  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
00522  * ==========================================================================
00523  */
00524 static uint64_t
00525 metaslab_df_alloc(space_map_t *sm, uint64_t size)
00526 {
00527         avl_tree_t *t = &sm->sm_root;
00528         uint64_t align = size & -size;
00529         uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
00530         uint64_t max_size = metaslab_pp_maxsize(sm);
00531         int free_pct = sm->sm_space * 100 / sm->sm_size;
00532 
00533         ASSERT(MUTEX_HELD(sm->sm_lock));
00534         ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
00535 
00536         if (max_size < size)
00537                 return (-1ULL);
00538 
00539         /*
00540          * If we're running low on space switch to using the size
00541          * sorted AVL tree (best-fit).
00542          */
00543         if (max_size < metaslab_df_alloc_threshold ||
00544             free_pct < metaslab_df_free_pct) {
00545                 t = sm->sm_pp_root;
00546                 *cursor = 0;
00547         }
00548 
00549         return (metaslab_block_picker(t, cursor, size, 1ULL));
00550 }
00551 
00552 static boolean_t
00553 metaslab_df_fragmented(space_map_t *sm)
00554 {
00555         uint64_t max_size = metaslab_pp_maxsize(sm);
00556         int free_pct = sm->sm_space * 100 / sm->sm_size;
00557 
00558         if (max_size >= metaslab_df_alloc_threshold &&
00559             free_pct >= metaslab_df_free_pct)
00560                 return (B_FALSE);
00561 
00562         return (B_TRUE);
00563 }
00564 
00565 static space_map_ops_t metaslab_df_ops = {
00566         metaslab_pp_load,
00567         metaslab_pp_unload,
00568         metaslab_df_alloc,
00569         metaslab_pp_claim,
00570         metaslab_pp_free,
00571         metaslab_pp_maxsize,
00572         metaslab_df_fragmented
00573 };
00574 
00575 /*
00576  * ==========================================================================
00577  * Other experimental allocators
00578  * ==========================================================================
00579  */
00580 static uint64_t
00581 metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
00582 {
00583         avl_tree_t *t = &sm->sm_root;
00584         uint64_t *cursor = (uint64_t *)sm->sm_ppd;
00585         uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
00586         uint64_t max_size = metaslab_pp_maxsize(sm);
00587         uint64_t rsize = size;
00588         uint64_t offset = 0;
00589 
00590         ASSERT(MUTEX_HELD(sm->sm_lock));
00591         ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
00592 
00593         if (max_size < size)
00594                 return (-1ULL);
00595 
00596         ASSERT3U(*extent_end, >=, *cursor);
00597 
00598         /*
00599          * If we're running low on space switch to using the size
00600          * sorted AVL tree (best-fit).
00601          */
00602         if ((*cursor + size) > *extent_end) {
00603 
00604                 t = sm->sm_pp_root;
00605                 *cursor = *extent_end = 0;
00606 
00607                 if (max_size > 2 * SPA_MAXBLOCKSIZE)
00608                         rsize = MIN(metaslab_min_alloc_size, max_size);
00609                 offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
00610                 if (offset != -1)
00611                         *cursor = offset + size;
00612         } else {
00613                 offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
00614         }
00615         ASSERT3U(*cursor, <=, *extent_end);
00616         return (offset);
00617 }
00618 
00619 static boolean_t
00620 metaslab_cdf_fragmented(space_map_t *sm)
00621 {
00622         uint64_t max_size = metaslab_pp_maxsize(sm);
00623 
00624         if (max_size > (metaslab_min_alloc_size * 10))
00625                 return (B_FALSE);
00626         return (B_TRUE);
00627 }
00628 
00629 static space_map_ops_t metaslab_cdf_ops = {
00630         metaslab_pp_load,
00631         metaslab_pp_unload,
00632         metaslab_cdf_alloc,
00633         metaslab_pp_claim,
00634         metaslab_pp_free,
00635         metaslab_pp_maxsize,
00636         metaslab_cdf_fragmented
00637 };
00638 
00639 uint64_t metaslab_ndf_clump_shift = 4;
00640 
00641 static uint64_t
00642 metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
00643 {
00644         avl_tree_t *t = &sm->sm_root;
00645         avl_index_t where;
00646         space_seg_t *ss, ssearch;
00647         uint64_t hbit = highbit(size);
00648         uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1;
00649         uint64_t max_size = metaslab_pp_maxsize(sm);
00650 
00651         ASSERT(MUTEX_HELD(sm->sm_lock));
00652         ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
00653 
00654         if (max_size < size)
00655                 return (-1ULL);
00656 
00657         ssearch.ss_start = *cursor;
00658         ssearch.ss_end = *cursor + size;
00659 
00660         ss = avl_find(t, &ssearch, &where);
00661         if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
00662                 t = sm->sm_pp_root;
00663 
00664                 ssearch.ss_start = 0;
00665                 ssearch.ss_end = MIN(max_size,
00666                     1ULL << (hbit + metaslab_ndf_clump_shift));
00667                 ss = avl_find(t, &ssearch, &where);
00668                 if (ss == NULL)
00669                         ss = avl_nearest(t, where, AVL_AFTER);
00670                 ASSERT(ss != NULL);
00671         }
00672 
00673         if (ss != NULL) {
00674                 if (ss->ss_start + size <= ss->ss_end) {
00675                         *cursor = ss->ss_start + size;
00676                         return (ss->ss_start);
00677                 }
00678         }
00679         return (-1ULL);
00680 }
00681 
00682 static boolean_t
00683 metaslab_ndf_fragmented(space_map_t *sm)
00684 {
00685         uint64_t max_size = metaslab_pp_maxsize(sm);
00686 
00687         if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift))
00688                 return (B_FALSE);
00689         return (B_TRUE);
00690 }
00691 
00692 
00693 static space_map_ops_t metaslab_ndf_ops = {
00694         metaslab_pp_load,
00695         metaslab_pp_unload,
00696         metaslab_ndf_alloc,
00697         metaslab_pp_claim,
00698         metaslab_pp_free,
00699         metaslab_pp_maxsize,
00700         metaslab_ndf_fragmented
00701 };
00702 
00703 space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
00704 
00705 /*
00706  * ==========================================================================
00707  * Metaslabs
00708  * ==========================================================================
00709  */
00710 metaslab_t *
00711 metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
00712         uint64_t start, uint64_t size, uint64_t txg)
00713 {
00714         vdev_t *vd = mg->mg_vd;
00715         metaslab_t *msp;
00716 
00717         msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
00718         mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
00719 
00720         msp->ms_smo_syncing = *smo;
00721 
00722         /*
00723          * We create the main space map here, but we don't create the
00724          * allocmaps and freemaps until metaslab_sync_done().  This serves
00725          * two purposes: it allows metaslab_sync_done() to detect the
00726          * addition of new space; and for debugging, it ensures that we'd
00727          * data fault on any attempt to use this metaslab before it's ready.
00728          */
00729         space_map_create(&msp->ms_map, start, size,
00730             vd->vdev_ashift, &msp->ms_lock);
00731 
00732         metaslab_group_add(mg, msp);
00733 
00734         if (metaslab_debug && smo->smo_object != 0) {
00735                 mutex_enter(&msp->ms_lock);
00736                 VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops,
00737                     SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
00738                 mutex_exit(&msp->ms_lock);
00739         }
00740 
00741         /*
00742          * If we're opening an existing pool (txg == 0) or creating
00743          * a new one (txg == TXG_INITIAL), all space is available now.
00744          * If we're adding space to an existing pool, the new space
00745          * does not become available until after this txg has synced.
00746          */
00747         if (txg <= TXG_INITIAL)
00748                 metaslab_sync_done(msp, 0);
00749 
00750         if (txg != 0) {
00751                 vdev_dirty(vd, 0, NULL, txg);
00752                 vdev_dirty(vd, VDD_METASLAB, msp, txg);
00753         }
00754 
00755         return (msp);
00756 }
00757 
00758 void
00759 metaslab_fini(metaslab_t *msp)
00760 {
00761         metaslab_group_t *mg = msp->ms_group;
00762 
00763         vdev_space_update(mg->mg_vd,
00764             -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size);
00765 
00766         metaslab_group_remove(mg, msp);
00767 
00768         mutex_enter(&msp->ms_lock);
00769 
00770         space_map_unload(&msp->ms_map);
00771         space_map_destroy(&msp->ms_map);
00772 
00773         for (int t = 0; t < TXG_SIZE; t++) {
00774                 space_map_destroy(&msp->ms_allocmap[t]);
00775                 space_map_destroy(&msp->ms_freemap[t]);
00776         }
00777 
00778         for (int t = 0; t < TXG_DEFER_SIZE; t++)
00779                 space_map_destroy(&msp->ms_defermap[t]);
00780 
00781         ASSERT0(msp->ms_deferspace);
00782 
00783         mutex_exit(&msp->ms_lock);
00784         mutex_destroy(&msp->ms_lock);
00785 
00786         kmem_free(msp, sizeof (metaslab_t));
00787 }
00788 
00789 #define METASLAB_WEIGHT_PRIMARY         (1ULL << 63)
00790 #define METASLAB_WEIGHT_SECONDARY       (1ULL << 62)
00791 #define METASLAB_ACTIVE_MASK            \
00792         (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
00793 
00794 static uint64_t
00795 metaslab_weight(metaslab_t *msp)
00796 {
00797         metaslab_group_t *mg = msp->ms_group;
00798         space_map_t *sm = &msp->ms_map;
00799         space_map_obj_t *smo = &msp->ms_smo;
00800         vdev_t *vd = mg->mg_vd;
00801         uint64_t weight, space;
00802 
00803         ASSERT(MUTEX_HELD(&msp->ms_lock));
00804 
00805         /*
00806          * The baseline weight is the metaslab's free space.
00807          */
00808         space = sm->sm_size - smo->smo_alloc;
00809         weight = space;
00810 
00811         /*
00812          * Modern disks have uniform bit density and constant angular velocity.
00813          * Therefore, the outer recording zones are faster (higher bandwidth)
00814          * than the inner zones by the ratio of outer to inner track diameter,
00815          * which is typically around 2:1.  We account for this by assigning
00816          * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
00817          * In effect, this means that we'll select the metaslab with the most
00818          * free bandwidth rather than simply the one with the most free space.
00819          */
00820         weight = 2 * weight -
00821             ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
00822         ASSERT(weight >= space && weight <= 2 * space);
00823 
00824         /*
00825          * For locality, assign higher weight to metaslabs which have
00826          * a lower offset than what we've already activated.
00827          */
00828         if (sm->sm_start <= mg->mg_bonus_area)
00829                 weight *= (metaslab_smo_bonus_pct / 100);
00830         ASSERT(weight >= space &&
00831             weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
00832 
00833         if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
00834                 /*
00835                  * If this metaslab is one we're actively using, adjust its
00836                  * weight to make it preferable to any inactive metaslab so
00837                  * we'll polish it off.
00838                  */
00839                 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
00840         }
00841         return (weight);
00842 }
00843 
00844 static void
00845 metaslab_prefetch(metaslab_group_t *mg)
00846 {
00847         spa_t *spa = mg->mg_vd->vdev_spa;
00848         metaslab_t *msp;
00849         avl_tree_t *t = &mg->mg_metaslab_tree;
00850         int m;
00851 
00852         mutex_enter(&mg->mg_lock);
00853 
00854         /*
00855          * Prefetch the next potential metaslabs
00856          */
00857         for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
00858                 space_map_t *sm = &msp->ms_map;
00859                 space_map_obj_t *smo = &msp->ms_smo;
00860 
00861                 /* If we have reached our prefetch limit then we're done */
00862                 if (m >= metaslab_prefetch_limit)
00863                         break;
00864 
00865                 if (!sm->sm_loaded && smo->smo_object != 0) {
00866                         mutex_exit(&mg->mg_lock);
00867                         dmu_prefetch(spa_meta_objset(spa), smo->smo_object,
00868                             0ULL, smo->smo_objsize);
00869                         mutex_enter(&mg->mg_lock);
00870                 }
00871         }
00872         mutex_exit(&mg->mg_lock);
00873 }
00874 
00875 static int
00876 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
00877 {
00878         metaslab_group_t *mg = msp->ms_group;
00879         space_map_t *sm = &msp->ms_map;
00880         space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
00881 
00882         ASSERT(MUTEX_HELD(&msp->ms_lock));
00883 
00884         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
00885                 space_map_load_wait(sm);
00886                 if (!sm->sm_loaded) {
00887                         int error = space_map_load(sm, sm_ops, SM_FREE,
00888                             &msp->ms_smo,
00889                             spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
00890                         if (error)  {
00891                                 metaslab_group_sort(msp->ms_group, msp, 0);
00892                                 return (error);
00893                         }
00894                         for (int t = 0; t < TXG_DEFER_SIZE; t++)
00895                                 space_map_walk(&msp->ms_defermap[t],
00896                                     space_map_claim, sm);
00897 
00898                 }
00899 
00900                 /*
00901                  * Track the bonus area as we activate new metaslabs.
00902                  */
00903                 if (sm->sm_start > mg->mg_bonus_area) {
00904                         mutex_enter(&mg->mg_lock);
00905                         mg->mg_bonus_area = sm->sm_start;
00906                         mutex_exit(&mg->mg_lock);
00907                 }
00908 
00909                 metaslab_group_sort(msp->ms_group, msp,
00910                     msp->ms_weight | activation_weight);
00911         }
00912         ASSERT(sm->sm_loaded);
00913         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
00914 
00915         return (0);
00916 }
00917 
00918 static void
00919 metaslab_passivate(metaslab_t *msp, uint64_t size)
00920 {
00921         /*
00922          * If size < SPA_MINBLOCKSIZE, then we will not allocate from
00923          * this metaslab again.  In that case, it had better be empty,
00924          * or we would be leaving space on the table.
00925          */
00926         ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0);
00927         metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
00928         ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
00929 }
00930 
00934 void
00935 metaslab_sync(metaslab_t *msp, uint64_t txg)
00936 {
00937         vdev_t *vd = msp->ms_group->mg_vd;
00938         spa_t *spa = vd->vdev_spa;
00939         objset_t *mos = spa_meta_objset(spa);
00940         space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
00941         space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
00942         space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
00943         space_map_t *sm = &msp->ms_map;
00944         space_map_obj_t *smo = &msp->ms_smo_syncing;
00945         dmu_buf_t *db;
00946         dmu_tx_t *tx;
00947 
00948         ASSERT(!vd->vdev_ishole);
00949 
00950         if (allocmap->sm_space == 0 && freemap->sm_space == 0)
00951                 return;
00952 
00953         /*
00954          * The only state that can actually be changing concurrently with
00955          * metaslab_sync() is the metaslab's ms_map.  No other thread can
00956          * be modifying this txg's allocmap, freemap, freed_map, or smo.
00957          * Therefore, we only hold ms_lock to satify space_map ASSERTs.
00958          * We drop it whenever we call into the DMU, because the DMU
00959          * can call down to us (e.g. via zio_free()) at any time.
00960          */
00961 
00962         tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
00963 
00964         if (smo->smo_object == 0) {
00965                 ASSERT(smo->smo_objsize == 0);
00966                 ASSERT(smo->smo_alloc == 0);
00967                 smo->smo_object = dmu_object_alloc(mos,
00968                     DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
00969                     DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
00970                 ASSERT(smo->smo_object != 0);
00971                 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
00972                     (sm->sm_start >> vd->vdev_ms_shift),
00973                     sizeof (uint64_t), &smo->smo_object, tx);
00974         }
00975 
00976         mutex_enter(&msp->ms_lock);
00977 
00978         space_map_walk(freemap, space_map_add, freed_map);
00979 
00980         if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
00981             2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) {
00982                 /*
00983                  * The in-core space map representation is twice as compact
00984                  * as the on-disk one, so it's time to condense the latter
00985                  * by generating a pure allocmap from first principles.
00986                  *
00987                  * This metaslab is 100% allocated,
00988                  * minus the content of the in-core map (sm),
00989                  * minus what's been freed this txg (freed_map),
00990                  * minus deferred frees (ms_defermap[]),
00991                  * minus allocations from txgs in the future
00992                  * (because they haven't been committed yet).
00993                  */
00994                 space_map_vacate(allocmap, NULL, NULL);
00995                 space_map_vacate(freemap, NULL, NULL);
00996 
00997                 space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size);
00998 
00999                 space_map_walk(sm, space_map_remove, allocmap);
01000                 space_map_walk(freed_map, space_map_remove, allocmap);
01001 
01002                 for (int t = 0; t < TXG_DEFER_SIZE; t++)
01003                         space_map_walk(&msp->ms_defermap[t],
01004                             space_map_remove, allocmap);
01005 
01006                 for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
01007                         space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
01008                             space_map_remove, allocmap);
01009 
01010                 mutex_exit(&msp->ms_lock);
01011                 space_map_truncate(smo, mos, tx);
01012                 mutex_enter(&msp->ms_lock);
01013         }
01014 
01015         space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
01016         space_map_sync(freemap, SM_FREE, smo, mos, tx);
01017 
01018         mutex_exit(&msp->ms_lock);
01019 
01020         VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
01021         dmu_buf_will_dirty(db, tx);
01022         ASSERT3U(db->db_size, >=, sizeof (*smo));
01023         bcopy(smo, db->db_data, sizeof (*smo));
01024         dmu_buf_rele(db, FTAG);
01025 
01026         dmu_tx_commit(tx);
01027 }
01028 
01033 void
01034 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
01035 {
01036         space_map_obj_t *smo = &msp->ms_smo;
01037         space_map_obj_t *smosync = &msp->ms_smo_syncing;
01038         space_map_t *sm = &msp->ms_map;
01039         space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
01040         space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
01041         metaslab_group_t *mg = msp->ms_group;
01042         vdev_t *vd = mg->mg_vd;
01043         int64_t alloc_delta, defer_delta;
01044 
01045         ASSERT(!vd->vdev_ishole);
01046 
01047         mutex_enter(&msp->ms_lock);
01048 
01049         /*
01050          * If this metaslab is just becoming available, initialize its
01051          * allocmaps and freemaps and add its capacity to the vdev.
01052          */
01053         if (freed_map->sm_size == 0) {
01054                 for (int t = 0; t < TXG_SIZE; t++) {
01055                         space_map_create(&msp->ms_allocmap[t], sm->sm_start,
01056                             sm->sm_size, sm->sm_shift, sm->sm_lock);
01057                         space_map_create(&msp->ms_freemap[t], sm->sm_start,
01058                             sm->sm_size, sm->sm_shift, sm->sm_lock);
01059                 }
01060 
01061                 for (int t = 0; t < TXG_DEFER_SIZE; t++)
01062                         space_map_create(&msp->ms_defermap[t], sm->sm_start,
01063                             sm->sm_size, sm->sm_shift, sm->sm_lock);
01064 
01065                 vdev_space_update(vd, 0, 0, sm->sm_size);
01066         }
01067 
01068         alloc_delta = smosync->smo_alloc - smo->smo_alloc;
01069         defer_delta = freed_map->sm_space - defer_map->sm_space;
01070 
01071         vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
01072 
01073         ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
01074         ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
01075 
01076         /*
01077          * If there's a space_map_load() in progress, wait for it to complete
01078          * so that we have a consistent view of the in-core space map.
01079          * Then, add defer_map (oldest deferred frees) to this map and
01080          * transfer freed_map (this txg's frees) to defer_map.
01081          */
01082         space_map_load_wait(sm);
01083         space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
01084         space_map_vacate(freed_map, space_map_add, defer_map);
01085 
01086         *smo = *smosync;
01087 
01088         msp->ms_deferspace += defer_delta;
01089         ASSERT3S(msp->ms_deferspace, >=, 0);
01090         ASSERT3S(msp->ms_deferspace, <=, sm->sm_size);
01091         if (msp->ms_deferspace != 0) {
01092                 /*
01093                  * Keep syncing this metaslab until all deferred frees
01094                  * are back in circulation.
01095                  */
01096                 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
01097         }
01098 
01099         /*
01100          * If the map is loaded but no longer active, evict it as soon as all
01101          * future allocations have synced.  (If we unloaded it now and then
01102          * loaded a moment later, the map wouldn't reflect those allocations.)
01103          */
01104         if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
01105                 int evictable = 1;
01106 
01107                 for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
01108                         if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
01109                                 evictable = 0;
01110 
01111                 if (evictable && !metaslab_debug)
01112                         space_map_unload(sm);
01113         }
01114 
01115         metaslab_group_sort(mg, msp, metaslab_weight(msp));
01116 
01117         mutex_exit(&msp->ms_lock);
01118 }
01119 
01120 void
01121 metaslab_sync_reassess(metaslab_group_t *mg)
01122 {
01123         vdev_t *vd = mg->mg_vd;
01124         int64_t failures = mg->mg_alloc_failures;
01125 
01126         /*
01127          * Re-evaluate all metaslabs which have lower offsets than the
01128          * bonus area.
01129          */
01130         for (int m = 0; m < vd->vdev_ms_count; m++) {
01131                 metaslab_t *msp = vd->vdev_ms[m];
01132 
01133                 if (msp->ms_map.sm_start > mg->mg_bonus_area)
01134                         break;
01135 
01136                 mutex_enter(&msp->ms_lock);
01137                 metaslab_group_sort(mg, msp, metaslab_weight(msp));
01138                 mutex_exit(&msp->ms_lock);
01139         }
01140 
01141         atomic_add_64(&mg->mg_alloc_failures, -failures);
01142 
01143         /*
01144          * Prefetch the next potential metaslabs
01145          */
01146         metaslab_prefetch(mg);
01147 }
01148 
01149 static uint64_t
01150 metaslab_distance(metaslab_t *msp, dva_t *dva)
01151 {
01152         uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
01153         uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
01154         uint64_t start = msp->ms_map.sm_start >> ms_shift;
01155 
01156         if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
01157                 return (1ULL << 63);
01158 
01159         if (offset < start)
01160                 return ((start - offset) << ms_shift);
01161         if (offset > start)
01162                 return ((offset - start) << ms_shift);
01163         return (0);
01164 }
01165 
01166 static uint64_t
01167 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
01168     uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
01169 {
01170         spa_t *spa = mg->mg_vd->vdev_spa;
01171         metaslab_t *msp = NULL;
01172         uint64_t offset = -1ULL;
01173         avl_tree_t *t = &mg->mg_metaslab_tree;
01174         uint64_t activation_weight;
01175         uint64_t target_distance;
01176         int i;
01177 
01178         activation_weight = METASLAB_WEIGHT_PRIMARY;
01179         for (i = 0; i < d; i++) {
01180                 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
01181                         activation_weight = METASLAB_WEIGHT_SECONDARY;
01182                         break;
01183                 }
01184         }
01185 
01186         for (;;) {
01187                 boolean_t was_active;
01188 
01189                 mutex_enter(&mg->mg_lock);
01190                 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
01191                         if (msp->ms_weight < asize) {
01192                                 spa_dbgmsg(spa, "%s: failed to meet weight "
01193                                     "requirement: vdev %llu, txg %llu, mg %p, "
01194                                     "msp %p, psize %llu, asize %llu, "
01195                                     "failures %llu, weight %llu",
01196                                     spa_name(spa), mg->mg_vd->vdev_id, txg,
01197                                     mg, msp, psize, asize,
01198                                     mg->mg_alloc_failures, msp->ms_weight);
01199                                 mutex_exit(&mg->mg_lock);
01200                                 return (-1ULL);
01201                         }
01202                         was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
01203                         if (activation_weight == METASLAB_WEIGHT_PRIMARY)
01204                                 break;
01205 
01206                         target_distance = min_distance +
01207                             (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
01208 
01209                         for (i = 0; i < d; i++)
01210                                 if (metaslab_distance(msp, &dva[i]) <
01211                                     target_distance)
01212                                         break;
01213                         if (i == d)
01214                                 break;
01215                 }
01216                 mutex_exit(&mg->mg_lock);
01217                 if (msp == NULL)
01218                         return (-1ULL);
01219 
01220                 /*
01221                  * If we've already reached the allowable number of failed
01222                  * allocation attempts on this metaslab group then we
01223                  * consider skipping it. We skip it only if we're allowed
01224                  * to "fast" gang, the physical size is larger than
01225                  * a gang block, and we're attempting to allocate from
01226                  * the primary metaslab.
01227                  */
01228                 if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
01229                     CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
01230                     activation_weight == METASLAB_WEIGHT_PRIMARY) {
01231                         spa_dbgmsg(spa, "%s: skipping metaslab group: "
01232                             "vdev %llu, txg %llu, mg %p, psize %llu, "
01233                             "asize %llu, failures %llu", spa_name(spa),
01234                             mg->mg_vd->vdev_id, txg, mg, psize, asize,
01235                             mg->mg_alloc_failures);
01236                         return (-1ULL);
01237                 }
01238 
01239                 mutex_enter(&msp->ms_lock);
01240 
01241                 /*
01242                  * Ensure that the metaslab we have selected is still
01243                  * capable of handling our request. It's possible that
01244                  * another thread may have changed the weight while we
01245                  * were blocked on the metaslab lock.
01246                  */
01247                 if (msp->ms_weight < asize || (was_active &&
01248                     !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
01249                     activation_weight == METASLAB_WEIGHT_PRIMARY)) {
01250                         mutex_exit(&msp->ms_lock);
01251                         continue;
01252                 }
01253 
01254                 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
01255                     activation_weight == METASLAB_WEIGHT_PRIMARY) {
01256                         metaslab_passivate(msp,
01257                             msp->ms_weight & ~METASLAB_ACTIVE_MASK);
01258                         mutex_exit(&msp->ms_lock);
01259                         continue;
01260                 }
01261 
01262                 if (metaslab_activate(msp, activation_weight) != 0) {
01263                         mutex_exit(&msp->ms_lock);
01264                         continue;
01265                 }
01266 
01267                 if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL)
01268                         break;
01269 
01270                 atomic_inc_64(&mg->mg_alloc_failures);
01271 
01272                 metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
01273 
01274                 mutex_exit(&msp->ms_lock);
01275         }
01276 
01277         if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
01278                 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
01279 
01280         space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize);
01281 
01282         mutex_exit(&msp->ms_lock);
01283 
01284         return (offset);
01285 }
01286 
01290 static int
01291 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
01292     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
01293 {
01294         metaslab_group_t *mg, *rotor;
01295         vdev_t *vd;
01296         int dshift = 3;
01297         int all_zero;
01298         int zio_lock = B_FALSE;
01299         boolean_t allocatable;
01300         uint64_t offset = -1ULL;
01301         uint64_t asize;
01302         uint64_t distance;
01303 
01304         ASSERT(!DVA_IS_VALID(&dva[d]));
01305 
01306         /*
01307          * For testing, make some blocks above a certain size be gang blocks.
01308          */
01309         if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
01310                 return (ENOSPC);
01311 
01312         /*
01313          * Start at the rotor and loop through all mgs until we find something.
01314          * Note that there's no locking on mc_rotor or mc_aliquot because
01315          * nothing actually breaks if we miss a few updates -- we just won't
01316          * allocate quite as evenly.  It all balances out over time.
01317          *
01318          * If we are doing ditto or log blocks, try to spread them across
01319          * consecutive vdevs.  If we're forced to reuse a vdev before we've
01320          * allocated all of our ditto blocks, then try and spread them out on
01321          * that vdev as much as possible.  If it turns out to not be possible,
01322          * gradually lower our standards until anything becomes acceptable.
01323          * Also, allocating on consecutive vdevs (as opposed to random vdevs)
01324          * gives us hope of containing our fault domains to something we're
01325          * able to reason about.  Otherwise, any two top-level vdev failures
01326          * will guarantee the loss of data.  With consecutive allocation,
01327          * only two adjacent top-level vdev failures will result in data loss.
01328          *
01329          * If we are doing gang blocks (hintdva is non-NULL), try to keep
01330          * ourselves on the same vdev as our gang block header.  That
01331          * way, we can hope for locality in vdev_cache, plus it makes our
01332          * fault domains something tractable.
01333          */
01334         if (hintdva) {
01335                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
01336 
01337                 /*
01338                  * It's possible the vdev we're using as the hint no
01339                  * longer exists (i.e. removed). Consult the rotor when
01340                  * all else fails.
01341                  */
01342                 if (vd != NULL) {
01343                         mg = vd->vdev_mg;
01344 
01345                         if (flags & METASLAB_HINTBP_AVOID &&
01346                             mg->mg_next != NULL)
01347                                 mg = mg->mg_next;
01348                 } else {
01349                         mg = mc->mc_rotor;
01350                 }
01351         } else if (d != 0) {
01352                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
01353                 mg = vd->vdev_mg->mg_next;
01354         } else {
01355                 mg = mc->mc_rotor;
01356         }
01357 
01358         /*
01359          * If the hint put us into the wrong metaslab class, or into a
01360          * metaslab group that has been passivated, just follow the rotor.
01361          */
01362         if (mg->mg_class != mc || mg->mg_activation_count <= 0)
01363                 mg = mc->mc_rotor;
01364 
01365         rotor = mg;
01366 top:
01367         all_zero = B_TRUE;
01368         do {
01369                 ASSERT(mg->mg_activation_count == 1);
01370 
01371                 vd = mg->mg_vd;
01372 
01373                 /*
01374                  * Don't allocate from faulted devices.
01375                  */
01376                 if (zio_lock) {
01377                         spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
01378                         allocatable = vdev_allocatable(vd);
01379                         spa_config_exit(spa, SCL_ZIO, FTAG);
01380                 } else {
01381                         allocatable = vdev_allocatable(vd);
01382                 }
01383                 if (!allocatable)
01384                         goto next;
01385 
01386                 /*
01387                  * Avoid writing single-copy data to a failing vdev
01388                  */
01389                 if ((vd->vdev_stat.vs_write_errors > 0 ||
01390                     vd->vdev_state < VDEV_STATE_HEALTHY) &&
01391                     d == 0 && dshift == 3) {
01392                         all_zero = B_FALSE;
01393                         goto next;
01394                 }
01395 
01396                 ASSERT(mg->mg_class == mc);
01397 
01398                 distance = vd->vdev_asize >> dshift;
01399                 if (distance <= (1ULL << vd->vdev_ms_shift))
01400                         distance = 0;
01401                 else
01402                         all_zero = B_FALSE;
01403 
01404                 asize = vdev_psize_to_asize(vd, psize);
01405                 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
01406 
01407                 offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
01408                     dva, d, flags);
01409                 if (offset != -1ULL) {
01410                         /*
01411                          * If we've just selected this metaslab group,
01412                          * figure out whether the corresponding vdev is
01413                          * over- or under-used relative to the pool,
01414                          * and set an allocation bias to even it out.
01415                          */
01416                         if (mc->mc_aliquot == 0) {
01417                                 vdev_stat_t *vs = &vd->vdev_stat;
01418                                 int64_t vu, cu;
01419 
01420                                 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
01421                                 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
01422 
01423                                 /*
01424                                  * Calculate how much more or less we should
01425                                  * try to allocate from this device during
01426                                  * this iteration around the rotor.
01427                                  * For example, if a device is 80% full
01428                                  * and the pool is 20% full then we should
01429                                  * reduce allocations by 60% on this device.
01430                                  *
01431                                  * mg_bias = (20 - 80) * 512K / 100 = -307K
01432                                  *
01433                                  * This reduces allocations by 307K for this
01434                                  * iteration.
01435                                  */
01436                                 mg->mg_bias = ((cu - vu) *
01437                                     (int64_t)mg->mg_aliquot) / 100;
01438                         }
01439 
01440                         if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
01441                             mg->mg_aliquot + mg->mg_bias) {
01442                                 mc->mc_rotor = mg->mg_next;
01443                                 mc->mc_aliquot = 0;
01444                         }
01445 
01446                         DVA_SET_VDEV(&dva[d], vd->vdev_id);
01447                         DVA_SET_OFFSET(&dva[d], offset);
01448                         DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
01449                         DVA_SET_ASIZE(&dva[d], asize);
01450 
01451                         return (0);
01452                 }
01453 next:
01454                 mc->mc_rotor = mg->mg_next;
01455                 mc->mc_aliquot = 0;
01456         } while ((mg = mg->mg_next) != rotor);
01457 
01458         if (!all_zero) {
01459                 dshift++;
01460                 ASSERT(dshift < 64);
01461                 goto top;
01462         }
01463 
01464         if (!allocatable && !zio_lock) {
01465                 dshift = 3;
01466                 zio_lock = B_TRUE;
01467                 goto top;
01468         }
01469 
01470         bzero(&dva[d], sizeof (dva_t));
01471 
01472         return (ENOSPC);
01473 }
01474 
01479 static void
01480 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
01481 {
01482         uint64_t vdev = DVA_GET_VDEV(dva);
01483         uint64_t offset = DVA_GET_OFFSET(dva);
01484         uint64_t size = DVA_GET_ASIZE(dva);
01485         vdev_t *vd;
01486         metaslab_t *msp;
01487 
01488         ASSERT(DVA_IS_VALID(dva));
01489 
01490         if (txg > spa_freeze_txg(spa))
01491                 return;
01492 
01493         if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
01494             (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
01495                 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
01496                     (u_longlong_t)vdev, (u_longlong_t)offset);
01497                 ASSERT(0);
01498                 return;
01499         }
01500 
01501         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
01502 
01503         if (DVA_GET_GANG(dva))
01504                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
01505 
01506         mutex_enter(&msp->ms_lock);
01507 
01508         if (now) {
01509                 space_map_remove(&msp->ms_allocmap[txg & TXG_MASK],
01510                     offset, size);
01511                 space_map_free(&msp->ms_map, offset, size);
01512         } else {
01513                 if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
01514                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
01515                 space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
01516         }
01517 
01518         mutex_exit(&msp->ms_lock);
01519 }
01520 
01527 static int
01528 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
01529 {
01530         uint64_t vdev = DVA_GET_VDEV(dva);
01531         uint64_t offset = DVA_GET_OFFSET(dva);
01532         uint64_t size = DVA_GET_ASIZE(dva);
01533         vdev_t *vd;
01534         metaslab_t *msp;
01535         int error = 0;
01536 
01537         ASSERT(DVA_IS_VALID(dva));
01538 
01539         if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
01540             (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
01541                 return (ENXIO);
01542 
01543         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
01544 
01545         if (DVA_GET_GANG(dva))
01546                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
01547 
01548         mutex_enter(&msp->ms_lock);
01549 
01550         if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
01551                 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
01552 
01553         if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
01554                 error = ENOENT;
01555 
01556         if (error || txg == 0) {        /* txg == 0 indicates dry run */
01557                 mutex_exit(&msp->ms_lock);
01558                 return (error);
01559         }
01560 
01561         space_map_claim(&msp->ms_map, offset, size);
01562 
01563         if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
01564                 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
01565                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
01566                 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
01567         }
01568 
01569         mutex_exit(&msp->ms_lock);
01570 
01571         return (0);
01572 }
01573 
01574 int
01575 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
01576     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
01577 {
01578         dva_t *dva = bp->blk_dva;
01579         dva_t *hintdva = hintbp->blk_dva;
01580         int error = 0;
01581 
01582         ASSERT(bp->blk_birth == 0);
01583         ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
01584 
01585         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
01586 
01587         if (mc->mc_rotor == NULL) {     /* no vdevs in this class */
01588                 spa_config_exit(spa, SCL_ALLOC, FTAG);
01589                 return (ENOSPC);
01590         }
01591 
01592         ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
01593         ASSERT(BP_GET_NDVAS(bp) == 0);
01594         ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
01595 
01596         for (int d = 0; d < ndvas; d++) {
01597                 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
01598                     txg, flags);
01599                 if (error) {
01600                         for (d--; d >= 0; d--) {
01601                                 metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
01602                                 bzero(&dva[d], sizeof (dva_t));
01603                         }
01604                         spa_config_exit(spa, SCL_ALLOC, FTAG);
01605                         return (error);
01606                 }
01607         }
01608         ASSERT(error == 0);
01609         ASSERT(BP_GET_NDVAS(bp) == ndvas);
01610 
01611         spa_config_exit(spa, SCL_ALLOC, FTAG);
01612 
01613         BP_SET_BIRTH(bp, txg, txg);
01614 
01615         return (0);
01616 }
01617 
01618 void
01619 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
01620 {
01621         const dva_t *dva = bp->blk_dva;
01622         int ndvas = BP_GET_NDVAS(bp);
01623 
01624         ASSERT(!BP_IS_HOLE(bp));
01625         ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
01626 
01627         spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
01628 
01629         for (int d = 0; d < ndvas; d++)
01630                 metaslab_free_dva(spa, &dva[d], txg, now);
01631 
01632         spa_config_exit(spa, SCL_FREE, FTAG);
01633 }
01634 
01635 int
01636 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
01637 {
01638         const dva_t *dva = bp->blk_dva;
01639         int ndvas = BP_GET_NDVAS(bp);
01640         int error = 0;
01641 
01642         ASSERT(!BP_IS_HOLE(bp));
01643 
01644         if (txg != 0) {
01645                 /*
01646                  * First do a dry run to make sure all DVAs are claimable,
01647                  * so we don't have to unwind from partial failures below.
01648                  */
01649                 if ((error = metaslab_claim(spa, bp, 0)) != 0)
01650                         return (error);
01651         }
01652 
01653         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
01654 
01655         for (int d = 0; d < ndvas; d++)
01656                 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
01657                         break;
01658 
01659         spa_config_exit(spa, SCL_ALLOC, FTAG);
01660 
01661         ASSERT(error == 0 || txg == 0);
01662 
01663         return (error);
01664 }