FreeBSD ZFS
The Zettabyte File System
|
00001 /* 00002 * CDDL HEADER START 00003 * 00004 * The contents of this file are subject to the terms of the 00005 * Common Development and Distribution License (the "License"). 00006 * You may not use this file except in compliance with the License. 00007 * 00008 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 00009 * or http://www.opensolaris.org/os/licensing. 00010 * See the License for the specific language governing permissions 00011 * and limitations under the License. 00012 * 00013 * When distributing Covered Code, include this CDDL HEADER in each 00014 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 00015 * If applicable, add the following below this CDDL HEADER, with the 00016 * fields enclosed by brackets "[]" replaced with your own identifying 00017 * information: Portions Copyright [yyyy] [name of copyright owner] 00018 * 00019 * CDDL HEADER END 00020 */ 00021 00022 /* 00023 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 00024 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 00025 * Copyright (c) 2012 by Delphix. All rights reserved. 00026 */ 00027 00028 #include <sys/zfs_context.h> 00029 #include <sys/fm/fs/zfs.h> 00030 #include <sys/spa.h> 00031 #include <sys/spa_impl.h> 00032 #include <sys/dmu.h> 00033 #include <sys/dmu_tx.h> 00034 #include <sys/vdev_impl.h> 00035 #include <sys/uberblock_impl.h> 00036 #include <sys/metaslab.h> 00037 #include <sys/metaslab_impl.h> 00038 #include <sys/space_map.h> 00039 #include <sys/zio.h> 00040 #include <sys/zap.h> 00041 #include <sys/fs/zfs.h> 00042 #include <sys/arc.h> 00043 #include <sys/zil.h> 00044 #include <sys/dsl_scan.h> 00045 #include <sys/trim_map.h> 00046 00052 SYSCTL_DECL(_vfs_zfs); 00053 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); 00054 00055 static vdev_ops_t *vdev_ops_table[] = { 00056 &vdev_root_ops, 00057 &vdev_raidz_ops, 00058 &vdev_mirror_ops, 00059 &vdev_replacing_ops, 00060 &vdev_spare_ops, 00061 #ifdef _KERNEL 00062 &vdev_geom_ops, 00063 #else 00064 &vdev_disk_ops, 00065 #endif 00066 &vdev_file_ops, 00067 &vdev_missing_ops, 00068 &vdev_hole_ops, 00069 NULL 00070 }; 00071 00075 static vdev_ops_t * 00076 vdev_getops(const char *type) 00077 { 00078 vdev_ops_t *ops, **opspp; 00079 00080 for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 00081 if (strcmp(ops->vdev_op_type, type) == 0) 00082 break; 00083 00084 return (ops); 00085 } 00086 00093 uint64_t 00094 vdev_default_asize(vdev_t *vd, uint64_t psize) 00095 { 00096 uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 00097 uint64_t csize; 00098 00099 for (int c = 0; c < vd->vdev_children; c++) { 00100 csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 00101 asize = MAX(asize, csize); 00102 } 00103 00104 return (asize); 00105 } 00106 00113 uint64_t 00114 vdev_get_min_asize(vdev_t *vd) 00115 { 00116 vdev_t *pvd = vd->vdev_parent; 00117 00118 /* 00119 * If our parent is NULL (inactive spare or cache) or is the root, 00120 * just return our own asize. 00121 */ 00122 if (pvd == NULL) 00123 return (vd->vdev_asize); 00124 00125 /* 00126 * The top-level vdev just returns the allocatable size rounded 00127 * to the nearest metaslab. 00128 */ 00129 if (vd == vd->vdev_top) 00130 return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); 00131 00132 /* 00133 * The allocatable space for a raidz vdev is N * sizeof(smallest child), 00134 * so each child must provide at least 1/Nth of its asize. 00135 */ 00136 if (pvd->vdev_ops == &vdev_raidz_ops) 00137 return (pvd->vdev_min_asize / pvd->vdev_children); 00138 00139 return (pvd->vdev_min_asize); 00140 } 00141 00142 void 00143 vdev_set_min_asize(vdev_t *vd) 00144 { 00145 vd->vdev_min_asize = vdev_get_min_asize(vd); 00146 00147 for (int c = 0; c < vd->vdev_children; c++) 00148 vdev_set_min_asize(vd->vdev_child[c]); 00149 } 00150 00151 vdev_t * 00152 vdev_lookup_top(spa_t *spa, uint64_t vdev) 00153 { 00154 vdev_t *rvd = spa->spa_root_vdev; 00155 00156 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 00157 00158 if (vdev < rvd->vdev_children) { 00159 ASSERT(rvd->vdev_child[vdev] != NULL); 00160 return (rvd->vdev_child[vdev]); 00161 } 00162 00163 return (NULL); 00164 } 00165 00166 vdev_t * 00167 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 00168 { 00169 vdev_t *mvd; 00170 00171 if (vd->vdev_guid == guid) 00172 return (vd); 00173 00174 for (int c = 0; c < vd->vdev_children; c++) 00175 if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 00176 NULL) 00177 return (mvd); 00178 00179 return (NULL); 00180 } 00181 00182 void 00183 vdev_add_child(vdev_t *pvd, vdev_t *cvd) 00184 { 00185 size_t oldsize, newsize; 00186 uint64_t id = cvd->vdev_id; 00187 vdev_t **newchild; 00188 00189 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 00190 ASSERT(cvd->vdev_parent == NULL); 00191 00192 cvd->vdev_parent = pvd; 00193 00194 if (pvd == NULL) 00195 return; 00196 00197 ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 00198 00199 oldsize = pvd->vdev_children * sizeof (vdev_t *); 00200 pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 00201 newsize = pvd->vdev_children * sizeof (vdev_t *); 00202 00203 newchild = kmem_zalloc(newsize, KM_SLEEP); 00204 if (pvd->vdev_child != NULL) { 00205 bcopy(pvd->vdev_child, newchild, oldsize); 00206 kmem_free(pvd->vdev_child, oldsize); 00207 } 00208 00209 pvd->vdev_child = newchild; 00210 pvd->vdev_child[id] = cvd; 00211 00212 cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 00213 ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 00214 00215 /* 00216 * Walk up all ancestors to update guid sum. 00217 */ 00218 for (; pvd != NULL; pvd = pvd->vdev_parent) 00219 pvd->vdev_guid_sum += cvd->vdev_guid_sum; 00220 } 00221 00222 void 00223 vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 00224 { 00225 int c; 00226 uint_t id = cvd->vdev_id; 00227 00228 ASSERT(cvd->vdev_parent == pvd); 00229 00230 if (pvd == NULL) 00231 return; 00232 00233 ASSERT(id < pvd->vdev_children); 00234 ASSERT(pvd->vdev_child[id] == cvd); 00235 00236 pvd->vdev_child[id] = NULL; 00237 cvd->vdev_parent = NULL; 00238 00239 for (c = 0; c < pvd->vdev_children; c++) 00240 if (pvd->vdev_child[c]) 00241 break; 00242 00243 if (c == pvd->vdev_children) { 00244 kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 00245 pvd->vdev_child = NULL; 00246 pvd->vdev_children = 0; 00247 } 00248 00249 /* 00250 * Walk up all ancestors to update guid sum. 00251 */ 00252 for (; pvd != NULL; pvd = pvd->vdev_parent) 00253 pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 00254 } 00255 00259 void 00260 vdev_compact_children(vdev_t *pvd) 00261 { 00262 vdev_t **newchild, *cvd; 00263 int oldc = pvd->vdev_children; 00264 int newc; 00265 00266 ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 00267 00268 for (int c = newc = 0; c < oldc; c++) 00269 if (pvd->vdev_child[c]) 00270 newc++; 00271 00272 newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 00273 00274 for (int c = newc = 0; c < oldc; c++) { 00275 if ((cvd = pvd->vdev_child[c]) != NULL) { 00276 newchild[newc] = cvd; 00277 cvd->vdev_id = newc++; 00278 } 00279 } 00280 00281 kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 00282 pvd->vdev_child = newchild; 00283 pvd->vdev_children = newc; 00284 } 00285 00289 vdev_t * 00290 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 00291 { 00292 vdev_t *vd; 00293 00294 vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 00295 00296 if (spa->spa_root_vdev == NULL) { 00297 ASSERT(ops == &vdev_root_ops); 00298 spa->spa_root_vdev = vd; 00299 spa->spa_load_guid = spa_generate_guid(NULL); 00300 } 00301 00302 if (guid == 0 && ops != &vdev_hole_ops) { 00303 if (spa->spa_root_vdev == vd) { 00304 /* 00305 * The root vdev's guid will also be the pool guid, 00306 * which must be unique among all pools. 00307 */ 00308 guid = spa_generate_guid(NULL); 00309 } else { 00310 /* 00311 * Any other vdev's guid must be unique within the pool. 00312 */ 00313 guid = spa_generate_guid(spa); 00314 } 00315 ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 00316 } 00317 00318 vd->vdev_spa = spa; 00319 vd->vdev_id = id; 00320 vd->vdev_guid = guid; 00321 vd->vdev_guid_sum = guid; 00322 vd->vdev_ops = ops; 00323 vd->vdev_state = VDEV_STATE_CLOSED; 00324 vd->vdev_ishole = (ops == &vdev_hole_ops); 00325 00326 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 00327 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 00328 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); 00329 for (int t = 0; t < DTL_TYPES; t++) { 00330 space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0, 00331 &vd->vdev_dtl_lock); 00332 } 00333 txg_list_create(&vd->vdev_ms_list, 00334 offsetof(struct metaslab, ms_txg_node)); 00335 txg_list_create(&vd->vdev_dtl_list, 00336 offsetof(struct vdev, vdev_dtl_node)); 00337 vd->vdev_stat.vs_timestamp = gethrtime(); 00338 vdev_queue_init(vd); 00339 vdev_cache_init(vd); 00340 00341 return (vd); 00342 } 00343 00349 int 00350 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 00351 int alloctype) 00352 { 00353 vdev_ops_t *ops; 00354 char *type; 00355 uint64_t guid = 0, islog, nparity; 00356 vdev_t *vd; 00357 00358 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 00359 00360 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 00361 return (EINVAL); 00362 00363 if ((ops = vdev_getops(type)) == NULL) 00364 return (EINVAL); 00365 00366 /* 00367 * If this is a load, get the vdev guid from the nvlist. 00368 * Otherwise, vdev_alloc_common() will generate one for us. 00369 */ 00370 if (alloctype == VDEV_ALLOC_LOAD) { 00371 uint64_t label_id; 00372 00373 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 00374 label_id != id) 00375 return (EINVAL); 00376 00377 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 00378 return (EINVAL); 00379 } else if (alloctype == VDEV_ALLOC_SPARE) { 00380 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 00381 return (EINVAL); 00382 } else if (alloctype == VDEV_ALLOC_L2CACHE) { 00383 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 00384 return (EINVAL); 00385 } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { 00386 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 00387 return (EINVAL); 00388 } 00389 00390 /* 00391 * The first allocated vdev must be of type 'root'. 00392 */ 00393 if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 00394 return (EINVAL); 00395 00396 /* 00397 * Determine whether we're a log vdev. 00398 */ 00399 islog = 0; 00400 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 00401 if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 00402 return (ENOTSUP); 00403 00404 if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) 00405 return (ENOTSUP); 00406 00407 /* 00408 * Set the nparity property for RAID-Z vdevs. 00409 */ 00410 nparity = -1ULL; 00411 if (ops == &vdev_raidz_ops) { 00412 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 00413 &nparity) == 0) { 00414 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 00415 return (EINVAL); 00416 /* 00417 * Previous versions could only support 1 or 2 parity 00418 * device. 00419 */ 00420 if (nparity > 1 && 00421 spa_version(spa) < SPA_VERSION_RAIDZ2) 00422 return (ENOTSUP); 00423 if (nparity > 2 && 00424 spa_version(spa) < SPA_VERSION_RAIDZ3) 00425 return (ENOTSUP); 00426 } else { 00427 /* 00428 * We require the parity to be specified for SPAs that 00429 * support multiple parity levels. 00430 */ 00431 if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 00432 return (EINVAL); 00433 /* 00434 * Otherwise, we default to 1 parity device for RAID-Z. 00435 */ 00436 nparity = 1; 00437 } 00438 } else { 00439 nparity = 0; 00440 } 00441 ASSERT(nparity != -1ULL); 00442 00443 vd = vdev_alloc_common(spa, id, guid, ops); 00444 00445 vd->vdev_islog = islog; 00446 vd->vdev_nparity = nparity; 00447 00448 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 00449 vd->vdev_path = spa_strdup(vd->vdev_path); 00450 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 00451 vd->vdev_devid = spa_strdup(vd->vdev_devid); 00452 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 00453 &vd->vdev_physpath) == 0) 00454 vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 00455 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) 00456 vd->vdev_fru = spa_strdup(vd->vdev_fru); 00457 00458 /* 00459 * Set the whole_disk property. If it's not specified, leave the value 00460 * as -1. 00461 */ 00462 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 00463 &vd->vdev_wholedisk) != 0) 00464 vd->vdev_wholedisk = -1ULL; 00465 00466 /* 00467 * Look for the 'not present' flag. This will only be set if the device 00468 * was not present at the time of import. 00469 */ 00470 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 00471 &vd->vdev_not_present); 00472 00473 /* 00474 * Get the alignment requirement. 00475 */ 00476 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 00477 00478 /* 00479 * Retrieve the vdev creation time. 00480 */ 00481 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, 00482 &vd->vdev_crtxg); 00483 00484 /* 00485 * If we're a top-level vdev, try to load the allocation parameters. 00486 */ 00487 if (parent && !parent->vdev_parent && 00488 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { 00489 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 00490 &vd->vdev_ms_array); 00491 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 00492 &vd->vdev_ms_shift); 00493 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 00494 &vd->vdev_asize); 00495 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, 00496 &vd->vdev_removing); 00497 } 00498 00499 if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { 00500 ASSERT(alloctype == VDEV_ALLOC_LOAD || 00501 alloctype == VDEV_ALLOC_ADD || 00502 alloctype == VDEV_ALLOC_SPLIT || 00503 alloctype == VDEV_ALLOC_ROOTPOOL); 00504 vd->vdev_mg = metaslab_group_create(islog ? 00505 spa_log_class(spa) : spa_normal_class(spa), vd); 00506 } 00507 00508 /* 00509 * If we're a leaf vdev, try to load the DTL object and other state. 00510 */ 00511 if (vd->vdev_ops->vdev_op_leaf && 00512 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || 00513 alloctype == VDEV_ALLOC_ROOTPOOL)) { 00514 if (alloctype == VDEV_ALLOC_LOAD) { 00515 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 00516 &vd->vdev_dtl_smo.smo_object); 00517 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 00518 &vd->vdev_unspare); 00519 } 00520 00521 if (alloctype == VDEV_ALLOC_ROOTPOOL) { 00522 uint64_t spare = 0; 00523 00524 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 00525 &spare) == 0 && spare) 00526 spa_spare_add(vd); 00527 } 00528 00529 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 00530 &vd->vdev_offline); 00531 00532 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING, 00533 &vd->vdev_resilvering); 00534 00535 /* 00536 * When importing a pool, we want to ignore the persistent fault 00537 * state, as the diagnosis made on another system may not be 00538 * valid in the current context. Local vdevs will 00539 * remain in the faulted state. 00540 */ 00541 if (spa_load_state(spa) == SPA_LOAD_OPEN) { 00542 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 00543 &vd->vdev_faulted); 00544 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 00545 &vd->vdev_degraded); 00546 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 00547 &vd->vdev_removed); 00548 00549 if (vd->vdev_faulted || vd->vdev_degraded) { 00550 char *aux; 00551 00552 vd->vdev_label_aux = 00553 VDEV_AUX_ERR_EXCEEDED; 00554 if (nvlist_lookup_string(nv, 00555 ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && 00556 strcmp(aux, "external") == 0) 00557 vd->vdev_label_aux = VDEV_AUX_EXTERNAL; 00558 } 00559 } 00560 } 00561 00562 /* 00563 * Add ourselves to the parent's list of children. 00564 */ 00565 vdev_add_child(parent, vd); 00566 00567 *vdp = vd; 00568 00569 return (0); 00570 } 00571 00572 void 00573 vdev_free(vdev_t *vd) 00574 { 00575 spa_t *spa = vd->vdev_spa; 00576 00577 /* 00578 * vdev_free() implies closing the vdev first. This is simpler than 00579 * trying to ensure complicated semantics for all callers. 00580 */ 00581 vdev_close(vd); 00582 00583 ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); 00584 ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); 00585 00586 /* 00587 * Free all children. 00588 */ 00589 for (int c = 0; c < vd->vdev_children; c++) 00590 vdev_free(vd->vdev_child[c]); 00591 00592 ASSERT(vd->vdev_child == NULL); 00593 ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 00594 00595 /* 00596 * Discard allocation state. 00597 */ 00598 if (vd->vdev_mg != NULL) { 00599 vdev_metaslab_fini(vd); 00600 metaslab_group_destroy(vd->vdev_mg); 00601 } 00602 00603 ASSERT0(vd->vdev_stat.vs_space); 00604 ASSERT0(vd->vdev_stat.vs_dspace); 00605 ASSERT0(vd->vdev_stat.vs_alloc); 00606 00607 /* 00608 * Remove this vdev from its parent's child list. 00609 */ 00610 vdev_remove_child(vd->vdev_parent, vd); 00611 00612 ASSERT(vd->vdev_parent == NULL); 00613 00614 /* 00615 * Clean up vdev structure. 00616 */ 00617 vdev_queue_fini(vd); 00618 vdev_cache_fini(vd); 00619 00620 if (vd->vdev_path) 00621 spa_strfree(vd->vdev_path); 00622 if (vd->vdev_devid) 00623 spa_strfree(vd->vdev_devid); 00624 if (vd->vdev_physpath) 00625 spa_strfree(vd->vdev_physpath); 00626 if (vd->vdev_fru) 00627 spa_strfree(vd->vdev_fru); 00628 00629 if (vd->vdev_isspare) 00630 spa_spare_remove(vd); 00631 if (vd->vdev_isl2cache) 00632 spa_l2cache_remove(vd); 00633 00634 txg_list_destroy(&vd->vdev_ms_list); 00635 txg_list_destroy(&vd->vdev_dtl_list); 00636 00637 mutex_enter(&vd->vdev_dtl_lock); 00638 for (int t = 0; t < DTL_TYPES; t++) { 00639 space_map_unload(&vd->vdev_dtl[t]); 00640 space_map_destroy(&vd->vdev_dtl[t]); 00641 } 00642 mutex_exit(&vd->vdev_dtl_lock); 00643 00644 mutex_destroy(&vd->vdev_dtl_lock); 00645 mutex_destroy(&vd->vdev_stat_lock); 00646 mutex_destroy(&vd->vdev_probe_lock); 00647 00648 if (vd == spa->spa_root_vdev) 00649 spa->spa_root_vdev = NULL; 00650 00651 kmem_free(vd, sizeof (vdev_t)); 00652 } 00653 00657 static void 00658 vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 00659 { 00660 spa_t *spa = svd->vdev_spa; 00661 metaslab_t *msp; 00662 vdev_t *vd; 00663 int t; 00664 00665 ASSERT(tvd == tvd->vdev_top); 00666 00667 tvd->vdev_ms_array = svd->vdev_ms_array; 00668 tvd->vdev_ms_shift = svd->vdev_ms_shift; 00669 tvd->vdev_ms_count = svd->vdev_ms_count; 00670 00671 svd->vdev_ms_array = 0; 00672 svd->vdev_ms_shift = 0; 00673 svd->vdev_ms_count = 0; 00674 00675 if (tvd->vdev_mg) 00676 ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); 00677 tvd->vdev_mg = svd->vdev_mg; 00678 tvd->vdev_ms = svd->vdev_ms; 00679 00680 svd->vdev_mg = NULL; 00681 svd->vdev_ms = NULL; 00682 00683 if (tvd->vdev_mg != NULL) 00684 tvd->vdev_mg->mg_vd = tvd; 00685 00686 tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 00687 tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 00688 tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 00689 00690 svd->vdev_stat.vs_alloc = 0; 00691 svd->vdev_stat.vs_space = 0; 00692 svd->vdev_stat.vs_dspace = 0; 00693 00694 for (t = 0; t < TXG_SIZE; t++) { 00695 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 00696 (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 00697 while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 00698 (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 00699 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 00700 (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 00701 } 00702 00703 if (list_link_active(&svd->vdev_config_dirty_node)) { 00704 vdev_config_clean(svd); 00705 vdev_config_dirty(tvd); 00706 } 00707 00708 if (list_link_active(&svd->vdev_state_dirty_node)) { 00709 vdev_state_clean(svd); 00710 vdev_state_dirty(tvd); 00711 } 00712 00713 tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 00714 svd->vdev_deflate_ratio = 0; 00715 00716 tvd->vdev_islog = svd->vdev_islog; 00717 svd->vdev_islog = 0; 00718 } 00719 00720 static void 00721 vdev_top_update(vdev_t *tvd, vdev_t *vd) 00722 { 00723 if (vd == NULL) 00724 return; 00725 00726 vd->vdev_top = tvd; 00727 00728 for (int c = 0; c < vd->vdev_children; c++) 00729 vdev_top_update(tvd, vd->vdev_child[c]); 00730 } 00731 00735 vdev_t * 00736 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 00737 { 00738 spa_t *spa = cvd->vdev_spa; 00739 vdev_t *pvd = cvd->vdev_parent; 00740 vdev_t *mvd; 00741 00742 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 00743 00744 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 00745 00746 mvd->vdev_asize = cvd->vdev_asize; 00747 mvd->vdev_min_asize = cvd->vdev_min_asize; 00748 mvd->vdev_max_asize = cvd->vdev_max_asize; 00749 mvd->vdev_ashift = cvd->vdev_ashift; 00750 mvd->vdev_state = cvd->vdev_state; 00751 mvd->vdev_crtxg = cvd->vdev_crtxg; 00752 00753 vdev_remove_child(pvd, cvd); 00754 vdev_add_child(pvd, mvd); 00755 cvd->vdev_id = mvd->vdev_children; 00756 vdev_add_child(mvd, cvd); 00757 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 00758 00759 if (mvd == mvd->vdev_top) 00760 vdev_top_transfer(cvd, mvd); 00761 00762 return (mvd); 00763 } 00764 00768 void 00769 vdev_remove_parent(vdev_t *cvd) 00770 { 00771 vdev_t *mvd = cvd->vdev_parent; 00772 vdev_t *pvd = mvd->vdev_parent; 00773 00774 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 00775 00776 ASSERT(mvd->vdev_children == 1); 00777 ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 00778 mvd->vdev_ops == &vdev_replacing_ops || 00779 mvd->vdev_ops == &vdev_spare_ops); 00780 cvd->vdev_ashift = mvd->vdev_ashift; 00781 00782 vdev_remove_child(mvd, cvd); 00783 vdev_remove_child(pvd, mvd); 00784 00785 /* 00786 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. 00787 * Otherwise, we could have detached an offline device, and when we 00788 * go to import the pool we'll think we have two top-level vdevs, 00789 * instead of a different version of the same top-level vdev. 00790 */ 00791 if (mvd->vdev_top == mvd) { 00792 uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; 00793 cvd->vdev_orig_guid = cvd->vdev_guid; 00794 cvd->vdev_guid += guid_delta; 00795 cvd->vdev_guid_sum += guid_delta; 00796 } 00797 cvd->vdev_id = mvd->vdev_id; 00798 vdev_add_child(pvd, cvd); 00799 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 00800 00801 if (cvd == cvd->vdev_top) 00802 vdev_top_transfer(mvd, cvd); 00803 00804 ASSERT(mvd->vdev_children == 0); 00805 vdev_free(mvd); 00806 } 00807 00808 int 00809 vdev_metaslab_init(vdev_t *vd, uint64_t txg) 00810 { 00811 spa_t *spa = vd->vdev_spa; 00812 objset_t *mos = spa->spa_meta_objset; 00813 uint64_t m; 00814 uint64_t oldc = vd->vdev_ms_count; 00815 uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 00816 metaslab_t **mspp; 00817 int error; 00818 00819 ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 00820 00821 /* 00822 * This vdev is not being allocated from yet or is a hole. 00823 */ 00824 if (vd->vdev_ms_shift == 0) 00825 return (0); 00826 00827 ASSERT(!vd->vdev_ishole); 00828 00829 /* 00830 * Compute the raidz-deflation ratio. Note, we hard-code 00831 * in 128k (1 << 17) because it is the current "typical" blocksize. 00832 * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change, 00833 * or we will inconsistently account for existing bp's. 00834 */ 00835 vd->vdev_deflate_ratio = (1 << 17) / 00836 (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); 00837 00838 ASSERT(oldc <= newc); 00839 00840 mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 00841 00842 if (oldc != 0) { 00843 bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 00844 kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 00845 } 00846 00847 vd->vdev_ms = mspp; 00848 vd->vdev_ms_count = newc; 00849 00850 for (m = oldc; m < newc; m++) { 00851 space_map_obj_t smo = { 0, 0, 0 }; 00852 if (txg == 0) { 00853 uint64_t object = 0; 00854 error = dmu_read(mos, vd->vdev_ms_array, 00855 m * sizeof (uint64_t), sizeof (uint64_t), &object, 00856 DMU_READ_PREFETCH); 00857 if (error) 00858 return (error); 00859 if (object != 0) { 00860 dmu_buf_t *db; 00861 error = dmu_bonus_hold(mos, object, FTAG, &db); 00862 if (error) 00863 return (error); 00864 ASSERT3U(db->db_size, >=, sizeof (smo)); 00865 bcopy(db->db_data, &smo, sizeof (smo)); 00866 ASSERT3U(smo.smo_object, ==, object); 00867 dmu_buf_rele(db, FTAG); 00868 } 00869 } 00870 vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo, 00871 m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); 00872 } 00873 00874 if (txg == 0) 00875 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); 00876 00877 /* 00878 * If the vdev is being removed we don't activate 00879 * the metaslabs since we want to ensure that no new 00880 * allocations are performed on this device. 00881 */ 00882 if (oldc == 0 && !vd->vdev_removing) 00883 metaslab_group_activate(vd->vdev_mg); 00884 00885 if (txg == 0) 00886 spa_config_exit(spa, SCL_ALLOC, FTAG); 00887 00888 return (0); 00889 } 00890 00891 void 00892 vdev_metaslab_fini(vdev_t *vd) 00893 { 00894 uint64_t m; 00895 uint64_t count = vd->vdev_ms_count; 00896 00897 if (vd->vdev_ms != NULL) { 00898 metaslab_group_passivate(vd->vdev_mg); 00899 for (m = 0; m < count; m++) 00900 if (vd->vdev_ms[m] != NULL) 00901 metaslab_fini(vd->vdev_ms[m]); 00902 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 00903 vd->vdev_ms = NULL; 00904 } 00905 } 00906 00907 typedef struct vdev_probe_stats { 00908 boolean_t vps_readable; 00909 boolean_t vps_writeable; 00910 int vps_flags; 00911 } vdev_probe_stats_t; 00912 00913 static void 00914 vdev_probe_done(zio_t *zio) 00915 { 00916 spa_t *spa = zio->io_spa; 00917 vdev_t *vd = zio->io_vd; 00918 vdev_probe_stats_t *vps = zio->io_private; 00919 00920 ASSERT(vd->vdev_probe_zio != NULL); 00921 00922 if (zio->io_type == ZIO_TYPE_READ) { 00923 if (zio->io_error == 0) 00924 vps->vps_readable = 1; 00925 if (zio->io_error == 0 && spa_writeable(spa)) { 00926 zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, 00927 zio->io_offset, zio->io_size, zio->io_data, 00928 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 00929 ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); 00930 } else { 00931 zio_buf_free(zio->io_data, zio->io_size); 00932 } 00933 } else if (zio->io_type == ZIO_TYPE_WRITE) { 00934 if (zio->io_error == 0) 00935 vps->vps_writeable = 1; 00936 zio_buf_free(zio->io_data, zio->io_size); 00937 } else if (zio->io_type == ZIO_TYPE_NULL) { 00938 zio_t *pio; 00939 00940 vd->vdev_cant_read |= !vps->vps_readable; 00941 vd->vdev_cant_write |= !vps->vps_writeable; 00942 00943 if (vdev_readable(vd) && 00944 (vdev_writeable(vd) || !spa_writeable(spa))) { 00945 zio->io_error = 0; 00946 } else { 00947 ASSERT(zio->io_error != 0); 00948 zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, 00949 spa, vd, NULL, 0, 0); 00950 zio->io_error = ENXIO; 00951 } 00952 00953 mutex_enter(&vd->vdev_probe_lock); 00954 ASSERT(vd->vdev_probe_zio == zio); 00955 vd->vdev_probe_zio = NULL; 00956 mutex_exit(&vd->vdev_probe_lock); 00957 00958 while ((pio = zio_walk_parents(zio)) != NULL) 00959 if (!vdev_accessible(vd, pio)) 00960 pio->io_error = ENXIO; 00961 00962 kmem_free(vps, sizeof (*vps)); 00963 } 00964 } 00965 00973 zio_t * 00974 vdev_probe(vdev_t *vd, zio_t *zio) 00975 { 00976 spa_t *spa = vd->vdev_spa; 00977 vdev_probe_stats_t *vps = NULL; 00978 zio_t *pio; 00979 00980 ASSERT(vd->vdev_ops->vdev_op_leaf); 00981 00982 /* 00983 * Don't probe the probe. 00984 */ 00985 if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) 00986 return (NULL); 00987 00988 /* 00989 * To prevent 'probe storms' when a device fails, we create 00990 * just one probe i/o at a time. All zios that want to probe 00991 * this vdev will become parents of the probe io. 00992 */ 00993 mutex_enter(&vd->vdev_probe_lock); 00994 00995 if ((pio = vd->vdev_probe_zio) == NULL) { 00996 vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); 00997 00998 vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | 00999 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | 01000 ZIO_FLAG_TRYHARD; 01001 01002 if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { 01003 /* 01004 * vdev_cant_read and vdev_cant_write can only 01005 * transition from TRUE to FALSE when we have the 01006 * SCL_ZIO lock as writer; otherwise they can only 01007 * transition from FALSE to TRUE. This ensures that 01008 * any zio looking at these values can assume that 01009 * failures persist for the life of the I/O. That's 01010 * important because when a device has intermittent 01011 * connectivity problems, we want to ensure that 01012 * they're ascribed to the device (ENXIO) and not 01013 * the zio (EIO). 01014 * 01015 * Since we hold SCL_ZIO as writer here, clear both 01016 * values so the probe can reevaluate from first 01017 * principles. 01018 */ 01019 vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; 01020 vd->vdev_cant_read = B_FALSE; 01021 vd->vdev_cant_write = B_FALSE; 01022 } 01023 01024 vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, 01025 vdev_probe_done, vps, 01026 vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); 01027 01028 /* 01029 * We can't change the vdev state in this context, so we 01030 * kick off an async task to do it on our behalf. 01031 */ 01032 if (zio != NULL) { 01033 vd->vdev_probe_wanted = B_TRUE; 01034 spa_async_request(spa, SPA_ASYNC_PROBE); 01035 } 01036 } 01037 01038 if (zio != NULL) 01039 zio_add_child(zio, pio); 01040 01041 mutex_exit(&vd->vdev_probe_lock); 01042 01043 if (vps == NULL) { 01044 ASSERT(zio != NULL); 01045 return (NULL); 01046 } 01047 01048 for (int l = 1; l < VDEV_LABELS; l++) { 01049 zio_nowait(zio_read_phys(pio, vd, 01050 vdev_label_offset(vd->vdev_psize, l, 01051 offsetof(vdev_label_t, vl_pad2)), 01052 VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), 01053 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 01054 ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); 01055 } 01056 01057 if (zio == NULL) 01058 return (pio); 01059 01060 zio_nowait(pio); 01061 return (NULL); 01062 } 01063 01064 static void 01065 vdev_open_child(void *arg) 01066 { 01067 vdev_t *vd = arg; 01068 01069 vd->vdev_open_thread = curthread; 01070 vd->vdev_open_error = vdev_open(vd); 01071 vd->vdev_open_thread = NULL; 01072 } 01073 01074 boolean_t 01075 vdev_uses_zvols(vdev_t *vd) 01076 { 01077 if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, 01078 strlen(ZVOL_DIR)) == 0) 01079 return (B_TRUE); 01080 for (int c = 0; c < vd->vdev_children; c++) 01081 if (vdev_uses_zvols(vd->vdev_child[c])) 01082 return (B_TRUE); 01083 return (B_FALSE); 01084 } 01085 01086 void 01087 vdev_open_children(vdev_t *vd) 01088 { 01089 taskq_t *tq; 01090 int children = vd->vdev_children; 01091 01092 /* 01093 * in order to handle pools on top of zvols, do the opens 01094 * in a single thread so that the same thread holds the 01095 * spa_namespace_lock 01096 */ 01097 if (B_TRUE || vdev_uses_zvols(vd)) { 01098 for (int c = 0; c < children; c++) 01099 vd->vdev_child[c]->vdev_open_error = 01100 vdev_open(vd->vdev_child[c]); 01101 return; 01102 } 01103 tq = taskq_create("vdev_open", children, minclsyspri, 01104 children, children, TASKQ_PREPOPULATE); 01105 01106 for (int c = 0; c < children; c++) 01107 VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], 01108 TQ_SLEEP) != 0); 01109 01110 taskq_destroy(tq); 01111 } 01112 01116 int 01117 vdev_open(vdev_t *vd) 01118 { 01119 spa_t *spa = vd->vdev_spa; 01120 int error; 01121 uint64_t osize = 0; 01122 uint64_t max_osize = 0; 01123 uint64_t asize, max_asize, psize; 01124 uint64_t ashift = 0; 01125 01126 ASSERT(vd->vdev_open_thread == curthread || 01127 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 01128 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 01129 vd->vdev_state == VDEV_STATE_CANT_OPEN || 01130 vd->vdev_state == VDEV_STATE_OFFLINE); 01131 01132 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 01133 vd->vdev_cant_read = B_FALSE; 01134 vd->vdev_cant_write = B_FALSE; 01135 vd->vdev_min_asize = vdev_get_min_asize(vd); 01136 01137 /* 01138 * If this vdev is not removed, check its fault status. If it's 01139 * faulted, bail out of the open. 01140 */ 01141 if (!vd->vdev_removed && vd->vdev_faulted) { 01142 ASSERT(vd->vdev_children == 0); 01143 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 01144 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 01145 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 01146 vd->vdev_label_aux); 01147 return (ENXIO); 01148 } else if (vd->vdev_offline) { 01149 ASSERT(vd->vdev_children == 0); 01150 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 01151 return (ENXIO); 01152 } 01153 01154 error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift); 01155 01156 /* 01157 * Reset the vdev_reopening flag so that we actually close 01158 * the vdev on error. 01159 */ 01160 vd->vdev_reopening = B_FALSE; 01161 if (zio_injection_enabled && error == 0) 01162 error = zio_handle_device_injection(vd, NULL, ENXIO); 01163 01164 if (error) { 01165 if (vd->vdev_removed && 01166 vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 01167 vd->vdev_removed = B_FALSE; 01168 01169 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 01170 vd->vdev_stat.vs_aux); 01171 return (error); 01172 } 01173 01174 vd->vdev_removed = B_FALSE; 01175 01176 /* 01177 * Recheck the faulted flag now that we have confirmed that 01178 * the vdev is accessible. If we're faulted, bail. 01179 */ 01180 if (vd->vdev_faulted) { 01181 ASSERT(vd->vdev_children == 0); 01182 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 01183 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 01184 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 01185 vd->vdev_label_aux); 01186 return (ENXIO); 01187 } 01188 01189 if (vd->vdev_degraded) { 01190 ASSERT(vd->vdev_children == 0); 01191 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 01192 VDEV_AUX_ERR_EXCEEDED); 01193 } else { 01194 vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); 01195 } 01196 01197 /* 01198 * For hole or missing vdevs we just return success. 01199 */ 01200 if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) 01201 return (0); 01202 01203 if (vd->vdev_ops->vdev_op_leaf) { 01204 vd->vdev_notrim = B_FALSE; 01205 trim_map_create(vd); 01206 } 01207 01208 for (int c = 0; c < vd->vdev_children; c++) { 01209 if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 01210 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 01211 VDEV_AUX_NONE); 01212 break; 01213 } 01214 } 01215 01216 osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 01217 max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); 01218 01219 if (vd->vdev_children == 0) { 01220 if (osize < SPA_MINDEVSIZE) { 01221 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 01222 VDEV_AUX_TOO_SMALL); 01223 return (EOVERFLOW); 01224 } 01225 psize = osize; 01226 asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 01227 max_asize = max_osize - (VDEV_LABEL_START_SIZE + 01228 VDEV_LABEL_END_SIZE); 01229 } else { 01230 if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 01231 (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 01232 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 01233 VDEV_AUX_TOO_SMALL); 01234 return (EOVERFLOW); 01235 } 01236 psize = 0; 01237 asize = osize; 01238 max_asize = max_osize; 01239 } 01240 01241 vd->vdev_psize = psize; 01242 01243 /* 01244 * Make sure the allocatable size hasn't shrunk. 01245 */ 01246 if (asize < vd->vdev_min_asize) { 01247 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 01248 VDEV_AUX_BAD_LABEL); 01249 return (EINVAL); 01250 } 01251 01252 if (vd->vdev_asize == 0) { 01253 /* 01254 * This is the first-ever open, so use the computed values. 01255 * For testing purposes, a higher ashift can be requested. 01256 */ 01257 vd->vdev_asize = asize; 01258 vd->vdev_max_asize = max_asize; 01259 vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); 01260 } else { 01261 /* 01262 * Make sure the alignment requirement hasn't increased. 01263 */ 01264 if (ashift > vd->vdev_top->vdev_ashift) { 01265 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 01266 VDEV_AUX_BAD_LABEL); 01267 return (EINVAL); 01268 } 01269 vd->vdev_max_asize = max_asize; 01270 } 01271 01272 /* 01273 * If all children are healthy and the asize has increased, 01274 * then we've experienced dynamic LUN growth. If automatic 01275 * expansion is enabled then use the additional space. 01276 */ 01277 if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize && 01278 (vd->vdev_expanding || spa->spa_autoexpand)) 01279 vd->vdev_asize = asize; 01280 01281 vdev_set_min_asize(vd); 01282 01283 /* 01284 * Ensure we can issue some IO before declaring the 01285 * vdev open for business. 01286 */ 01287 if (vd->vdev_ops->vdev_op_leaf && 01288 (error = zio_wait(vdev_probe(vd, NULL))) != 0) { 01289 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 01290 VDEV_AUX_ERR_EXCEEDED); 01291 return (error); 01292 } 01293 01294 /* 01295 * If a leaf vdev has a DTL, and seems healthy, then kick off a 01296 * resilver. But don't do this if we are doing a reopen for a scrub, 01297 * since this would just restart the scrub we are already doing. 01298 */ 01299 if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && 01300 vdev_resilver_needed(vd, NULL, NULL)) 01301 spa_async_request(spa, SPA_ASYNC_RESILVER); 01302 01303 return (0); 01304 } 01305 01323 int 01324 vdev_validate(vdev_t *vd, boolean_t strict) 01325 { 01326 spa_t *spa = vd->vdev_spa; 01327 nvlist_t *label; 01328 uint64_t guid = 0, top_guid; 01329 uint64_t state; 01330 01331 for (int c = 0; c < vd->vdev_children; c++) 01332 if (vdev_validate(vd->vdev_child[c], strict) != 0) 01333 return (EBADF); 01334 01335 /* 01336 * If the device has already failed, or was marked offline, don't do 01337 * any further validation. Otherwise, label I/O will fail and we will 01338 * overwrite the previous state. 01339 */ 01340 if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { 01341 uint64_t aux_guid = 0; 01342 nvlist_t *nvl; 01343 uint64_t txg = strict ? spa->spa_config_txg : -1ULL; 01344 01345 if ((label = vdev_label_read_config(vd, txg)) == NULL) { 01346 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 01347 VDEV_AUX_BAD_LABEL); 01348 return (0); 01349 } 01350 01351 /* 01352 * Determine if this vdev has been split off into another 01353 * pool. If so, then refuse to open it. 01354 */ 01355 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, 01356 &aux_guid) == 0 && aux_guid == spa_guid(spa)) { 01357 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 01358 VDEV_AUX_SPLIT_POOL); 01359 nvlist_free(label); 01360 return (0); 01361 } 01362 01363 if (strict && (nvlist_lookup_uint64(label, 01364 ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || 01365 guid != spa_guid(spa))) { 01366 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 01367 VDEV_AUX_CORRUPT_DATA); 01368 nvlist_free(label); 01369 return (0); 01370 } 01371 01372 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) 01373 != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, 01374 &aux_guid) != 0) 01375 aux_guid = 0; 01376 01377 /* 01378 * If this vdev just became a top-level vdev because its 01379 * sibling was detached, it will have adopted the parent's 01380 * vdev guid -- but the label may or may not be on disk yet. 01381 * Fortunately, either version of the label will have the 01382 * same top guid, so if we're a top-level vdev, we can 01383 * safely compare to that instead. 01384 * 01385 * If we split this vdev off instead, then we also check the 01386 * original pool's guid. We don't want to consider the vdev 01387 * corrupt if it is partway through a split operation. 01388 */ 01389 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 01390 &guid) != 0 || 01391 nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, 01392 &top_guid) != 0 || 01393 ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && 01394 (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { 01395 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 01396 VDEV_AUX_CORRUPT_DATA); 01397 nvlist_free(label); 01398 return (0); 01399 } 01400 01401 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 01402 &state) != 0) { 01403 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 01404 VDEV_AUX_CORRUPT_DATA); 01405 nvlist_free(label); 01406 return (0); 01407 } 01408 01409 nvlist_free(label); 01410 01411 /* 01412 * If this is a verbatim import, no need to check the 01413 * state of the pool. 01414 */ 01415 if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && 01416 spa_load_state(spa) == SPA_LOAD_OPEN && 01417 state != POOL_STATE_ACTIVE) 01418 return (EBADF); 01419 01420 /* 01421 * If we were able to open and validate a vdev that was 01422 * previously marked permanently unavailable, clear that state 01423 * now. 01424 */ 01425 if (vd->vdev_not_present) 01426 vd->vdev_not_present = 0; 01427 } 01428 01429 return (0); 01430 } 01431 01435 void 01436 vdev_close(vdev_t *vd) 01437 { 01438 spa_t *spa = vd->vdev_spa; 01439 vdev_t *pvd = vd->vdev_parent; 01440 01441 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 01442 01443 /* 01444 * If our parent is reopening, then we are as well, unless we are 01445 * going offline. 01446 */ 01447 if (pvd != NULL && pvd->vdev_reopening) 01448 vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); 01449 01450 vd->vdev_ops->vdev_op_close(vd); 01451 01452 vdev_cache_purge(vd); 01453 01454 if (vd->vdev_ops->vdev_op_leaf) 01455 trim_map_destroy(vd); 01456 01457 /* 01458 * We record the previous state before we close it, so that if we are 01459 * doing a reopen(), we don't generate FMA ereports if we notice that 01460 * it's still faulted. 01461 */ 01462 vd->vdev_prevstate = vd->vdev_state; 01463 01464 if (vd->vdev_offline) 01465 vd->vdev_state = VDEV_STATE_OFFLINE; 01466 else 01467 vd->vdev_state = VDEV_STATE_CLOSED; 01468 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 01469 } 01470 01471 void 01472 vdev_hold(vdev_t *vd) 01473 { 01474 spa_t *spa = vd->vdev_spa; 01475 01476 ASSERT(spa_is_root(spa)); 01477 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 01478 return; 01479 01480 for (int c = 0; c < vd->vdev_children; c++) 01481 vdev_hold(vd->vdev_child[c]); 01482 01483 if (vd->vdev_ops->vdev_op_leaf) 01484 vd->vdev_ops->vdev_op_hold(vd); 01485 } 01486 01487 void 01488 vdev_rele(vdev_t *vd) 01489 { 01490 spa_t *spa = vd->vdev_spa; 01491 01492 ASSERT(spa_is_root(spa)); 01493 for (int c = 0; c < vd->vdev_children; c++) 01494 vdev_rele(vd->vdev_child[c]); 01495 01496 if (vd->vdev_ops->vdev_op_leaf) 01497 vd->vdev_ops->vdev_op_rele(vd); 01498 } 01499 01506 void 01507 vdev_reopen(vdev_t *vd) 01508 { 01509 spa_t *spa = vd->vdev_spa; 01510 01511 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 01512 01513 /* set the reopening flag unless we're taking the vdev offline */ 01514 vd->vdev_reopening = !vd->vdev_offline; 01515 vdev_close(vd); 01516 (void) vdev_open(vd); 01517 01518 /* 01519 * Call vdev_validate() here to make sure we have the same device. 01520 * Otherwise, a device with an invalid label could be successfully 01521 * opened in response to vdev_reopen(). 01522 */ 01523 if (vd->vdev_aux) { 01524 (void) vdev_validate_aux(vd); 01525 if (vdev_readable(vd) && vdev_writeable(vd) && 01526 vd->vdev_aux == &spa->spa_l2cache && 01527 !l2arc_vdev_present(vd)) 01528 l2arc_add_vdev(spa, vd); 01529 } else { 01530 (void) vdev_validate(vd, spa_last_synced_txg(spa)); 01531 } 01532 01533 /* 01534 * Reassess parent vdev's health. 01535 */ 01536 vdev_propagate_state(vd); 01537 } 01538 01539 int 01540 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 01541 { 01542 int error; 01543 01544 /* 01545 * Normally, partial opens (e.g. of a mirror) are allowed. 01546 * For a create, however, we want to fail the request if 01547 * there are any components we can't open. 01548 */ 01549 error = vdev_open(vd); 01550 01551 if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 01552 vdev_close(vd); 01553 return (error ? error : ENXIO); 01554 } 01555 01556 /* 01557 * Recursively initialize all labels. 01558 */ 01559 if ((error = vdev_label_init(vd, txg, isreplacing ? 01560 VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 01561 vdev_close(vd); 01562 return (error); 01563 } 01564 01565 return (0); 01566 } 01567 01568 void 01569 vdev_metaslab_set_size(vdev_t *vd) 01570 { 01571 /* 01572 * Aim for roughly 200 metaslabs per vdev. 01573 */ 01574 vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); 01575 vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 01576 } 01577 01578 void 01579 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 01580 { 01581 ASSERT(vd == vd->vdev_top); 01582 ASSERT(!vd->vdev_ishole); 01583 ASSERT(ISP2(flags)); 01584 ASSERT(spa_writeable(vd->vdev_spa)); 01585 01586 if (flags & VDD_METASLAB) 01587 (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 01588 01589 if (flags & VDD_DTL) 01590 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 01591 01592 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 01593 } 01594 01595 /* 01596 * DTLs. 01597 * 01598 * A vdev's DTL (dirty time log) is the set of transaction groups for which 01599 * the vdev has less than perfect replication. There are four kinds of DTL: 01600 * 01601 * DTL_MISSING: txgs for which the vdev has no valid copies of the data 01602 * 01603 * DTL_PARTIAL: txgs for which data is available, but not fully replicated 01604 * 01605 * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon 01606 * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of 01607 * txgs that was scrubbed. 01608 * 01609 * DTL_OUTAGE: txgs which cannot currently be read, whether due to 01610 * persistent errors or just some device being offline. 01611 * Unlike the other three, the DTL_OUTAGE map is not generally 01612 * maintained; it's only computed when needed, typically to 01613 * determine whether a device can be detached. 01614 * 01615 * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device 01616 * either has the data or it doesn't. 01617 * 01618 * For interior vdevs such as mirror and RAID-Z the picture is more complex. 01619 * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because 01620 * if any child is less than fully replicated, then so is its parent. 01621 * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, 01622 * comprising only those txgs which appear in 'maxfaults' or more children; 01623 * those are the txgs we don't have enough replication to read. For example, 01624 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); 01625 * thus, its DTL_MISSING consists of the set of txgs that appear in more than 01626 * two child DTL_MISSING maps. 01627 * 01628 * It should be clear from the above that to compute the DTLs and outage maps 01629 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. 01630 * Therefore, that is all we keep on disk. When loading the pool, or after 01631 * a configuration change, we generate all other DTLs from first principles. 01632 */ 01633 void 01634 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 01635 { 01636 space_map_t *sm = &vd->vdev_dtl[t]; 01637 01638 ASSERT(t < DTL_TYPES); 01639 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 01640 ASSERT(spa_writeable(vd->vdev_spa)); 01641 01642 mutex_enter(sm->sm_lock); 01643 if (!space_map_contains(sm, txg, size)) 01644 space_map_add(sm, txg, size); 01645 mutex_exit(sm->sm_lock); 01646 } 01647 01648 boolean_t 01649 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 01650 { 01651 space_map_t *sm = &vd->vdev_dtl[t]; 01652 boolean_t dirty = B_FALSE; 01653 01654 ASSERT(t < DTL_TYPES); 01655 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 01656 01657 mutex_enter(sm->sm_lock); 01658 if (sm->sm_space != 0) 01659 dirty = space_map_contains(sm, txg, size); 01660 mutex_exit(sm->sm_lock); 01661 01662 return (dirty); 01663 } 01664 01665 boolean_t 01666 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) 01667 { 01668 space_map_t *sm = &vd->vdev_dtl[t]; 01669 boolean_t empty; 01670 01671 mutex_enter(sm->sm_lock); 01672 empty = (sm->sm_space == 0); 01673 mutex_exit(sm->sm_lock); 01674 01675 return (empty); 01676 } 01677 01681 void 01682 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 01683 { 01684 spa_t *spa = vd->vdev_spa; 01685 avl_tree_t reftree; 01686 int minref; 01687 01688 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 01689 01690 for (int c = 0; c < vd->vdev_children; c++) 01691 vdev_dtl_reassess(vd->vdev_child[c], txg, 01692 scrub_txg, scrub_done); 01693 01694 if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux) 01695 return; 01696 01697 if (vd->vdev_ops->vdev_op_leaf) { 01698 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 01699 01700 mutex_enter(&vd->vdev_dtl_lock); 01701 if (scrub_txg != 0 && 01702 (spa->spa_scrub_started || 01703 (scn && scn->scn_phys.scn_errors == 0))) { 01704 /* 01705 * We completed a scrub up to scrub_txg. If we 01706 * did it without rebooting, then the scrub dtl 01707 * will be valid, so excise the old region and 01708 * fold in the scrub dtl. Otherwise, leave the 01709 * dtl as-is if there was an error. 01710 * 01711 * There's little trick here: to excise the beginning 01712 * of the DTL_MISSING map, we put it into a reference 01713 * tree and then add a segment with refcnt -1 that 01714 * covers the range [0, scrub_txg). This means 01715 * that each txg in that range has refcnt -1 or 0. 01716 * We then add DTL_SCRUB with a refcnt of 2, so that 01717 * entries in the range [0, scrub_txg) will have a 01718 * positive refcnt -- either 1 or 2. We then convert 01719 * the reference tree into the new DTL_MISSING map. 01720 */ 01721 space_map_ref_create(&reftree); 01722 space_map_ref_add_map(&reftree, 01723 &vd->vdev_dtl[DTL_MISSING], 1); 01724 space_map_ref_add_seg(&reftree, 0, scrub_txg, -1); 01725 space_map_ref_add_map(&reftree, 01726 &vd->vdev_dtl[DTL_SCRUB], 2); 01727 space_map_ref_generate_map(&reftree, 01728 &vd->vdev_dtl[DTL_MISSING], 1); 01729 space_map_ref_destroy(&reftree); 01730 } 01731 space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); 01732 space_map_walk(&vd->vdev_dtl[DTL_MISSING], 01733 space_map_add, &vd->vdev_dtl[DTL_PARTIAL]); 01734 if (scrub_done) 01735 space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL); 01736 space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); 01737 if (!vdev_readable(vd)) 01738 space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); 01739 else 01740 space_map_walk(&vd->vdev_dtl[DTL_MISSING], 01741 space_map_add, &vd->vdev_dtl[DTL_OUTAGE]); 01742 mutex_exit(&vd->vdev_dtl_lock); 01743 01744 if (txg != 0) 01745 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 01746 return; 01747 } 01748 01749 mutex_enter(&vd->vdev_dtl_lock); 01750 for (int t = 0; t < DTL_TYPES; t++) { 01751 /* account for child's outage in parent's missing map */ 01752 int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; 01753 if (t == DTL_SCRUB) 01754 continue; /* leaf vdevs only */ 01755 if (t == DTL_PARTIAL) 01756 minref = 1; /* i.e. non-zero */ 01757 else if (vd->vdev_nparity != 0) 01758 minref = vd->vdev_nparity + 1; /* RAID-Z */ 01759 else 01760 minref = vd->vdev_children; /* any kind of mirror */ 01761 space_map_ref_create(&reftree); 01762 for (int c = 0; c < vd->vdev_children; c++) { 01763 vdev_t *cvd = vd->vdev_child[c]; 01764 mutex_enter(&cvd->vdev_dtl_lock); 01765 space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1); 01766 mutex_exit(&cvd->vdev_dtl_lock); 01767 } 01768 space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref); 01769 space_map_ref_destroy(&reftree); 01770 } 01771 mutex_exit(&vd->vdev_dtl_lock); 01772 } 01773 01774 static int 01775 vdev_dtl_load(vdev_t *vd) 01776 { 01777 spa_t *spa = vd->vdev_spa; 01778 space_map_obj_t *smo = &vd->vdev_dtl_smo; 01779 objset_t *mos = spa->spa_meta_objset; 01780 dmu_buf_t *db; 01781 int error; 01782 01783 ASSERT(vd->vdev_children == 0); 01784 01785 if (smo->smo_object == 0) 01786 return (0); 01787 01788 ASSERT(!vd->vdev_ishole); 01789 01790 if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) 01791 return (error); 01792 01793 ASSERT3U(db->db_size, >=, sizeof (*smo)); 01794 bcopy(db->db_data, smo, sizeof (*smo)); 01795 dmu_buf_rele(db, FTAG); 01796 01797 mutex_enter(&vd->vdev_dtl_lock); 01798 error = space_map_load(&vd->vdev_dtl[DTL_MISSING], 01799 NULL, SM_ALLOC, smo, mos); 01800 mutex_exit(&vd->vdev_dtl_lock); 01801 01802 return (error); 01803 } 01804 01805 void 01806 vdev_dtl_sync(vdev_t *vd, uint64_t txg) 01807 { 01808 spa_t *spa = vd->vdev_spa; 01809 space_map_obj_t *smo = &vd->vdev_dtl_smo; 01810 space_map_t *sm = &vd->vdev_dtl[DTL_MISSING]; 01811 objset_t *mos = spa->spa_meta_objset; 01812 space_map_t smsync; 01813 kmutex_t smlock; 01814 dmu_buf_t *db; 01815 dmu_tx_t *tx; 01816 01817 ASSERT(!vd->vdev_ishole); 01818 01819 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 01820 01821 if (vd->vdev_detached) { 01822 if (smo->smo_object != 0) { 01823 int err = dmu_object_free(mos, smo->smo_object, tx); 01824 ASSERT0(err); 01825 smo->smo_object = 0; 01826 } 01827 dmu_tx_commit(tx); 01828 return; 01829 } 01830 01831 if (smo->smo_object == 0) { 01832 ASSERT(smo->smo_objsize == 0); 01833 ASSERT(smo->smo_alloc == 0); 01834 smo->smo_object = dmu_object_alloc(mos, 01835 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 01836 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 01837 ASSERT(smo->smo_object != 0); 01838 vdev_config_dirty(vd->vdev_top); 01839 } 01840 01841 mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); 01842 01843 space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, 01844 &smlock); 01845 01846 mutex_enter(&smlock); 01847 01848 mutex_enter(&vd->vdev_dtl_lock); 01849 space_map_walk(sm, space_map_add, &smsync); 01850 mutex_exit(&vd->vdev_dtl_lock); 01851 01852 space_map_truncate(smo, mos, tx); 01853 space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); 01854 01855 space_map_destroy(&smsync); 01856 01857 mutex_exit(&smlock); 01858 mutex_destroy(&smlock); 01859 01860 VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 01861 dmu_buf_will_dirty(db, tx); 01862 ASSERT3U(db->db_size, >=, sizeof (*smo)); 01863 bcopy(smo, db->db_data, sizeof (*smo)); 01864 dmu_buf_rele(db, FTAG); 01865 01866 dmu_tx_commit(tx); 01867 } 01868 01873 boolean_t 01874 vdev_dtl_required(vdev_t *vd) 01875 { 01876 spa_t *spa = vd->vdev_spa; 01877 vdev_t *tvd = vd->vdev_top; 01878 uint8_t cant_read = vd->vdev_cant_read; 01879 boolean_t required; 01880 01881 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 01882 01883 if (vd == spa->spa_root_vdev || vd == tvd) 01884 return (B_TRUE); 01885 01886 /* 01887 * Temporarily mark the device as unreadable, and then determine 01888 * whether this results in any DTL outages in the top-level vdev. 01889 * If not, we can safely offline/detach/remove the device. 01890 */ 01891 vd->vdev_cant_read = B_TRUE; 01892 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 01893 required = !vdev_dtl_empty(tvd, DTL_OUTAGE); 01894 vd->vdev_cant_read = cant_read; 01895 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 01896 01897 if (!required && zio_injection_enabled) 01898 required = !!zio_handle_device_injection(vd, NULL, ECHILD); 01899 01900 return (required); 01901 } 01902 01906 boolean_t 01907 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) 01908 { 01909 boolean_t needed = B_FALSE; 01910 uint64_t thismin = UINT64_MAX; 01911 uint64_t thismax = 0; 01912 01913 if (vd->vdev_children == 0) { 01914 mutex_enter(&vd->vdev_dtl_lock); 01915 if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 && 01916 vdev_writeable(vd)) { 01917 space_seg_t *ss; 01918 01919 ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root); 01920 thismin = ss->ss_start - 1; 01921 ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root); 01922 thismax = ss->ss_end; 01923 needed = B_TRUE; 01924 } 01925 mutex_exit(&vd->vdev_dtl_lock); 01926 } else { 01927 for (int c = 0; c < vd->vdev_children; c++) { 01928 vdev_t *cvd = vd->vdev_child[c]; 01929 uint64_t cmin, cmax; 01930 01931 if (vdev_resilver_needed(cvd, &cmin, &cmax)) { 01932 thismin = MIN(thismin, cmin); 01933 thismax = MAX(thismax, cmax); 01934 needed = B_TRUE; 01935 } 01936 } 01937 } 01938 01939 if (needed && minp) { 01940 *minp = thismin; 01941 *maxp = thismax; 01942 } 01943 return (needed); 01944 } 01945 01946 void 01947 vdev_load(vdev_t *vd) 01948 { 01949 /* 01950 * Recursively load all children. 01951 */ 01952 for (int c = 0; c < vd->vdev_children; c++) 01953 vdev_load(vd->vdev_child[c]); 01954 01955 /* 01956 * If this is a top-level vdev, initialize its metaslabs. 01957 */ 01958 if (vd == vd->vdev_top && !vd->vdev_ishole && 01959 (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || 01960 vdev_metaslab_init(vd, 0) != 0)) 01961 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 01962 VDEV_AUX_CORRUPT_DATA); 01963 01964 /* 01965 * If this is a leaf vdev, load its DTL. 01966 */ 01967 if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) 01968 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 01969 VDEV_AUX_CORRUPT_DATA); 01970 } 01971 01979 int 01980 vdev_validate_aux(vdev_t *vd) 01981 { 01982 nvlist_t *label; 01983 uint64_t guid, version; 01984 uint64_t state; 01985 01986 if (!vdev_readable(vd)) 01987 return (0); 01988 01989 if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { 01990 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 01991 VDEV_AUX_CORRUPT_DATA); 01992 return (-1); 01993 } 01994 01995 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 01996 !SPA_VERSION_IS_SUPPORTED(version) || 01997 nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 01998 guid != vd->vdev_guid || 01999 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 02000 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 02001 VDEV_AUX_CORRUPT_DATA); 02002 nvlist_free(label); 02003 return (-1); 02004 } 02005 02006 /* 02007 * We don't actually check the pool state here. If it's in fact in 02008 * use by another pool, we update this fact on the fly when requested. 02009 */ 02010 nvlist_free(label); 02011 return (0); 02012 } 02013 02014 void 02015 vdev_remove(vdev_t *vd, uint64_t txg) 02016 { 02017 spa_t *spa = vd->vdev_spa; 02018 objset_t *mos = spa->spa_meta_objset; 02019 dmu_tx_t *tx; 02020 02021 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 02022 02023 if (vd->vdev_dtl_smo.smo_object) { 02024 ASSERT0(vd->vdev_dtl_smo.smo_alloc); 02025 (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx); 02026 vd->vdev_dtl_smo.smo_object = 0; 02027 } 02028 02029 if (vd->vdev_ms != NULL) { 02030 for (int m = 0; m < vd->vdev_ms_count; m++) { 02031 metaslab_t *msp = vd->vdev_ms[m]; 02032 02033 if (msp == NULL || msp->ms_smo.smo_object == 0) 02034 continue; 02035 02036 ASSERT0(msp->ms_smo.smo_alloc); 02037 (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx); 02038 msp->ms_smo.smo_object = 0; 02039 } 02040 } 02041 02042 if (vd->vdev_ms_array) { 02043 (void) dmu_object_free(mos, vd->vdev_ms_array, tx); 02044 vd->vdev_ms_array = 0; 02045 vd->vdev_ms_shift = 0; 02046 } 02047 dmu_tx_commit(tx); 02048 } 02049 02050 void 02051 vdev_sync_done(vdev_t *vd, uint64_t txg) 02052 { 02053 metaslab_t *msp; 02054 boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); 02055 02056 ASSERT(!vd->vdev_ishole); 02057 02058 while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 02059 metaslab_sync_done(msp, txg); 02060 02061 if (reassess) 02062 metaslab_sync_reassess(vd->vdev_mg); 02063 } 02064 02065 void 02066 vdev_sync(vdev_t *vd, uint64_t txg) 02067 { 02068 spa_t *spa = vd->vdev_spa; 02069 vdev_t *lvd; 02070 metaslab_t *msp; 02071 dmu_tx_t *tx; 02072 02073 ASSERT(!vd->vdev_ishole); 02074 02075 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { 02076 ASSERT(vd == vd->vdev_top); 02077 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 02078 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 02079 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 02080 ASSERT(vd->vdev_ms_array != 0); 02081 vdev_config_dirty(vd); 02082 dmu_tx_commit(tx); 02083 } 02084 02085 /* 02086 * Remove the metadata associated with this vdev once it's empty. 02087 */ 02088 if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) 02089 vdev_remove(vd, txg); 02090 02091 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 02092 metaslab_sync(msp, txg); 02093 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 02094 } 02095 02096 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 02097 vdev_dtl_sync(lvd, txg); 02098 02099 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 02100 } 02101 02102 uint64_t 02103 vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 02104 { 02105 return (vd->vdev_ops->vdev_op_asize(vd, psize)); 02106 } 02107 02112 int 02113 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) 02114 { 02115 vdev_t *vd, *tvd; 02116 02117 spa_vdev_state_enter(spa, SCL_NONE); 02118 02119 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 02120 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 02121 02122 if (!vd->vdev_ops->vdev_op_leaf) 02123 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 02124 02125 tvd = vd->vdev_top; 02126 02127 /* 02128 * We don't directly use the aux state here, but if we do a 02129 * vdev_reopen(), we need this value to be present to remember why we 02130 * were faulted. 02131 */ 02132 vd->vdev_label_aux = aux; 02133 02134 /* 02135 * Faulted state takes precedence over degraded. 02136 */ 02137 vd->vdev_delayed_close = B_FALSE; 02138 vd->vdev_faulted = 1ULL; 02139 vd->vdev_degraded = 0ULL; 02140 vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); 02141 02142 /* 02143 * If this device has the only valid copy of the data, then 02144 * back off and simply mark the vdev as degraded instead. 02145 */ 02146 if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { 02147 vd->vdev_degraded = 1ULL; 02148 vd->vdev_faulted = 0ULL; 02149 02150 /* 02151 * If we reopen the device and it's not dead, only then do we 02152 * mark it degraded. 02153 */ 02154 vdev_reopen(tvd); 02155 02156 if (vdev_readable(vd)) 02157 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); 02158 } 02159 02160 return (spa_vdev_state_exit(spa, vd, 0)); 02161 } 02162 02168 int 02169 vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) 02170 { 02171 vdev_t *vd; 02172 02173 spa_vdev_state_enter(spa, SCL_NONE); 02174 02175 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 02176 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 02177 02178 if (!vd->vdev_ops->vdev_op_leaf) 02179 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 02180 02181 /* 02182 * If the vdev is already faulted, then don't do anything. 02183 */ 02184 if (vd->vdev_faulted || vd->vdev_degraded) 02185 return (spa_vdev_state_exit(spa, NULL, 0)); 02186 02187 vd->vdev_degraded = 1ULL; 02188 if (!vdev_is_dead(vd)) 02189 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 02190 aux); 02191 02192 return (spa_vdev_state_exit(spa, vd, 0)); 02193 } 02194 02203 int 02204 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) 02205 { 02206 vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; 02207 02208 spa_vdev_state_enter(spa, SCL_NONE); 02209 02210 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 02211 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 02212 02213 if (!vd->vdev_ops->vdev_op_leaf) 02214 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 02215 02216 tvd = vd->vdev_top; 02217 vd->vdev_offline = B_FALSE; 02218 vd->vdev_tmpoffline = B_FALSE; 02219 vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); 02220 vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); 02221 02222 /* XXX - L2ARC 1.0 does not support expansion */ 02223 if (!vd->vdev_aux) { 02224 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 02225 pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); 02226 } 02227 02228 vdev_reopen(tvd); 02229 vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 02230 02231 if (!vd->vdev_aux) { 02232 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 02233 pvd->vdev_expanding = B_FALSE; 02234 } 02235 02236 if (newstate) 02237 *newstate = vd->vdev_state; 02238 if ((flags & ZFS_ONLINE_UNSPARE) && 02239 !vdev_is_dead(vd) && vd->vdev_parent && 02240 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 02241 vd->vdev_parent->vdev_child[0] == vd) 02242 vd->vdev_unspare = B_TRUE; 02243 02244 if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { 02245 02246 /* XXX - L2ARC 1.0 does not support expansion */ 02247 if (vd->vdev_aux) 02248 return (spa_vdev_state_exit(spa, vd, ENOTSUP)); 02249 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 02250 } 02251 return (spa_vdev_state_exit(spa, vd, 0)); 02252 } 02253 02254 static int 02255 vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) 02256 { 02257 vdev_t *vd, *tvd; 02258 int error = 0; 02259 uint64_t generation; 02260 metaslab_group_t *mg; 02261 02262 top: 02263 spa_vdev_state_enter(spa, SCL_ALLOC); 02264 02265 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 02266 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 02267 02268 if (!vd->vdev_ops->vdev_op_leaf) 02269 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 02270 02271 tvd = vd->vdev_top; 02272 mg = tvd->vdev_mg; 02273 generation = spa->spa_config_generation + 1; 02274 02275 /* 02276 * If the device isn't already offline, try to offline it. 02277 */ 02278 if (!vd->vdev_offline) { 02279 /* 02280 * If this device has the only valid copy of some data, 02281 * don't allow it to be offlined. Log devices are always 02282 * expendable. 02283 */ 02284 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 02285 vdev_dtl_required(vd)) 02286 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 02287 02288 /* 02289 * If the top-level is a slog and it has had allocations 02290 * then proceed. We check that the vdev's metaslab group 02291 * is not NULL since it's possible that we may have just 02292 * added this vdev but not yet initialized its metaslabs. 02293 */ 02294 if (tvd->vdev_islog && mg != NULL) { 02295 /* 02296 * Prevent any future allocations. 02297 */ 02298 metaslab_group_passivate(mg); 02299 (void) spa_vdev_state_exit(spa, vd, 0); 02300 02301 error = spa_offline_log(spa); 02302 02303 spa_vdev_state_enter(spa, SCL_ALLOC); 02304 02305 /* 02306 * Check to see if the config has changed. 02307 */ 02308 if (error || generation != spa->spa_config_generation) { 02309 metaslab_group_activate(mg); 02310 if (error) 02311 return (spa_vdev_state_exit(spa, 02312 vd, error)); 02313 (void) spa_vdev_state_exit(spa, vd, 0); 02314 goto top; 02315 } 02316 ASSERT0(tvd->vdev_stat.vs_alloc); 02317 } 02318 02319 /* 02320 * Offline this device and reopen its top-level vdev. 02321 * If the top-level vdev is a log device then just offline 02322 * it. Otherwise, if this action results in the top-level 02323 * vdev becoming unusable, undo it and fail the request. 02324 */ 02325 vd->vdev_offline = B_TRUE; 02326 vdev_reopen(tvd); 02327 02328 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 02329 vdev_is_dead(tvd)) { 02330 vd->vdev_offline = B_FALSE; 02331 vdev_reopen(tvd); 02332 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 02333 } 02334 02335 /* 02336 * Add the device back into the metaslab rotor so that 02337 * once we online the device it's open for business. 02338 */ 02339 if (tvd->vdev_islog && mg != NULL) 02340 metaslab_group_activate(mg); 02341 } 02342 02343 vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); 02344 02345 return (spa_vdev_state_exit(spa, vd, 0)); 02346 } 02347 02348 int 02349 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 02350 { 02351 int error; 02352 02353 mutex_enter(&spa->spa_vdev_top_lock); 02354 error = vdev_offline_locked(spa, guid, flags); 02355 mutex_exit(&spa->spa_vdev_top_lock); 02356 02357 return (error); 02358 } 02359 02365 void 02366 vdev_clear(spa_t *spa, vdev_t *vd) 02367 { 02368 vdev_t *rvd = spa->spa_root_vdev; 02369 02370 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 02371 02372 if (vd == NULL) 02373 vd = rvd; 02374 02375 vd->vdev_stat.vs_read_errors = 0; 02376 vd->vdev_stat.vs_write_errors = 0; 02377 vd->vdev_stat.vs_checksum_errors = 0; 02378 02379 for (int c = 0; c < vd->vdev_children; c++) 02380 vdev_clear(spa, vd->vdev_child[c]); 02381 02382 /* 02383 * If we're in the FAULTED state or have experienced failed I/O, then 02384 * clear the persistent state and attempt to reopen the device. We 02385 * also mark the vdev config dirty, so that the new faulted state is 02386 * written out to disk. 02387 */ 02388 if (vd->vdev_faulted || vd->vdev_degraded || 02389 !vdev_readable(vd) || !vdev_writeable(vd)) { 02390 02391 /* 02392 * When reopening in reponse to a clear event, it may be due to 02393 * a fmadm repair request. In this case, if the device is 02394 * still broken, we want to still post the ereport again. 02395 */ 02396 vd->vdev_forcefault = B_TRUE; 02397 02398 vd->vdev_faulted = vd->vdev_degraded = 0ULL; 02399 vd->vdev_cant_read = B_FALSE; 02400 vd->vdev_cant_write = B_FALSE; 02401 02402 vdev_reopen(vd == rvd ? rvd : vd->vdev_top); 02403 02404 vd->vdev_forcefault = B_FALSE; 02405 02406 if (vd != rvd && vdev_writeable(vd->vdev_top)) 02407 vdev_state_dirty(vd->vdev_top); 02408 02409 if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) 02410 spa_async_request(spa, SPA_ASYNC_RESILVER); 02411 02412 spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); 02413 } 02414 02415 /* 02416 * When clearing a FMA-diagnosed fault, we always want to 02417 * unspare the device, as we assume that the original spare was 02418 * done in response to the FMA fault. 02419 */ 02420 if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && 02421 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 02422 vd->vdev_parent->vdev_child[0] == vd) 02423 vd->vdev_unspare = B_TRUE; 02424 } 02425 02426 boolean_t 02427 vdev_is_dead(vdev_t *vd) 02428 { 02429 /* 02430 * Holes and missing devices are always considered "dead". 02431 * This simplifies the code since we don't have to check for 02432 * these types of devices in the various code paths. 02433 * Instead we rely on the fact that we skip over dead devices 02434 * before issuing I/O to them. 02435 */ 02436 return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || 02437 vd->vdev_ops == &vdev_missing_ops); 02438 } 02439 02440 boolean_t 02441 vdev_readable(vdev_t *vd) 02442 { 02443 return (!vdev_is_dead(vd) && !vd->vdev_cant_read); 02444 } 02445 02446 boolean_t 02447 vdev_writeable(vdev_t *vd) 02448 { 02449 return (!vdev_is_dead(vd) && !vd->vdev_cant_write); 02450 } 02451 02452 boolean_t 02453 vdev_allocatable(vdev_t *vd) 02454 { 02455 uint64_t state = vd->vdev_state; 02456 02457 /* 02458 * We currently allow allocations from vdevs which may be in the 02459 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device 02460 * fails to reopen then we'll catch it later when we're holding 02461 * the proper locks. Note that we have to get the vdev state 02462 * in a local variable because although it changes atomically, 02463 * we're asking two separate questions about it. 02464 */ 02465 return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && 02466 !vd->vdev_cant_write && !vd->vdev_ishole); 02467 } 02468 02469 boolean_t 02470 vdev_accessible(vdev_t *vd, zio_t *zio) 02471 { 02472 ASSERT(zio->io_vd == vd); 02473 02474 if (vdev_is_dead(vd) || vd->vdev_remove_wanted) 02475 return (B_FALSE); 02476 02477 if (zio->io_type == ZIO_TYPE_READ) 02478 return (!vd->vdev_cant_read); 02479 02480 if (zio->io_type == ZIO_TYPE_WRITE) 02481 return (!vd->vdev_cant_write); 02482 02483 return (B_TRUE); 02484 } 02485 02486 /* 02487 * Get statistics for the given vdev. 02488 */ 02489 void 02490 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 02491 { 02492 vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 02493 02494 mutex_enter(&vd->vdev_stat_lock); 02495 bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 02496 vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 02497 vs->vs_state = vd->vdev_state; 02498 vs->vs_rsize = vdev_get_min_asize(vd); 02499 if (vd->vdev_ops->vdev_op_leaf) 02500 vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 02501 vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize; 02502 mutex_exit(&vd->vdev_stat_lock); 02503 02504 /* 02505 * If we're getting stats on the root vdev, aggregate the I/O counts 02506 * over all top-level vdevs (i.e. the direct children of the root). 02507 */ 02508 if (vd == rvd) { 02509 for (int c = 0; c < rvd->vdev_children; c++) { 02510 vdev_t *cvd = rvd->vdev_child[c]; 02511 vdev_stat_t *cvs = &cvd->vdev_stat; 02512 02513 mutex_enter(&vd->vdev_stat_lock); 02514 for (int t = 0; t < ZIO_TYPES; t++) { 02515 vs->vs_ops[t] += cvs->vs_ops[t]; 02516 vs->vs_bytes[t] += cvs->vs_bytes[t]; 02517 } 02518 cvs->vs_scan_removing = cvd->vdev_removing; 02519 mutex_exit(&vd->vdev_stat_lock); 02520 } 02521 } 02522 } 02523 02524 void 02525 vdev_clear_stats(vdev_t *vd) 02526 { 02527 mutex_enter(&vd->vdev_stat_lock); 02528 vd->vdev_stat.vs_space = 0; 02529 vd->vdev_stat.vs_dspace = 0; 02530 vd->vdev_stat.vs_alloc = 0; 02531 mutex_exit(&vd->vdev_stat_lock); 02532 } 02533 02534 void 02535 vdev_scan_stat_init(vdev_t *vd) 02536 { 02537 vdev_stat_t *vs = &vd->vdev_stat; 02538 02539 for (int c = 0; c < vd->vdev_children; c++) 02540 vdev_scan_stat_init(vd->vdev_child[c]); 02541 02542 mutex_enter(&vd->vdev_stat_lock); 02543 vs->vs_scan_processed = 0; 02544 mutex_exit(&vd->vdev_stat_lock); 02545 } 02546 02547 void 02548 vdev_stat_update(zio_t *zio, uint64_t psize) 02549 { 02550 spa_t *spa = zio->io_spa; 02551 vdev_t *rvd = spa->spa_root_vdev; 02552 vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; 02553 vdev_t *pvd; 02554 uint64_t txg = zio->io_txg; 02555 vdev_stat_t *vs = &vd->vdev_stat; 02556 zio_type_t type = zio->io_type; 02557 int flags = zio->io_flags; 02558 02559 /* 02560 * If this i/o is a gang leader, it didn't do any actual work. 02561 */ 02562 if (zio->io_gang_tree) 02563 return; 02564 02565 if (zio->io_error == 0) { 02566 /* 02567 * If this is a root i/o, don't count it -- we've already 02568 * counted the top-level vdevs, and vdev_get_stats() will 02569 * aggregate them when asked. This reduces contention on 02570 * the root vdev_stat_lock and implicitly handles blocks 02571 * that compress away to holes, for which there is no i/o. 02572 * (Holes never create vdev children, so all the counters 02573 * remain zero, which is what we want.) 02574 * 02575 * Note: this only applies to successful i/o (io_error == 0) 02576 * because unlike i/o counts, errors are not additive. 02577 * When reading a ditto block, for example, failure of 02578 * one top-level vdev does not imply a root-level error. 02579 */ 02580 if (vd == rvd) 02581 return; 02582 02583 ASSERT(vd == zio->io_vd); 02584 02585 if (flags & ZIO_FLAG_IO_BYPASS) 02586 return; 02587 02588 mutex_enter(&vd->vdev_stat_lock); 02589 02590 if (flags & ZIO_FLAG_IO_REPAIR) { 02591 if (flags & ZIO_FLAG_SCAN_THREAD) { 02592 dsl_scan_phys_t *scn_phys = 02593 &spa->spa_dsl_pool->dp_scan->scn_phys; 02594 uint64_t *processed = &scn_phys->scn_processed; 02595 02596 /* XXX cleanup? */ 02597 if (vd->vdev_ops->vdev_op_leaf) 02598 atomic_add_64(processed, psize); 02599 vs->vs_scan_processed += psize; 02600 } 02601 02602 if (flags & ZIO_FLAG_SELF_HEAL) 02603 vs->vs_self_healed += psize; 02604 } 02605 02606 vs->vs_ops[type]++; 02607 vs->vs_bytes[type] += psize; 02608 02609 mutex_exit(&vd->vdev_stat_lock); 02610 return; 02611 } 02612 02613 if (flags & ZIO_FLAG_SPECULATIVE) 02614 return; 02615 02616 /* 02617 * If this is an I/O error that is going to be retried, then ignore the 02618 * error. Otherwise, the user may interpret B_FAILFAST I/O errors as 02619 * hard errors, when in reality they can happen for any number of 02620 * innocuous reasons (bus resets, MPxIO link failure, etc). 02621 */ 02622 if (zio->io_error == EIO && 02623 !(zio->io_flags & ZIO_FLAG_IO_RETRY)) 02624 return; 02625 02626 /* 02627 * Intent logs writes won't propagate their error to the root 02628 * I/O so don't mark these types of failures as pool-level 02629 * errors. 02630 */ 02631 if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 02632 return; 02633 02634 mutex_enter(&vd->vdev_stat_lock); 02635 if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { 02636 if (zio->io_error == ECKSUM) 02637 vs->vs_checksum_errors++; 02638 else 02639 vs->vs_read_errors++; 02640 } 02641 if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) 02642 vs->vs_write_errors++; 02643 mutex_exit(&vd->vdev_stat_lock); 02644 02645 if (type == ZIO_TYPE_WRITE && txg != 0 && 02646 (!(flags & ZIO_FLAG_IO_REPAIR) || 02647 (flags & ZIO_FLAG_SCAN_THREAD) || 02648 spa->spa_claiming)) { 02649 /* 02650 * This is either a normal write (not a repair), or it's 02651 * a repair induced by the scrub thread, or it's a repair 02652 * made by zil_claim() during spa_load() in the first txg. 02653 * In the normal case, we commit the DTL change in the same 02654 * txg as the block was born. In the scrub-induced repair 02655 * case, we know that scrubs run in first-pass syncing context, 02656 * so we commit the DTL change in spa_syncing_txg(spa). 02657 * In the zil_claim() case, we commit in spa_first_txg(spa). 02658 * 02659 * We currently do not make DTL entries for failed spontaneous 02660 * self-healing writes triggered by normal (non-scrubbing) 02661 * reads, because we have no transactional context in which to 02662 * do so -- and it's not clear that it'd be desirable anyway. 02663 */ 02664 if (vd->vdev_ops->vdev_op_leaf) { 02665 uint64_t commit_txg = txg; 02666 if (flags & ZIO_FLAG_SCAN_THREAD) { 02667 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 02668 ASSERT(spa_sync_pass(spa) == 1); 02669 vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); 02670 commit_txg = spa_syncing_txg(spa); 02671 } else if (spa->spa_claiming) { 02672 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 02673 commit_txg = spa_first_txg(spa); 02674 } 02675 ASSERT(commit_txg >= spa_syncing_txg(spa)); 02676 if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) 02677 return; 02678 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 02679 vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); 02680 vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); 02681 } 02682 if (vd != rvd) 02683 vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); 02684 } 02685 } 02686 02691 void 02692 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, 02693 int64_t space_delta) 02694 { 02695 int64_t dspace_delta = space_delta; 02696 spa_t *spa = vd->vdev_spa; 02697 vdev_t *rvd = spa->spa_root_vdev; 02698 metaslab_group_t *mg = vd->vdev_mg; 02699 metaslab_class_t *mc = mg ? mg->mg_class : NULL; 02700 02701 ASSERT(vd == vd->vdev_top); 02702 02703 /* 02704 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 02705 * factor. We must calculate this here and not at the root vdev 02706 * because the root vdev's psize-to-asize is simply the max of its 02707 * childrens', thus not accurate enough for us. 02708 */ 02709 ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 02710 ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); 02711 dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 02712 vd->vdev_deflate_ratio; 02713 02714 mutex_enter(&vd->vdev_stat_lock); 02715 vd->vdev_stat.vs_alloc += alloc_delta; 02716 vd->vdev_stat.vs_space += space_delta; 02717 vd->vdev_stat.vs_dspace += dspace_delta; 02718 mutex_exit(&vd->vdev_stat_lock); 02719 02720 if (mc == spa_normal_class(spa)) { 02721 mutex_enter(&rvd->vdev_stat_lock); 02722 rvd->vdev_stat.vs_alloc += alloc_delta; 02723 rvd->vdev_stat.vs_space += space_delta; 02724 rvd->vdev_stat.vs_dspace += dspace_delta; 02725 mutex_exit(&rvd->vdev_stat_lock); 02726 } 02727 02728 if (mc != NULL) { 02729 ASSERT(rvd == vd->vdev_parent); 02730 ASSERT(vd->vdev_ms_count != 0); 02731 02732 metaslab_class_space_update(mc, 02733 alloc_delta, defer_delta, space_delta, dspace_delta); 02734 } 02735 } 02736 02742 void 02743 vdev_config_dirty(vdev_t *vd) 02744 { 02745 spa_t *spa = vd->vdev_spa; 02746 vdev_t *rvd = spa->spa_root_vdev; 02747 int c; 02748 02749 ASSERT(spa_writeable(spa)); 02750 02751 /* 02752 * If this is an aux vdev (as with l2cache and spare devices), then we 02753 * update the vdev config manually and set the sync flag. 02754 */ 02755 if (vd->vdev_aux != NULL) { 02756 spa_aux_vdev_t *sav = vd->vdev_aux; 02757 nvlist_t **aux; 02758 uint_t naux; 02759 02760 for (c = 0; c < sav->sav_count; c++) { 02761 if (sav->sav_vdevs[c] == vd) 02762 break; 02763 } 02764 02765 if (c == sav->sav_count) { 02766 /* 02767 * We're being removed. There's nothing more to do. 02768 */ 02769 ASSERT(sav->sav_sync == B_TRUE); 02770 return; 02771 } 02772 02773 sav->sav_sync = B_TRUE; 02774 02775 if (nvlist_lookup_nvlist_array(sav->sav_config, 02776 ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { 02777 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 02778 ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); 02779 } 02780 02781 ASSERT(c < naux); 02782 02783 /* 02784 * Setting the nvlist in the middle if the array is a little 02785 * sketchy, but it will work. 02786 */ 02787 nvlist_free(aux[c]); 02788 aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); 02789 02790 return; 02791 } 02792 02793 /* 02794 * The dirty list is protected by the SCL_CONFIG lock. The caller 02795 * must either hold SCL_CONFIG as writer, or must be the sync thread 02796 * (which holds SCL_CONFIG as reader). There's only one sync thread, 02797 * so this is sufficient to ensure mutual exclusion. 02798 */ 02799 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 02800 (dsl_pool_sync_context(spa_get_dsl(spa)) && 02801 spa_config_held(spa, SCL_CONFIG, RW_READER))); 02802 02803 if (vd == rvd) { 02804 for (c = 0; c < rvd->vdev_children; c++) 02805 vdev_config_dirty(rvd->vdev_child[c]); 02806 } else { 02807 ASSERT(vd == vd->vdev_top); 02808 02809 if (!list_link_active(&vd->vdev_config_dirty_node) && 02810 !vd->vdev_ishole) 02811 list_insert_head(&spa->spa_config_dirty_list, vd); 02812 } 02813 } 02814 02815 void 02816 vdev_config_clean(vdev_t *vd) 02817 { 02818 spa_t *spa = vd->vdev_spa; 02819 02820 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 02821 (dsl_pool_sync_context(spa_get_dsl(spa)) && 02822 spa_config_held(spa, SCL_CONFIG, RW_READER))); 02823 02824 ASSERT(list_link_active(&vd->vdev_config_dirty_node)); 02825 list_remove(&spa->spa_config_dirty_list, vd); 02826 } 02827 02834 void 02835 vdev_state_dirty(vdev_t *vd) 02836 { 02837 spa_t *spa = vd->vdev_spa; 02838 02839 ASSERT(spa_writeable(spa)); 02840 ASSERT(vd == vd->vdev_top); 02841 02842 /* 02843 * The state list is protected by the SCL_STATE lock. The caller 02844 * must either hold SCL_STATE as writer, or must be the sync thread 02845 * (which holds SCL_STATE as reader). There's only one sync thread, 02846 * so this is sufficient to ensure mutual exclusion. 02847 */ 02848 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 02849 (dsl_pool_sync_context(spa_get_dsl(spa)) && 02850 spa_config_held(spa, SCL_STATE, RW_READER))); 02851 02852 if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) 02853 list_insert_head(&spa->spa_state_dirty_list, vd); 02854 } 02855 02856 void 02857 vdev_state_clean(vdev_t *vd) 02858 { 02859 spa_t *spa = vd->vdev_spa; 02860 02861 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 02862 (dsl_pool_sync_context(spa_get_dsl(spa)) && 02863 spa_config_held(spa, SCL_STATE, RW_READER))); 02864 02865 ASSERT(list_link_active(&vd->vdev_state_dirty_node)); 02866 list_remove(&spa->spa_state_dirty_list, vd); 02867 } 02868 02872 void 02873 vdev_propagate_state(vdev_t *vd) 02874 { 02875 spa_t *spa = vd->vdev_spa; 02876 vdev_t *rvd = spa->spa_root_vdev; 02877 int degraded = 0, faulted = 0; 02878 int corrupted = 0; 02879 vdev_t *child; 02880 02881 if (vd->vdev_children > 0) { 02882 for (int c = 0; c < vd->vdev_children; c++) { 02883 child = vd->vdev_child[c]; 02884 02885 /* 02886 * Don't factor holes into the decision. 02887 */ 02888 if (child->vdev_ishole) 02889 continue; 02890 02891 if (!vdev_readable(child) || 02892 (!vdev_writeable(child) && spa_writeable(spa))) { 02893 /* 02894 * Root special: if there is a top-level log 02895 * device, treat the root vdev as if it were 02896 * degraded. 02897 */ 02898 if (child->vdev_islog && vd == rvd) 02899 degraded++; 02900 else 02901 faulted++; 02902 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 02903 degraded++; 02904 } 02905 02906 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 02907 corrupted++; 02908 } 02909 02910 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 02911 02912 /* 02913 * Root special: if there is a top-level vdev that cannot be 02914 * opened due to corrupted metadata, then propagate the root 02915 * vdev's aux state as 'corrupt' rather than 'insufficient 02916 * replicas'. 02917 */ 02918 if (corrupted && vd == rvd && 02919 rvd->vdev_state == VDEV_STATE_CANT_OPEN) 02920 vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 02921 VDEV_AUX_CORRUPT_DATA); 02922 } 02923 02924 if (vd->vdev_parent) 02925 vdev_propagate_state(vd->vdev_parent); 02926 } 02927 02936 void 02937 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 02938 { 02939 uint64_t save_state; 02940 spa_t *spa = vd->vdev_spa; 02941 02942 if (state == vd->vdev_state) { 02943 vd->vdev_stat.vs_aux = aux; 02944 return; 02945 } 02946 02947 save_state = vd->vdev_state; 02948 02949 vd->vdev_state = state; 02950 vd->vdev_stat.vs_aux = aux; 02951 02952 /* 02953 * If we are setting the vdev state to anything but an open state, then 02954 * always close the underlying device unless the device has requested 02955 * a delayed close (i.e. we're about to remove or fault the device). 02956 * Otherwise, we keep accessible but invalid devices open forever. 02957 * We don't call vdev_close() itself, because that implies some extra 02958 * checks (offline, etc) that we don't want here. This is limited to 02959 * leaf devices, because otherwise closing the device will affect other 02960 * children. 02961 */ 02962 if (!vd->vdev_delayed_close && vdev_is_dead(vd) && 02963 vd->vdev_ops->vdev_op_leaf) 02964 vd->vdev_ops->vdev_op_close(vd); 02965 02966 /* 02967 * If we have brought this vdev back into service, we need 02968 * to notify fmd so that it can gracefully repair any outstanding 02969 * cases due to a missing device. We do this in all cases, even those 02970 * that probably don't correlate to a repaired fault. This is sure to 02971 * catch all cases, and we let the zfs-retire agent sort it out. If 02972 * this is a transient state it's OK, as the retire agent will 02973 * double-check the state of the vdev before repairing it. 02974 */ 02975 if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf && 02976 vd->vdev_prevstate != state) 02977 zfs_post_state_change(spa, vd); 02978 02979 if (vd->vdev_removed && 02980 state == VDEV_STATE_CANT_OPEN && 02981 (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 02982 /* 02983 * If the previous state is set to VDEV_STATE_REMOVED, then this 02984 * device was previously marked removed and someone attempted to 02985 * reopen it. If this failed due to a nonexistent device, then 02986 * keep the device in the REMOVED state. We also let this be if 02987 * it is one of our special test online cases, which is only 02988 * attempting to online the device and shouldn't generate an FMA 02989 * fault. 02990 */ 02991 vd->vdev_state = VDEV_STATE_REMOVED; 02992 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 02993 } else if (state == VDEV_STATE_REMOVED) { 02994 vd->vdev_removed = B_TRUE; 02995 } else if (state == VDEV_STATE_CANT_OPEN) { 02996 /* 02997 * If we fail to open a vdev during an import or recovery, we 02998 * mark it as "not available", which signifies that it was 02999 * never there to begin with. Failure to open such a device 03000 * is not considered an error. 03001 */ 03002 if ((spa_load_state(spa) == SPA_LOAD_IMPORT || 03003 spa_load_state(spa) == SPA_LOAD_RECOVER) && 03004 vd->vdev_ops->vdev_op_leaf) 03005 vd->vdev_not_present = 1; 03006 03007 /* 03008 * Post the appropriate ereport. If the 'prevstate' field is 03009 * set to something other than VDEV_STATE_UNKNOWN, it indicates 03010 * that this is part of a vdev_reopen(). In this case, we don't 03011 * want to post the ereport if the device was already in the 03012 * CANT_OPEN state beforehand. 03013 * 03014 * If the 'checkremove' flag is set, then this is an attempt to 03015 * online the device in response to an insertion event. If we 03016 * hit this case, then we have detected an insertion event for a 03017 * faulted or offline device that wasn't in the removed state. 03018 * In this scenario, we don't post an ereport because we are 03019 * about to replace the device, or attempt an online with 03020 * vdev_forcefault, which will generate the fault for us. 03021 */ 03022 if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 03023 !vd->vdev_not_present && !vd->vdev_checkremove && 03024 vd != spa->spa_root_vdev) { 03025 const char *class; 03026 03027 switch (aux) { 03028 case VDEV_AUX_OPEN_FAILED: 03029 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 03030 break; 03031 case VDEV_AUX_CORRUPT_DATA: 03032 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 03033 break; 03034 case VDEV_AUX_NO_REPLICAS: 03035 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 03036 break; 03037 case VDEV_AUX_BAD_GUID_SUM: 03038 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 03039 break; 03040 case VDEV_AUX_TOO_SMALL: 03041 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 03042 break; 03043 case VDEV_AUX_BAD_LABEL: 03044 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 03045 break; 03046 default: 03047 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 03048 } 03049 03050 zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 03051 } 03052 03053 /* Erase any notion of persistent removed state */ 03054 vd->vdev_removed = B_FALSE; 03055 } else { 03056 vd->vdev_removed = B_FALSE; 03057 } 03058 03059 if (!isopen && vd->vdev_parent) 03060 vdev_propagate_state(vd->vdev_parent); 03061 } 03062 03076 boolean_t 03077 vdev_is_bootable(vdev_t *vd) 03078 { 03079 #ifdef sun 03080 if (!vd->vdev_ops->vdev_op_leaf) { 03081 char *vdev_type = vd->vdev_ops->vdev_op_type; 03082 03083 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 03084 vd->vdev_children > 1) { 03085 return (B_FALSE); 03086 } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 03087 strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 03088 return (B_FALSE); 03089 } 03090 } else if (vd->vdev_wholedisk == 1) { 03091 return (B_FALSE); 03092 } 03093 03094 for (int c = 0; c < vd->vdev_children; c++) { 03095 if (!vdev_is_bootable(vd->vdev_child[c])) 03096 return (B_FALSE); 03097 } 03098 #endif /* sun */ 03099 return (B_TRUE); 03100 } 03101 03108 void 03109 vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) 03110 { 03111 spa_t *spa = nvd->vdev_spa; 03112 03113 ASSERT(nvd->vdev_top->vdev_islog); 03114 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 03115 ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); 03116 03117 for (int c = 0; c < nvd->vdev_children; c++) 03118 vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); 03119 03120 if (nvd->vdev_ops->vdev_op_leaf) { 03121 /* 03122 * Restore the persistent vdev state 03123 */ 03124 nvd->vdev_offline = ovd->vdev_offline; 03125 nvd->vdev_faulted = ovd->vdev_faulted; 03126 nvd->vdev_degraded = ovd->vdev_degraded; 03127 nvd->vdev_removed = ovd->vdev_removed; 03128 } 03129 } 03130 03136 boolean_t 03137 vdev_log_state_valid(vdev_t *vd) 03138 { 03139 if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && 03140 !vd->vdev_removed) 03141 return (B_TRUE); 03142 03143 for (int c = 0; c < vd->vdev_children; c++) 03144 if (vdev_log_state_valid(vd->vdev_child[c])) 03145 return (B_TRUE); 03146 03147 return (B_FALSE); 03148 } 03149 03153 void 03154 vdev_expand(vdev_t *vd, uint64_t txg) 03155 { 03156 ASSERT(vd->vdev_top == vd); 03157 ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 03158 03159 if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { 03160 VERIFY(vdev_metaslab_init(vd, txg) == 0); 03161 vdev_config_dirty(vd); 03162 } 03163 } 03164 03168 void 03169 vdev_split(vdev_t *vd) 03170 { 03171 vdev_t *cvd, *pvd = vd->vdev_parent; 03172 03173 vdev_remove_child(pvd, vd); 03174 vdev_compact_children(pvd); 03175 03176 cvd = pvd->vdev_child[0]; 03177 if (pvd->vdev_children == 1) { 03178 vdev_remove_parent(cvd); 03179 cvd->vdev_splitting = B_TRUE; 03180 } 03181 vdev_propagate_state(cvd); 03182 }