FreeBSD ZFS
The Zettabyte File System

vdev.c

Go to the documentation of this file.
00001 /*
00002  * CDDL HEADER START
00003  *
00004  * The contents of this file are subject to the terms of the
00005  * Common Development and Distribution License (the "License").
00006  * You may not use this file except in compliance with the License.
00007  *
00008  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
00009  * or http://www.opensolaris.org/os/licensing.
00010  * See the License for the specific language governing permissions
00011  * and limitations under the License.
00012  *
00013  * When distributing Covered Code, include this CDDL HEADER in each
00014  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
00015  * If applicable, add the following below this CDDL HEADER, with the
00016  * fields enclosed by brackets "[]" replaced with your own identifying
00017  * information: Portions Copyright [yyyy] [name of copyright owner]
00018  *
00019  * CDDL HEADER END
00020  */
00021 
00022 /*
00023  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
00024  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
00025  * Copyright (c) 2012 by Delphix. All rights reserved.
00026  */
00027 
00028 #include <sys/zfs_context.h>
00029 #include <sys/fm/fs/zfs.h>
00030 #include <sys/spa.h>
00031 #include <sys/spa_impl.h>
00032 #include <sys/dmu.h>
00033 #include <sys/dmu_tx.h>
00034 #include <sys/vdev_impl.h>
00035 #include <sys/uberblock_impl.h>
00036 #include <sys/metaslab.h>
00037 #include <sys/metaslab_impl.h>
00038 #include <sys/space_map.h>
00039 #include <sys/zio.h>
00040 #include <sys/zap.h>
00041 #include <sys/fs/zfs.h>
00042 #include <sys/arc.h>
00043 #include <sys/zil.h>
00044 #include <sys/dsl_scan.h>
00045 #include <sys/trim_map.h>
00046 
00052 SYSCTL_DECL(_vfs_zfs);
00053 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
00054 
00055 static vdev_ops_t *vdev_ops_table[] = {
00056         &vdev_root_ops,
00057         &vdev_raidz_ops,
00058         &vdev_mirror_ops,
00059         &vdev_replacing_ops,
00060         &vdev_spare_ops,
00061 #ifdef _KERNEL
00062         &vdev_geom_ops,
00063 #else
00064         &vdev_disk_ops,
00065 #endif
00066         &vdev_file_ops,
00067         &vdev_missing_ops,
00068         &vdev_hole_ops,
00069         NULL
00070 };
00071 
00075 static vdev_ops_t *
00076 vdev_getops(const char *type)
00077 {
00078         vdev_ops_t *ops, **opspp;
00079 
00080         for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
00081                 if (strcmp(ops->vdev_op_type, type) == 0)
00082                         break;
00083 
00084         return (ops);
00085 }
00086 
00093 uint64_t
00094 vdev_default_asize(vdev_t *vd, uint64_t psize)
00095 {
00096         uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
00097         uint64_t csize;
00098 
00099         for (int c = 0; c < vd->vdev_children; c++) {
00100                 csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
00101                 asize = MAX(asize, csize);
00102         }
00103 
00104         return (asize);
00105 }
00106 
00113 uint64_t
00114 vdev_get_min_asize(vdev_t *vd)
00115 {
00116         vdev_t *pvd = vd->vdev_parent;
00117 
00118         /*
00119          * If our parent is NULL (inactive spare or cache) or is the root,
00120          * just return our own asize.
00121          */
00122         if (pvd == NULL)
00123                 return (vd->vdev_asize);
00124 
00125         /*
00126          * The top-level vdev just returns the allocatable size rounded
00127          * to the nearest metaslab.
00128          */
00129         if (vd == vd->vdev_top)
00130                 return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
00131 
00132         /*
00133          * The allocatable space for a raidz vdev is N * sizeof(smallest child),
00134          * so each child must provide at least 1/Nth of its asize.
00135          */
00136         if (pvd->vdev_ops == &vdev_raidz_ops)
00137                 return (pvd->vdev_min_asize / pvd->vdev_children);
00138 
00139         return (pvd->vdev_min_asize);
00140 }
00141 
00142 void
00143 vdev_set_min_asize(vdev_t *vd)
00144 {
00145         vd->vdev_min_asize = vdev_get_min_asize(vd);
00146 
00147         for (int c = 0; c < vd->vdev_children; c++)
00148                 vdev_set_min_asize(vd->vdev_child[c]);
00149 }
00150 
00151 vdev_t *
00152 vdev_lookup_top(spa_t *spa, uint64_t vdev)
00153 {
00154         vdev_t *rvd = spa->spa_root_vdev;
00155 
00156         ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
00157 
00158         if (vdev < rvd->vdev_children) {
00159                 ASSERT(rvd->vdev_child[vdev] != NULL);
00160                 return (rvd->vdev_child[vdev]);
00161         }
00162 
00163         return (NULL);
00164 }
00165 
00166 vdev_t *
00167 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
00168 {
00169         vdev_t *mvd;
00170 
00171         if (vd->vdev_guid == guid)
00172                 return (vd);
00173 
00174         for (int c = 0; c < vd->vdev_children; c++)
00175                 if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
00176                     NULL)
00177                         return (mvd);
00178 
00179         return (NULL);
00180 }
00181 
00182 void
00183 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
00184 {
00185         size_t oldsize, newsize;
00186         uint64_t id = cvd->vdev_id;
00187         vdev_t **newchild;
00188 
00189         ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
00190         ASSERT(cvd->vdev_parent == NULL);
00191 
00192         cvd->vdev_parent = pvd;
00193 
00194         if (pvd == NULL)
00195                 return;
00196 
00197         ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
00198 
00199         oldsize = pvd->vdev_children * sizeof (vdev_t *);
00200         pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
00201         newsize = pvd->vdev_children * sizeof (vdev_t *);
00202 
00203         newchild = kmem_zalloc(newsize, KM_SLEEP);
00204         if (pvd->vdev_child != NULL) {
00205                 bcopy(pvd->vdev_child, newchild, oldsize);
00206                 kmem_free(pvd->vdev_child, oldsize);
00207         }
00208 
00209         pvd->vdev_child = newchild;
00210         pvd->vdev_child[id] = cvd;
00211 
00212         cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
00213         ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
00214 
00215         /*
00216          * Walk up all ancestors to update guid sum.
00217          */
00218         for (; pvd != NULL; pvd = pvd->vdev_parent)
00219                 pvd->vdev_guid_sum += cvd->vdev_guid_sum;
00220 }
00221 
00222 void
00223 vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
00224 {
00225         int c;
00226         uint_t id = cvd->vdev_id;
00227 
00228         ASSERT(cvd->vdev_parent == pvd);
00229 
00230         if (pvd == NULL)
00231                 return;
00232 
00233         ASSERT(id < pvd->vdev_children);
00234         ASSERT(pvd->vdev_child[id] == cvd);
00235 
00236         pvd->vdev_child[id] = NULL;
00237         cvd->vdev_parent = NULL;
00238 
00239         for (c = 0; c < pvd->vdev_children; c++)
00240                 if (pvd->vdev_child[c])
00241                         break;
00242 
00243         if (c == pvd->vdev_children) {
00244                 kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
00245                 pvd->vdev_child = NULL;
00246                 pvd->vdev_children = 0;
00247         }
00248 
00249         /*
00250          * Walk up all ancestors to update guid sum.
00251          */
00252         for (; pvd != NULL; pvd = pvd->vdev_parent)
00253                 pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
00254 }
00255 
00259 void
00260 vdev_compact_children(vdev_t *pvd)
00261 {
00262         vdev_t **newchild, *cvd;
00263         int oldc = pvd->vdev_children;
00264         int newc;
00265 
00266         ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
00267 
00268         for (int c = newc = 0; c < oldc; c++)
00269                 if (pvd->vdev_child[c])
00270                         newc++;
00271 
00272         newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
00273 
00274         for (int c = newc = 0; c < oldc; c++) {
00275                 if ((cvd = pvd->vdev_child[c]) != NULL) {
00276                         newchild[newc] = cvd;
00277                         cvd->vdev_id = newc++;
00278                 }
00279         }
00280 
00281         kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
00282         pvd->vdev_child = newchild;
00283         pvd->vdev_children = newc;
00284 }
00285 
00289 vdev_t *
00290 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
00291 {
00292         vdev_t *vd;
00293 
00294         vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
00295 
00296         if (spa->spa_root_vdev == NULL) {
00297                 ASSERT(ops == &vdev_root_ops);
00298                 spa->spa_root_vdev = vd;
00299                 spa->spa_load_guid = spa_generate_guid(NULL);
00300         }
00301 
00302         if (guid == 0 && ops != &vdev_hole_ops) {
00303                 if (spa->spa_root_vdev == vd) {
00304                         /*
00305                          * The root vdev's guid will also be the pool guid,
00306                          * which must be unique among all pools.
00307                          */
00308                         guid = spa_generate_guid(NULL);
00309                 } else {
00310                         /*
00311                          * Any other vdev's guid must be unique within the pool.
00312                          */
00313                         guid = spa_generate_guid(spa);
00314                 }
00315                 ASSERT(!spa_guid_exists(spa_guid(spa), guid));
00316         }
00317 
00318         vd->vdev_spa = spa;
00319         vd->vdev_id = id;
00320         vd->vdev_guid = guid;
00321         vd->vdev_guid_sum = guid;
00322         vd->vdev_ops = ops;
00323         vd->vdev_state = VDEV_STATE_CLOSED;
00324         vd->vdev_ishole = (ops == &vdev_hole_ops);
00325 
00326         mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
00327         mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
00328         mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
00329         for (int t = 0; t < DTL_TYPES; t++) {
00330                 space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0,
00331                     &vd->vdev_dtl_lock);
00332         }
00333         txg_list_create(&vd->vdev_ms_list,
00334             offsetof(struct metaslab, ms_txg_node));
00335         txg_list_create(&vd->vdev_dtl_list,
00336             offsetof(struct vdev, vdev_dtl_node));
00337         vd->vdev_stat.vs_timestamp = gethrtime();
00338         vdev_queue_init(vd);
00339         vdev_cache_init(vd);
00340 
00341         return (vd);
00342 }
00343 
00349 int
00350 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
00351     int alloctype)
00352 {
00353         vdev_ops_t *ops;
00354         char *type;
00355         uint64_t guid = 0, islog, nparity;
00356         vdev_t *vd;
00357 
00358         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
00359 
00360         if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
00361                 return (EINVAL);
00362 
00363         if ((ops = vdev_getops(type)) == NULL)
00364                 return (EINVAL);
00365 
00366         /*
00367          * If this is a load, get the vdev guid from the nvlist.
00368          * Otherwise, vdev_alloc_common() will generate one for us.
00369          */
00370         if (alloctype == VDEV_ALLOC_LOAD) {
00371                 uint64_t label_id;
00372 
00373                 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
00374                     label_id != id)
00375                         return (EINVAL);
00376 
00377                 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
00378                         return (EINVAL);
00379         } else if (alloctype == VDEV_ALLOC_SPARE) {
00380                 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
00381                         return (EINVAL);
00382         } else if (alloctype == VDEV_ALLOC_L2CACHE) {
00383                 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
00384                         return (EINVAL);
00385         } else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
00386                 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
00387                         return (EINVAL);
00388         }
00389 
00390         /*
00391          * The first allocated vdev must be of type 'root'.
00392          */
00393         if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
00394                 return (EINVAL);
00395 
00396         /*
00397          * Determine whether we're a log vdev.
00398          */
00399         islog = 0;
00400         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
00401         if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
00402                 return (ENOTSUP);
00403 
00404         if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
00405                 return (ENOTSUP);
00406 
00407         /*
00408          * Set the nparity property for RAID-Z vdevs.
00409          */
00410         nparity = -1ULL;
00411         if (ops == &vdev_raidz_ops) {
00412                 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
00413                     &nparity) == 0) {
00414                         if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
00415                                 return (EINVAL);
00416                         /*
00417                          * Previous versions could only support 1 or 2 parity
00418                          * device.
00419                          */
00420                         if (nparity > 1 &&
00421                             spa_version(spa) < SPA_VERSION_RAIDZ2)
00422                                 return (ENOTSUP);
00423                         if (nparity > 2 &&
00424                             spa_version(spa) < SPA_VERSION_RAIDZ3)
00425                                 return (ENOTSUP);
00426                 } else {
00427                         /*
00428                          * We require the parity to be specified for SPAs that
00429                          * support multiple parity levels.
00430                          */
00431                         if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
00432                                 return (EINVAL);
00433                         /*
00434                          * Otherwise, we default to 1 parity device for RAID-Z.
00435                          */
00436                         nparity = 1;
00437                 }
00438         } else {
00439                 nparity = 0;
00440         }
00441         ASSERT(nparity != -1ULL);
00442 
00443         vd = vdev_alloc_common(spa, id, guid, ops);
00444 
00445         vd->vdev_islog = islog;
00446         vd->vdev_nparity = nparity;
00447 
00448         if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
00449                 vd->vdev_path = spa_strdup(vd->vdev_path);
00450         if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
00451                 vd->vdev_devid = spa_strdup(vd->vdev_devid);
00452         if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
00453             &vd->vdev_physpath) == 0)
00454                 vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
00455         if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
00456                 vd->vdev_fru = spa_strdup(vd->vdev_fru);
00457 
00458         /*
00459          * Set the whole_disk property.  If it's not specified, leave the value
00460          * as -1.
00461          */
00462         if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
00463             &vd->vdev_wholedisk) != 0)
00464                 vd->vdev_wholedisk = -1ULL;
00465 
00466         /*
00467          * Look for the 'not present' flag.  This will only be set if the device
00468          * was not present at the time of import.
00469          */
00470         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
00471             &vd->vdev_not_present);
00472 
00473         /*
00474          * Get the alignment requirement.
00475          */
00476         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
00477 
00478         /*
00479          * Retrieve the vdev creation time.
00480          */
00481         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
00482             &vd->vdev_crtxg);
00483 
00484         /*
00485          * If we're a top-level vdev, try to load the allocation parameters.
00486          */
00487         if (parent && !parent->vdev_parent &&
00488             (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
00489                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
00490                     &vd->vdev_ms_array);
00491                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
00492                     &vd->vdev_ms_shift);
00493                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
00494                     &vd->vdev_asize);
00495                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
00496                     &vd->vdev_removing);
00497         }
00498 
00499         if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
00500                 ASSERT(alloctype == VDEV_ALLOC_LOAD ||
00501                     alloctype == VDEV_ALLOC_ADD ||
00502                     alloctype == VDEV_ALLOC_SPLIT ||
00503                     alloctype == VDEV_ALLOC_ROOTPOOL);
00504                 vd->vdev_mg = metaslab_group_create(islog ?
00505                     spa_log_class(spa) : spa_normal_class(spa), vd);
00506         }
00507 
00508         /*
00509          * If we're a leaf vdev, try to load the DTL object and other state.
00510          */
00511         if (vd->vdev_ops->vdev_op_leaf &&
00512             (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
00513             alloctype == VDEV_ALLOC_ROOTPOOL)) {
00514                 if (alloctype == VDEV_ALLOC_LOAD) {
00515                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
00516                             &vd->vdev_dtl_smo.smo_object);
00517                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
00518                             &vd->vdev_unspare);
00519                 }
00520 
00521                 if (alloctype == VDEV_ALLOC_ROOTPOOL) {
00522                         uint64_t spare = 0;
00523 
00524                         if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
00525                             &spare) == 0 && spare)
00526                                 spa_spare_add(vd);
00527                 }
00528 
00529                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
00530                     &vd->vdev_offline);
00531 
00532                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING,
00533                     &vd->vdev_resilvering);
00534 
00535                 /*
00536                  * When importing a pool, we want to ignore the persistent fault
00537                  * state, as the diagnosis made on another system may not be
00538                  * valid in the current context.  Local vdevs will
00539                  * remain in the faulted state.
00540                  */
00541                 if (spa_load_state(spa) == SPA_LOAD_OPEN) {
00542                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
00543                             &vd->vdev_faulted);
00544                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
00545                             &vd->vdev_degraded);
00546                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
00547                             &vd->vdev_removed);
00548 
00549                         if (vd->vdev_faulted || vd->vdev_degraded) {
00550                                 char *aux;
00551 
00552                                 vd->vdev_label_aux =
00553                                     VDEV_AUX_ERR_EXCEEDED;
00554                                 if (nvlist_lookup_string(nv,
00555                                     ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
00556                                     strcmp(aux, "external") == 0)
00557                                         vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
00558                         }
00559                 }
00560         }
00561 
00562         /*
00563          * Add ourselves to the parent's list of children.
00564          */
00565         vdev_add_child(parent, vd);
00566 
00567         *vdp = vd;
00568 
00569         return (0);
00570 }
00571 
00572 void
00573 vdev_free(vdev_t *vd)
00574 {
00575         spa_t *spa = vd->vdev_spa;
00576 
00577         /*
00578          * vdev_free() implies closing the vdev first.  This is simpler than
00579          * trying to ensure complicated semantics for all callers.
00580          */
00581         vdev_close(vd);
00582 
00583         ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
00584         ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
00585 
00586         /*
00587          * Free all children.
00588          */
00589         for (int c = 0; c < vd->vdev_children; c++)
00590                 vdev_free(vd->vdev_child[c]);
00591 
00592         ASSERT(vd->vdev_child == NULL);
00593         ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
00594 
00595         /*
00596          * Discard allocation state.
00597          */
00598         if (vd->vdev_mg != NULL) {
00599                 vdev_metaslab_fini(vd);
00600                 metaslab_group_destroy(vd->vdev_mg);
00601         }
00602 
00603         ASSERT0(vd->vdev_stat.vs_space);
00604         ASSERT0(vd->vdev_stat.vs_dspace);
00605         ASSERT0(vd->vdev_stat.vs_alloc);
00606 
00607         /*
00608          * Remove this vdev from its parent's child list.
00609          */
00610         vdev_remove_child(vd->vdev_parent, vd);
00611 
00612         ASSERT(vd->vdev_parent == NULL);
00613 
00614         /*
00615          * Clean up vdev structure.
00616          */
00617         vdev_queue_fini(vd);
00618         vdev_cache_fini(vd);
00619 
00620         if (vd->vdev_path)
00621                 spa_strfree(vd->vdev_path);
00622         if (vd->vdev_devid)
00623                 spa_strfree(vd->vdev_devid);
00624         if (vd->vdev_physpath)
00625                 spa_strfree(vd->vdev_physpath);
00626         if (vd->vdev_fru)
00627                 spa_strfree(vd->vdev_fru);
00628 
00629         if (vd->vdev_isspare)
00630                 spa_spare_remove(vd);
00631         if (vd->vdev_isl2cache)
00632                 spa_l2cache_remove(vd);
00633 
00634         txg_list_destroy(&vd->vdev_ms_list);
00635         txg_list_destroy(&vd->vdev_dtl_list);
00636 
00637         mutex_enter(&vd->vdev_dtl_lock);
00638         for (int t = 0; t < DTL_TYPES; t++) {
00639                 space_map_unload(&vd->vdev_dtl[t]);
00640                 space_map_destroy(&vd->vdev_dtl[t]);
00641         }
00642         mutex_exit(&vd->vdev_dtl_lock);
00643 
00644         mutex_destroy(&vd->vdev_dtl_lock);
00645         mutex_destroy(&vd->vdev_stat_lock);
00646         mutex_destroy(&vd->vdev_probe_lock);
00647 
00648         if (vd == spa->spa_root_vdev)
00649                 spa->spa_root_vdev = NULL;
00650 
00651         kmem_free(vd, sizeof (vdev_t));
00652 }
00653 
00657 static void
00658 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
00659 {
00660         spa_t *spa = svd->vdev_spa;
00661         metaslab_t *msp;
00662         vdev_t *vd;
00663         int t;
00664 
00665         ASSERT(tvd == tvd->vdev_top);
00666 
00667         tvd->vdev_ms_array = svd->vdev_ms_array;
00668         tvd->vdev_ms_shift = svd->vdev_ms_shift;
00669         tvd->vdev_ms_count = svd->vdev_ms_count;
00670 
00671         svd->vdev_ms_array = 0;
00672         svd->vdev_ms_shift = 0;
00673         svd->vdev_ms_count = 0;
00674 
00675         if (tvd->vdev_mg)
00676                 ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
00677         tvd->vdev_mg = svd->vdev_mg;
00678         tvd->vdev_ms = svd->vdev_ms;
00679 
00680         svd->vdev_mg = NULL;
00681         svd->vdev_ms = NULL;
00682 
00683         if (tvd->vdev_mg != NULL)
00684                 tvd->vdev_mg->mg_vd = tvd;
00685 
00686         tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
00687         tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
00688         tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
00689 
00690         svd->vdev_stat.vs_alloc = 0;
00691         svd->vdev_stat.vs_space = 0;
00692         svd->vdev_stat.vs_dspace = 0;
00693 
00694         for (t = 0; t < TXG_SIZE; t++) {
00695                 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
00696                         (void) txg_list_add(&tvd->vdev_ms_list, msp, t);
00697                 while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
00698                         (void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
00699                 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
00700                         (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
00701         }
00702 
00703         if (list_link_active(&svd->vdev_config_dirty_node)) {
00704                 vdev_config_clean(svd);
00705                 vdev_config_dirty(tvd);
00706         }
00707 
00708         if (list_link_active(&svd->vdev_state_dirty_node)) {
00709                 vdev_state_clean(svd);
00710                 vdev_state_dirty(tvd);
00711         }
00712 
00713         tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
00714         svd->vdev_deflate_ratio = 0;
00715 
00716         tvd->vdev_islog = svd->vdev_islog;
00717         svd->vdev_islog = 0;
00718 }
00719 
00720 static void
00721 vdev_top_update(vdev_t *tvd, vdev_t *vd)
00722 {
00723         if (vd == NULL)
00724                 return;
00725 
00726         vd->vdev_top = tvd;
00727 
00728         for (int c = 0; c < vd->vdev_children; c++)
00729                 vdev_top_update(tvd, vd->vdev_child[c]);
00730 }
00731 
00735 vdev_t *
00736 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
00737 {
00738         spa_t *spa = cvd->vdev_spa;
00739         vdev_t *pvd = cvd->vdev_parent;
00740         vdev_t *mvd;
00741 
00742         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
00743 
00744         mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
00745 
00746         mvd->vdev_asize = cvd->vdev_asize;
00747         mvd->vdev_min_asize = cvd->vdev_min_asize;
00748         mvd->vdev_max_asize = cvd->vdev_max_asize;
00749         mvd->vdev_ashift = cvd->vdev_ashift;
00750         mvd->vdev_state = cvd->vdev_state;
00751         mvd->vdev_crtxg = cvd->vdev_crtxg;
00752 
00753         vdev_remove_child(pvd, cvd);
00754         vdev_add_child(pvd, mvd);
00755         cvd->vdev_id = mvd->vdev_children;
00756         vdev_add_child(mvd, cvd);
00757         vdev_top_update(cvd->vdev_top, cvd->vdev_top);
00758 
00759         if (mvd == mvd->vdev_top)
00760                 vdev_top_transfer(cvd, mvd);
00761 
00762         return (mvd);
00763 }
00764 
00768 void
00769 vdev_remove_parent(vdev_t *cvd)
00770 {
00771         vdev_t *mvd = cvd->vdev_parent;
00772         vdev_t *pvd = mvd->vdev_parent;
00773 
00774         ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
00775 
00776         ASSERT(mvd->vdev_children == 1);
00777         ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
00778             mvd->vdev_ops == &vdev_replacing_ops ||
00779             mvd->vdev_ops == &vdev_spare_ops);
00780         cvd->vdev_ashift = mvd->vdev_ashift;
00781 
00782         vdev_remove_child(mvd, cvd);
00783         vdev_remove_child(pvd, mvd);
00784 
00785         /*
00786          * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
00787          * Otherwise, we could have detached an offline device, and when we
00788          * go to import the pool we'll think we have two top-level vdevs,
00789          * instead of a different version of the same top-level vdev.
00790          */
00791         if (mvd->vdev_top == mvd) {
00792                 uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
00793                 cvd->vdev_orig_guid = cvd->vdev_guid;
00794                 cvd->vdev_guid += guid_delta;
00795                 cvd->vdev_guid_sum += guid_delta;
00796         }
00797         cvd->vdev_id = mvd->vdev_id;
00798         vdev_add_child(pvd, cvd);
00799         vdev_top_update(cvd->vdev_top, cvd->vdev_top);
00800 
00801         if (cvd == cvd->vdev_top)
00802                 vdev_top_transfer(mvd, cvd);
00803 
00804         ASSERT(mvd->vdev_children == 0);
00805         vdev_free(mvd);
00806 }
00807 
00808 int
00809 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
00810 {
00811         spa_t *spa = vd->vdev_spa;
00812         objset_t *mos = spa->spa_meta_objset;
00813         uint64_t m;
00814         uint64_t oldc = vd->vdev_ms_count;
00815         uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
00816         metaslab_t **mspp;
00817         int error;
00818 
00819         ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
00820 
00821         /*
00822          * This vdev is not being allocated from yet or is a hole.
00823          */
00824         if (vd->vdev_ms_shift == 0)
00825                 return (0);
00826 
00827         ASSERT(!vd->vdev_ishole);
00828 
00829         /*
00830          * Compute the raidz-deflation ratio.  Note, we hard-code
00831          * in 128k (1 << 17) because it is the current "typical" blocksize.
00832          * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
00833          * or we will inconsistently account for existing bp's.
00834          */
00835         vd->vdev_deflate_ratio = (1 << 17) /
00836             (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
00837 
00838         ASSERT(oldc <= newc);
00839 
00840         mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
00841 
00842         if (oldc != 0) {
00843                 bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
00844                 kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
00845         }
00846 
00847         vd->vdev_ms = mspp;
00848         vd->vdev_ms_count = newc;
00849 
00850         for (m = oldc; m < newc; m++) {
00851                 space_map_obj_t smo = { 0, 0, 0 };
00852                 if (txg == 0) {
00853                         uint64_t object = 0;
00854                         error = dmu_read(mos, vd->vdev_ms_array,
00855                             m * sizeof (uint64_t), sizeof (uint64_t), &object,
00856                             DMU_READ_PREFETCH);
00857                         if (error)
00858                                 return (error);
00859                         if (object != 0) {
00860                                 dmu_buf_t *db;
00861                                 error = dmu_bonus_hold(mos, object, FTAG, &db);
00862                                 if (error)
00863                                         return (error);
00864                                 ASSERT3U(db->db_size, >=, sizeof (smo));
00865                                 bcopy(db->db_data, &smo, sizeof (smo));
00866                                 ASSERT3U(smo.smo_object, ==, object);
00867                                 dmu_buf_rele(db, FTAG);
00868                         }
00869                 }
00870                 vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo,
00871                     m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
00872         }
00873 
00874         if (txg == 0)
00875                 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
00876 
00877         /*
00878          * If the vdev is being removed we don't activate
00879          * the metaslabs since we want to ensure that no new
00880          * allocations are performed on this device.
00881          */
00882         if (oldc == 0 && !vd->vdev_removing)
00883                 metaslab_group_activate(vd->vdev_mg);
00884 
00885         if (txg == 0)
00886                 spa_config_exit(spa, SCL_ALLOC, FTAG);
00887 
00888         return (0);
00889 }
00890 
00891 void
00892 vdev_metaslab_fini(vdev_t *vd)
00893 {
00894         uint64_t m;
00895         uint64_t count = vd->vdev_ms_count;
00896 
00897         if (vd->vdev_ms != NULL) {
00898                 metaslab_group_passivate(vd->vdev_mg);
00899                 for (m = 0; m < count; m++)
00900                         if (vd->vdev_ms[m] != NULL)
00901                                 metaslab_fini(vd->vdev_ms[m]);
00902                 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
00903                 vd->vdev_ms = NULL;
00904         }
00905 }
00906 
00907 typedef struct vdev_probe_stats {
00908         boolean_t       vps_readable;
00909         boolean_t       vps_writeable;
00910         int             vps_flags;
00911 } vdev_probe_stats_t;
00912 
00913 static void
00914 vdev_probe_done(zio_t *zio)
00915 {
00916         spa_t *spa = zio->io_spa;
00917         vdev_t *vd = zio->io_vd;
00918         vdev_probe_stats_t *vps = zio->io_private;
00919 
00920         ASSERT(vd->vdev_probe_zio != NULL);
00921 
00922         if (zio->io_type == ZIO_TYPE_READ) {
00923                 if (zio->io_error == 0)
00924                         vps->vps_readable = 1;
00925                 if (zio->io_error == 0 && spa_writeable(spa)) {
00926                         zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
00927                             zio->io_offset, zio->io_size, zio->io_data,
00928                             ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
00929                             ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
00930                 } else {
00931                         zio_buf_free(zio->io_data, zio->io_size);
00932                 }
00933         } else if (zio->io_type == ZIO_TYPE_WRITE) {
00934                 if (zio->io_error == 0)
00935                         vps->vps_writeable = 1;
00936                 zio_buf_free(zio->io_data, zio->io_size);
00937         } else if (zio->io_type == ZIO_TYPE_NULL) {
00938                 zio_t *pio;
00939 
00940                 vd->vdev_cant_read |= !vps->vps_readable;
00941                 vd->vdev_cant_write |= !vps->vps_writeable;
00942 
00943                 if (vdev_readable(vd) &&
00944                     (vdev_writeable(vd) || !spa_writeable(spa))) {
00945                         zio->io_error = 0;
00946                 } else {
00947                         ASSERT(zio->io_error != 0);
00948                         zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
00949                             spa, vd, NULL, 0, 0);
00950                         zio->io_error = ENXIO;
00951                 }
00952 
00953                 mutex_enter(&vd->vdev_probe_lock);
00954                 ASSERT(vd->vdev_probe_zio == zio);
00955                 vd->vdev_probe_zio = NULL;
00956                 mutex_exit(&vd->vdev_probe_lock);
00957 
00958                 while ((pio = zio_walk_parents(zio)) != NULL)
00959                         if (!vdev_accessible(vd, pio))
00960                                 pio->io_error = ENXIO;
00961 
00962                 kmem_free(vps, sizeof (*vps));
00963         }
00964 }
00965 
00973 zio_t *
00974 vdev_probe(vdev_t *vd, zio_t *zio)
00975 {
00976         spa_t *spa = vd->vdev_spa;
00977         vdev_probe_stats_t *vps = NULL;
00978         zio_t *pio;
00979 
00980         ASSERT(vd->vdev_ops->vdev_op_leaf);
00981 
00982         /*
00983          * Don't probe the probe.
00984          */
00985         if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
00986                 return (NULL);
00987 
00988         /*
00989          * To prevent 'probe storms' when a device fails, we create
00990          * just one probe i/o at a time.  All zios that want to probe
00991          * this vdev will become parents of the probe io.
00992          */
00993         mutex_enter(&vd->vdev_probe_lock);
00994 
00995         if ((pio = vd->vdev_probe_zio) == NULL) {
00996                 vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
00997 
00998                 vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
00999                     ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
01000                     ZIO_FLAG_TRYHARD;
01001 
01002                 if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
01003                         /*
01004                          * vdev_cant_read and vdev_cant_write can only
01005                          * transition from TRUE to FALSE when we have the
01006                          * SCL_ZIO lock as writer; otherwise they can only
01007                          * transition from FALSE to TRUE.  This ensures that
01008                          * any zio looking at these values can assume that
01009                          * failures persist for the life of the I/O.  That's
01010                          * important because when a device has intermittent
01011                          * connectivity problems, we want to ensure that
01012                          * they're ascribed to the device (ENXIO) and not
01013                          * the zio (EIO).
01014                          *
01015                          * Since we hold SCL_ZIO as writer here, clear both
01016                          * values so the probe can reevaluate from first
01017                          * principles.
01018                          */
01019                         vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
01020                         vd->vdev_cant_read = B_FALSE;
01021                         vd->vdev_cant_write = B_FALSE;
01022                 }
01023 
01024                 vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
01025                     vdev_probe_done, vps,
01026                     vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
01027 
01028                 /*
01029                  * We can't change the vdev state in this context, so we
01030                  * kick off an async task to do it on our behalf.
01031                  */
01032                 if (zio != NULL) {
01033                         vd->vdev_probe_wanted = B_TRUE;
01034                         spa_async_request(spa, SPA_ASYNC_PROBE);
01035                 }
01036         }
01037 
01038         if (zio != NULL)
01039                 zio_add_child(zio, pio);
01040 
01041         mutex_exit(&vd->vdev_probe_lock);
01042 
01043         if (vps == NULL) {
01044                 ASSERT(zio != NULL);
01045                 return (NULL);
01046         }
01047 
01048         for (int l = 1; l < VDEV_LABELS; l++) {
01049                 zio_nowait(zio_read_phys(pio, vd,
01050                     vdev_label_offset(vd->vdev_psize, l,
01051                     offsetof(vdev_label_t, vl_pad2)),
01052                     VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE),
01053                     ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
01054                     ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
01055         }
01056 
01057         if (zio == NULL)
01058                 return (pio);
01059 
01060         zio_nowait(pio);
01061         return (NULL);
01062 }
01063 
01064 static void
01065 vdev_open_child(void *arg)
01066 {
01067         vdev_t *vd = arg;
01068 
01069         vd->vdev_open_thread = curthread;
01070         vd->vdev_open_error = vdev_open(vd);
01071         vd->vdev_open_thread = NULL;
01072 }
01073 
01074 boolean_t
01075 vdev_uses_zvols(vdev_t *vd)
01076 {
01077         if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
01078             strlen(ZVOL_DIR)) == 0)
01079                 return (B_TRUE);
01080         for (int c = 0; c < vd->vdev_children; c++)
01081                 if (vdev_uses_zvols(vd->vdev_child[c]))
01082                         return (B_TRUE);
01083         return (B_FALSE);
01084 }
01085 
01086 void
01087 vdev_open_children(vdev_t *vd)
01088 {
01089         taskq_t *tq;
01090         int children = vd->vdev_children;
01091 
01092         /*
01093          * in order to handle pools on top of zvols, do the opens
01094          * in a single thread so that the same thread holds the
01095          * spa_namespace_lock
01096          */
01097         if (B_TRUE || vdev_uses_zvols(vd)) {
01098                 for (int c = 0; c < children; c++)
01099                         vd->vdev_child[c]->vdev_open_error =
01100                             vdev_open(vd->vdev_child[c]);
01101                 return;
01102         }
01103         tq = taskq_create("vdev_open", children, minclsyspri,
01104             children, children, TASKQ_PREPOPULATE);
01105 
01106         for (int c = 0; c < children; c++)
01107                 VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
01108                     TQ_SLEEP) != 0);
01109 
01110         taskq_destroy(tq);
01111 }
01112 
01116 int
01117 vdev_open(vdev_t *vd)
01118 {
01119         spa_t *spa = vd->vdev_spa;
01120         int error;
01121         uint64_t osize = 0;
01122         uint64_t max_osize = 0;
01123         uint64_t asize, max_asize, psize;
01124         uint64_t ashift = 0;
01125 
01126         ASSERT(vd->vdev_open_thread == curthread ||
01127             spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
01128         ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
01129             vd->vdev_state == VDEV_STATE_CANT_OPEN ||
01130             vd->vdev_state == VDEV_STATE_OFFLINE);
01131 
01132         vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
01133         vd->vdev_cant_read = B_FALSE;
01134         vd->vdev_cant_write = B_FALSE;
01135         vd->vdev_min_asize = vdev_get_min_asize(vd);
01136 
01137         /*
01138          * If this vdev is not removed, check its fault status.  If it's
01139          * faulted, bail out of the open.
01140          */
01141         if (!vd->vdev_removed && vd->vdev_faulted) {
01142                 ASSERT(vd->vdev_children == 0);
01143                 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
01144                     vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
01145                 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
01146                     vd->vdev_label_aux);
01147                 return (ENXIO);
01148         } else if (vd->vdev_offline) {
01149                 ASSERT(vd->vdev_children == 0);
01150                 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
01151                 return (ENXIO);
01152         }
01153 
01154         error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift);
01155 
01156         /*
01157          * Reset the vdev_reopening flag so that we actually close
01158          * the vdev on error.
01159          */
01160         vd->vdev_reopening = B_FALSE;
01161         if (zio_injection_enabled && error == 0)
01162                 error = zio_handle_device_injection(vd, NULL, ENXIO);
01163 
01164         if (error) {
01165                 if (vd->vdev_removed &&
01166                     vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
01167                         vd->vdev_removed = B_FALSE;
01168 
01169                 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
01170                     vd->vdev_stat.vs_aux);
01171                 return (error);
01172         }
01173 
01174         vd->vdev_removed = B_FALSE;
01175 
01176         /*
01177          * Recheck the faulted flag now that we have confirmed that
01178          * the vdev is accessible.  If we're faulted, bail.
01179          */
01180         if (vd->vdev_faulted) {
01181                 ASSERT(vd->vdev_children == 0);
01182                 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
01183                     vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
01184                 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
01185                     vd->vdev_label_aux);
01186                 return (ENXIO);
01187         }
01188 
01189         if (vd->vdev_degraded) {
01190                 ASSERT(vd->vdev_children == 0);
01191                 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
01192                     VDEV_AUX_ERR_EXCEEDED);
01193         } else {
01194                 vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
01195         }
01196 
01197         /*
01198          * For hole or missing vdevs we just return success.
01199          */
01200         if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
01201                 return (0);
01202 
01203         if (vd->vdev_ops->vdev_op_leaf) {
01204                 vd->vdev_notrim = B_FALSE;
01205                 trim_map_create(vd);
01206         }
01207 
01208         for (int c = 0; c < vd->vdev_children; c++) {
01209                 if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
01210                         vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
01211                             VDEV_AUX_NONE);
01212                         break;
01213                 }
01214         }
01215 
01216         osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
01217         max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
01218 
01219         if (vd->vdev_children == 0) {
01220                 if (osize < SPA_MINDEVSIZE) {
01221                         vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
01222                             VDEV_AUX_TOO_SMALL);
01223                         return (EOVERFLOW);
01224                 }
01225                 psize = osize;
01226                 asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
01227                 max_asize = max_osize - (VDEV_LABEL_START_SIZE +
01228                     VDEV_LABEL_END_SIZE);
01229         } else {
01230                 if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
01231                     (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
01232                         vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
01233                             VDEV_AUX_TOO_SMALL);
01234                         return (EOVERFLOW);
01235                 }
01236                 psize = 0;
01237                 asize = osize;
01238                 max_asize = max_osize;
01239         }
01240 
01241         vd->vdev_psize = psize;
01242 
01243         /*
01244          * Make sure the allocatable size hasn't shrunk.
01245          */
01246         if (asize < vd->vdev_min_asize) {
01247                 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
01248                     VDEV_AUX_BAD_LABEL);
01249                 return (EINVAL);
01250         }
01251 
01252         if (vd->vdev_asize == 0) {
01253                 /*
01254                  * This is the first-ever open, so use the computed values.
01255                  * For testing purposes, a higher ashift can be requested.
01256                  */
01257                 vd->vdev_asize = asize;
01258                 vd->vdev_max_asize = max_asize;
01259                 vd->vdev_ashift = MAX(ashift, vd->vdev_ashift);
01260         } else {
01261                 /*
01262                  * Make sure the alignment requirement hasn't increased.
01263                  */
01264                 if (ashift > vd->vdev_top->vdev_ashift) {
01265                         vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
01266                             VDEV_AUX_BAD_LABEL);
01267                         return (EINVAL);
01268                 }
01269                 vd->vdev_max_asize = max_asize;
01270         }
01271 
01272         /*
01273          * If all children are healthy and the asize has increased,
01274          * then we've experienced dynamic LUN growth.  If automatic
01275          * expansion is enabled then use the additional space.
01276          */
01277         if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize &&
01278             (vd->vdev_expanding || spa->spa_autoexpand))
01279                 vd->vdev_asize = asize;
01280 
01281         vdev_set_min_asize(vd);
01282 
01283         /*
01284          * Ensure we can issue some IO before declaring the
01285          * vdev open for business.
01286          */
01287         if (vd->vdev_ops->vdev_op_leaf &&
01288             (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
01289                 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
01290                     VDEV_AUX_ERR_EXCEEDED);
01291                 return (error);
01292         }
01293 
01294         /*
01295          * If a leaf vdev has a DTL, and seems healthy, then kick off a
01296          * resilver.  But don't do this if we are doing a reopen for a scrub,
01297          * since this would just restart the scrub we are already doing.
01298          */
01299         if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
01300             vdev_resilver_needed(vd, NULL, NULL))
01301                 spa_async_request(spa, SPA_ASYNC_RESILVER);
01302 
01303         return (0);
01304 }
01305 
01323 int
01324 vdev_validate(vdev_t *vd, boolean_t strict)
01325 {
01326         spa_t *spa = vd->vdev_spa;
01327         nvlist_t *label;
01328         uint64_t guid = 0, top_guid;
01329         uint64_t state;
01330 
01331         for (int c = 0; c < vd->vdev_children; c++)
01332                 if (vdev_validate(vd->vdev_child[c], strict) != 0)
01333                         return (EBADF);
01334 
01335         /*
01336          * If the device has already failed, or was marked offline, don't do
01337          * any further validation.  Otherwise, label I/O will fail and we will
01338          * overwrite the previous state.
01339          */
01340         if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
01341                 uint64_t aux_guid = 0;
01342                 nvlist_t *nvl;
01343                 uint64_t txg = strict ? spa->spa_config_txg : -1ULL;
01344 
01345                 if ((label = vdev_label_read_config(vd, txg)) == NULL) {
01346                         vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
01347                             VDEV_AUX_BAD_LABEL);
01348                         return (0);
01349                 }
01350 
01351                 /*
01352                  * Determine if this vdev has been split off into another
01353                  * pool.  If so, then refuse to open it.
01354                  */
01355                 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
01356                     &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
01357                         vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
01358                             VDEV_AUX_SPLIT_POOL);
01359                         nvlist_free(label);
01360                         return (0);
01361                 }
01362 
01363                 if (strict && (nvlist_lookup_uint64(label,
01364                     ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
01365                     guid != spa_guid(spa))) {
01366                         vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
01367                             VDEV_AUX_CORRUPT_DATA);
01368                         nvlist_free(label);
01369                         return (0);
01370                 }
01371 
01372                 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
01373                     != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
01374                     &aux_guid) != 0)
01375                         aux_guid = 0;
01376 
01377                 /*
01378                  * If this vdev just became a top-level vdev because its
01379                  * sibling was detached, it will have adopted the parent's
01380                  * vdev guid -- but the label may or may not be on disk yet.
01381                  * Fortunately, either version of the label will have the
01382                  * same top guid, so if we're a top-level vdev, we can
01383                  * safely compare to that instead.
01384                  *
01385                  * If we split this vdev off instead, then we also check the
01386                  * original pool's guid.  We don't want to consider the vdev
01387                  * corrupt if it is partway through a split operation.
01388                  */
01389                 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
01390                     &guid) != 0 ||
01391                     nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
01392                     &top_guid) != 0 ||
01393                     ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
01394                     (vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
01395                         vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
01396                             VDEV_AUX_CORRUPT_DATA);
01397                         nvlist_free(label);
01398                         return (0);
01399                 }
01400 
01401                 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
01402                     &state) != 0) {
01403                         vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
01404                             VDEV_AUX_CORRUPT_DATA);
01405                         nvlist_free(label);
01406                         return (0);
01407                 }
01408 
01409                 nvlist_free(label);
01410 
01411                 /*
01412                  * If this is a verbatim import, no need to check the
01413                  * state of the pool.
01414                  */
01415                 if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
01416                     spa_load_state(spa) == SPA_LOAD_OPEN &&
01417                     state != POOL_STATE_ACTIVE)
01418                         return (EBADF);
01419 
01420                 /*
01421                  * If we were able to open and validate a vdev that was
01422                  * previously marked permanently unavailable, clear that state
01423                  * now.
01424                  */
01425                 if (vd->vdev_not_present)
01426                         vd->vdev_not_present = 0;
01427         }
01428 
01429         return (0);
01430 }
01431 
01435 void
01436 vdev_close(vdev_t *vd)
01437 {
01438         spa_t *spa = vd->vdev_spa;
01439         vdev_t *pvd = vd->vdev_parent;
01440 
01441         ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
01442 
01443         /*
01444          * If our parent is reopening, then we are as well, unless we are
01445          * going offline.
01446          */
01447         if (pvd != NULL && pvd->vdev_reopening)
01448                 vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
01449 
01450         vd->vdev_ops->vdev_op_close(vd);
01451 
01452         vdev_cache_purge(vd);
01453 
01454         if (vd->vdev_ops->vdev_op_leaf)
01455                 trim_map_destroy(vd);
01456 
01457         /*
01458          * We record the previous state before we close it, so that if we are
01459          * doing a reopen(), we don't generate FMA ereports if we notice that
01460          * it's still faulted.
01461          */
01462         vd->vdev_prevstate = vd->vdev_state;
01463 
01464         if (vd->vdev_offline)
01465                 vd->vdev_state = VDEV_STATE_OFFLINE;
01466         else
01467                 vd->vdev_state = VDEV_STATE_CLOSED;
01468         vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
01469 }
01470 
01471 void
01472 vdev_hold(vdev_t *vd)
01473 {
01474         spa_t *spa = vd->vdev_spa;
01475 
01476         ASSERT(spa_is_root(spa));
01477         if (spa->spa_state == POOL_STATE_UNINITIALIZED)
01478                 return;
01479 
01480         for (int c = 0; c < vd->vdev_children; c++)
01481                 vdev_hold(vd->vdev_child[c]);
01482 
01483         if (vd->vdev_ops->vdev_op_leaf)
01484                 vd->vdev_ops->vdev_op_hold(vd);
01485 }
01486 
01487 void
01488 vdev_rele(vdev_t *vd)
01489 {
01490         spa_t *spa = vd->vdev_spa;
01491 
01492         ASSERT(spa_is_root(spa));
01493         for (int c = 0; c < vd->vdev_children; c++)
01494                 vdev_rele(vd->vdev_child[c]);
01495 
01496         if (vd->vdev_ops->vdev_op_leaf)
01497                 vd->vdev_ops->vdev_op_rele(vd);
01498 }
01499 
01506 void
01507 vdev_reopen(vdev_t *vd)
01508 {
01509         spa_t *spa = vd->vdev_spa;
01510 
01511         ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
01512 
01513         /* set the reopening flag unless we're taking the vdev offline */
01514         vd->vdev_reopening = !vd->vdev_offline;
01515         vdev_close(vd);
01516         (void) vdev_open(vd);
01517 
01518         /*
01519          * Call vdev_validate() here to make sure we have the same device.
01520          * Otherwise, a device with an invalid label could be successfully
01521          * opened in response to vdev_reopen().
01522          */
01523         if (vd->vdev_aux) {
01524                 (void) vdev_validate_aux(vd);
01525                 if (vdev_readable(vd) && vdev_writeable(vd) &&
01526                     vd->vdev_aux == &spa->spa_l2cache &&
01527                     !l2arc_vdev_present(vd))
01528                         l2arc_add_vdev(spa, vd);
01529         } else {
01530                 (void) vdev_validate(vd, spa_last_synced_txg(spa));
01531         }
01532 
01533         /*
01534          * Reassess parent vdev's health.
01535          */
01536         vdev_propagate_state(vd);
01537 }
01538 
01539 int
01540 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
01541 {
01542         int error;
01543 
01544         /*
01545          * Normally, partial opens (e.g. of a mirror) are allowed.
01546          * For a create, however, we want to fail the request if
01547          * there are any components we can't open.
01548          */
01549         error = vdev_open(vd);
01550 
01551         if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
01552                 vdev_close(vd);
01553                 return (error ? error : ENXIO);
01554         }
01555 
01556         /*
01557          * Recursively initialize all labels.
01558          */
01559         if ((error = vdev_label_init(vd, txg, isreplacing ?
01560             VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
01561                 vdev_close(vd);
01562                 return (error);
01563         }
01564 
01565         return (0);
01566 }
01567 
01568 void
01569 vdev_metaslab_set_size(vdev_t *vd)
01570 {
01571         /*
01572          * Aim for roughly 200 metaslabs per vdev.
01573          */
01574         vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
01575         vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
01576 }
01577 
01578 void
01579 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
01580 {
01581         ASSERT(vd == vd->vdev_top);
01582         ASSERT(!vd->vdev_ishole);
01583         ASSERT(ISP2(flags));
01584         ASSERT(spa_writeable(vd->vdev_spa));
01585 
01586         if (flags & VDD_METASLAB)
01587                 (void) txg_list_add(&vd->vdev_ms_list, arg, txg);
01588 
01589         if (flags & VDD_DTL)
01590                 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
01591 
01592         (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
01593 }
01594 
01595 /*
01596  * DTLs.
01597  *
01598  * A vdev's DTL (dirty time log) is the set of transaction groups for which
01599  * the vdev has less than perfect replication.  There are four kinds of DTL:
01600  *
01601  * DTL_MISSING: txgs for which the vdev has no valid copies of the data
01602  *
01603  * DTL_PARTIAL: txgs for which data is available, but not fully replicated
01604  *
01605  * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
01606  *      scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
01607  *      txgs that was scrubbed.
01608  *
01609  * DTL_OUTAGE: txgs which cannot currently be read, whether due to
01610  *      persistent errors or just some device being offline.
01611  *      Unlike the other three, the DTL_OUTAGE map is not generally
01612  *      maintained; it's only computed when needed, typically to
01613  *      determine whether a device can be detached.
01614  *
01615  * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
01616  * either has the data or it doesn't.
01617  *
01618  * For interior vdevs such as mirror and RAID-Z the picture is more complex.
01619  * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
01620  * if any child is less than fully replicated, then so is its parent.
01621  * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
01622  * comprising only those txgs which appear in 'maxfaults' or more children;
01623  * those are the txgs we don't have enough replication to read.  For example,
01624  * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
01625  * thus, its DTL_MISSING consists of the set of txgs that appear in more than
01626  * two child DTL_MISSING maps.
01627  *
01628  * It should be clear from the above that to compute the DTLs and outage maps
01629  * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
01630  * Therefore, that is all we keep on disk.  When loading the pool, or after
01631  * a configuration change, we generate all other DTLs from first principles.
01632  */
01633 void
01634 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
01635 {
01636         space_map_t *sm = &vd->vdev_dtl[t];
01637 
01638         ASSERT(t < DTL_TYPES);
01639         ASSERT(vd != vd->vdev_spa->spa_root_vdev);
01640         ASSERT(spa_writeable(vd->vdev_spa));
01641 
01642         mutex_enter(sm->sm_lock);
01643         if (!space_map_contains(sm, txg, size))
01644                 space_map_add(sm, txg, size);
01645         mutex_exit(sm->sm_lock);
01646 }
01647 
01648 boolean_t
01649 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
01650 {
01651         space_map_t *sm = &vd->vdev_dtl[t];
01652         boolean_t dirty = B_FALSE;
01653 
01654         ASSERT(t < DTL_TYPES);
01655         ASSERT(vd != vd->vdev_spa->spa_root_vdev);
01656 
01657         mutex_enter(sm->sm_lock);
01658         if (sm->sm_space != 0)
01659                 dirty = space_map_contains(sm, txg, size);
01660         mutex_exit(sm->sm_lock);
01661 
01662         return (dirty);
01663 }
01664 
01665 boolean_t
01666 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
01667 {
01668         space_map_t *sm = &vd->vdev_dtl[t];
01669         boolean_t empty;
01670 
01671         mutex_enter(sm->sm_lock);
01672         empty = (sm->sm_space == 0);
01673         mutex_exit(sm->sm_lock);
01674 
01675         return (empty);
01676 }
01677 
01681 void
01682 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
01683 {
01684         spa_t *spa = vd->vdev_spa;
01685         avl_tree_t reftree;
01686         int minref;
01687 
01688         ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
01689 
01690         for (int c = 0; c < vd->vdev_children; c++)
01691                 vdev_dtl_reassess(vd->vdev_child[c], txg,
01692                     scrub_txg, scrub_done);
01693 
01694         if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
01695                 return;
01696 
01697         if (vd->vdev_ops->vdev_op_leaf) {
01698                 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
01699 
01700                 mutex_enter(&vd->vdev_dtl_lock);
01701                 if (scrub_txg != 0 &&
01702                     (spa->spa_scrub_started ||
01703                     (scn && scn->scn_phys.scn_errors == 0))) {
01704                         /*
01705                          * We completed a scrub up to scrub_txg.  If we
01706                          * did it without rebooting, then the scrub dtl
01707                          * will be valid, so excise the old region and
01708                          * fold in the scrub dtl.  Otherwise, leave the
01709                          * dtl as-is if there was an error.
01710                          *
01711                          * There's little trick here: to excise the beginning
01712                          * of the DTL_MISSING map, we put it into a reference
01713                          * tree and then add a segment with refcnt -1 that
01714                          * covers the range [0, scrub_txg).  This means
01715                          * that each txg in that range has refcnt -1 or 0.
01716                          * We then add DTL_SCRUB with a refcnt of 2, so that
01717                          * entries in the range [0, scrub_txg) will have a
01718                          * positive refcnt -- either 1 or 2.  We then convert
01719                          * the reference tree into the new DTL_MISSING map.
01720                          */
01721                         space_map_ref_create(&reftree);
01722                         space_map_ref_add_map(&reftree,
01723                             &vd->vdev_dtl[DTL_MISSING], 1);
01724                         space_map_ref_add_seg(&reftree, 0, scrub_txg, -1);
01725                         space_map_ref_add_map(&reftree,
01726                             &vd->vdev_dtl[DTL_SCRUB], 2);
01727                         space_map_ref_generate_map(&reftree,
01728                             &vd->vdev_dtl[DTL_MISSING], 1);
01729                         space_map_ref_destroy(&reftree);
01730                 }
01731                 space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
01732                 space_map_walk(&vd->vdev_dtl[DTL_MISSING],
01733                     space_map_add, &vd->vdev_dtl[DTL_PARTIAL]);
01734                 if (scrub_done)
01735                         space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
01736                 space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
01737                 if (!vdev_readable(vd))
01738                         space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
01739                 else
01740                         space_map_walk(&vd->vdev_dtl[DTL_MISSING],
01741                             space_map_add, &vd->vdev_dtl[DTL_OUTAGE]);
01742                 mutex_exit(&vd->vdev_dtl_lock);
01743 
01744                 if (txg != 0)
01745                         vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
01746                 return;
01747         }
01748 
01749         mutex_enter(&vd->vdev_dtl_lock);
01750         for (int t = 0; t < DTL_TYPES; t++) {
01751                 /* account for child's outage in parent's missing map */
01752                 int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
01753                 if (t == DTL_SCRUB)
01754                         continue;                       /* leaf vdevs only */
01755                 if (t == DTL_PARTIAL)
01756                         minref = 1;                     /* i.e. non-zero */
01757                 else if (vd->vdev_nparity != 0)
01758                         minref = vd->vdev_nparity + 1;  /* RAID-Z */
01759                 else
01760                         minref = vd->vdev_children;     /* any kind of mirror */
01761                 space_map_ref_create(&reftree);
01762                 for (int c = 0; c < vd->vdev_children; c++) {
01763                         vdev_t *cvd = vd->vdev_child[c];
01764                         mutex_enter(&cvd->vdev_dtl_lock);
01765                         space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1);
01766                         mutex_exit(&cvd->vdev_dtl_lock);
01767                 }
01768                 space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
01769                 space_map_ref_destroy(&reftree);
01770         }
01771         mutex_exit(&vd->vdev_dtl_lock);
01772 }
01773 
01774 static int
01775 vdev_dtl_load(vdev_t *vd)
01776 {
01777         spa_t *spa = vd->vdev_spa;
01778         space_map_obj_t *smo = &vd->vdev_dtl_smo;
01779         objset_t *mos = spa->spa_meta_objset;
01780         dmu_buf_t *db;
01781         int error;
01782 
01783         ASSERT(vd->vdev_children == 0);
01784 
01785         if (smo->smo_object == 0)
01786                 return (0);
01787 
01788         ASSERT(!vd->vdev_ishole);
01789 
01790         if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
01791                 return (error);
01792 
01793         ASSERT3U(db->db_size, >=, sizeof (*smo));
01794         bcopy(db->db_data, smo, sizeof (*smo));
01795         dmu_buf_rele(db, FTAG);
01796 
01797         mutex_enter(&vd->vdev_dtl_lock);
01798         error = space_map_load(&vd->vdev_dtl[DTL_MISSING],
01799             NULL, SM_ALLOC, smo, mos);
01800         mutex_exit(&vd->vdev_dtl_lock);
01801 
01802         return (error);
01803 }
01804 
01805 void
01806 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
01807 {
01808         spa_t *spa = vd->vdev_spa;
01809         space_map_obj_t *smo = &vd->vdev_dtl_smo;
01810         space_map_t *sm = &vd->vdev_dtl[DTL_MISSING];
01811         objset_t *mos = spa->spa_meta_objset;
01812         space_map_t smsync;
01813         kmutex_t smlock;
01814         dmu_buf_t *db;
01815         dmu_tx_t *tx;
01816 
01817         ASSERT(!vd->vdev_ishole);
01818 
01819         tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
01820 
01821         if (vd->vdev_detached) {
01822                 if (smo->smo_object != 0) {
01823                         int err = dmu_object_free(mos, smo->smo_object, tx);
01824                         ASSERT0(err);
01825                         smo->smo_object = 0;
01826                 }
01827                 dmu_tx_commit(tx);
01828                 return;
01829         }
01830 
01831         if (smo->smo_object == 0) {
01832                 ASSERT(smo->smo_objsize == 0);
01833                 ASSERT(smo->smo_alloc == 0);
01834                 smo->smo_object = dmu_object_alloc(mos,
01835                     DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
01836                     DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
01837                 ASSERT(smo->smo_object != 0);
01838                 vdev_config_dirty(vd->vdev_top);
01839         }
01840 
01841         mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
01842 
01843         space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
01844             &smlock);
01845 
01846         mutex_enter(&smlock);
01847 
01848         mutex_enter(&vd->vdev_dtl_lock);
01849         space_map_walk(sm, space_map_add, &smsync);
01850         mutex_exit(&vd->vdev_dtl_lock);
01851 
01852         space_map_truncate(smo, mos, tx);
01853         space_map_sync(&smsync, SM_ALLOC, smo, mos, tx);
01854 
01855         space_map_destroy(&smsync);
01856 
01857         mutex_exit(&smlock);
01858         mutex_destroy(&smlock);
01859 
01860         VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
01861         dmu_buf_will_dirty(db, tx);
01862         ASSERT3U(db->db_size, >=, sizeof (*smo));
01863         bcopy(smo, db->db_data, sizeof (*smo));
01864         dmu_buf_rele(db, FTAG);
01865 
01866         dmu_tx_commit(tx);
01867 }
01868 
01873 boolean_t
01874 vdev_dtl_required(vdev_t *vd)
01875 {
01876         spa_t *spa = vd->vdev_spa;
01877         vdev_t *tvd = vd->vdev_top;
01878         uint8_t cant_read = vd->vdev_cant_read;
01879         boolean_t required;
01880 
01881         ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
01882 
01883         if (vd == spa->spa_root_vdev || vd == tvd)
01884                 return (B_TRUE);
01885 
01886         /*
01887          * Temporarily mark the device as unreadable, and then determine
01888          * whether this results in any DTL outages in the top-level vdev.
01889          * If not, we can safely offline/detach/remove the device.
01890          */
01891         vd->vdev_cant_read = B_TRUE;
01892         vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
01893         required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
01894         vd->vdev_cant_read = cant_read;
01895         vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
01896 
01897         if (!required && zio_injection_enabled)
01898                 required = !!zio_handle_device_injection(vd, NULL, ECHILD);
01899 
01900         return (required);
01901 }
01902 
01906 boolean_t
01907 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
01908 {
01909         boolean_t needed = B_FALSE;
01910         uint64_t thismin = UINT64_MAX;
01911         uint64_t thismax = 0;
01912 
01913         if (vd->vdev_children == 0) {
01914                 mutex_enter(&vd->vdev_dtl_lock);
01915                 if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 &&
01916                     vdev_writeable(vd)) {
01917                         space_seg_t *ss;
01918 
01919                         ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
01920                         thismin = ss->ss_start - 1;
01921                         ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
01922                         thismax = ss->ss_end;
01923                         needed = B_TRUE;
01924                 }
01925                 mutex_exit(&vd->vdev_dtl_lock);
01926         } else {
01927                 for (int c = 0; c < vd->vdev_children; c++) {
01928                         vdev_t *cvd = vd->vdev_child[c];
01929                         uint64_t cmin, cmax;
01930 
01931                         if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
01932                                 thismin = MIN(thismin, cmin);
01933                                 thismax = MAX(thismax, cmax);
01934                                 needed = B_TRUE;
01935                         }
01936                 }
01937         }
01938 
01939         if (needed && minp) {
01940                 *minp = thismin;
01941                 *maxp = thismax;
01942         }
01943         return (needed);
01944 }
01945 
01946 void
01947 vdev_load(vdev_t *vd)
01948 {
01949         /*
01950          * Recursively load all children.
01951          */
01952         for (int c = 0; c < vd->vdev_children; c++)
01953                 vdev_load(vd->vdev_child[c]);
01954 
01955         /*
01956          * If this is a top-level vdev, initialize its metaslabs.
01957          */
01958         if (vd == vd->vdev_top && !vd->vdev_ishole &&
01959             (vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
01960             vdev_metaslab_init(vd, 0) != 0))
01961                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
01962                     VDEV_AUX_CORRUPT_DATA);
01963 
01964         /*
01965          * If this is a leaf vdev, load its DTL.
01966          */
01967         if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0)
01968                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
01969                     VDEV_AUX_CORRUPT_DATA);
01970 }
01971 
01979 int
01980 vdev_validate_aux(vdev_t *vd)
01981 {
01982         nvlist_t *label;
01983         uint64_t guid, version;
01984         uint64_t state;
01985 
01986         if (!vdev_readable(vd))
01987                 return (0);
01988 
01989         if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
01990                 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
01991                     VDEV_AUX_CORRUPT_DATA);
01992                 return (-1);
01993         }
01994 
01995         if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
01996             !SPA_VERSION_IS_SUPPORTED(version) ||
01997             nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
01998             guid != vd->vdev_guid ||
01999             nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
02000                 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
02001                     VDEV_AUX_CORRUPT_DATA);
02002                 nvlist_free(label);
02003                 return (-1);
02004         }
02005 
02006         /*
02007          * We don't actually check the pool state here.  If it's in fact in
02008          * use by another pool, we update this fact on the fly when requested.
02009          */
02010         nvlist_free(label);
02011         return (0);
02012 }
02013 
02014 void
02015 vdev_remove(vdev_t *vd, uint64_t txg)
02016 {
02017         spa_t *spa = vd->vdev_spa;
02018         objset_t *mos = spa->spa_meta_objset;
02019         dmu_tx_t *tx;
02020 
02021         tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
02022 
02023         if (vd->vdev_dtl_smo.smo_object) {
02024                 ASSERT0(vd->vdev_dtl_smo.smo_alloc);
02025                 (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
02026                 vd->vdev_dtl_smo.smo_object = 0;
02027         }
02028 
02029         if (vd->vdev_ms != NULL) {
02030                 for (int m = 0; m < vd->vdev_ms_count; m++) {
02031                         metaslab_t *msp = vd->vdev_ms[m];
02032 
02033                         if (msp == NULL || msp->ms_smo.smo_object == 0)
02034                                 continue;
02035 
02036                         ASSERT0(msp->ms_smo.smo_alloc);
02037                         (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
02038                         msp->ms_smo.smo_object = 0;
02039                 }
02040         }
02041 
02042         if (vd->vdev_ms_array) {
02043                 (void) dmu_object_free(mos, vd->vdev_ms_array, tx);
02044                 vd->vdev_ms_array = 0;
02045                 vd->vdev_ms_shift = 0;
02046         }
02047         dmu_tx_commit(tx);
02048 }
02049 
02050 void
02051 vdev_sync_done(vdev_t *vd, uint64_t txg)
02052 {
02053         metaslab_t *msp;
02054         boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
02055 
02056         ASSERT(!vd->vdev_ishole);
02057 
02058         while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
02059                 metaslab_sync_done(msp, txg);
02060 
02061         if (reassess)
02062                 metaslab_sync_reassess(vd->vdev_mg);
02063 }
02064 
02065 void
02066 vdev_sync(vdev_t *vd, uint64_t txg)
02067 {
02068         spa_t *spa = vd->vdev_spa;
02069         vdev_t *lvd;
02070         metaslab_t *msp;
02071         dmu_tx_t *tx;
02072 
02073         ASSERT(!vd->vdev_ishole);
02074 
02075         if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
02076                 ASSERT(vd == vd->vdev_top);
02077                 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
02078                 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
02079                     DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
02080                 ASSERT(vd->vdev_ms_array != 0);
02081                 vdev_config_dirty(vd);
02082                 dmu_tx_commit(tx);
02083         }
02084 
02085         /*
02086          * Remove the metadata associated with this vdev once it's empty.
02087          */
02088         if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
02089                 vdev_remove(vd, txg);
02090 
02091         while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
02092                 metaslab_sync(msp, txg);
02093                 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
02094         }
02095 
02096         while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
02097                 vdev_dtl_sync(lvd, txg);
02098 
02099         (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
02100 }
02101 
02102 uint64_t
02103 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
02104 {
02105         return (vd->vdev_ops->vdev_op_asize(vd, psize));
02106 }
02107 
02112 int
02113 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
02114 {
02115         vdev_t *vd, *tvd;
02116 
02117         spa_vdev_state_enter(spa, SCL_NONE);
02118 
02119         if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
02120                 return (spa_vdev_state_exit(spa, NULL, ENODEV));
02121 
02122         if (!vd->vdev_ops->vdev_op_leaf)
02123                 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
02124 
02125         tvd = vd->vdev_top;
02126 
02127         /*
02128          * We don't directly use the aux state here, but if we do a
02129          * vdev_reopen(), we need this value to be present to remember why we
02130          * were faulted.
02131          */
02132         vd->vdev_label_aux = aux;
02133 
02134         /*
02135          * Faulted state takes precedence over degraded.
02136          */
02137         vd->vdev_delayed_close = B_FALSE;
02138         vd->vdev_faulted = 1ULL;
02139         vd->vdev_degraded = 0ULL;
02140         vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
02141 
02142         /*
02143          * If this device has the only valid copy of the data, then
02144          * back off and simply mark the vdev as degraded instead.
02145          */
02146         if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
02147                 vd->vdev_degraded = 1ULL;
02148                 vd->vdev_faulted = 0ULL;
02149 
02150                 /*
02151                  * If we reopen the device and it's not dead, only then do we
02152                  * mark it degraded.
02153                  */
02154                 vdev_reopen(tvd);
02155 
02156                 if (vdev_readable(vd))
02157                         vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
02158         }
02159 
02160         return (spa_vdev_state_exit(spa, vd, 0));
02161 }
02162 
02168 int
02169 vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
02170 {
02171         vdev_t *vd;
02172 
02173         spa_vdev_state_enter(spa, SCL_NONE);
02174 
02175         if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
02176                 return (spa_vdev_state_exit(spa, NULL, ENODEV));
02177 
02178         if (!vd->vdev_ops->vdev_op_leaf)
02179                 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
02180 
02181         /*
02182          * If the vdev is already faulted, then don't do anything.
02183          */
02184         if (vd->vdev_faulted || vd->vdev_degraded)
02185                 return (spa_vdev_state_exit(spa, NULL, 0));
02186 
02187         vd->vdev_degraded = 1ULL;
02188         if (!vdev_is_dead(vd))
02189                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
02190                     aux);
02191 
02192         return (spa_vdev_state_exit(spa, vd, 0));
02193 }
02194 
02203 int
02204 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
02205 {
02206         vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
02207 
02208         spa_vdev_state_enter(spa, SCL_NONE);
02209 
02210         if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
02211                 return (spa_vdev_state_exit(spa, NULL, ENODEV));
02212 
02213         if (!vd->vdev_ops->vdev_op_leaf)
02214                 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
02215 
02216         tvd = vd->vdev_top;
02217         vd->vdev_offline = B_FALSE;
02218         vd->vdev_tmpoffline = B_FALSE;
02219         vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
02220         vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
02221 
02222         /* XXX - L2ARC 1.0 does not support expansion */
02223         if (!vd->vdev_aux) {
02224                 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
02225                         pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
02226         }
02227 
02228         vdev_reopen(tvd);
02229         vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
02230 
02231         if (!vd->vdev_aux) {
02232                 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
02233                         pvd->vdev_expanding = B_FALSE;
02234         }
02235 
02236         if (newstate)
02237                 *newstate = vd->vdev_state;
02238         if ((flags & ZFS_ONLINE_UNSPARE) &&
02239             !vdev_is_dead(vd) && vd->vdev_parent &&
02240             vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
02241             vd->vdev_parent->vdev_child[0] == vd)
02242                 vd->vdev_unspare = B_TRUE;
02243 
02244         if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
02245 
02246                 /* XXX - L2ARC 1.0 does not support expansion */
02247                 if (vd->vdev_aux)
02248                         return (spa_vdev_state_exit(spa, vd, ENOTSUP));
02249                 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
02250         }
02251         return (spa_vdev_state_exit(spa, vd, 0));
02252 }
02253 
02254 static int
02255 vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
02256 {
02257         vdev_t *vd, *tvd;
02258         int error = 0;
02259         uint64_t generation;
02260         metaslab_group_t *mg;
02261 
02262 top:
02263         spa_vdev_state_enter(spa, SCL_ALLOC);
02264 
02265         if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
02266                 return (spa_vdev_state_exit(spa, NULL, ENODEV));
02267 
02268         if (!vd->vdev_ops->vdev_op_leaf)
02269                 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
02270 
02271         tvd = vd->vdev_top;
02272         mg = tvd->vdev_mg;
02273         generation = spa->spa_config_generation + 1;
02274 
02275         /*
02276          * If the device isn't already offline, try to offline it.
02277          */
02278         if (!vd->vdev_offline) {
02279                 /*
02280                  * If this device has the only valid copy of some data,
02281                  * don't allow it to be offlined. Log devices are always
02282                  * expendable.
02283                  */
02284                 if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
02285                     vdev_dtl_required(vd))
02286                         return (spa_vdev_state_exit(spa, NULL, EBUSY));
02287 
02288                 /*
02289                  * If the top-level is a slog and it has had allocations
02290                  * then proceed.  We check that the vdev's metaslab group
02291                  * is not NULL since it's possible that we may have just
02292                  * added this vdev but not yet initialized its metaslabs.
02293                  */
02294                 if (tvd->vdev_islog && mg != NULL) {
02295                         /*
02296                          * Prevent any future allocations.
02297                          */
02298                         metaslab_group_passivate(mg);
02299                         (void) spa_vdev_state_exit(spa, vd, 0);
02300 
02301                         error = spa_offline_log(spa);
02302 
02303                         spa_vdev_state_enter(spa, SCL_ALLOC);
02304 
02305                         /*
02306                          * Check to see if the config has changed.
02307                          */
02308                         if (error || generation != spa->spa_config_generation) {
02309                                 metaslab_group_activate(mg);
02310                                 if (error)
02311                                         return (spa_vdev_state_exit(spa,
02312                                             vd, error));
02313                                 (void) spa_vdev_state_exit(spa, vd, 0);
02314                                 goto top;
02315                         }
02316                         ASSERT0(tvd->vdev_stat.vs_alloc);
02317                 }
02318 
02319                 /*
02320                  * Offline this device and reopen its top-level vdev.
02321                  * If the top-level vdev is a log device then just offline
02322                  * it. Otherwise, if this action results in the top-level
02323                  * vdev becoming unusable, undo it and fail the request.
02324                  */
02325                 vd->vdev_offline = B_TRUE;
02326                 vdev_reopen(tvd);
02327 
02328                 if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
02329                     vdev_is_dead(tvd)) {
02330                         vd->vdev_offline = B_FALSE;
02331                         vdev_reopen(tvd);
02332                         return (spa_vdev_state_exit(spa, NULL, EBUSY));
02333                 }
02334 
02335                 /*
02336                  * Add the device back into the metaslab rotor so that
02337                  * once we online the device it's open for business.
02338                  */
02339                 if (tvd->vdev_islog && mg != NULL)
02340                         metaslab_group_activate(mg);
02341         }
02342 
02343         vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
02344 
02345         return (spa_vdev_state_exit(spa, vd, 0));
02346 }
02347 
02348 int
02349 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
02350 {
02351         int error;
02352 
02353         mutex_enter(&spa->spa_vdev_top_lock);
02354         error = vdev_offline_locked(spa, guid, flags);
02355         mutex_exit(&spa->spa_vdev_top_lock);
02356 
02357         return (error);
02358 }
02359 
02365 void
02366 vdev_clear(spa_t *spa, vdev_t *vd)
02367 {
02368         vdev_t *rvd = spa->spa_root_vdev;
02369 
02370         ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
02371 
02372         if (vd == NULL)
02373                 vd = rvd;
02374 
02375         vd->vdev_stat.vs_read_errors = 0;
02376         vd->vdev_stat.vs_write_errors = 0;
02377         vd->vdev_stat.vs_checksum_errors = 0;
02378 
02379         for (int c = 0; c < vd->vdev_children; c++)
02380                 vdev_clear(spa, vd->vdev_child[c]);
02381 
02382         /*
02383          * If we're in the FAULTED state or have experienced failed I/O, then
02384          * clear the persistent state and attempt to reopen the device.  We
02385          * also mark the vdev config dirty, so that the new faulted state is
02386          * written out to disk.
02387          */
02388         if (vd->vdev_faulted || vd->vdev_degraded ||
02389             !vdev_readable(vd) || !vdev_writeable(vd)) {
02390 
02391                 /*
02392                  * When reopening in reponse to a clear event, it may be due to
02393                  * a fmadm repair request.  In this case, if the device is
02394                  * still broken, we want to still post the ereport again.
02395                  */
02396                 vd->vdev_forcefault = B_TRUE;
02397 
02398                 vd->vdev_faulted = vd->vdev_degraded = 0ULL;
02399                 vd->vdev_cant_read = B_FALSE;
02400                 vd->vdev_cant_write = B_FALSE;
02401 
02402                 vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
02403 
02404                 vd->vdev_forcefault = B_FALSE;
02405 
02406                 if (vd != rvd && vdev_writeable(vd->vdev_top))
02407                         vdev_state_dirty(vd->vdev_top);
02408 
02409                 if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
02410                         spa_async_request(spa, SPA_ASYNC_RESILVER);
02411 
02412                 spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
02413         }
02414 
02415         /*
02416          * When clearing a FMA-diagnosed fault, we always want to
02417          * unspare the device, as we assume that the original spare was
02418          * done in response to the FMA fault.
02419          */
02420         if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
02421             vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
02422             vd->vdev_parent->vdev_child[0] == vd)
02423                 vd->vdev_unspare = B_TRUE;
02424 }
02425 
02426 boolean_t
02427 vdev_is_dead(vdev_t *vd)
02428 {
02429         /*
02430          * Holes and missing devices are always considered "dead".
02431          * This simplifies the code since we don't have to check for
02432          * these types of devices in the various code paths.
02433          * Instead we rely on the fact that we skip over dead devices
02434          * before issuing I/O to them.
02435          */
02436         return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
02437             vd->vdev_ops == &vdev_missing_ops);
02438 }
02439 
02440 boolean_t
02441 vdev_readable(vdev_t *vd)
02442 {
02443         return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
02444 }
02445 
02446 boolean_t
02447 vdev_writeable(vdev_t *vd)
02448 {
02449         return (!vdev_is_dead(vd) && !vd->vdev_cant_write);
02450 }
02451 
02452 boolean_t
02453 vdev_allocatable(vdev_t *vd)
02454 {
02455         uint64_t state = vd->vdev_state;
02456 
02457         /*
02458          * We currently allow allocations from vdevs which may be in the
02459          * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
02460          * fails to reopen then we'll catch it later when we're holding
02461          * the proper locks.  Note that we have to get the vdev state
02462          * in a local variable because although it changes atomically,
02463          * we're asking two separate questions about it.
02464          */
02465         return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
02466             !vd->vdev_cant_write && !vd->vdev_ishole);
02467 }
02468 
02469 boolean_t
02470 vdev_accessible(vdev_t *vd, zio_t *zio)
02471 {
02472         ASSERT(zio->io_vd == vd);
02473 
02474         if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
02475                 return (B_FALSE);
02476 
02477         if (zio->io_type == ZIO_TYPE_READ)
02478                 return (!vd->vdev_cant_read);
02479 
02480         if (zio->io_type == ZIO_TYPE_WRITE)
02481                 return (!vd->vdev_cant_write);
02482 
02483         return (B_TRUE);
02484 }
02485 
02486 /*
02487  * Get statistics for the given vdev.
02488  */
02489 void
02490 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
02491 {
02492         vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
02493 
02494         mutex_enter(&vd->vdev_stat_lock);
02495         bcopy(&vd->vdev_stat, vs, sizeof (*vs));
02496         vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
02497         vs->vs_state = vd->vdev_state;
02498         vs->vs_rsize = vdev_get_min_asize(vd);
02499         if (vd->vdev_ops->vdev_op_leaf)
02500                 vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
02501         vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize;
02502         mutex_exit(&vd->vdev_stat_lock);
02503 
02504         /*
02505          * If we're getting stats on the root vdev, aggregate the I/O counts
02506          * over all top-level vdevs (i.e. the direct children of the root).
02507          */
02508         if (vd == rvd) {
02509                 for (int c = 0; c < rvd->vdev_children; c++) {
02510                         vdev_t *cvd = rvd->vdev_child[c];
02511                         vdev_stat_t *cvs = &cvd->vdev_stat;
02512 
02513                         mutex_enter(&vd->vdev_stat_lock);
02514                         for (int t = 0; t < ZIO_TYPES; t++) {
02515                                 vs->vs_ops[t] += cvs->vs_ops[t];
02516                                 vs->vs_bytes[t] += cvs->vs_bytes[t];
02517                         }
02518                         cvs->vs_scan_removing = cvd->vdev_removing;
02519                         mutex_exit(&vd->vdev_stat_lock);
02520                 }
02521         }
02522 }
02523 
02524 void
02525 vdev_clear_stats(vdev_t *vd)
02526 {
02527         mutex_enter(&vd->vdev_stat_lock);
02528         vd->vdev_stat.vs_space = 0;
02529         vd->vdev_stat.vs_dspace = 0;
02530         vd->vdev_stat.vs_alloc = 0;
02531         mutex_exit(&vd->vdev_stat_lock);
02532 }
02533 
02534 void
02535 vdev_scan_stat_init(vdev_t *vd)
02536 {
02537         vdev_stat_t *vs = &vd->vdev_stat;
02538 
02539         for (int c = 0; c < vd->vdev_children; c++)
02540                 vdev_scan_stat_init(vd->vdev_child[c]);
02541 
02542         mutex_enter(&vd->vdev_stat_lock);
02543         vs->vs_scan_processed = 0;
02544         mutex_exit(&vd->vdev_stat_lock);
02545 }
02546 
02547 void
02548 vdev_stat_update(zio_t *zio, uint64_t psize)
02549 {
02550         spa_t *spa = zio->io_spa;
02551         vdev_t *rvd = spa->spa_root_vdev;
02552         vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
02553         vdev_t *pvd;
02554         uint64_t txg = zio->io_txg;
02555         vdev_stat_t *vs = &vd->vdev_stat;
02556         zio_type_t type = zio->io_type;
02557         int flags = zio->io_flags;
02558 
02559         /*
02560          * If this i/o is a gang leader, it didn't do any actual work.
02561          */
02562         if (zio->io_gang_tree)
02563                 return;
02564 
02565         if (zio->io_error == 0) {
02566                 /*
02567                  * If this is a root i/o, don't count it -- we've already
02568                  * counted the top-level vdevs, and vdev_get_stats() will
02569                  * aggregate them when asked.  This reduces contention on
02570                  * the root vdev_stat_lock and implicitly handles blocks
02571                  * that compress away to holes, for which there is no i/o.
02572                  * (Holes never create vdev children, so all the counters
02573                  * remain zero, which is what we want.)
02574                  *
02575                  * Note: this only applies to successful i/o (io_error == 0)
02576                  * because unlike i/o counts, errors are not additive.
02577                  * When reading a ditto block, for example, failure of
02578                  * one top-level vdev does not imply a root-level error.
02579                  */
02580                 if (vd == rvd)
02581                         return;
02582 
02583                 ASSERT(vd == zio->io_vd);
02584 
02585                 if (flags & ZIO_FLAG_IO_BYPASS)
02586                         return;
02587 
02588                 mutex_enter(&vd->vdev_stat_lock);
02589 
02590                 if (flags & ZIO_FLAG_IO_REPAIR) {
02591                         if (flags & ZIO_FLAG_SCAN_THREAD) {
02592                                 dsl_scan_phys_t *scn_phys =
02593                                     &spa->spa_dsl_pool->dp_scan->scn_phys;
02594                                 uint64_t *processed = &scn_phys->scn_processed;
02595 
02596                                 /* XXX cleanup? */
02597                                 if (vd->vdev_ops->vdev_op_leaf)
02598                                         atomic_add_64(processed, psize);
02599                                 vs->vs_scan_processed += psize;
02600                         }
02601 
02602                         if (flags & ZIO_FLAG_SELF_HEAL)
02603                                 vs->vs_self_healed += psize;
02604                 }
02605 
02606                 vs->vs_ops[type]++;
02607                 vs->vs_bytes[type] += psize;
02608 
02609                 mutex_exit(&vd->vdev_stat_lock);
02610                 return;
02611         }
02612 
02613         if (flags & ZIO_FLAG_SPECULATIVE)
02614                 return;
02615 
02616         /*
02617          * If this is an I/O error that is going to be retried, then ignore the
02618          * error.  Otherwise, the user may interpret B_FAILFAST I/O errors as
02619          * hard errors, when in reality they can happen for any number of
02620          * innocuous reasons (bus resets, MPxIO link failure, etc).
02621          */
02622         if (zio->io_error == EIO &&
02623             !(zio->io_flags & ZIO_FLAG_IO_RETRY))
02624                 return;
02625 
02626         /*
02627          * Intent logs writes won't propagate their error to the root
02628          * I/O so don't mark these types of failures as pool-level
02629          * errors.
02630          */
02631         if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
02632                 return;
02633 
02634         mutex_enter(&vd->vdev_stat_lock);
02635         if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
02636                 if (zio->io_error == ECKSUM)
02637                         vs->vs_checksum_errors++;
02638                 else
02639                         vs->vs_read_errors++;
02640         }
02641         if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
02642                 vs->vs_write_errors++;
02643         mutex_exit(&vd->vdev_stat_lock);
02644 
02645         if (type == ZIO_TYPE_WRITE && txg != 0 &&
02646             (!(flags & ZIO_FLAG_IO_REPAIR) ||
02647             (flags & ZIO_FLAG_SCAN_THREAD) ||
02648             spa->spa_claiming)) {
02649                 /*
02650                  * This is either a normal write (not a repair), or it's
02651                  * a repair induced by the scrub thread, or it's a repair
02652                  * made by zil_claim() during spa_load() in the first txg.
02653                  * In the normal case, we commit the DTL change in the same
02654                  * txg as the block was born.  In the scrub-induced repair
02655                  * case, we know that scrubs run in first-pass syncing context,
02656                  * so we commit the DTL change in spa_syncing_txg(spa).
02657                  * In the zil_claim() case, we commit in spa_first_txg(spa).
02658                  *
02659                  * We currently do not make DTL entries for failed spontaneous
02660                  * self-healing writes triggered by normal (non-scrubbing)
02661                  * reads, because we have no transactional context in which to
02662                  * do so -- and it's not clear that it'd be desirable anyway.
02663                  */
02664                 if (vd->vdev_ops->vdev_op_leaf) {
02665                         uint64_t commit_txg = txg;
02666                         if (flags & ZIO_FLAG_SCAN_THREAD) {
02667                                 ASSERT(flags & ZIO_FLAG_IO_REPAIR);
02668                                 ASSERT(spa_sync_pass(spa) == 1);
02669                                 vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
02670                                 commit_txg = spa_syncing_txg(spa);
02671                         } else if (spa->spa_claiming) {
02672                                 ASSERT(flags & ZIO_FLAG_IO_REPAIR);
02673                                 commit_txg = spa_first_txg(spa);
02674                         }
02675                         ASSERT(commit_txg >= spa_syncing_txg(spa));
02676                         if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
02677                                 return;
02678                         for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
02679                                 vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
02680                         vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
02681                 }
02682                 if (vd != rvd)
02683                         vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
02684         }
02685 }
02686 
02691 void
02692 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
02693     int64_t space_delta)
02694 {
02695         int64_t dspace_delta = space_delta;
02696         spa_t *spa = vd->vdev_spa;
02697         vdev_t *rvd = spa->spa_root_vdev;
02698         metaslab_group_t *mg = vd->vdev_mg;
02699         metaslab_class_t *mc = mg ? mg->mg_class : NULL;
02700 
02701         ASSERT(vd == vd->vdev_top);
02702 
02703         /*
02704          * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
02705          * factor.  We must calculate this here and not at the root vdev
02706          * because the root vdev's psize-to-asize is simply the max of its
02707          * childrens', thus not accurate enough for us.
02708          */
02709         ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
02710         ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
02711         dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
02712             vd->vdev_deflate_ratio;
02713 
02714         mutex_enter(&vd->vdev_stat_lock);
02715         vd->vdev_stat.vs_alloc += alloc_delta;
02716         vd->vdev_stat.vs_space += space_delta;
02717         vd->vdev_stat.vs_dspace += dspace_delta;
02718         mutex_exit(&vd->vdev_stat_lock);
02719 
02720         if (mc == spa_normal_class(spa)) {
02721                 mutex_enter(&rvd->vdev_stat_lock);
02722                 rvd->vdev_stat.vs_alloc += alloc_delta;
02723                 rvd->vdev_stat.vs_space += space_delta;
02724                 rvd->vdev_stat.vs_dspace += dspace_delta;
02725                 mutex_exit(&rvd->vdev_stat_lock);
02726         }
02727 
02728         if (mc != NULL) {
02729                 ASSERT(rvd == vd->vdev_parent);
02730                 ASSERT(vd->vdev_ms_count != 0);
02731 
02732                 metaslab_class_space_update(mc,
02733                     alloc_delta, defer_delta, space_delta, dspace_delta);
02734         }
02735 }
02736 
02742 void
02743 vdev_config_dirty(vdev_t *vd)
02744 {
02745         spa_t *spa = vd->vdev_spa;
02746         vdev_t *rvd = spa->spa_root_vdev;
02747         int c;
02748 
02749         ASSERT(spa_writeable(spa));
02750 
02751         /*
02752          * If this is an aux vdev (as with l2cache and spare devices), then we
02753          * update the vdev config manually and set the sync flag.
02754          */
02755         if (vd->vdev_aux != NULL) {
02756                 spa_aux_vdev_t *sav = vd->vdev_aux;
02757                 nvlist_t **aux;
02758                 uint_t naux;
02759 
02760                 for (c = 0; c < sav->sav_count; c++) {
02761                         if (sav->sav_vdevs[c] == vd)
02762                                 break;
02763                 }
02764 
02765                 if (c == sav->sav_count) {
02766                         /*
02767                          * We're being removed.  There's nothing more to do.
02768                          */
02769                         ASSERT(sav->sav_sync == B_TRUE);
02770                         return;
02771                 }
02772 
02773                 sav->sav_sync = B_TRUE;
02774 
02775                 if (nvlist_lookup_nvlist_array(sav->sav_config,
02776                     ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
02777                         VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
02778                             ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
02779                 }
02780 
02781                 ASSERT(c < naux);
02782 
02783                 /*
02784                  * Setting the nvlist in the middle if the array is a little
02785                  * sketchy, but it will work.
02786                  */
02787                 nvlist_free(aux[c]);
02788                 aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
02789 
02790                 return;
02791         }
02792 
02793         /*
02794          * The dirty list is protected by the SCL_CONFIG lock.  The caller
02795          * must either hold SCL_CONFIG as writer, or must be the sync thread
02796          * (which holds SCL_CONFIG as reader).  There's only one sync thread,
02797          * so this is sufficient to ensure mutual exclusion.
02798          */
02799         ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
02800             (dsl_pool_sync_context(spa_get_dsl(spa)) &&
02801             spa_config_held(spa, SCL_CONFIG, RW_READER)));
02802 
02803         if (vd == rvd) {
02804                 for (c = 0; c < rvd->vdev_children; c++)
02805                         vdev_config_dirty(rvd->vdev_child[c]);
02806         } else {
02807                 ASSERT(vd == vd->vdev_top);
02808 
02809                 if (!list_link_active(&vd->vdev_config_dirty_node) &&
02810                     !vd->vdev_ishole)
02811                         list_insert_head(&spa->spa_config_dirty_list, vd);
02812         }
02813 }
02814 
02815 void
02816 vdev_config_clean(vdev_t *vd)
02817 {
02818         spa_t *spa = vd->vdev_spa;
02819 
02820         ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
02821             (dsl_pool_sync_context(spa_get_dsl(spa)) &&
02822             spa_config_held(spa, SCL_CONFIG, RW_READER)));
02823 
02824         ASSERT(list_link_active(&vd->vdev_config_dirty_node));
02825         list_remove(&spa->spa_config_dirty_list, vd);
02826 }
02827 
02834 void
02835 vdev_state_dirty(vdev_t *vd)
02836 {
02837         spa_t *spa = vd->vdev_spa;
02838 
02839         ASSERT(spa_writeable(spa));
02840         ASSERT(vd == vd->vdev_top);
02841 
02842         /*
02843          * The state list is protected by the SCL_STATE lock.  The caller
02844          * must either hold SCL_STATE as writer, or must be the sync thread
02845          * (which holds SCL_STATE as reader).  There's only one sync thread,
02846          * so this is sufficient to ensure mutual exclusion.
02847          */
02848         ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
02849             (dsl_pool_sync_context(spa_get_dsl(spa)) &&
02850             spa_config_held(spa, SCL_STATE, RW_READER)));
02851 
02852         if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
02853                 list_insert_head(&spa->spa_state_dirty_list, vd);
02854 }
02855 
02856 void
02857 vdev_state_clean(vdev_t *vd)
02858 {
02859         spa_t *spa = vd->vdev_spa;
02860 
02861         ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
02862             (dsl_pool_sync_context(spa_get_dsl(spa)) &&
02863             spa_config_held(spa, SCL_STATE, RW_READER)));
02864 
02865         ASSERT(list_link_active(&vd->vdev_state_dirty_node));
02866         list_remove(&spa->spa_state_dirty_list, vd);
02867 }
02868 
02872 void
02873 vdev_propagate_state(vdev_t *vd)
02874 {
02875         spa_t *spa = vd->vdev_spa;
02876         vdev_t *rvd = spa->spa_root_vdev;
02877         int degraded = 0, faulted = 0;
02878         int corrupted = 0;
02879         vdev_t *child;
02880 
02881         if (vd->vdev_children > 0) {
02882                 for (int c = 0; c < vd->vdev_children; c++) {
02883                         child = vd->vdev_child[c];
02884 
02885                         /*
02886                          * Don't factor holes into the decision.
02887                          */
02888                         if (child->vdev_ishole)
02889                                 continue;
02890 
02891                         if (!vdev_readable(child) ||
02892                             (!vdev_writeable(child) && spa_writeable(spa))) {
02893                                 /*
02894                                  * Root special: if there is a top-level log
02895                                  * device, treat the root vdev as if it were
02896                                  * degraded.
02897                                  */
02898                                 if (child->vdev_islog && vd == rvd)
02899                                         degraded++;
02900                                 else
02901                                         faulted++;
02902                         } else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
02903                                 degraded++;
02904                         }
02905 
02906                         if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
02907                                 corrupted++;
02908                 }
02909 
02910                 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
02911 
02912                 /*
02913                  * Root special: if there is a top-level vdev that cannot be
02914                  * opened due to corrupted metadata, then propagate the root
02915                  * vdev's aux state as 'corrupt' rather than 'insufficient
02916                  * replicas'.
02917                  */
02918                 if (corrupted && vd == rvd &&
02919                     rvd->vdev_state == VDEV_STATE_CANT_OPEN)
02920                         vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
02921                             VDEV_AUX_CORRUPT_DATA);
02922         }
02923 
02924         if (vd->vdev_parent)
02925                 vdev_propagate_state(vd->vdev_parent);
02926 }
02927 
02936 void
02937 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
02938 {
02939         uint64_t save_state;
02940         spa_t *spa = vd->vdev_spa;
02941 
02942         if (state == vd->vdev_state) {
02943                 vd->vdev_stat.vs_aux = aux;
02944                 return;
02945         }
02946 
02947         save_state = vd->vdev_state;
02948 
02949         vd->vdev_state = state;
02950         vd->vdev_stat.vs_aux = aux;
02951 
02952         /*
02953          * If we are setting the vdev state to anything but an open state, then
02954          * always close the underlying device unless the device has requested
02955          * a delayed close (i.e. we're about to remove or fault the device).
02956          * Otherwise, we keep accessible but invalid devices open forever.
02957          * We don't call vdev_close() itself, because that implies some extra
02958          * checks (offline, etc) that we don't want here.  This is limited to
02959          * leaf devices, because otherwise closing the device will affect other
02960          * children.
02961          */
02962         if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
02963             vd->vdev_ops->vdev_op_leaf)
02964                 vd->vdev_ops->vdev_op_close(vd);
02965 
02966         /*
02967          * If we have brought this vdev back into service, we need
02968          * to notify fmd so that it can gracefully repair any outstanding
02969          * cases due to a missing device.  We do this in all cases, even those
02970          * that probably don't correlate to a repaired fault.  This is sure to
02971          * catch all cases, and we let the zfs-retire agent sort it out.  If
02972          * this is a transient state it's OK, as the retire agent will
02973          * double-check the state of the vdev before repairing it.
02974          */
02975         if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf &&
02976             vd->vdev_prevstate != state)
02977                 zfs_post_state_change(spa, vd);
02978 
02979         if (vd->vdev_removed &&
02980             state == VDEV_STATE_CANT_OPEN &&
02981             (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
02982                 /*
02983                  * If the previous state is set to VDEV_STATE_REMOVED, then this
02984                  * device was previously marked removed and someone attempted to
02985                  * reopen it.  If this failed due to a nonexistent device, then
02986                  * keep the device in the REMOVED state.  We also let this be if
02987                  * it is one of our special test online cases, which is only
02988                  * attempting to online the device and shouldn't generate an FMA
02989                  * fault.
02990                  */
02991                 vd->vdev_state = VDEV_STATE_REMOVED;
02992                 vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
02993         } else if (state == VDEV_STATE_REMOVED) {
02994                 vd->vdev_removed = B_TRUE;
02995         } else if (state == VDEV_STATE_CANT_OPEN) {
02996                 /*
02997                  * If we fail to open a vdev during an import or recovery, we
02998                  * mark it as "not available", which signifies that it was
02999                  * never there to begin with.  Failure to open such a device
03000                  * is not considered an error.
03001                  */
03002                 if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
03003                     spa_load_state(spa) == SPA_LOAD_RECOVER) &&
03004                     vd->vdev_ops->vdev_op_leaf)
03005                         vd->vdev_not_present = 1;
03006 
03007                 /*
03008                  * Post the appropriate ereport.  If the 'prevstate' field is
03009                  * set to something other than VDEV_STATE_UNKNOWN, it indicates
03010                  * that this is part of a vdev_reopen().  In this case, we don't
03011                  * want to post the ereport if the device was already in the
03012                  * CANT_OPEN state beforehand.
03013                  *
03014                  * If the 'checkremove' flag is set, then this is an attempt to
03015                  * online the device in response to an insertion event.  If we
03016                  * hit this case, then we have detected an insertion event for a
03017                  * faulted or offline device that wasn't in the removed state.
03018                  * In this scenario, we don't post an ereport because we are
03019                  * about to replace the device, or attempt an online with
03020                  * vdev_forcefault, which will generate the fault for us.
03021                  */
03022                 if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
03023                     !vd->vdev_not_present && !vd->vdev_checkremove &&
03024                     vd != spa->spa_root_vdev) {
03025                         const char *class;
03026 
03027                         switch (aux) {
03028                         case VDEV_AUX_OPEN_FAILED:
03029                                 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
03030                                 break;
03031                         case VDEV_AUX_CORRUPT_DATA:
03032                                 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
03033                                 break;
03034                         case VDEV_AUX_NO_REPLICAS:
03035                                 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
03036                                 break;
03037                         case VDEV_AUX_BAD_GUID_SUM:
03038                                 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
03039                                 break;
03040                         case VDEV_AUX_TOO_SMALL:
03041                                 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
03042                                 break;
03043                         case VDEV_AUX_BAD_LABEL:
03044                                 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
03045                                 break;
03046                         default:
03047                                 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
03048                         }
03049 
03050                         zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
03051                 }
03052 
03053                 /* Erase any notion of persistent removed state */
03054                 vd->vdev_removed = B_FALSE;
03055         } else {
03056                 vd->vdev_removed = B_FALSE;
03057         }
03058 
03059         if (!isopen && vd->vdev_parent)
03060                 vdev_propagate_state(vd->vdev_parent);
03061 }
03062 
03076 boolean_t
03077 vdev_is_bootable(vdev_t *vd)
03078 {
03079 #ifdef sun
03080         if (!vd->vdev_ops->vdev_op_leaf) {
03081                 char *vdev_type = vd->vdev_ops->vdev_op_type;
03082 
03083                 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
03084                     vd->vdev_children > 1) {
03085                         return (B_FALSE);
03086                 } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
03087                     strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
03088                         return (B_FALSE);
03089                 }
03090         } else if (vd->vdev_wholedisk == 1) {
03091                 return (B_FALSE);
03092         }
03093 
03094         for (int c = 0; c < vd->vdev_children; c++) {
03095                 if (!vdev_is_bootable(vd->vdev_child[c]))
03096                         return (B_FALSE);
03097         }
03098 #endif  /* sun */
03099         return (B_TRUE);
03100 }
03101 
03108 void
03109 vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
03110 {
03111         spa_t *spa = nvd->vdev_spa;
03112 
03113         ASSERT(nvd->vdev_top->vdev_islog);
03114         ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
03115         ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
03116 
03117         for (int c = 0; c < nvd->vdev_children; c++)
03118                 vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
03119 
03120         if (nvd->vdev_ops->vdev_op_leaf) {
03121                 /*
03122                  * Restore the persistent vdev state
03123                  */
03124                 nvd->vdev_offline = ovd->vdev_offline;
03125                 nvd->vdev_faulted = ovd->vdev_faulted;
03126                 nvd->vdev_degraded = ovd->vdev_degraded;
03127                 nvd->vdev_removed = ovd->vdev_removed;
03128         }
03129 }
03130 
03136 boolean_t
03137 vdev_log_state_valid(vdev_t *vd)
03138 {
03139         if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
03140             !vd->vdev_removed)
03141                 return (B_TRUE);
03142 
03143         for (int c = 0; c < vd->vdev_children; c++)
03144                 if (vdev_log_state_valid(vd->vdev_child[c]))
03145                         return (B_TRUE);
03146 
03147         return (B_FALSE);
03148 }
03149 
03153 void
03154 vdev_expand(vdev_t *vd, uint64_t txg)
03155 {
03156         ASSERT(vd->vdev_top == vd);
03157         ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
03158 
03159         if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
03160                 VERIFY(vdev_metaslab_init(vd, txg) == 0);
03161                 vdev_config_dirty(vd);
03162         }
03163 }
03164 
03168 void
03169 vdev_split(vdev_t *vd)
03170 {
03171         vdev_t *cvd, *pvd = vd->vdev_parent;
03172 
03173         vdev_remove_child(pvd, vd);
03174         vdev_compact_children(pvd);
03175 
03176         cvd = pvd->vdev_child[0];
03177         if (pvd->vdev_children == 1) {
03178                 vdev_remove_parent(cvd);
03179                 cvd->vdev_splitting = B_TRUE;
03180         }
03181         vdev_propagate_state(cvd);
03182 }
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines