FreeBSD ZFS
The Zettabyte File System

spa.c

Go to the documentation of this file.
00001 /*
00002  * CDDL HEADER START
00003  *
00004  * The contents of this file are subject to the terms of the
00005  * Common Development and Distribution License (the "License").
00006  * You may not use this file except in compliance with the License.
00007  *
00008  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
00009  * or http://www.opensolaris.org/os/licensing.
00010  * See the License for the specific language governing permissions
00011  * and limitations under the License.
00012  *
00013  * When distributing Covered Code, include this CDDL HEADER in each
00014  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
00015  * If applicable, add the following below this CDDL HEADER, with the
00016  * fields enclosed by brackets "[]" replaced with your own identifying
00017  * information: Portions Copyright [yyyy] [name of copyright owner]
00018  *
00019  * CDDL HEADER END
00020  */
00021 
00022 /*
00023  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
00024  * Copyright (c) 2012 by Delphix. All rights reserved.
00025  */
00026 
00036 #include <sys/zfs_context.h>
00037 #include <sys/fm/fs/zfs.h>
00038 #include <sys/spa_impl.h>
00039 #include <sys/zio.h>
00040 #include <sys/zio_checksum.h>
00041 #include <sys/dmu.h>
00042 #include <sys/dmu_tx.h>
00043 #include <sys/zap.h>
00044 #include <sys/zil.h>
00045 #include <sys/ddt.h>
00046 #include <sys/vdev_impl.h>
00047 #include <sys/metaslab.h>
00048 #include <sys/metaslab_impl.h>
00049 #include <sys/uberblock_impl.h>
00050 #include <sys/txg.h>
00051 #include <sys/avl.h>
00052 #include <sys/dmu_traverse.h>
00053 #include <sys/dmu_objset.h>
00054 #include <sys/unique.h>
00055 #include <sys/dsl_pool.h>
00056 #include <sys/dsl_dataset.h>
00057 #include <sys/dsl_dir.h>
00058 #include <sys/dsl_prop.h>
00059 #include <sys/dsl_synctask.h>
00060 #include <sys/fs/zfs.h>
00061 #include <sys/arc.h>
00062 #include <sys/callb.h>
00063 #include <sys/spa_boot.h>
00064 #include <sys/zfs_ioctl.h>
00065 #include <sys/dsl_scan.h>
00066 #include <sys/zfeature.h>
00067 #include <sys/zvol.h>
00068 #include <sys/trim_map.h>
00069 
00070 #ifdef  _KERNEL
00071 #include <sys/callb.h>
00072 #include <sys/cpupart.h>
00073 #include <sys/zone.h>
00074 #endif  /* _KERNEL */
00075 
00076 #include "zfs_prop.h"
00077 #include "zfs_comutil.h"
00078 
00084 int check_hostid = 1;
00085 
00092 int zfs_ccw_retry_interval = 300;
00093 
00094 SYSCTL_DECL(_vfs_zfs);
00095 TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid);
00096 SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0,
00097     "Check hostid on import?");
00098 TUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval);
00099 SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW,
00100     &zfs_ccw_retry_interval, 0,
00101     "Configuration cache file write, retry after failure, interval (seconds)");
00102 
00103 typedef enum zti_modes {
00104         zti_mode_fixed,                 
00105         zti_mode_online_percent,        
00106         zti_mode_batch,                 
00107         zti_mode_null,                  
00108         zti_nmodes
00109 } zti_modes_t;
00110 
00111 #define ZTI_FIX(n)      { zti_mode_fixed, (n) }
00112 #define ZTI_PCT(n)      { zti_mode_online_percent, (n) }
00113 #define ZTI_BATCH       { zti_mode_batch, 0 }
00114 #define ZTI_NULL        { zti_mode_null, 0 }
00115 
00116 #define ZTI_ONE         ZTI_FIX(1)
00117 
00118 typedef struct zio_taskq_info {
00119         enum zti_modes zti_mode;
00120         uint_t zti_value;
00121 } zio_taskq_info_t;
00122 
00123 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
00124         "issue", "issue_high", "intr", "intr_high"
00125 };
00126 
00131 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
00132         /* ISSUE        ISSUE_HIGH      INTR            INTR_HIGH */
00133         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
00134         { ZTI_FIX(8),   ZTI_NULL,       ZTI_BATCH,      ZTI_NULL },
00135         { ZTI_BATCH,    ZTI_FIX(5),     ZTI_FIX(8),     ZTI_FIX(5) },
00136         { ZTI_FIX(100), ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
00137         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
00138         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
00139 };
00140 
00141 static dsl_syncfunc_t spa_sync_version;
00142 static dsl_syncfunc_t spa_sync_props;
00143 static dsl_checkfunc_t spa_change_guid_check;
00144 static dsl_syncfunc_t spa_change_guid_sync;
00145 static boolean_t spa_has_active_shared_spare(spa_t *spa);
00146 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
00147     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
00148     char **ereport);
00149 static void spa_vdev_resilver_done(spa_t *spa);
00150 
00151 uint_t          zio_taskq_batch_pct = 100;      
00152 #ifdef PSRSET_BIND
00153 id_t            zio_taskq_psrset_bind = PS_NONE;
00154 #endif
00155 #ifdef SYSDC
00156 boolean_t       zio_taskq_sysdc = B_TRUE;       
00157 #endif
00158 uint_t          zio_taskq_basedc = 80;          
00160 boolean_t       spa_create_process = B_TRUE;    
00166 #define TRYIMPORT_NAME  "$import"
00167 
00168 /*
00169  * ==========================================================================
00170  * SPA properties routines
00171  * ==========================================================================
00172  */
00173 
00177 static void
00178 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
00179     uint64_t intval, zprop_source_t src)
00180 {
00181         const char *propname = zpool_prop_to_name(prop);
00182         nvlist_t *propval;
00183 
00184         VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
00185         VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
00186 
00187         if (strval != NULL)
00188                 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
00189         else
00190                 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
00191 
00192         VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
00193         nvlist_free(propval);
00194 }
00195 
00199 static void
00200 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
00201 {
00202         vdev_t *rvd = spa->spa_root_vdev;
00203         dsl_pool_t *pool = spa->spa_dsl_pool;
00204         uint64_t size;
00205         uint64_t alloc;
00206         uint64_t space;
00207         uint64_t cap, version;
00208         zprop_source_t src = ZPROP_SRC_NONE;
00209         spa_config_dirent_t *dp;
00210 
00211         ASSERT(MUTEX_HELD(&spa->spa_props_lock));
00212 
00213         if (rvd != NULL) {
00214                 alloc = metaslab_class_get_alloc(spa_normal_class(spa));
00215                 size = metaslab_class_get_space(spa_normal_class(spa));
00216                 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
00217                 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
00218                 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
00219                 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
00220                     size - alloc, src);
00221 
00222                 space = 0;
00223                 for (int c = 0; c < rvd->vdev_children; c++) {
00224                         vdev_t *tvd = rvd->vdev_child[c];
00225                         space += tvd->vdev_max_asize - tvd->vdev_asize;
00226                 }
00227                 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space,
00228                     src);
00229 
00230                 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
00231                     (spa_mode(spa) == FREAD), src);
00232 
00233                 cap = (size == 0) ? 0 : (alloc * 100 / size);
00234                 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
00235 
00236                 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
00237                     ddt_get_pool_dedup_ratio(spa), src);
00238 
00239                 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
00240                     rvd->vdev_state, src);
00241 
00242                 version = spa_version(spa);
00243                 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
00244                         src = ZPROP_SRC_DEFAULT;
00245                 else
00246                         src = ZPROP_SRC_LOCAL;
00247                 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
00248         }
00249 
00250         if (pool != NULL) {
00251                 dsl_dir_t *freedir = pool->dp_free_dir;
00252 
00253                 /*
00254                  * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
00255                  * when opening pools before this version freedir will be NULL.
00256                  */
00257                 if (freedir != NULL) {
00258                         spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
00259                             freedir->dd_phys->dd_used_bytes, src);
00260                 } else {
00261                         spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
00262                             NULL, 0, src);
00263                 }
00264         }
00265 
00266         spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
00267 
00268         if (spa->spa_comment != NULL) {
00269                 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
00270                     0, ZPROP_SRC_LOCAL);
00271         }
00272 
00273         if (spa->spa_root != NULL)
00274                 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
00275                     0, ZPROP_SRC_LOCAL);
00276 
00277         if ((dp = list_head(&spa->spa_config_list)) != NULL) {
00278                 if (dp->scd_path == NULL) {
00279                         spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
00280                             "none", 0, ZPROP_SRC_LOCAL);
00281                 } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
00282                         spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
00283                             dp->scd_path, 0, ZPROP_SRC_LOCAL);
00284                 }
00285         }
00286 }
00287 
00291 int
00292 spa_prop_get(spa_t *spa, nvlist_t **nvp)
00293 {
00294         objset_t *mos = spa->spa_meta_objset;
00295         zap_cursor_t zc;
00296         zap_attribute_t za;
00297         int err;
00298 
00299         VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
00300 
00301         mutex_enter(&spa->spa_props_lock);
00302 
00303         /*
00304          * Get properties from the spa config.
00305          */
00306         spa_prop_get_config(spa, nvp);
00307 
00308         /* If no pool property object, no more prop to get. */
00309         if (mos == NULL || spa->spa_pool_props_object == 0) {
00310                 mutex_exit(&spa->spa_props_lock);
00311                 return (0);
00312         }
00313 
00314         /*
00315          * Get properties from the MOS pool property object.
00316          */
00317         for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
00318             (err = zap_cursor_retrieve(&zc, &za)) == 0;
00319             zap_cursor_advance(&zc)) {
00320                 uint64_t intval = 0;
00321                 char *strval = NULL;
00322                 zprop_source_t src = ZPROP_SRC_DEFAULT;
00323                 zpool_prop_t prop;
00324 
00325                 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
00326                         continue;
00327 
00328                 switch (za.za_integer_length) {
00329                 case 8:
00330                         /* integer property */
00331                         if (za.za_first_integer !=
00332                             zpool_prop_default_numeric(prop))
00333                                 src = ZPROP_SRC_LOCAL;
00334 
00335                         if (prop == ZPOOL_PROP_BOOTFS) {
00336                                 dsl_pool_t *dp;
00337                                 dsl_dataset_t *ds = NULL;
00338 
00339                                 dp = spa_get_dsl(spa);
00340                                 rw_enter(&dp->dp_config_rwlock, RW_READER);
00341                                 if (err = dsl_dataset_hold_obj(dp,
00342                                     za.za_first_integer, FTAG, &ds)) {
00343                                         rw_exit(&dp->dp_config_rwlock);
00344                                         break;
00345                                 }
00346 
00347                                 strval = kmem_alloc(
00348                                     MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
00349                                     KM_SLEEP);
00350                                 dsl_dataset_name(ds, strval);
00351                                 dsl_dataset_rele(ds, FTAG);
00352                                 rw_exit(&dp->dp_config_rwlock);
00353                         } else {
00354                                 strval = NULL;
00355                                 intval = za.za_first_integer;
00356                         }
00357 
00358                         spa_prop_add_list(*nvp, prop, strval, intval, src);
00359 
00360                         if (strval != NULL)
00361                                 kmem_free(strval,
00362                                     MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
00363 
00364                         break;
00365 
00366                 case 1:
00367                         /* string property */
00368                         strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
00369                         err = zap_lookup(mos, spa->spa_pool_props_object,
00370                             za.za_name, 1, za.za_num_integers, strval);
00371                         if (err) {
00372                                 kmem_free(strval, za.za_num_integers);
00373                                 break;
00374                         }
00375                         spa_prop_add_list(*nvp, prop, strval, 0, src);
00376                         kmem_free(strval, za.za_num_integers);
00377                         break;
00378 
00379                 default:
00380                         break;
00381                 }
00382         }
00383         zap_cursor_fini(&zc);
00384         mutex_exit(&spa->spa_props_lock);
00385 out:
00386         if (err && err != ENOENT) {
00387                 nvlist_free(*nvp);
00388                 *nvp = NULL;
00389                 return (err);
00390         }
00391 
00392         return (0);
00393 }
00394 
00399 static int
00400 spa_prop_validate(spa_t *spa, nvlist_t *props)
00401 {
00402         nvpair_t *elem;
00403         int error = 0, reset_bootfs = 0;
00404         uint64_t objnum;
00405         boolean_t has_feature = B_FALSE;
00406 
00407         elem = NULL;
00408         while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
00409                 uint64_t intval;
00410                 char *strval, *slash, *check, *fname;
00411                 const char *propname = nvpair_name(elem);
00412                 zpool_prop_t prop = zpool_name_to_prop(propname);
00413 
00414                 switch (prop) {
00415                 case ZPROP_INVAL:
00416                         if (!zpool_prop_feature(propname)) {
00417                                 error = EINVAL;
00418                                 break;
00419                         }
00420 
00421                         /*
00422                          * Sanitize the input.
00423                          */
00424                         if (nvpair_type(elem) != DATA_TYPE_UINT64) {
00425                                 error = EINVAL;
00426                                 break;
00427                         }
00428 
00429                         if (nvpair_value_uint64(elem, &intval) != 0) {
00430                                 error = EINVAL;
00431                                 break;
00432                         }
00433 
00434                         if (intval != 0) {
00435                                 error = EINVAL;
00436                                 break;
00437                         }
00438 
00439                         fname = strchr(propname, '@') + 1;
00440                         if (zfeature_lookup_name(fname, NULL) != 0) {
00441                                 error = EINVAL;
00442                                 break;
00443                         }
00444 
00445                         has_feature = B_TRUE;
00446                         break;
00447 
00448                 case ZPOOL_PROP_VERSION:
00449                         error = nvpair_value_uint64(elem, &intval);
00450                         if (!error &&
00451                             (intval < spa_version(spa) ||
00452                             intval > SPA_VERSION_BEFORE_FEATURES ||
00453                             has_feature))
00454                                 error = EINVAL;
00455                         break;
00456 
00457                 case ZPOOL_PROP_DELEGATION:
00458                 case ZPOOL_PROP_AUTOREPLACE:
00459                 case ZPOOL_PROP_LISTSNAPS:
00460                 case ZPOOL_PROP_AUTOEXPAND:
00461                         error = nvpair_value_uint64(elem, &intval);
00462                         if (!error && intval > 1)
00463                                 error = EINVAL;
00464                         break;
00465 
00466                 case ZPOOL_PROP_BOOTFS:
00467                         /*
00468                          * If the pool version is less than SPA_VERSION_BOOTFS,
00469                          * or the pool is still being created (version == 0),
00470                          * the bootfs property cannot be set.
00471                          */
00472                         if (spa_version(spa) < SPA_VERSION_BOOTFS) {
00473                                 error = ENOTSUP;
00474                                 break;
00475                         }
00476 
00477                         /*
00478                          * Make sure the vdev config is bootable
00479                          */
00480                         if (!vdev_is_bootable(spa->spa_root_vdev)) {
00481                                 error = ENOTSUP;
00482                                 break;
00483                         }
00484 
00485                         reset_bootfs = 1;
00486 
00487                         error = nvpair_value_string(elem, &strval);
00488 
00489                         if (!error) {
00490                                 objset_t *os;
00491                                 uint64_t compress;
00492 
00493                                 if (strval == NULL || strval[0] == '\0') {
00494                                         objnum = zpool_prop_default_numeric(
00495                                             ZPOOL_PROP_BOOTFS);
00496                                         break;
00497                                 }
00498 
00499                                 if (error = dmu_objset_hold(strval, FTAG, &os))
00500                                         break;
00501 
00502                                 /* Must be ZPL and not gzip compressed. */
00503 
00504                                 if (dmu_objset_type(os) != DMU_OST_ZFS) {
00505                                         error = ENOTSUP;
00506                                 } else if ((error = dsl_prop_get_integer(strval,
00507                                     zfs_prop_to_name(ZFS_PROP_COMPRESSION),
00508                                     &compress, NULL)) == 0 &&
00509                                     !BOOTFS_COMPRESS_VALID(compress)) {
00510                                         error = ENOTSUP;
00511                                 } else {
00512                                         objnum = dmu_objset_id(os);
00513                                 }
00514                                 dmu_objset_rele(os, FTAG);
00515                         }
00516                         break;
00517 
00518                 case ZPOOL_PROP_FAILUREMODE:
00519                         error = nvpair_value_uint64(elem, &intval);
00520                         if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
00521                             intval > ZIO_FAILURE_MODE_PANIC))
00522                                 error = EINVAL;
00523 
00524                         /*
00525                          * This is a special case which only occurs when
00526                          * the pool has completely failed. This allows
00527                          * the user to change the in-core failmode property
00528                          * without syncing it out to disk (I/Os might
00529                          * currently be blocked). We do this by returning
00530                          * EIO to the caller (spa_prop_set) to trick it
00531                          * into thinking we encountered a property validation
00532                          * error.
00533                          */
00534                         if (!error && spa_suspended(spa)) {
00535                                 spa->spa_failmode = intval;
00536                                 error = EIO;
00537                         }
00538                         break;
00539 
00540                 case ZPOOL_PROP_CACHEFILE:
00541                         if ((error = nvpair_value_string(elem, &strval)) != 0)
00542                                 break;
00543 
00544                         if (strval[0] == '\0')
00545                                 break;
00546 
00547                         if (strcmp(strval, "none") == 0)
00548                                 break;
00549 
00550                         if (strval[0] != '/') {
00551                                 error = EINVAL;
00552                                 break;
00553                         }
00554 
00555                         slash = strrchr(strval, '/');
00556                         ASSERT(slash != NULL);
00557 
00558                         if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
00559                             strcmp(slash, "/..") == 0)
00560                                 error = EINVAL;
00561                         break;
00562 
00563                 case ZPOOL_PROP_COMMENT:
00564                         if ((error = nvpair_value_string(elem, &strval)) != 0)
00565                                 break;
00566                         for (check = strval; *check != '\0'; check++) {
00567                                 /*
00568                                  * The kernel doesn't have an easy isprint()
00569                                  * check.  For this kernel check, we merely
00570                                  * check ASCII apart from DEL.  Fix this if
00571                                  * there is an easy-to-use kernel isprint().
00572                                  */
00573                                 if (*check >= 0x7f) {
00574                                         error = EINVAL;
00575                                         break;
00576                                 }
00577                                 check++;
00578                         }
00579                         if (strlen(strval) > ZPROP_MAX_COMMENT)
00580                                 error = E2BIG;
00581                         break;
00582 
00583                 case ZPOOL_PROP_DEDUPDITTO:
00584                         if (spa_version(spa) < SPA_VERSION_DEDUP)
00585                                 error = ENOTSUP;
00586                         else
00587                                 error = nvpair_value_uint64(elem, &intval);
00588                         if (error == 0 &&
00589                             intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
00590                                 error = EINVAL;
00591                         break;
00592                 }
00593 
00594                 if (error)
00595                         break;
00596         }
00597 
00598         if (!error && reset_bootfs) {
00599                 error = nvlist_remove(props,
00600                     zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
00601 
00602                 if (!error) {
00603                         error = nvlist_add_uint64(props,
00604                             zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
00605                 }
00606         }
00607 
00608         return (error);
00609 }
00610 
00611 void
00612 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
00613 {
00614         char *cachefile;
00615         spa_config_dirent_t *dp;
00616 
00617         if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
00618             &cachefile) != 0)
00619                 return;
00620 
00621         dp = kmem_alloc(sizeof (spa_config_dirent_t),
00622             KM_SLEEP);
00623 
00624         if (cachefile[0] == '\0')
00625                 dp->scd_path = spa_strdup(spa_config_path);
00626         else if (strcmp(cachefile, "none") == 0)
00627                 dp->scd_path = NULL;
00628         else
00629                 dp->scd_path = spa_strdup(cachefile);
00630 
00631         list_insert_head(&spa->spa_config_list, dp);
00632         if (need_sync)
00633                 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
00634 }
00635 
00636 int
00637 spa_prop_set(spa_t *spa, nvlist_t *nvp)
00638 {
00639         int error;
00640         nvpair_t *elem = NULL;
00641         boolean_t need_sync = B_FALSE;
00642 
00643         if ((error = spa_prop_validate(spa, nvp)) != 0)
00644                 return (error);
00645 
00646         while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
00647                 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
00648 
00649                 if (prop == ZPOOL_PROP_CACHEFILE ||
00650                     prop == ZPOOL_PROP_ALTROOT ||
00651                     prop == ZPOOL_PROP_READONLY)
00652                         continue;
00653 
00654                 if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) {
00655                         uint64_t ver;
00656 
00657                         if (prop == ZPOOL_PROP_VERSION) {
00658                                 VERIFY(nvpair_value_uint64(elem, &ver) == 0);
00659                         } else {
00660                                 ASSERT(zpool_prop_feature(nvpair_name(elem)));
00661                                 ver = SPA_VERSION_FEATURES;
00662                                 need_sync = B_TRUE;
00663                         }
00664 
00665                         /* Save time if the version is already set. */
00666                         if (ver == spa_version(spa))
00667                                 continue;
00668 
00669                         /*
00670                          * In addition to the pool directory object, we might
00671                          * create the pool properties object, the features for
00672                          * read object, the features for write object, or the
00673                          * feature descriptions object.
00674                          */
00675                         error = dsl_sync_task_do(spa_get_dsl(spa), NULL,
00676                             spa_sync_version, spa, &ver, 6);
00677                         if (error)
00678                                 return (error);
00679                         continue;
00680                 }
00681 
00682                 need_sync = B_TRUE;
00683                 break;
00684         }
00685 
00686         if (need_sync) {
00687                 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
00688                     spa, nvp, 6));
00689         }
00690 
00691         return (0);
00692 }
00693 
00697 void
00698 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
00699 {
00700         if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
00701                 VERIFY(zap_remove(spa->spa_meta_objset,
00702                     spa->spa_pool_props_object,
00703                     zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
00704                 spa->spa_bootfs = 0;
00705         }
00706 }
00707 
00708 /*ARGSUSED*/
00709 static int
00710 spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx)
00711 {
00712         spa_t *spa = arg1;
00713         uint64_t *newguid = arg2;
00714         vdev_t *rvd = spa->spa_root_vdev;
00715         uint64_t vdev_state;
00716 
00717         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
00718         vdev_state = rvd->vdev_state;
00719         spa_config_exit(spa, SCL_STATE, FTAG);
00720 
00721         if (vdev_state != VDEV_STATE_HEALTHY)
00722                 return (ENXIO);
00723 
00724         ASSERT3U(spa_guid(spa), !=, *newguid);
00725 
00726         return (0);
00727 }
00728 
00729 static void
00730 spa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx)
00731 {
00732         spa_t *spa = arg1;
00733         uint64_t *newguid = arg2;
00734         uint64_t oldguid;
00735         vdev_t *rvd = spa->spa_root_vdev;
00736 
00737         oldguid = spa_guid(spa);
00738 
00739         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
00740         rvd->vdev_guid = *newguid;
00741         rvd->vdev_guid_sum += (*newguid - oldguid);
00742         vdev_config_dirty(rvd);
00743         spa_config_exit(spa, SCL_STATE, FTAG);
00744 
00745 #ifdef __FreeBSD__
00746         /*
00747          * TODO: until recent illumos logging changes are merged
00748          *       log reguid as pool property change
00749          */
00750         spa_history_log_internal(LOG_POOL_PROPSET, spa, tx,
00751             "guid change old=%llu new=%llu", oldguid, *newguid);
00752 #else
00753         spa_history_log_internal(spa, "guid change", tx, "old=%lld new=%lld",
00754             oldguid, *newguid);
00755 #endif
00756 }
00757 
00767 int
00768 spa_change_guid(spa_t *spa)
00769 {
00770         int error;
00771         uint64_t guid;
00772 
00773         mutex_enter(&spa_namespace_lock);
00774         guid = spa_generate_guid(NULL);
00775 
00776         error = dsl_sync_task_do(spa_get_dsl(spa), spa_change_guid_check,
00777             spa_change_guid_sync, spa, &guid, 5);
00778 
00779         if (error == 0) {
00780                 spa_config_sync(spa, B_FALSE, B_TRUE);
00781                 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
00782         }
00783 
00784         mutex_exit(&spa_namespace_lock);
00785 
00786         return (error);
00787 }
00788 
00789 /*
00790  * ==========================================================================
00791  * SPA state manipulation (open/create/destroy/import/export)
00792  * ==========================================================================
00793  */
00794 
00795 static int
00796 spa_error_entry_compare(const void *a, const void *b)
00797 {
00798         spa_error_entry_t *sa = (spa_error_entry_t *)a;
00799         spa_error_entry_t *sb = (spa_error_entry_t *)b;
00800         int ret;
00801 
00802         ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
00803             sizeof (zbookmark_t));
00804 
00805         if (ret < 0)
00806                 return (-1);
00807         else if (ret > 0)
00808                 return (1);
00809         else
00810                 return (0);
00811 }
00812 
00817 void
00818 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
00819 {
00820         ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
00821 
00822         bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
00823         bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
00824 
00825         avl_create(&spa->spa_errlist_scrub,
00826             spa_error_entry_compare, sizeof (spa_error_entry_t),
00827             offsetof(spa_error_entry_t, se_avl));
00828         avl_create(&spa->spa_errlist_last,
00829             spa_error_entry_compare, sizeof (spa_error_entry_t),
00830             offsetof(spa_error_entry_t, se_avl));
00831 }
00832 
00833 static taskq_t *
00834 spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
00835     uint_t value)
00836 {
00837         uint_t flags = TASKQ_PREPOPULATE;
00838         boolean_t batch = B_FALSE;
00839 
00840         switch (mode) {
00841         case zti_mode_null:
00842                 return (NULL);          /* no taskq needed */
00843 
00844         case zti_mode_fixed:
00845                 ASSERT3U(value, >=, 1);
00846                 value = MAX(value, 1);
00847                 break;
00848 
00849         case zti_mode_batch:
00850                 batch = B_TRUE;
00851                 flags |= TASKQ_THREADS_CPU_PCT;
00852                 value = zio_taskq_batch_pct;
00853                 break;
00854 
00855         case zti_mode_online_percent:
00856                 flags |= TASKQ_THREADS_CPU_PCT;
00857                 break;
00858 
00859         default:
00860                 panic("unrecognized mode for %s taskq (%u:%u) in "
00861                     "spa_activate()",
00862                     name, mode, value);
00863                 break;
00864         }
00865 
00866 #ifdef SYSDC
00867         if (zio_taskq_sysdc && spa->spa_proc != &p0) {
00868                 if (batch)
00869                         flags |= TASKQ_DC_BATCH;
00870 
00871                 return (taskq_create_sysdc(name, value, 50, INT_MAX,
00872                     spa->spa_proc, zio_taskq_basedc, flags));
00873         }
00874 #endif
00875         return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
00876             spa->spa_proc, flags));
00877 }
00878 
00879 static void
00880 spa_create_zio_taskqs(spa_t *spa)
00881 {
00882         for (int t = 0; t < ZIO_TYPES; t++) {
00883                 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
00884                         const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
00885                         enum zti_modes mode = ztip->zti_mode;
00886                         uint_t value = ztip->zti_value;
00887                         char name[32];
00888 
00889                         (void) snprintf(name, sizeof (name),
00890                             "%s_%s", zio_type_name[t], zio_taskq_types[q]);
00891 
00892                         spa->spa_zio_taskq[t][q] =
00893                             spa_taskq_create(spa, name, mode, value);
00894                 }
00895         }
00896 }
00897 
00898 #ifdef _KERNEL
00899 #ifdef SPA_PROCESS
00900 static void
00901 spa_thread(void *arg)
00902 {
00903         callb_cpr_t cprinfo;
00904 
00905         spa_t *spa = arg;
00906         user_t *pu = PTOU(curproc);
00907 
00908         CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
00909             spa->spa_name);
00910 
00911         ASSERT(curproc != &p0);
00912         (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
00913             "zpool-%s", spa->spa_name);
00914         (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
00915 
00916 #ifdef PSRSET_BIND
00917         /* bind this thread to the requested psrset */
00918         if (zio_taskq_psrset_bind != PS_NONE) {
00919                 pool_lock();
00920                 mutex_enter(&cpu_lock);
00921                 mutex_enter(&pidlock);
00922                 mutex_enter(&curproc->p_lock);
00923 
00924                 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
00925                     0, NULL, NULL) == 0)  {
00926                         curthread->t_bind_pset = zio_taskq_psrset_bind;
00927                 } else {
00928                         cmn_err(CE_WARN,
00929                             "Couldn't bind process for zfs pool \"%s\" to "
00930                             "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
00931                 }
00932 
00933                 mutex_exit(&curproc->p_lock);
00934                 mutex_exit(&pidlock);
00935                 mutex_exit(&cpu_lock);
00936                 pool_unlock();
00937         }
00938 #endif
00939 
00940 #ifdef SYSDC
00941         if (zio_taskq_sysdc) {
00942                 sysdc_thread_enter(curthread, 100, 0);
00943         }
00944 #endif
00945 
00946         spa->spa_proc = curproc;
00947         spa->spa_did = curthread->t_did;
00948 
00949         spa_create_zio_taskqs(spa);
00950 
00951         mutex_enter(&spa->spa_proc_lock);
00952         ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
00953 
00954         spa->spa_proc_state = SPA_PROC_ACTIVE;
00955         cv_broadcast(&spa->spa_proc_cv);
00956 
00957         CALLB_CPR_SAFE_BEGIN(&cprinfo);
00958         while (spa->spa_proc_state == SPA_PROC_ACTIVE)
00959                 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
00960         CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
00961 
00962         ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
00963         spa->spa_proc_state = SPA_PROC_GONE;
00964         spa->spa_proc = &p0;
00965         cv_broadcast(&spa->spa_proc_cv);
00966         CALLB_CPR_EXIT(&cprinfo);       /* drops spa_proc_lock */
00967 
00968         mutex_enter(&curproc->p_lock);
00969         lwp_exit();
00970 }
00971 #endif  /* SPA_PROCESS */
00972 #endif
00973 
00977 static void
00978 spa_activate(spa_t *spa, int mode)
00979 {
00980         ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
00981 
00982         spa->spa_state = POOL_STATE_ACTIVE;
00983         spa->spa_mode = mode;
00984 
00985         spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
00986         spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
00987 
00988         /* Try to create a covering process */
00989         mutex_enter(&spa->spa_proc_lock);
00990         ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
00991         ASSERT(spa->spa_proc == &p0);
00992         spa->spa_did = 0;
00993 
00994 #ifdef SPA_PROCESS
00995         /* Only create a process if we're going to be around a while. */
00996         if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
00997                 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
00998                     NULL, 0) == 0) {
00999                         spa->spa_proc_state = SPA_PROC_CREATED;
01000                         while (spa->spa_proc_state == SPA_PROC_CREATED) {
01001                                 cv_wait(&spa->spa_proc_cv,
01002                                     &spa->spa_proc_lock);
01003                         }
01004                         ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
01005                         ASSERT(spa->spa_proc != &p0);
01006                         ASSERT(spa->spa_did != 0);
01007                 } else {
01008 #ifdef _KERNEL
01009                         cmn_err(CE_WARN,
01010                             "Couldn't create process for zfs pool \"%s\"\n",
01011                             spa->spa_name);
01012 #endif
01013                 }
01014         }
01015 #endif  /* SPA_PROCESS */
01016         mutex_exit(&spa->spa_proc_lock);
01017 
01018         /* If we didn't create a process, we need to create our taskqs. */
01019         ASSERT(spa->spa_proc == &p0);
01020         if (spa->spa_proc == &p0) {
01021                 spa_create_zio_taskqs(spa);
01022         }
01023 
01024         /*
01025          * Start TRIM thread.
01026          */
01027         trim_thread_create(spa);
01028 
01029         list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
01030             offsetof(vdev_t, vdev_config_dirty_node));
01031         list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
01032             offsetof(vdev_t, vdev_state_dirty_node));
01033 
01034         txg_list_create(&spa->spa_vdev_txg_list,
01035             offsetof(struct vdev, vdev_txg_node));
01036 
01037         avl_create(&spa->spa_errlist_scrub,
01038             spa_error_entry_compare, sizeof (spa_error_entry_t),
01039             offsetof(spa_error_entry_t, se_avl));
01040         avl_create(&spa->spa_errlist_last,
01041             spa_error_entry_compare, sizeof (spa_error_entry_t),
01042             offsetof(spa_error_entry_t, se_avl));
01043 }
01044 
01048 static void
01049 spa_deactivate(spa_t *spa)
01050 {
01051         ASSERT(spa->spa_sync_on == B_FALSE);
01052         ASSERT(spa->spa_dsl_pool == NULL);
01053         ASSERT(spa->spa_root_vdev == NULL);
01054         ASSERT(spa->spa_async_zio_root == NULL);
01055         ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
01056 
01057         /*
01058          * Stop TRIM thread in case spa_unload() wasn't called directly
01059          * before spa_deactivate().
01060          */
01061         trim_thread_destroy(spa);
01062 
01063         txg_list_destroy(&spa->spa_vdev_txg_list);
01064 
01065         list_destroy(&spa->spa_config_dirty_list);
01066         list_destroy(&spa->spa_state_dirty_list);
01067 
01068         for (int t = 0; t < ZIO_TYPES; t++) {
01069                 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
01070                         if (spa->spa_zio_taskq[t][q] != NULL)
01071                                 taskq_destroy(spa->spa_zio_taskq[t][q]);
01072                         spa->spa_zio_taskq[t][q] = NULL;
01073                 }
01074         }
01075 
01076         metaslab_class_destroy(spa->spa_normal_class);
01077         spa->spa_normal_class = NULL;
01078 
01079         metaslab_class_destroy(spa->spa_log_class);
01080         spa->spa_log_class = NULL;
01081 
01082         /*
01083          * If this was part of an import or the open otherwise failed, we may
01084          * still have errors left in the queues.  Empty them just in case.
01085          */
01086         spa_errlog_drain(spa);
01087 
01088         avl_destroy(&spa->spa_errlist_scrub);
01089         avl_destroy(&spa->spa_errlist_last);
01090 
01091         spa->spa_state = POOL_STATE_UNINITIALIZED;
01092 
01093         mutex_enter(&spa->spa_proc_lock);
01094         if (spa->spa_proc_state != SPA_PROC_NONE) {
01095                 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
01096                 spa->spa_proc_state = SPA_PROC_DEACTIVATE;
01097                 cv_broadcast(&spa->spa_proc_cv);
01098                 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
01099                         ASSERT(spa->spa_proc != &p0);
01100                         cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
01101                 }
01102                 ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
01103                 spa->spa_proc_state = SPA_PROC_NONE;
01104         }
01105         ASSERT(spa->spa_proc == &p0);
01106         mutex_exit(&spa->spa_proc_lock);
01107 
01108 #ifdef SPA_PROCESS
01109         /*
01110          * We want to make sure spa_thread() has actually exited the ZFS
01111          * module, so that the module can't be unloaded out from underneath
01112          * it.
01113          */
01114         if (spa->spa_did != 0) {
01115                 thread_join(spa->spa_did);
01116                 spa->spa_did = 0;
01117         }
01118 #endif  /* SPA_PROCESS */
01119 }
01120 
01127 static int
01128 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
01129     uint_t id, int atype)
01130 {
01131         nvlist_t **child;
01132         uint_t children;
01133         int error;
01134 
01135         if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
01136                 return (error);
01137 
01138         if ((*vdp)->vdev_ops->vdev_op_leaf)
01139                 return (0);
01140 
01141         error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
01142             &child, &children);
01143 
01144         if (error == ENOENT)
01145                 return (0);
01146 
01147         if (error) {
01148                 vdev_free(*vdp);
01149                 *vdp = NULL;
01150                 return (EINVAL);
01151         }
01152 
01153         for (int c = 0; c < children; c++) {
01154                 vdev_t *vd;
01155                 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
01156                     atype)) != 0) {
01157                         vdev_free(*vdp);
01158                         *vdp = NULL;
01159                         return (error);
01160                 }
01161         }
01162 
01163         ASSERT(*vdp != NULL);
01164 
01165         return (0);
01166 }
01167 
01171 static void
01172 spa_unload(spa_t *spa)
01173 {
01174         int i;
01175 
01176         ASSERT(MUTEX_HELD(&spa_namespace_lock));
01177 
01178         /*
01179          * Stop TRIM thread.
01180          */
01181         trim_thread_destroy(spa);
01182 
01183         /*
01184          * Stop async tasks.
01185          */
01186         spa_async_suspend(spa);
01187 
01188         /*
01189          * Stop syncing.
01190          */
01191         if (spa->spa_sync_on) {
01192                 txg_sync_stop(spa->spa_dsl_pool);
01193                 spa->spa_sync_on = B_FALSE;
01194         }
01195 
01196         /*
01197          * Wait for any outstanding async I/O to complete.
01198          */
01199         if (spa->spa_async_zio_root != NULL) {
01200                 (void) zio_wait(spa->spa_async_zio_root);
01201                 spa->spa_async_zio_root = NULL;
01202         }
01203 
01204         bpobj_close(&spa->spa_deferred_bpobj);
01205 
01206         /*
01207          * Close the dsl pool.
01208          */
01209         if (spa->spa_dsl_pool) {
01210                 dsl_pool_close(spa->spa_dsl_pool);
01211                 spa->spa_dsl_pool = NULL;
01212                 spa->spa_meta_objset = NULL;
01213         }
01214 
01215         ddt_unload(spa);
01216 
01217         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
01218 
01219         /*
01220          * Drop and purge level 2 cache
01221          */
01222         spa_l2cache_drop(spa);
01223 
01224         /*
01225          * Close all vdevs.
01226          */
01227         if (spa->spa_root_vdev)
01228                 vdev_free(spa->spa_root_vdev);
01229         ASSERT(spa->spa_root_vdev == NULL);
01230 
01231         for (i = 0; i < spa->spa_spares.sav_count; i++)
01232                 vdev_free(spa->spa_spares.sav_vdevs[i]);
01233         if (spa->spa_spares.sav_vdevs) {
01234                 kmem_free(spa->spa_spares.sav_vdevs,
01235                     spa->spa_spares.sav_count * sizeof (void *));
01236                 spa->spa_spares.sav_vdevs = NULL;
01237         }
01238         if (spa->spa_spares.sav_config) {
01239                 nvlist_free(spa->spa_spares.sav_config);
01240                 spa->spa_spares.sav_config = NULL;
01241         }
01242         spa->spa_spares.sav_count = 0;
01243 
01244         for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
01245                 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
01246                 vdev_free(spa->spa_l2cache.sav_vdevs[i]);
01247         }
01248         if (spa->spa_l2cache.sav_vdevs) {
01249                 kmem_free(spa->spa_l2cache.sav_vdevs,
01250                     spa->spa_l2cache.sav_count * sizeof (void *));
01251                 spa->spa_l2cache.sav_vdevs = NULL;
01252         }
01253         if (spa->spa_l2cache.sav_config) {
01254                 nvlist_free(spa->spa_l2cache.sav_config);
01255                 spa->spa_l2cache.sav_config = NULL;
01256         }
01257         spa->spa_l2cache.sav_count = 0;
01258 
01259         spa->spa_async_suspended = 0;
01260 
01261         if (spa->spa_comment != NULL) {
01262                 spa_strfree(spa->spa_comment);
01263                 spa->spa_comment = NULL;
01264         }
01265 
01266         spa_config_exit(spa, SCL_ALL, FTAG);
01267 }
01268 
01275 static void
01276 spa_load_spares(spa_t *spa)
01277 {
01278         nvlist_t **spares;
01279         uint_t nspares;
01280         int i;
01281         vdev_t *vd, *tvd;
01282 
01283         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
01284 
01285         /*
01286          * First, close and free any existing spare vdevs.
01287          */
01288         for (i = 0; i < spa->spa_spares.sav_count; i++) {
01289                 vd = spa->spa_spares.sav_vdevs[i];
01290 
01291                 /* Undo the call to spa_activate() below */
01292                 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
01293                     B_FALSE)) != NULL && tvd->vdev_isspare)
01294                         spa_spare_remove(tvd);
01295                 vdev_close(vd);
01296                 vdev_free(vd);
01297         }
01298 
01299         if (spa->spa_spares.sav_vdevs)
01300                 kmem_free(spa->spa_spares.sav_vdevs,
01301                     spa->spa_spares.sav_count * sizeof (void *));
01302 
01303         if (spa->spa_spares.sav_config == NULL)
01304                 nspares = 0;
01305         else
01306                 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
01307                     ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
01308 
01309         spa->spa_spares.sav_count = (int)nspares;
01310         spa->spa_spares.sav_vdevs = NULL;
01311 
01312         if (nspares == 0)
01313                 return;
01314 
01315         /*
01316          * Construct the array of vdevs, opening them to get status in the
01317          * process.   For each spare, there is potentially two different vdev_t
01318          * structures associated with it: one in the list of spares (used only
01319          * for basic validation purposes) and one in the active vdev
01320          * configuration (if it's spared in).  During this phase we open and
01321          * validate each vdev on the spare list.  If the vdev also exists in the
01322          * active configuration, then we also mark this vdev as an active spare.
01323          */
01324         spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
01325             KM_SLEEP);
01326         for (i = 0; i < spa->spa_spares.sav_count; i++) {
01327                 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
01328                     VDEV_ALLOC_SPARE) == 0);
01329                 ASSERT(vd != NULL);
01330 
01331                 spa->spa_spares.sav_vdevs[i] = vd;
01332 
01333                 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
01334                     B_FALSE)) != NULL) {
01335                         if (!tvd->vdev_isspare)
01336                                 spa_spare_add(tvd);
01337 
01338                         /*
01339                          * We only mark the spare active if we were successfully
01340                          * able to load the vdev.  Otherwise, importing a pool
01341                          * with a bad active spare would result in strange
01342                          * behavior, because multiple pool would think the spare
01343                          * is actively in use.
01344                          *
01345                          * There is a vulnerability here to an equally bizarre
01346                          * circumstance, where a dead active spare is later
01347                          * brought back to life (onlined or otherwise).  Given
01348                          * the rarity of this scenario, and the extra complexity
01349                          * it adds, we ignore the possibility.
01350                          */
01351                         if (!vdev_is_dead(tvd))
01352                                 spa_spare_activate(tvd);
01353                 }
01354 
01355                 vd->vdev_top = vd;
01356                 vd->vdev_aux = &spa->spa_spares;
01357 
01358                 if (vdev_open(vd) != 0)
01359                         continue;
01360 
01361                 if (vdev_validate_aux(vd) == 0)
01362                         spa_spare_add(vd);
01363         }
01364 
01365         /*
01366          * Recompute the stashed list of spares, with status information
01367          * this time.
01368          */
01369         VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
01370             DATA_TYPE_NVLIST_ARRAY) == 0);
01371 
01372         spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
01373             KM_SLEEP);
01374         for (i = 0; i < spa->spa_spares.sav_count; i++)
01375                 spares[i] = vdev_config_generate(spa,
01376                     spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
01377         VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
01378             ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
01379         for (i = 0; i < spa->spa_spares.sav_count; i++)
01380                 nvlist_free(spares[i]);
01381         kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
01382 }
01383 
01392 static void
01393 spa_load_l2cache(spa_t *spa)
01394 {
01395         nvlist_t **l2cache;
01396         uint_t nl2cache;
01397         int i, j, oldnvdevs;
01398         uint64_t guid;
01399         vdev_t *vd, **oldvdevs, **newvdevs;
01400         spa_aux_vdev_t *sav = &spa->spa_l2cache;
01401 
01402         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
01403 
01404         if (sav->sav_config != NULL) {
01405                 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
01406                     ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
01407                 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
01408         } else {
01409                 nl2cache = 0;
01410         }
01411 
01412         oldvdevs = sav->sav_vdevs;
01413         oldnvdevs = sav->sav_count;
01414         sav->sav_vdevs = NULL;
01415         sav->sav_count = 0;
01416 
01417         /*
01418          * Process new nvlist of vdevs.
01419          */
01420         for (i = 0; i < nl2cache; i++) {
01421                 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
01422                     &guid) == 0);
01423 
01424                 newvdevs[i] = NULL;
01425                 for (j = 0; j < oldnvdevs; j++) {
01426                         vd = oldvdevs[j];
01427                         if (vd != NULL && guid == vd->vdev_guid) {
01428                                 /*
01429                                  * Retain previous vdev for add/remove ops.
01430                                  */
01431                                 newvdevs[i] = vd;
01432                                 oldvdevs[j] = NULL;
01433                                 break;
01434                         }
01435                 }
01436 
01437                 if (newvdevs[i] == NULL) {
01438                         /*
01439                          * Create new vdev
01440                          */
01441                         VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
01442                             VDEV_ALLOC_L2CACHE) == 0);
01443                         ASSERT(vd != NULL);
01444                         newvdevs[i] = vd;
01445 
01446                         /*
01447                          * Commit this vdev as an l2cache device,
01448                          * even if it fails to open.
01449                          */
01450                         spa_l2cache_add(vd);
01451 
01452                         vd->vdev_top = vd;
01453                         vd->vdev_aux = sav;
01454 
01455                         spa_l2cache_activate(vd);
01456 
01457                         if (vdev_open(vd) != 0)
01458                                 continue;
01459 
01460                         (void) vdev_validate_aux(vd);
01461 
01462                         if (!vdev_is_dead(vd))
01463                                 l2arc_add_vdev(spa, vd);
01464                 }
01465         }
01466 
01467         /*
01468          * Purge vdevs that were dropped
01469          */
01470         for (i = 0; i < oldnvdevs; i++) {
01471                 uint64_t pool;
01472 
01473                 vd = oldvdevs[i];
01474                 if (vd != NULL) {
01475                         ASSERT(vd->vdev_isl2cache);
01476 
01477                         if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
01478                             pool != 0ULL && l2arc_vdev_present(vd))
01479                                 l2arc_remove_vdev(vd);
01480                         vdev_clear_stats(vd);
01481                         vdev_free(vd);
01482                 }
01483         }
01484 
01485         if (oldvdevs)
01486                 kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
01487 
01488         if (sav->sav_config == NULL)
01489                 goto out;
01490 
01491         sav->sav_vdevs = newvdevs;
01492         sav->sav_count = (int)nl2cache;
01493 
01494         /*
01495          * Recompute the stashed list of l2cache devices, with status
01496          * information this time.
01497          */
01498         VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
01499             DATA_TYPE_NVLIST_ARRAY) == 0);
01500 
01501         l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
01502         for (i = 0; i < sav->sav_count; i++)
01503                 l2cache[i] = vdev_config_generate(spa,
01504                     sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
01505         VERIFY(nvlist_add_nvlist_array(sav->sav_config,
01506             ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
01507 out:
01508         for (i = 0; i < sav->sav_count; i++)
01509                 nvlist_free(l2cache[i]);
01510         if (sav->sav_count)
01511                 kmem_free(l2cache, sav->sav_count * sizeof (void *));
01512 }
01513 
01514 static int
01515 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
01516 {
01517         dmu_buf_t *db;
01518         char *packed = NULL;
01519         size_t nvsize = 0;
01520         int error;
01521         *value = NULL;
01522 
01523         VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
01524         nvsize = *(uint64_t *)db->db_data;
01525         dmu_buf_rele(db, FTAG);
01526 
01527         packed = kmem_alloc(nvsize, KM_SLEEP);
01528         error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
01529             DMU_READ_PREFETCH);
01530         if (error == 0)
01531                 error = nvlist_unpack(packed, nvsize, value, 0);
01532         kmem_free(packed, nvsize);
01533 
01534         return (error);
01535 }
01536 
01541 static void
01542 spa_check_removed(vdev_t *vd)
01543 {
01544         for (int c = 0; c < vd->vdev_children; c++)
01545                 spa_check_removed(vd->vdev_child[c]);
01546 
01547         if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
01548                 zfs_post_autoreplace(vd->vdev_spa, vd);
01549                 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
01550         }
01551 }
01552 
01556 static boolean_t
01557 spa_config_valid(spa_t *spa, nvlist_t *config)
01558 {
01559         vdev_t *mrvd, *rvd = spa->spa_root_vdev;
01560         nvlist_t *nv;
01561 
01562         VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
01563 
01564         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
01565         VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
01566 
01567         ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
01568 
01569         /*
01570          * If we're doing a normal import, then build up any additional
01571          * diagnostic information about missing devices in this config.
01572          * We'll pass this up to the user for further processing.
01573          */
01574         if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
01575                 nvlist_t **child, *nv;
01576                 uint64_t idx = 0;
01577 
01578                 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
01579                     KM_SLEEP);
01580                 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
01581 
01582                 for (int c = 0; c < rvd->vdev_children; c++) {
01583                         vdev_t *tvd = rvd->vdev_child[c];
01584                         vdev_t *mtvd  = mrvd->vdev_child[c];
01585 
01586                         if (tvd->vdev_ops == &vdev_missing_ops &&
01587                             mtvd->vdev_ops != &vdev_missing_ops &&
01588                             mtvd->vdev_islog)
01589                                 child[idx++] = vdev_config_generate(spa, mtvd,
01590                                     B_FALSE, 0);
01591                 }
01592 
01593                 if (idx) {
01594                         VERIFY(nvlist_add_nvlist_array(nv,
01595                             ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
01596                         VERIFY(nvlist_add_nvlist(spa->spa_load_info,
01597                             ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
01598 
01599                         for (int i = 0; i < idx; i++)
01600                                 nvlist_free(child[i]);
01601                 }
01602                 nvlist_free(nv);
01603                 kmem_free(child, rvd->vdev_children * sizeof (char **));
01604         }
01605 
01606         /*
01607          * Compare the root vdev tree with the information we have
01608          * from the MOS config (mrvd). Check each top-level vdev
01609          * with the corresponding MOS config top-level (mtvd).
01610          */
01611         for (int c = 0; c < rvd->vdev_children; c++) {
01612                 vdev_t *tvd = rvd->vdev_child[c];
01613                 vdev_t *mtvd  = mrvd->vdev_child[c];
01614 
01615                 /*
01616                  * Resolve any "missing" vdevs in the current configuration.
01617                  * If we find that the MOS config has more accurate information
01618                  * about the top-level vdev then use that vdev instead.
01619                  */
01620                 if (tvd->vdev_ops == &vdev_missing_ops &&
01621                     mtvd->vdev_ops != &vdev_missing_ops) {
01622 
01623                         if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
01624                                 continue;
01625 
01626                         /*
01627                          * Device specific actions.
01628                          */
01629                         if (mtvd->vdev_islog) {
01630                                 spa_set_log_state(spa, SPA_LOG_CLEAR);
01631                         } else {
01632                                 /*
01633                                  * XXX - once we have 'readonly' pool
01634                                  * support we should be able to handle
01635                                  * missing data devices by transitioning
01636                                  * the pool to readonly.
01637                                  */
01638                                 continue;
01639                         }
01640 
01641                         /*
01642                          * Swap the missing vdev with the data we were
01643                          * able to obtain from the MOS config.
01644                          */
01645                         vdev_remove_child(rvd, tvd);
01646                         vdev_remove_child(mrvd, mtvd);
01647 
01648                         vdev_add_child(rvd, mtvd);
01649                         vdev_add_child(mrvd, tvd);
01650 
01651                         spa_config_exit(spa, SCL_ALL, FTAG);
01652                         vdev_load(mtvd);
01653                         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
01654 
01655                         vdev_reopen(rvd);
01656                 } else if (mtvd->vdev_islog) {
01657                         /*
01658                          * Load the slog device's state from the MOS config
01659                          * since it's possible that the label does not
01660                          * contain the most up-to-date information.
01661                          */
01662                         vdev_load_log_state(tvd, mtvd);
01663                         vdev_reopen(tvd);
01664                 }
01665         }
01666         vdev_free(mrvd);
01667         spa_config_exit(spa, SCL_ALL, FTAG);
01668 
01669         /*
01670          * Ensure we were able to validate the config.
01671          */
01672         return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
01673 }
01674 
01678 static int
01679 spa_check_logs(spa_t *spa)
01680 {
01681         switch (spa->spa_log_state) {
01682         case SPA_LOG_MISSING:
01683                 /* need to recheck in case slog has been restored */
01684         case SPA_LOG_UNKNOWN:
01685                 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL,
01686                     DS_FIND_CHILDREN)) {
01687                         spa_set_log_state(spa, SPA_LOG_MISSING);
01688                         return (1);
01689                 }
01690                 break;
01691         }
01692         return (0);
01693 }
01694 
01695 static boolean_t
01696 spa_passivate_log(spa_t *spa)
01697 {
01698         vdev_t *rvd = spa->spa_root_vdev;
01699         boolean_t slog_found = B_FALSE;
01700 
01701         ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
01702 
01703         if (!spa_has_slogs(spa))
01704                 return (B_FALSE);
01705 
01706         for (int c = 0; c < rvd->vdev_children; c++) {
01707                 vdev_t *tvd = rvd->vdev_child[c];
01708                 metaslab_group_t *mg = tvd->vdev_mg;
01709 
01710                 if (tvd->vdev_islog) {
01711                         metaslab_group_passivate(mg);
01712                         slog_found = B_TRUE;
01713                 }
01714         }
01715 
01716         return (slog_found);
01717 }
01718 
01719 static void
01720 spa_activate_log(spa_t *spa)
01721 {
01722         vdev_t *rvd = spa->spa_root_vdev;
01723 
01724         ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
01725 
01726         for (int c = 0; c < rvd->vdev_children; c++) {
01727                 vdev_t *tvd = rvd->vdev_child[c];
01728                 metaslab_group_t *mg = tvd->vdev_mg;
01729 
01730                 if (tvd->vdev_islog)
01731                         metaslab_group_activate(mg);
01732         }
01733 }
01734 
01735 int
01736 spa_offline_log(spa_t *spa)
01737 {
01738         int error = 0;
01739 
01740         if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
01741             NULL, DS_FIND_CHILDREN)) == 0) {
01742 
01743                 /*
01744                  * We successfully offlined the log device, sync out the
01745                  * current txg so that the "stubby" block can be removed
01746                  * by zil_sync().
01747                  */
01748                 txg_wait_synced(spa->spa_dsl_pool, 0);
01749         }
01750         return (error);
01751 }
01752 
01753 static void
01754 spa_aux_check_removed(spa_aux_vdev_t *sav)
01755 {
01756         int i;
01757 
01758         for (i = 0; i < sav->sav_count; i++)
01759                 spa_check_removed(sav->sav_vdevs[i]);
01760 }
01761 
01765 void
01766 spa_claim_notify(zio_t *zio)
01767 {
01768         spa_t *spa = zio->io_spa;
01769 
01770         if (zio->io_error)
01771                 return;
01772 
01773         mutex_enter(&spa->spa_props_lock);      /* any mutex will do */
01774         if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
01775                 spa->spa_claim_max_txg = zio->io_bp->blk_birth;
01776         mutex_exit(&spa->spa_props_lock);
01777 }
01778 
01779 typedef struct spa_load_error {
01780         uint64_t        sle_meta_count;
01781         uint64_t        sle_data_count;
01782 } spa_load_error_t;
01783 
01784 static void
01785 spa_load_verify_done(zio_t *zio)
01786 {
01787         blkptr_t *bp = zio->io_bp;
01788         spa_load_error_t *sle = zio->io_private;
01789         dmu_object_type_t type = BP_GET_TYPE(bp);
01790         int error = zio->io_error;
01791 
01792         if (error) {
01793                 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
01794                     type != DMU_OT_INTENT_LOG)
01795                         atomic_add_64(&sle->sle_meta_count, 1);
01796                 else
01797                         atomic_add_64(&sle->sle_data_count, 1);
01798         }
01799         zio_data_buf_free(zio->io_data, zio->io_size);
01800 }
01801 
01802 /*ARGSUSED*/
01803 static int
01804 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
01805     arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
01806 {
01807         if (bp != NULL) {
01808                 zio_t *rio = arg;
01809                 size_t size = BP_GET_PSIZE(bp);
01810                 void *data = zio_data_buf_alloc(size);
01811 
01812                 zio_nowait(zio_read(rio, spa, bp, data, size,
01813                     spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
01814                     ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
01815                     ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
01816         }
01817         return (0);
01818 }
01819 
01820 static int
01821 spa_load_verify(spa_t *spa)
01822 {
01823         zio_t *rio;
01824         spa_load_error_t sle = { 0 };
01825         zpool_rewind_policy_t policy;
01826         boolean_t verify_ok = B_FALSE;
01827         int error;
01828 
01829         zpool_get_rewind_policy(spa->spa_config, &policy);
01830 
01831         if (policy.zrp_request & ZPOOL_NEVER_REWIND)
01832                 return (0);
01833 
01834         rio = zio_root(spa, NULL, &sle,
01835             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
01836 
01837         error = traverse_pool(spa, spa->spa_verify_min_txg,
01838             TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
01839 
01840         (void) zio_wait(rio);
01841 
01842         spa->spa_load_meta_errors = sle.sle_meta_count;
01843         spa->spa_load_data_errors = sle.sle_data_count;
01844 
01845         if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
01846             sle.sle_data_count <= policy.zrp_maxdata) {
01847                 int64_t loss = 0;
01848 
01849                 verify_ok = B_TRUE;
01850                 spa->spa_load_txg = spa->spa_uberblock.ub_txg;
01851                 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
01852 
01853                 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
01854                 VERIFY(nvlist_add_uint64(spa->spa_load_info,
01855                     ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
01856                 VERIFY(nvlist_add_int64(spa->spa_load_info,
01857                     ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
01858                 VERIFY(nvlist_add_uint64(spa->spa_load_info,
01859                     ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
01860         } else {
01861                 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
01862         }
01863 
01864         if (error) {
01865                 if (error != ENXIO && error != EIO)
01866                         error = EIO;
01867                 return (error);
01868         }
01869 
01870         return (verify_ok ? 0 : EIO);
01871 }
01872 
01876 static void
01877 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
01878 {
01879         (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
01880             zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
01881 }
01882 
01886 static int
01887 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
01888 {
01889         return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
01890             name, sizeof (uint64_t), 1, val));
01891 }
01892 
01893 static int
01894 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
01895 {
01896         vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
01897         return (err);
01898 }
01899 
01916 static void
01917 spa_try_repair(spa_t *spa, nvlist_t *config)
01918 {
01919         uint_t extracted;
01920         uint64_t *glist;
01921         uint_t i, gcount;
01922         nvlist_t *nvl;
01923         vdev_t **vd;
01924         boolean_t attempt_reopen;
01925 
01926         if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
01927                 return;
01928 
01929         /* check that the config is complete */
01930         if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
01931             &glist, &gcount) != 0)
01932                 return;
01933 
01934         vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
01935 
01936         /* attempt to online all the vdevs & validate */
01937         attempt_reopen = B_TRUE;
01938         for (i = 0; i < gcount; i++) {
01939                 if (glist[i] == 0)      /* vdev is hole */
01940                         continue;
01941 
01942                 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
01943                 if (vd[i] == NULL) {
01944                         /*
01945                          * Don't bother attempting to reopen the disks;
01946                          * just do the split.
01947                          */
01948                         attempt_reopen = B_FALSE;
01949                 } else {
01950                         /* attempt to re-online it */
01951                         vd[i]->vdev_offline = B_FALSE;
01952                 }
01953         }
01954 
01955         if (attempt_reopen) {
01956                 vdev_reopen(spa->spa_root_vdev);
01957 
01958                 /* check each device to see what state it's in */
01959                 for (extracted = 0, i = 0; i < gcount; i++) {
01960                         if (vd[i] != NULL &&
01961                             vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
01962                                 break;
01963                         ++extracted;
01964                 }
01965         }
01966 
01967         /*
01968          * If every disk has been moved to the new pool, or if we never
01969          * even attempted to look at them, then we split them off for
01970          * good.
01971          */
01972         if (!attempt_reopen || gcount == extracted) {
01973                 for (i = 0; i < gcount; i++)
01974                         if (vd[i] != NULL)
01975                                 vdev_split(vd[i]);
01976                 vdev_reopen(spa->spa_root_vdev);
01977         }
01978 
01979         kmem_free(vd, gcount * sizeof (vdev_t *));
01980 }
01981 
01982 static int
01983 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
01984     boolean_t mosconfig)
01985 {
01986         nvlist_t *config = spa->spa_config;
01987         char *ereport = FM_EREPORT_ZFS_POOL;
01988         char *comment;
01989         int error;
01990         uint64_t pool_guid;
01991         nvlist_t *nvl;
01992 
01993         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
01994                 return (EINVAL);
01995 
01996         ASSERT(spa->spa_comment == NULL);
01997         if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
01998                 spa->spa_comment = spa_strdup(comment);
01999 
02000         /*
02001          * Versioning wasn't explicitly added to the label until later, so if
02002          * it's not present treat it as the initial version.
02003          */
02004         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
02005             &spa->spa_ubsync.ub_version) != 0)
02006                 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
02007 
02008         (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
02009             &spa->spa_config_txg);
02010 
02011         if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
02012             spa_guid_exists(pool_guid, 0)) {
02013                 error = EEXIST;
02014         } else {
02015                 spa->spa_config_guid = pool_guid;
02016 
02017                 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
02018                     &nvl) == 0) {
02019                         VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
02020                             KM_SLEEP) == 0);
02021                 }
02022 
02023                 nvlist_free(spa->spa_load_info);
02024                 spa->spa_load_info = fnvlist_alloc();
02025 
02026                 gethrestime(&spa->spa_loaded_ts);
02027                 error = spa_load_impl(spa, pool_guid, config, state, type,
02028                     mosconfig, &ereport);
02029         }
02030 
02031         spa->spa_minref = refcount_count(&spa->spa_refcount);
02032         if (error) {
02033                 if (error != EEXIST) {
02034                         spa->spa_loaded_ts.tv_sec = 0;
02035                         spa->spa_loaded_ts.tv_nsec = 0;
02036                 }
02037                 if (error != EBADF) {
02038                         zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
02039                 }
02040         }
02041         spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
02042         spa->spa_ena = 0;
02043 
02044         return (error);
02045 }
02046 
02051 static int
02052 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
02053     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
02054     char **ereport)
02055 {
02056         int error = 0;
02057         nvlist_t *nvroot = NULL;
02058         nvlist_t *label;
02059         vdev_t *rvd;
02060         uberblock_t *ub = &spa->spa_uberblock;
02061         uint64_t children, config_cache_txg = spa->spa_config_txg;
02062         int orig_mode = spa->spa_mode;
02063         int parse;
02064         uint64_t obj;
02065         boolean_t missing_feat_write = B_FALSE;
02066 
02067         /*
02068          * If this is an untrusted config, access the pool in read-only mode.
02069          * This prevents things like resilvering recently removed devices.
02070          */
02071         if (!mosconfig)
02072                 spa->spa_mode = FREAD;
02073 
02074         ASSERT(MUTEX_HELD(&spa_namespace_lock));
02075 
02076         spa->spa_load_state = state;
02077 
02078         if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
02079                 return (EINVAL);
02080 
02081         parse = (type == SPA_IMPORT_EXISTING ?
02082             VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
02083 
02084         /*
02085          * Create "The Godfather" zio to hold all async IOs
02086          */
02087         spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
02088             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
02089 
02090         /*
02091          * Parse the configuration into a vdev tree.  We explicitly set the
02092          * value that will be returned by spa_version() since parsing the
02093          * configuration requires knowing the version number.
02094          */
02095         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
02096         error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
02097         spa_config_exit(spa, SCL_ALL, FTAG);
02098 
02099         if (error != 0)
02100                 return (error);
02101 
02102         ASSERT(spa->spa_root_vdev == rvd);
02103 
02104         if (type != SPA_IMPORT_ASSEMBLE) {
02105                 ASSERT(spa_guid(spa) == pool_guid);
02106         }
02107 
02108         /*
02109          * Try to open all vdevs, loading each label in the process.
02110          */
02111         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
02112         error = vdev_open(rvd);
02113         spa_config_exit(spa, SCL_ALL, FTAG);
02114         if (error != 0)
02115                 return (error);
02116 
02117         /*
02118          * We need to validate the vdev labels against the configuration that
02119          * we have in hand, which is dependent on the setting of mosconfig. If
02120          * mosconfig is true then we're validating the vdev labels based on
02121          * that config.  Otherwise, we're validating against the cached config
02122          * (zpool.cache) that was read when we loaded the zfs module, and then
02123          * later we will recursively call spa_load() and validate against
02124          * the vdev config.
02125          *
02126          * If we're assembling a new pool that's been split off from an
02127          * existing pool, the labels haven't yet been updated so we skip
02128          * validation for now.
02129          */
02130         if (type != SPA_IMPORT_ASSEMBLE) {
02131                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
02132                 error = vdev_validate(rvd, mosconfig);
02133                 spa_config_exit(spa, SCL_ALL, FTAG);
02134 
02135                 if (error != 0)
02136                         return (error);
02137 
02138                 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
02139                         return (ENXIO);
02140         }
02141 
02142         /*
02143          * Find the best uberblock.
02144          */
02145         vdev_uberblock_load(rvd, ub, &label);
02146 
02147         /*
02148          * If we weren't able to find a single valid uberblock, return failure.
02149          */
02150         if (ub->ub_txg == 0) {
02151                 nvlist_free(label);
02152                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
02153         }
02154 
02155         /*
02156          * If the pool has an unsupported version we can't open it.
02157          */
02158         if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
02159                 nvlist_free(label);
02160                 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
02161         }
02162 
02163         if (ub->ub_version >= SPA_VERSION_FEATURES) {
02164                 nvlist_t *features;
02165 
02166                 /*
02167                  * If we weren't able to find what's necessary for reading the
02168                  * MOS in the label, return failure.
02169                  */
02170                 if (label == NULL || nvlist_lookup_nvlist(label,
02171                     ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
02172                         nvlist_free(label);
02173                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
02174                             ENXIO));
02175                 }
02176 
02177                 /*
02178                  * Update our in-core representation with the definitive values
02179                  * from the label.
02180                  */
02181                 nvlist_free(spa->spa_label_features);
02182                 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
02183         }
02184 
02185         nvlist_free(label);
02186 
02187         /*
02188          * Look through entries in the label nvlist's features_for_read. If
02189          * there is a feature listed there which we don't understand then we
02190          * cannot open a pool.
02191          */
02192         if (ub->ub_version >= SPA_VERSION_FEATURES) {
02193                 nvlist_t *unsup_feat;
02194 
02195                 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
02196                     0);
02197 
02198                 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
02199                     NULL); nvp != NULL;
02200                     nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
02201                         if (!zfeature_is_supported(nvpair_name(nvp))) {
02202                                 VERIFY(nvlist_add_string(unsup_feat,
02203                                     nvpair_name(nvp), "") == 0);
02204                         }
02205                 }
02206 
02207                 if (!nvlist_empty(unsup_feat)) {
02208                         VERIFY(nvlist_add_nvlist(spa->spa_load_info,
02209                             ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
02210                         nvlist_free(unsup_feat);
02211                         return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
02212                             ENOTSUP));
02213                 }
02214 
02215                 nvlist_free(unsup_feat);
02216         }
02217 
02218         /*
02219          * If the vdev guid sum doesn't match the uberblock, we have an
02220          * incomplete configuration.  We first check to see if the pool
02221          * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
02222          * If it is, defer the vdev_guid_sum check till later so we
02223          * can handle missing vdevs.
02224          */
02225         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
02226             &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
02227             rvd->vdev_guid_sum != ub->ub_guid_sum)
02228                 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
02229 
02230         if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
02231                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
02232                 spa_try_repair(spa, config);
02233                 spa_config_exit(spa, SCL_ALL, FTAG);
02234                 nvlist_free(spa->spa_config_splitting);
02235                 spa->spa_config_splitting = NULL;
02236         }
02237 
02238         /*
02239          * Initialize internal SPA structures.
02240          */
02241         spa->spa_state = POOL_STATE_ACTIVE;
02242         spa->spa_ubsync = spa->spa_uberblock;
02243         spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
02244             TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
02245         spa->spa_first_txg = spa->spa_last_ubsync_txg ?
02246             spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
02247         spa->spa_claim_max_txg = spa->spa_first_txg;
02248         spa->spa_prev_software_version = ub->ub_software_version;
02249 
02250         error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
02251         if (error)
02252                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
02253         spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
02254 
02255         if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
02256                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
02257 
02258         if (spa_version(spa) >= SPA_VERSION_FEATURES) {
02259                 boolean_t missing_feat_read = B_FALSE;
02260                 nvlist_t *unsup_feat, *enabled_feat;
02261 
02262                 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
02263                     &spa->spa_feat_for_read_obj) != 0) {
02264                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
02265                 }
02266 
02267                 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
02268                     &spa->spa_feat_for_write_obj) != 0) {
02269                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
02270                 }
02271 
02272                 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
02273                     &spa->spa_feat_desc_obj) != 0) {
02274                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
02275                 }
02276 
02277                 enabled_feat = fnvlist_alloc();
02278                 unsup_feat = fnvlist_alloc();
02279 
02280                 if (!feature_is_supported(spa->spa_meta_objset,
02281                     spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj,
02282                     unsup_feat, enabled_feat))
02283                         missing_feat_read = B_TRUE;
02284 
02285                 if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
02286                         if (!feature_is_supported(spa->spa_meta_objset,
02287                             spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj,
02288                             unsup_feat, enabled_feat)) {
02289                                 missing_feat_write = B_TRUE;
02290                         }
02291                 }
02292 
02293                 fnvlist_add_nvlist(spa->spa_load_info,
02294                     ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
02295 
02296                 if (!nvlist_empty(unsup_feat)) {
02297                         fnvlist_add_nvlist(spa->spa_load_info,
02298                             ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
02299                 }
02300 
02301                 fnvlist_free(enabled_feat);
02302                 fnvlist_free(unsup_feat);
02303 
02304                 if (!missing_feat_read) {
02305                         fnvlist_add_boolean(spa->spa_load_info,
02306                             ZPOOL_CONFIG_CAN_RDONLY);
02307                 }
02308 
02309                 /*
02310                  * If the state is SPA_LOAD_TRYIMPORT, our objective is
02311                  * twofold: to determine whether the pool is available for
02312                  * import in read-write mode and (if it is not) whether the
02313                  * pool is available for import in read-only mode. If the pool
02314                  * is available for import in read-write mode, it is displayed
02315                  * as available in userland; if it is not available for import
02316                  * in read-only mode, it is displayed as unavailable in
02317                  * userland. If the pool is available for import in read-only
02318                  * mode but not read-write mode, it is displayed as unavailable
02319                  * in userland with a special note that the pool is actually
02320                  * available for open in read-only mode.
02321                  *
02322                  * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
02323                  * missing a feature for write, we must first determine whether
02324                  * the pool can be opened read-only before returning to
02325                  * userland in order to know whether to display the
02326                  * abovementioned note.
02327                  */
02328                 if (missing_feat_read || (missing_feat_write &&
02329                     spa_writeable(spa))) {
02330                         return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
02331                             ENOTSUP));
02332                 }
02333         }
02334 
02335         spa->spa_is_initializing = B_TRUE;
02336         error = dsl_pool_open(spa->spa_dsl_pool);
02337         spa->spa_is_initializing = B_FALSE;
02338         if (error != 0)
02339                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
02340 
02341         if (!mosconfig) {
02342                 uint64_t hostid;
02343                 nvlist_t *policy = NULL, *nvconfig;
02344 
02345                 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
02346                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
02347 
02348                 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
02349                     ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
02350                         char *hostname;
02351                         unsigned long myhostid = 0;
02352 
02353                         VERIFY(nvlist_lookup_string(nvconfig,
02354                             ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
02355 
02356 #ifdef  _KERNEL
02357                         myhostid = zone_get_hostid(NULL);
02358 #else   /* _KERNEL */
02359                         /*
02360                          * We're emulating the system's hostid in userland, so
02361                          * we can't use zone_get_hostid().
02362                          */
02363                         (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
02364 #endif  /* _KERNEL */
02365                         if (check_hostid && hostid != 0 && myhostid != 0 &&
02366                             hostid != myhostid) {
02367                                 nvlist_free(nvconfig);
02368                                 cmn_err(CE_WARN, "pool '%s' could not be "
02369                                     "loaded as it was last accessed by "
02370                                     "another system (host: %s hostid: 0x%lx). "
02371                                     "See: http://illumos.org/msg/ZFS-8000-EY",
02372                                     spa_name(spa), hostname,
02373                                     (unsigned long)hostid);
02374                                 return (EBADF);
02375                         }
02376                 }
02377                 if (nvlist_lookup_nvlist(spa->spa_config,
02378                     ZPOOL_REWIND_POLICY, &policy) == 0)
02379                         VERIFY(nvlist_add_nvlist(nvconfig,
02380                             ZPOOL_REWIND_POLICY, policy) == 0);
02381 
02382                 spa_config_set(spa, nvconfig);
02383                 spa_unload(spa);
02384                 spa_deactivate(spa);
02385                 spa_activate(spa, orig_mode);
02386 
02387                 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
02388         }
02389 
02390         if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
02391                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
02392         error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
02393         if (error != 0)
02394                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
02395 
02396         /*
02397          * Load the bit that tells us to use the new accounting function
02398          * (raid-z deflation).  If we have an older pool, this will not
02399          * be present.
02400          */
02401         error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
02402         if (error != 0 && error != ENOENT)
02403                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
02404 
02405         error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
02406             &spa->spa_creation_version);
02407         if (error != 0 && error != ENOENT)
02408                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
02409 
02410         /*
02411          * Load the persistent error log.  If we have an older pool, this will
02412          * not be present.
02413          */
02414         error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
02415         if (error != 0 && error != ENOENT)
02416                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
02417 
02418         error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
02419             &spa->spa_errlog_scrub);
02420         if (error != 0 && error != ENOENT)
02421                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
02422 
02423         /*
02424          * Load the history object.  If we have an older pool, this
02425          * will not be present.
02426          */
02427         error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
02428         if (error != 0 && error != ENOENT)
02429                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
02430 
02431         /*
02432          * If we're assembling the pool from the split-off vdevs of
02433          * an existing pool, we don't want to attach the spares & cache
02434          * devices.
02435          */
02436 
02437         /*
02438          * Load any hot spares for this pool.
02439          */
02440         error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
02441         if (error != 0 && error != ENOENT)
02442                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
02443         if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
02444                 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
02445                 if (load_nvlist(spa, spa->spa_spares.sav_object,
02446                     &spa->spa_spares.sav_config) != 0)
02447                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
02448 
02449                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
02450                 spa_load_spares(spa);
02451                 spa_config_exit(spa, SCL_ALL, FTAG);
02452         } else if (error == 0) {
02453                 spa->spa_spares.sav_sync = B_TRUE;
02454         }
02455 
02456         /*
02457          * Load any level 2 ARC devices for this pool.
02458          */
02459         error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
02460             &spa->spa_l2cache.sav_object);
02461         if (error != 0 && error != ENOENT)
02462                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
02463         if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
02464                 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
02465                 if (load_nvlist(spa, spa->spa_l2cache.sav_object,
02466                     &spa->spa_l2cache.sav_config) != 0)
02467                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
02468 
02469                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
02470                 spa_load_l2cache(spa);
02471                 spa_config_exit(spa, SCL_ALL, FTAG);
02472         } else if (error == 0) {
02473                 spa->spa_l2cache.sav_sync = B_TRUE;
02474         }
02475 
02476         spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
02477 
02478         error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
02479         if (error && error != ENOENT)
02480                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
02481 
02482         if (error == 0) {
02483                 uint64_t autoreplace;
02484 
02485                 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
02486                 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
02487                 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
02488                 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
02489                 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
02490                 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
02491                     &spa->spa_dedup_ditto);
02492 
02493                 spa->spa_autoreplace = (autoreplace != 0);
02494         }
02495 
02496         /*
02497          * If the 'autoreplace' property is set, then post a resource notifying
02498          * the ZFS DE that it should not issue any faults for unopenable
02499          * devices.  We also iterate over the vdevs, and post a sysevent for any
02500          * unopenable vdevs so that the normal autoreplace handler can take
02501          * over.
02502          */
02503         if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
02504                 spa_check_removed(spa->spa_root_vdev);
02505                 /*
02506                  * For the import case, this is done in spa_import(), because
02507                  * at this point we're using the spare definitions from
02508                  * the MOS config, not necessarily from the userland config.
02509                  */
02510                 if (state != SPA_LOAD_IMPORT) {
02511                         spa_aux_check_removed(&spa->spa_spares);
02512                         spa_aux_check_removed(&spa->spa_l2cache);
02513                 }
02514         }
02515 
02516         /*
02517          * Load the vdev state for all toplevel vdevs.
02518          */
02519         vdev_load(rvd);
02520 
02521         /*
02522          * Propagate the leaf DTLs we just loaded all the way up the tree.
02523          */
02524         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
02525         vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
02526         spa_config_exit(spa, SCL_ALL, FTAG);
02527 
02528         /*
02529          * Load the DDTs (dedup tables).
02530          */
02531         error = ddt_load(spa);
02532         if (error != 0)
02533                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
02534 
02535         spa_update_dspace(spa);
02536 
02537         /*
02538          * Validate the config, using the MOS config to fill in any
02539          * information which might be missing.  If we fail to validate
02540          * the config then declare the pool unfit for use. If we're
02541          * assembling a pool from a split, the log is not transferred
02542          * over.
02543          */
02544         if (type != SPA_IMPORT_ASSEMBLE) {
02545                 nvlist_t *nvconfig;
02546 
02547                 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
02548                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
02549 
02550                 if (!spa_config_valid(spa, nvconfig)) {
02551                         nvlist_free(nvconfig);
02552                         return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
02553                             ENXIO));
02554                 }
02555                 nvlist_free(nvconfig);
02556 
02557                 /*
02558                  * Now that we've validated the config, check the state of the
02559                  * root vdev.  If it can't be opened, it indicates one or
02560                  * more toplevel vdevs are faulted.
02561                  */
02562                 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
02563                         return (ENXIO);
02564 
02565                 if (spa_check_logs(spa)) {
02566                         *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
02567                         return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
02568                 }
02569         }
02570 
02571         if (missing_feat_write) {
02572                 ASSERT(state == SPA_LOAD_TRYIMPORT);
02573 
02574                 /*
02575                  * At this point, we know that we can open the pool in
02576                  * read-only mode but not read-write mode. We now have enough
02577                  * information and can return to userland.
02578                  */
02579                 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
02580         }
02581 
02582         /*
02583          * We've successfully opened the pool, verify that we're ready
02584          * to start pushing transactions.
02585          */
02586         if (state != SPA_LOAD_TRYIMPORT) {
02587                 if (error = spa_load_verify(spa))
02588                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
02589                             error));
02590         }
02591 
02592         if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
02593             spa->spa_load_max_txg == UINT64_MAX)) {
02594                 dmu_tx_t *tx;
02595                 int need_update = B_FALSE;
02596 
02597                 ASSERT(state != SPA_LOAD_TRYIMPORT);
02598 
02599                 /*
02600                  * Claim log blocks that haven't been committed yet.
02601                  * This must all happen in a single txg.
02602                  * Note: spa_claim_max_txg is updated by spa_claim_notify(),
02603                  * invoked from zil_claim_log_block()'s i/o done callback.
02604                  * Price of rollback is that we abandon the log.
02605                  */
02606                 spa->spa_claiming = B_TRUE;
02607 
02608                 tx = dmu_tx_create_assigned(spa_get_dsl(spa),
02609                     spa_first_txg(spa));
02610                 (void) dmu_objset_find(spa_name(spa),
02611                     zil_claim, tx, DS_FIND_CHILDREN);
02612                 dmu_tx_commit(tx);
02613 
02614                 spa->spa_claiming = B_FALSE;
02615 
02616                 spa_set_log_state(spa, SPA_LOG_GOOD);
02617                 spa->spa_sync_on = B_TRUE;
02618                 txg_sync_start(spa->spa_dsl_pool);
02619 
02620                 /*
02621                  * Wait for all claims to sync.  We sync up to the highest
02622                  * claimed log block birth time so that claimed log blocks
02623                  * don't appear to be from the future.  spa_claim_max_txg
02624                  * will have been set for us by either zil_check_log_chain()
02625                  * (invoked from spa_check_logs()) or zil_claim() above.
02626                  */
02627                 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
02628 
02629                 /*
02630                  * If the config cache is stale, or we have uninitialized
02631                  * metaslabs (see spa_vdev_add()), then update the config.
02632                  *
02633                  * If this is a verbatim import, trust the current
02634                  * in-core spa_config and update the disk labels.
02635                  */
02636                 if (config_cache_txg != spa->spa_config_txg ||
02637                     state == SPA_LOAD_IMPORT ||
02638                     state == SPA_LOAD_RECOVER ||
02639                     (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
02640                         need_update = B_TRUE;
02641 
02642                 for (int c = 0; c < rvd->vdev_children; c++)
02643                         if (rvd->vdev_child[c]->vdev_ms_array == 0)
02644                                 need_update = B_TRUE;
02645 
02646                 /*
02647                  * Update the config cache asychronously in case we're the
02648                  * root pool, in which case the config cache isn't writable yet.
02649                  */
02650                 if (need_update)
02651                         spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
02652 
02653                 /*
02654                  * Check all DTLs to see if anything needs resilvering.
02655                  */
02656                 if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
02657                     vdev_resilver_needed(rvd, NULL, NULL))
02658                         spa_async_request(spa, SPA_ASYNC_RESILVER);
02659 
02660                 /*
02661                  * Delete any inconsistent datasets.
02662                  */
02663                 (void) dmu_objset_find(spa_name(spa),
02664                     dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
02665 
02666                 /*
02667                  * Clean up any stale temporary dataset userrefs.
02668                  */
02669                 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
02670         }
02671 
02672         return (0);
02673 }
02674 
02675 static int
02676 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
02677 {
02678         int mode = spa->spa_mode;
02679 
02680         spa_unload(spa);
02681         spa_deactivate(spa);
02682 
02683         spa->spa_load_max_txg--;
02684 
02685         spa_activate(spa, mode);
02686         spa_async_suspend(spa);
02687 
02688         return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
02689 }
02690 
02698 static int
02699 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
02700     uint64_t max_request, int rewind_flags)
02701 {
02702         nvlist_t *loadinfo = NULL;
02703         nvlist_t *config = NULL;
02704         int load_error, rewind_error;
02705         uint64_t safe_rewind_txg;
02706         uint64_t min_txg;
02707 
02708         if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
02709                 spa->spa_load_max_txg = spa->spa_load_txg;
02710                 spa_set_log_state(spa, SPA_LOG_CLEAR);
02711         } else {
02712                 spa->spa_load_max_txg = max_request;
02713         }
02714 
02715         load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
02716             mosconfig);
02717         if (load_error == 0)
02718                 return (0);
02719 
02720         if (spa->spa_root_vdev != NULL)
02721                 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
02722 
02723         spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
02724         spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
02725 
02726         if (rewind_flags & ZPOOL_NEVER_REWIND) {
02727                 nvlist_free(config);
02728                 return (load_error);
02729         }
02730 
02731         if (state == SPA_LOAD_RECOVER) {
02732                 /* Price of rolling back is discarding txgs, including log */
02733                 spa_set_log_state(spa, SPA_LOG_CLEAR);
02734         } else {
02735                 /*
02736                  * If we aren't rolling back save the load info from our first
02737                  * import attempt so that we can restore it after attempting
02738                  * to rewind.
02739                  */
02740                 loadinfo = spa->spa_load_info;
02741                 spa->spa_load_info = fnvlist_alloc();
02742         }
02743 
02744         spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
02745         safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
02746         min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
02747             TXG_INITIAL : safe_rewind_txg;
02748 
02749         /*
02750          * Continue as long as we're finding errors, we're still within
02751          * the acceptable rewind range, and we're still finding uberblocks
02752          */
02753         while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
02754             spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
02755                 if (spa->spa_load_max_txg < safe_rewind_txg)
02756                         spa->spa_extreme_rewind = B_TRUE;
02757                 rewind_error = spa_load_retry(spa, state, mosconfig);
02758         }
02759 
02760         spa->spa_extreme_rewind = B_FALSE;
02761         spa->spa_load_max_txg = UINT64_MAX;
02762 
02763         if (config && (rewind_error || state != SPA_LOAD_RECOVER))
02764                 spa_config_set(spa, config);
02765 
02766         if (state == SPA_LOAD_RECOVER) {
02767                 ASSERT3P(loadinfo, ==, NULL);
02768                 return (rewind_error);
02769         } else {
02770                 /* Store the rewind info as part of the initial load info */
02771                 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
02772                     spa->spa_load_info);
02773 
02774                 /* Restore the initial load info */
02775                 fnvlist_free(spa->spa_load_info);
02776                 spa->spa_load_info = loadinfo;
02777 
02778                 return (load_error);
02779         }
02780 }
02781 
02794 static int
02795 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
02796     nvlist_t **config)
02797 {
02798         spa_t *spa;
02799         spa_load_state_t state = SPA_LOAD_OPEN;
02800         int error;
02801         int locked = B_FALSE;
02802         int firstopen = B_FALSE;
02803 
02804         *spapp = NULL;
02805 
02806         /*
02807          * As disgusting as this is, we need to support recursive calls to this
02808          * function because dsl_dir_open() is called during spa_load(), and ends
02809          * up calling spa_open() again.  The real fix is to figure out how to
02810          * avoid dsl_dir_open() calling this in the first place.
02811          */
02812         if (mutex_owner(&spa_namespace_lock) != curthread) {
02813                 mutex_enter(&spa_namespace_lock);
02814                 locked = B_TRUE;
02815         }
02816 
02817         if ((spa = spa_lookup(pool)) == NULL) {
02818                 if (locked)
02819                         mutex_exit(&spa_namespace_lock);
02820                 return (ENOENT);
02821         }
02822 
02823         if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
02824                 zpool_rewind_policy_t policy;
02825 
02826                 firstopen = B_TRUE;
02827 
02828                 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
02829                     &policy);
02830                 if (policy.zrp_request & ZPOOL_DO_REWIND)
02831                         state = SPA_LOAD_RECOVER;
02832 
02833                 spa_activate(spa, spa_mode_global);
02834 
02835                 if (state != SPA_LOAD_RECOVER)
02836                         spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
02837 
02838                 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
02839                     policy.zrp_request);
02840 
02841                 if (error == EBADF) {
02842                         /*
02843                          * If vdev_validate() returns failure (indicated by
02844                          * EBADF), it indicates that one of the vdevs indicates
02845                          * that the pool has been exported or destroyed.  If
02846                          * this is the case, the config cache is out of sync and
02847                          * we should remove the pool from the namespace.
02848                          */
02849                         spa_unload(spa);
02850                         spa_deactivate(spa);
02851                         spa_config_sync(spa, B_TRUE, B_TRUE);
02852                         spa_remove(spa);
02853                         if (locked)
02854                                 mutex_exit(&spa_namespace_lock);
02855                         return (ENOENT);
02856                 }
02857 
02858                 if (error) {
02859                         /*
02860                          * We can't open the pool, but we still have useful
02861                          * information: the state of each vdev after the
02862                          * attempted vdev_open().  Return this to the user.
02863                          */
02864                         if (config != NULL && spa->spa_config) {
02865                                 VERIFY(nvlist_dup(spa->spa_config, config,
02866                                     KM_SLEEP) == 0);
02867                                 VERIFY(nvlist_add_nvlist(*config,
02868                                     ZPOOL_CONFIG_LOAD_INFO,
02869                                     spa->spa_load_info) == 0);
02870                         }
02871                         spa_unload(spa);
02872                         spa_deactivate(spa);
02873                         spa->spa_last_open_failed = error;
02874                         if (locked)
02875                                 mutex_exit(&spa_namespace_lock);
02876                         *spapp = NULL;
02877                         return (error);
02878                 }
02879         }
02880 
02881         spa_open_ref(spa, tag);
02882 
02883         if (config != NULL)
02884                 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
02885 
02886         /*
02887          * If we've recovered the pool, pass back any information we
02888          * gathered while doing the load.
02889          */
02890         if (state == SPA_LOAD_RECOVER) {
02891                 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
02892                     spa->spa_load_info) == 0);
02893         }
02894 
02895         if (locked) {
02896                 spa->spa_last_open_failed = 0;
02897                 spa->spa_last_ubsync_txg = 0;
02898                 spa->spa_load_txg = 0;
02899                 mutex_exit(&spa_namespace_lock);
02900 #ifdef __FreeBSD__
02901 #ifdef _KERNEL
02902                 if (firstopen)
02903                         zvol_create_minors(pool);
02904 #endif
02905 #endif
02906         }
02907 
02908         *spapp = spa;
02909 
02910         return (0);
02911 }
02912 
02913 int
02914 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
02915     nvlist_t **config)
02916 {
02917         return (spa_open_common(name, spapp, tag, policy, config));
02918 }
02919 
02920 int
02921 spa_open(const char *name, spa_t **spapp, void *tag)
02922 {
02923         return (spa_open_common(name, spapp, tag, NULL, NULL));
02924 }
02925 
02932 spa_t *
02933 spa_inject_addref(char *name)
02934 {
02935         spa_t *spa;
02936 
02937         mutex_enter(&spa_namespace_lock);
02938         if ((spa = spa_lookup(name)) == NULL) {
02939                 mutex_exit(&spa_namespace_lock);
02940                 return (NULL);
02941         }
02942         spa->spa_inject_ref++;
02943         mutex_exit(&spa_namespace_lock);
02944 
02945         return (spa);
02946 }
02947 
02948 void
02949 spa_inject_delref(spa_t *spa)
02950 {
02951         mutex_enter(&spa_namespace_lock);
02952         spa->spa_inject_ref--;
02953         mutex_exit(&spa_namespace_lock);
02954 }
02955 
02959 static void
02960 spa_add_spares(spa_t *spa, nvlist_t *config)
02961 {
02962         nvlist_t **spares;
02963         uint_t i, nspares;
02964         nvlist_t *nvroot;
02965         uint64_t guid;
02966         vdev_stat_t *vs;
02967         uint_t vsc;
02968         uint64_t pool;
02969 
02970         ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
02971 
02972         if (spa->spa_spares.sav_count == 0)
02973                 return;
02974 
02975         VERIFY(nvlist_lookup_nvlist(config,
02976             ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
02977         VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
02978             ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
02979         if (nspares != 0) {
02980                 VERIFY(nvlist_add_nvlist_array(nvroot,
02981                     ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
02982                 VERIFY(nvlist_lookup_nvlist_array(nvroot,
02983                     ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
02984 
02985                 /*
02986                  * Go through and find any spares which have since been
02987                  * repurposed as an active spare.  If this is the case, update
02988                  * their status appropriately.
02989                  */
02990                 for (i = 0; i < nspares; i++) {
02991                         VERIFY(nvlist_lookup_uint64(spares[i],
02992                             ZPOOL_CONFIG_GUID, &guid) == 0);
02993                         if (spa_spare_exists(guid, &pool, NULL) &&
02994                             pool != 0ULL) {
02995                                 VERIFY(nvlist_lookup_uint64_array(
02996                                     spares[i], ZPOOL_CONFIG_VDEV_STATS,
02997                                     (uint64_t **)&vs, &vsc) == 0);
02998                                 vs->vs_state = VDEV_STATE_CANT_OPEN;
02999                                 vs->vs_aux = VDEV_AUX_SPARED;
03000                         }
03001                 }
03002         }
03003 }
03004 
03008 static void
03009 spa_add_l2cache(spa_t *spa, nvlist_t *config)
03010 {
03011         nvlist_t **l2cache;
03012         uint_t i, j, nl2cache;
03013         nvlist_t *nvroot;
03014         uint64_t guid;
03015         vdev_t *vd;
03016         vdev_stat_t *vs;
03017         uint_t vsc;
03018 
03019         ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
03020 
03021         if (spa->spa_l2cache.sav_count == 0)
03022                 return;
03023 
03024         VERIFY(nvlist_lookup_nvlist(config,
03025             ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
03026         VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
03027             ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
03028         if (nl2cache != 0) {
03029                 VERIFY(nvlist_add_nvlist_array(nvroot,
03030                     ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
03031                 VERIFY(nvlist_lookup_nvlist_array(nvroot,
03032                     ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
03033 
03034                 /*
03035                  * Update level 2 cache device stats.
03036                  */
03037 
03038                 for (i = 0; i < nl2cache; i++) {
03039                         VERIFY(nvlist_lookup_uint64(l2cache[i],
03040                             ZPOOL_CONFIG_GUID, &guid) == 0);
03041 
03042                         vd = NULL;
03043                         for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
03044                                 if (guid ==
03045                                     spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
03046                                         vd = spa->spa_l2cache.sav_vdevs[j];
03047                                         break;
03048                                 }
03049                         }
03050                         ASSERT(vd != NULL);
03051 
03052                         VERIFY(nvlist_lookup_uint64_array(l2cache[i],
03053                             ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
03054                             == 0);
03055                         vdev_get_stats(vd, vs);
03056                 }
03057         }
03058 }
03059 
03060 static void
03061 spa_add_feature_stats(spa_t *spa, nvlist_t *config)
03062 {
03063         nvlist_t *features;
03064         zap_cursor_t zc;
03065         zap_attribute_t za;
03066 
03067         ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
03068         VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
03069 
03070         if (spa->spa_feat_for_read_obj != 0) {
03071                 for (zap_cursor_init(&zc, spa->spa_meta_objset,
03072                     spa->spa_feat_for_read_obj);
03073                     zap_cursor_retrieve(&zc, &za) == 0;
03074                     zap_cursor_advance(&zc)) {
03075                         ASSERT(za.za_integer_length == sizeof (uint64_t) &&
03076                             za.za_num_integers == 1);
03077                         VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
03078                             za.za_first_integer));
03079                 }
03080                 zap_cursor_fini(&zc);
03081         }
03082 
03083         if (spa->spa_feat_for_write_obj != 0) {
03084                 for (zap_cursor_init(&zc, spa->spa_meta_objset,
03085                     spa->spa_feat_for_write_obj);
03086                     zap_cursor_retrieve(&zc, &za) == 0;
03087                     zap_cursor_advance(&zc)) {
03088                         ASSERT(za.za_integer_length == sizeof (uint64_t) &&
03089                             za.za_num_integers == 1);
03090                         VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
03091                             za.za_first_integer));
03092                 }
03093                 zap_cursor_fini(&zc);
03094         }
03095 
03096         VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
03097             features) == 0);
03098         nvlist_free(features);
03099 }
03100 
03101 int
03102 spa_get_stats(const char *name, nvlist_t **config,
03103     char *altroot, size_t buflen)
03104 {
03105         int error;
03106         spa_t *spa;
03107 
03108         *config = NULL;
03109         error = spa_open_common(name, &spa, FTAG, NULL, config);
03110 
03111         if (spa != NULL) {
03112                 /*
03113                  * This still leaves a window of inconsistency where the spares
03114                  * or l2cache devices could change and the config would be
03115                  * self-inconsistent.
03116                  */
03117                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
03118 
03119                 if (*config != NULL) {
03120                         uint64_t loadtimes[2];
03121 
03122                         loadtimes[0] = spa->spa_loaded_ts.tv_sec;
03123                         loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
03124                         VERIFY(nvlist_add_uint64_array(*config,
03125                             ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
03126 
03127                         VERIFY(nvlist_add_uint64(*config,
03128                             ZPOOL_CONFIG_ERRCOUNT,
03129                             spa_get_errlog_size(spa)) == 0);
03130 
03131                         if (spa_suspended(spa))
03132                                 VERIFY(nvlist_add_uint64(*config,
03133                                     ZPOOL_CONFIG_SUSPENDED,
03134                                     spa->spa_failmode) == 0);
03135 
03136                         spa_add_spares(spa, *config);
03137                         spa_add_l2cache(spa, *config);
03138                         spa_add_feature_stats(spa, *config);
03139                 }
03140         }
03141 
03142         /*
03143          * We want to get the alternate root even for faulted pools, so we cheat
03144          * and call spa_lookup() directly.
03145          */
03146         if (altroot) {
03147                 if (spa == NULL) {
03148                         mutex_enter(&spa_namespace_lock);
03149                         spa = spa_lookup(name);
03150                         if (spa)
03151                                 spa_altroot(spa, altroot, buflen);
03152                         else
03153                                 altroot[0] = '\0';
03154                         spa = NULL;
03155                         mutex_exit(&spa_namespace_lock);
03156                 } else {
03157                         spa_altroot(spa, altroot, buflen);
03158                 }
03159         }
03160 
03161         if (spa != NULL) {
03162                 spa_config_exit(spa, SCL_CONFIG, FTAG);
03163                 spa_close(spa, FTAG);
03164         }
03165 
03166         return (error);
03167 }
03168 
03175 static int
03176 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
03177     spa_aux_vdev_t *sav, const char *config, uint64_t version,
03178     vdev_labeltype_t label)
03179 {
03180         nvlist_t **dev;
03181         uint_t i, ndev;
03182         vdev_t *vd;
03183         int error;
03184 
03185         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
03186 
03187         /*
03188          * It's acceptable to have no devs specified.
03189          */
03190         if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
03191                 return (0);
03192 
03193         if (ndev == 0)
03194                 return (EINVAL);
03195 
03196         /*
03197          * Make sure the pool is formatted with a version that supports this
03198          * device type.
03199          */
03200         if (spa_version(spa) < version)
03201                 return (ENOTSUP);
03202 
03203         /*
03204          * Set the pending device list so we correctly handle device in-use
03205          * checking.
03206          */
03207         sav->sav_pending = dev;
03208         sav->sav_npending = ndev;
03209 
03210         for (i = 0; i < ndev; i++) {
03211                 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
03212                     mode)) != 0)
03213                         goto out;
03214 
03215                 if (!vd->vdev_ops->vdev_op_leaf) {
03216                         vdev_free(vd);
03217                         error = EINVAL;
03218                         goto out;
03219                 }
03220 
03221                 /*
03222                  * The L2ARC currently only supports disk devices in
03223                  * kernel context.  For user-level testing, we allow it.
03224                  */
03225 #ifdef _KERNEL
03226                 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
03227                     strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
03228                         error = ENOTBLK;
03229                         vdev_free(vd);
03230                         goto out;
03231                 }
03232 #endif
03233                 vd->vdev_top = vd;
03234 
03235                 if ((error = vdev_open(vd)) == 0 &&
03236                     (error = vdev_label_init(vd, crtxg, label)) == 0) {
03237                         VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
03238                             vd->vdev_guid) == 0);
03239                 }
03240 
03241                 vdev_free(vd);
03242 
03243                 if (error &&
03244                     (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
03245                         goto out;
03246                 else
03247                         error = 0;
03248         }
03249 
03250 out:
03251         sav->sav_pending = NULL;
03252         sav->sav_npending = 0;
03253         return (error);
03254 }
03255 
03256 static int
03257 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
03258 {
03259         int error;
03260 
03261         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
03262 
03263         if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
03264             &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
03265             VDEV_LABEL_SPARE)) != 0) {
03266                 return (error);
03267         }
03268 
03269         return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
03270             &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
03271             VDEV_LABEL_L2CACHE));
03272 }
03273 
03274 static void
03275 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
03276     const char *config)
03277 {
03278         int i;
03279 
03280         if (sav->sav_config != NULL) {
03281                 nvlist_t **olddevs;
03282                 uint_t oldndevs;
03283                 nvlist_t **newdevs;
03284 
03285                 /*
03286                  * Generate new dev list by concatentating with the
03287                  * current dev list.
03288                  */
03289                 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
03290                     &olddevs, &oldndevs) == 0);
03291 
03292                 newdevs = kmem_alloc(sizeof (void *) *
03293                     (ndevs + oldndevs), KM_SLEEP);
03294                 for (i = 0; i < oldndevs; i++)
03295                         VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
03296                             KM_SLEEP) == 0);
03297                 for (i = 0; i < ndevs; i++)
03298                         VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
03299                             KM_SLEEP) == 0);
03300 
03301                 VERIFY(nvlist_remove(sav->sav_config, config,
03302                     DATA_TYPE_NVLIST_ARRAY) == 0);
03303 
03304                 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
03305                     config, newdevs, ndevs + oldndevs) == 0);
03306                 for (i = 0; i < oldndevs + ndevs; i++)
03307                         nvlist_free(newdevs[i]);
03308                 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
03309         } else {
03310                 /*
03311                  * Generate a new dev list.
03312                  */
03313                 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
03314                     KM_SLEEP) == 0);
03315                 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
03316                     devs, ndevs) == 0);
03317         }
03318 }
03319 
03323 void
03324 spa_l2cache_drop(spa_t *spa)
03325 {
03326         vdev_t *vd;
03327         int i;
03328         spa_aux_vdev_t *sav = &spa->spa_l2cache;
03329 
03330         for (i = 0; i < sav->sav_count; i++) {
03331                 uint64_t pool;
03332 
03333                 vd = sav->sav_vdevs[i];
03334                 ASSERT(vd != NULL);
03335 
03336                 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
03337                     pool != 0ULL && l2arc_vdev_present(vd))
03338                         l2arc_remove_vdev(vd);
03339         }
03340 }
03341 
03345 int
03346 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
03347     const char *history_str, nvlist_t *zplprops)
03348 {
03349         spa_t *spa;
03350         char *altroot = NULL;
03351         vdev_t *rvd;
03352         dsl_pool_t *dp;
03353         dmu_tx_t *tx;
03354         int error = 0;
03355         uint64_t txg = TXG_INITIAL;
03356         nvlist_t **spares, **l2cache;
03357         uint_t nspares, nl2cache;
03358         uint64_t version, obj;
03359         boolean_t has_features;
03360 
03361         /*
03362          * If this pool already exists, return failure.
03363          */
03364         mutex_enter(&spa_namespace_lock);
03365         if (spa_lookup(pool) != NULL) {
03366                 mutex_exit(&spa_namespace_lock);
03367                 return (EEXIST);
03368         }
03369 
03370         /*
03371          * Allocate a new spa_t structure.
03372          */
03373         (void) nvlist_lookup_string(props,
03374             zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
03375         spa = spa_add(pool, NULL, altroot);
03376         spa_activate(spa, spa_mode_global);
03377 
03378         if (props && (error = spa_prop_validate(spa, props))) {
03379                 spa_deactivate(spa);
03380                 spa_remove(spa);
03381                 mutex_exit(&spa_namespace_lock);
03382                 return (error);
03383         }
03384 
03385         has_features = B_FALSE;
03386         for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
03387             elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
03388                 if (zpool_prop_feature(nvpair_name(elem)))
03389                         has_features = B_TRUE;
03390         }
03391 
03392         if (has_features || nvlist_lookup_uint64(props,
03393             zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
03394                 version = SPA_VERSION;
03395         }
03396         ASSERT(SPA_VERSION_IS_SUPPORTED(version));
03397 
03398         spa->spa_first_txg = txg;
03399         spa->spa_uberblock.ub_txg = txg - 1;
03400         spa->spa_uberblock.ub_version = version;
03401         spa->spa_ubsync = spa->spa_uberblock;
03402 
03403         /*
03404          * Create "The Godfather" zio to hold all async IOs
03405          */
03406         spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
03407             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
03408 
03409         /*
03410          * Create the root vdev.
03411          */
03412         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
03413 
03414         error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
03415 
03416         ASSERT(error != 0 || rvd != NULL);
03417         ASSERT(error != 0 || spa->spa_root_vdev == rvd);
03418 
03419         if (error == 0 && !zfs_allocatable_devs(nvroot))
03420                 error = EINVAL;
03421 
03422         if (error == 0 &&
03423             (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
03424             (error = spa_validate_aux(spa, nvroot, txg,
03425             VDEV_ALLOC_ADD)) == 0) {
03426                 for (int c = 0; c < rvd->vdev_children; c++) {
03427                         vdev_metaslab_set_size(rvd->vdev_child[c]);
03428                         vdev_expand(rvd->vdev_child[c], txg);
03429                 }
03430         }
03431 
03432         spa_config_exit(spa, SCL_ALL, FTAG);
03433 
03434         if (error != 0) {
03435                 spa_unload(spa);
03436                 spa_deactivate(spa);
03437                 spa_remove(spa);
03438                 mutex_exit(&spa_namespace_lock);
03439                 return (error);
03440         }
03441 
03442         /*
03443          * Get the list of spares, if specified.
03444          */
03445         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
03446             &spares, &nspares) == 0) {
03447                 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
03448                     KM_SLEEP) == 0);
03449                 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
03450                     ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
03451                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
03452                 spa_load_spares(spa);
03453                 spa_config_exit(spa, SCL_ALL, FTAG);
03454                 spa->spa_spares.sav_sync = B_TRUE;
03455         }
03456 
03457         /*
03458          * Get the list of level 2 cache devices, if specified.
03459          */
03460         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
03461             &l2cache, &nl2cache) == 0) {
03462                 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
03463                     NV_UNIQUE_NAME, KM_SLEEP) == 0);
03464                 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
03465                     ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
03466                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
03467                 spa_load_l2cache(spa);
03468                 spa_config_exit(spa, SCL_ALL, FTAG);
03469                 spa->spa_l2cache.sav_sync = B_TRUE;
03470         }
03471 
03472         spa->spa_is_initializing = B_TRUE;
03473         spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
03474         spa->spa_meta_objset = dp->dp_meta_objset;
03475         spa->spa_is_initializing = B_FALSE;
03476 
03477         /*
03478          * Create DDTs (dedup tables).
03479          */
03480         ddt_create(spa);
03481 
03482         spa_update_dspace(spa);
03483 
03484         tx = dmu_tx_create_assigned(dp, txg);
03485 
03486         /*
03487          * Create the pool config object.
03488          */
03489         spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
03490             DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
03491             DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
03492 
03493         if (zap_add(spa->spa_meta_objset,
03494             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
03495             sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
03496                 cmn_err(CE_PANIC, "failed to add pool config");
03497         }
03498 
03499         if (spa_version(spa) >= SPA_VERSION_FEATURES)
03500                 spa_feature_create_zap_objects(spa, tx);
03501 
03502         if (zap_add(spa->spa_meta_objset,
03503             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
03504             sizeof (uint64_t), 1, &version, tx) != 0) {
03505                 cmn_err(CE_PANIC, "failed to add pool version");
03506         }
03507 
03508         /* Newly created pools with the right version are always deflated. */
03509         if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
03510                 spa->spa_deflate = TRUE;
03511                 if (zap_add(spa->spa_meta_objset,
03512                     DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
03513                     sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
03514                         cmn_err(CE_PANIC, "failed to add deflate");
03515                 }
03516         }
03517 
03518         /*
03519          * Create the deferred-free bpobj.  Turn off compression
03520          * because sync-to-convergence takes longer if the blocksize
03521          * keeps changing.
03522          */
03523         obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
03524         dmu_object_set_compress(spa->spa_meta_objset, obj,
03525             ZIO_COMPRESS_OFF, tx);
03526         if (zap_add(spa->spa_meta_objset,
03527             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
03528             sizeof (uint64_t), 1, &obj, tx) != 0) {
03529                 cmn_err(CE_PANIC, "failed to add bpobj");
03530         }
03531         VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
03532             spa->spa_meta_objset, obj));
03533 
03534         /*
03535          * Create the pool's history object.
03536          */
03537         if (version >= SPA_VERSION_ZPOOL_HISTORY)
03538                 spa_history_create_obj(spa, tx);
03539 
03540         /*
03541          * Set pool properties.
03542          */
03543         spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
03544         spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
03545         spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
03546         spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
03547 
03548         if (props != NULL) {
03549                 spa_configfile_set(spa, props, B_FALSE);
03550                 spa_sync_props(spa, props, tx);
03551         }
03552 
03553         dmu_tx_commit(tx);
03554 
03555         spa->spa_sync_on = B_TRUE;
03556         txg_sync_start(spa->spa_dsl_pool);
03557 
03558         /*
03559          * We explicitly wait for the first transaction to complete so that our
03560          * bean counters are appropriately updated.
03561          */
03562         txg_wait_synced(spa->spa_dsl_pool, txg);
03563 
03564         spa_config_sync(spa, B_FALSE, B_TRUE);
03565 
03566         if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
03567                 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
03568         spa_history_log_version(spa, LOG_POOL_CREATE);
03569 
03570         spa->spa_minref = refcount_count(&spa->spa_refcount);
03571 
03572         mutex_exit(&spa_namespace_lock);
03573 
03574         return (0);
03575 }
03576 
03577 #ifdef _KERNEL
03578 #if defined(sun)
03579 /*
03580  * Get the root pool information from the root disk, then import the root pool
03581  * during the system boot up time.
03582  */
03583 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
03584 
03585 static nvlist_t *
03586 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
03587 {
03588         nvlist_t *config;
03589         nvlist_t *nvtop, *nvroot;
03590         uint64_t pgid;
03591 
03592         if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
03593                 return (NULL);
03594 
03595         /*
03596          * Add this top-level vdev to the child array.
03597          */
03598         VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
03599             &nvtop) == 0);
03600         VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
03601             &pgid) == 0);
03602         VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
03603 
03604         /*
03605          * Put this pool's top-level vdevs into a root vdev.
03606          */
03607         VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
03608         VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
03609             VDEV_TYPE_ROOT) == 0);
03610         VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
03611         VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
03612         VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
03613             &nvtop, 1) == 0);
03614 
03615         /*
03616          * Replace the existing vdev_tree with the new root vdev in
03617          * this pool's configuration (remove the old, add the new).
03618          */
03619         VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
03620         nvlist_free(nvroot);
03621         return (config);
03622 }
03623 
03629 static void
03630 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
03631 {
03632         for (int c = 0; c < vd->vdev_children; c++)
03633                 spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
03634 
03635         if (vd->vdev_ops->vdev_op_leaf) {
03636                 nvlist_t *label;
03637                 uint64_t label_txg;
03638 
03639                 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
03640                     &label) != 0)
03641                         return;
03642 
03643                 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
03644                     &label_txg) == 0);
03645 
03646                 /*
03647                  * Do we have a better boot device?
03648                  */
03649                 if (label_txg > *txg) {
03650                         *txg = label_txg;
03651                         *avd = vd;
03652                 }
03653                 nvlist_free(label);
03654         }
03655 }
03656 
03669 int
03670 spa_import_rootpool(char *devpath, char *devid)
03671 {
03672         spa_t *spa;
03673         vdev_t *rvd, *bvd, *avd = NULL;
03674         nvlist_t *config, *nvtop;
03675         uint64_t guid, txg;
03676         char *pname;
03677         int error;
03678 
03679         /*
03680          * Read the label from the boot device and generate a configuration.
03681          */
03682         config = spa_generate_rootconf(devpath, devid, &guid);
03683 #if defined(_OBP) && defined(_KERNEL)
03684         if (config == NULL) {
03685                 if (strstr(devpath, "/iscsi/ssd") != NULL) {
03686                         /* iscsi boot */
03687                         get_iscsi_bootpath_phy(devpath);
03688                         config = spa_generate_rootconf(devpath, devid, &guid);
03689                 }
03690         }
03691 #endif
03692         if (config == NULL) {
03693                 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
03694                     devpath);
03695                 return (EIO);
03696         }
03697 
03698         VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
03699             &pname) == 0);
03700         VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
03701 
03702         mutex_enter(&spa_namespace_lock);
03703         if ((spa = spa_lookup(pname)) != NULL) {
03704                 /*
03705                  * Remove the existing root pool from the namespace so that we
03706                  * can replace it with the correct config we just read in.
03707                  */
03708                 spa_remove(spa);
03709         }
03710 
03711         spa = spa_add(pname, config, NULL);
03712         spa->spa_is_root = B_TRUE;
03713         spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
03714 
03715         /*
03716          * Build up a vdev tree based on the boot device's label config.
03717          */
03718         VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
03719             &nvtop) == 0);
03720         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
03721         error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
03722             VDEV_ALLOC_ROOTPOOL);
03723         spa_config_exit(spa, SCL_ALL, FTAG);
03724         if (error) {
03725                 mutex_exit(&spa_namespace_lock);
03726                 nvlist_free(config);
03727                 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
03728                     pname);
03729                 return (error);
03730         }
03731 
03732         /*
03733          * Get the boot vdev.
03734          */
03735         if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
03736                 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
03737                     (u_longlong_t)guid);
03738                 error = ENOENT;
03739                 goto out;
03740         }
03741 
03742         /*
03743          * Determine if there is a better boot device.
03744          */
03745         avd = bvd;
03746         spa_alt_rootvdev(rvd, &avd, &txg);
03747         if (avd != bvd) {
03748                 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
03749                     "try booting from '%s'", avd->vdev_path);
03750                 error = EINVAL;
03751                 goto out;
03752         }
03753 
03754         /*
03755          * If the boot device is part of a spare vdev then ensure that
03756          * we're booting off the active spare.
03757          */
03758         if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
03759             !bvd->vdev_isspare) {
03760                 cmn_err(CE_NOTE, "The boot device is currently spared. Please "
03761                     "try booting from '%s'",
03762                     bvd->vdev_parent->
03763                     vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
03764                 error = EINVAL;
03765                 goto out;
03766         }
03767 
03768         error = 0;
03769         spa_history_log_version(spa, LOG_POOL_IMPORT);
03770 out:
03771         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
03772         vdev_free(rvd);
03773         spa_config_exit(spa, SCL_ALL, FTAG);
03774         mutex_exit(&spa_namespace_lock);
03775 
03776         nvlist_free(config);
03777         return (error);
03778 }
03779 
03780 #else
03781 
03782 extern int
03783 vdev_geom_read_pool_label(const char *name, nvlist_t **config);
03784 
03785 static nvlist_t *
03786 spa_generate_rootconf(const char *name)
03787 {
03788         nvlist_t *config;
03789         nvlist_t *nvtop, *nvroot;
03790         uint64_t pgid;
03791 
03792         if (vdev_geom_read_pool_label(name, &config) != 0)
03793                 return (NULL);
03794 
03795         /*
03796          * Add this top-level vdev to the child array.
03797          */
03798         VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
03799             &nvtop) == 0);
03800         VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
03801             &pgid) == 0);
03802 
03803         /*
03804          * Put this pool's top-level vdevs into a root vdev.
03805          */
03806         VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
03807         VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
03808             VDEV_TYPE_ROOT) == 0);
03809         VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
03810         VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
03811         VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
03812             &nvtop, 1) == 0);
03813 
03814         /*
03815          * Replace the existing vdev_tree with the new root vdev in
03816          * this pool's configuration (remove the old, add the new).
03817          */
03818         VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
03819         nvlist_free(nvroot);
03820         return (config);
03821 }
03822 
03823 int
03824 spa_import_rootpool(const char *name)
03825 {
03826         spa_t *spa;
03827         vdev_t *rvd, *bvd, *avd = NULL;
03828         nvlist_t *config, *nvtop;
03829         uint64_t txg;
03830         char *pname;
03831         int error;
03832 
03833         /*
03834          * Read the label from the boot device and generate a configuration.
03835          */
03836         config = spa_generate_rootconf(name);
03837         if (config == NULL) {
03838                 cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
03839                     name);
03840                 return (EIO);
03841         }
03842 
03843         VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
03844             &pname) == 0 && strcmp(name, pname) == 0);
03845         VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
03846 
03847         mutex_enter(&spa_namespace_lock);
03848         if ((spa = spa_lookup(pname)) != NULL) {
03849                 /*
03850                  * Remove the existing root pool from the namespace so that we
03851                  * can replace it with the correct config we just read in.
03852                  */
03853                 spa_remove(spa);
03854         }
03855         spa = spa_add(pname, config, NULL);
03856         spa->spa_is_root = B_TRUE;
03857         spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
03858 
03859         /*
03860          * Build up a vdev tree based on the boot device's label config.
03861          */
03862         VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
03863             &nvtop) == 0);
03864         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
03865         error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
03866             VDEV_ALLOC_ROOTPOOL);
03867         spa_config_exit(spa, SCL_ALL, FTAG);
03868         if (error) {
03869                 mutex_exit(&spa_namespace_lock);
03870                 nvlist_free(config);
03871                 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
03872                     pname);
03873                 return (error);
03874         }
03875 
03876         error = 0;
03877         spa_history_log_version(spa, LOG_POOL_IMPORT);
03878 out:
03879         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
03880         vdev_free(rvd);
03881         spa_config_exit(spa, SCL_ALL, FTAG);
03882         mutex_exit(&spa_namespace_lock);
03883 
03884         return (error);
03885 }
03886 
03887 #endif  /* sun */
03888 #endif
03889 
03893 int
03894 spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
03895 {
03896         spa_t *spa;
03897         char *altroot = NULL;
03898         spa_load_state_t state = SPA_LOAD_IMPORT;
03899         zpool_rewind_policy_t policy;
03900         uint64_t mode = spa_mode_global;
03901         uint64_t readonly = B_FALSE;
03902         int error;
03903         nvlist_t *nvroot;
03904         nvlist_t **spares, **l2cache;
03905         uint_t nspares, nl2cache;
03906 
03907         /*
03908          * If a pool with this name exists, return failure.
03909          */
03910         mutex_enter(&spa_namespace_lock);
03911         if (spa_lookup(pool) != NULL) {
03912                 mutex_exit(&spa_namespace_lock);
03913                 return (EEXIST);
03914         }
03915 
03916         /*
03917          * Create and initialize the spa structure.
03918          */
03919         (void) nvlist_lookup_string(props,
03920             zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
03921         (void) nvlist_lookup_uint64(props,
03922             zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
03923         if (readonly)
03924                 mode = FREAD;
03925         spa = spa_add(pool, config, altroot);
03926         spa->spa_import_flags = flags;
03927 
03928         /*
03929          * Verbatim import - Take a pool and insert it into the namespace
03930          * as if it had been loaded at boot.
03931          */
03932         if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
03933                 if (props != NULL)
03934                         spa_configfile_set(spa, props, B_FALSE);
03935 
03936                 spa_config_sync(spa, B_FALSE, B_TRUE);
03937 
03938                 mutex_exit(&spa_namespace_lock);
03939                 spa_history_log_version(spa, LOG_POOL_IMPORT);
03940 
03941                 return (0);
03942         }
03943 
03944         spa_activate(spa, mode);
03945 
03946         /*
03947          * Don't start async tasks until we know everything is healthy.
03948          */
03949         spa_async_suspend(spa);
03950 
03951         zpool_get_rewind_policy(config, &policy);
03952         if (policy.zrp_request & ZPOOL_DO_REWIND)
03953                 state = SPA_LOAD_RECOVER;
03954 
03955         /*
03956          * Pass off the heavy lifting to spa_load().  Pass TRUE for mosconfig
03957          * because the user-supplied config is actually the one to trust when
03958          * doing an import.
03959          */
03960         if (state != SPA_LOAD_RECOVER)
03961                 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
03962 
03963         error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
03964             policy.zrp_request);
03965 
03966         /*
03967          * Propagate anything learned while loading the pool and pass it
03968          * back to caller (i.e. rewind info, missing devices, etc).
03969          */
03970         VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
03971             spa->spa_load_info) == 0);
03972 
03973         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
03974         /*
03975          * Toss any existing sparelist, as it doesn't have any validity
03976          * anymore, and conflicts with spa_has_spare().
03977          */
03978         if (spa->spa_spares.sav_config) {
03979                 nvlist_free(spa->spa_spares.sav_config);
03980                 spa->spa_spares.sav_config = NULL;
03981                 spa_load_spares(spa);
03982         }
03983         if (spa->spa_l2cache.sav_config) {
03984                 nvlist_free(spa->spa_l2cache.sav_config);
03985                 spa->spa_l2cache.sav_config = NULL;
03986                 spa_load_l2cache(spa);
03987         }
03988 
03989         VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
03990             &nvroot) == 0);
03991         if (error == 0)
03992                 error = spa_validate_aux(spa, nvroot, -1ULL,
03993                     VDEV_ALLOC_SPARE);
03994         if (error == 0)
03995                 error = spa_validate_aux(spa, nvroot, -1ULL,
03996                     VDEV_ALLOC_L2CACHE);
03997         spa_config_exit(spa, SCL_ALL, FTAG);
03998 
03999         if (props != NULL)
04000                 spa_configfile_set(spa, props, B_FALSE);
04001 
04002         if (error != 0 || (props && spa_writeable(spa) &&
04003             (error = spa_prop_set(spa, props)))) {
04004                 spa_unload(spa);
04005                 spa_deactivate(spa);
04006                 spa_remove(spa);
04007                 mutex_exit(&spa_namespace_lock);
04008                 return (error);
04009         }
04010 
04011         spa_async_resume(spa);
04012 
04013         /*
04014          * Override any spares and level 2 cache devices as specified by
04015          * the user, as these may have correct device names/devids, etc.
04016          */
04017         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
04018             &spares, &nspares) == 0) {
04019                 if (spa->spa_spares.sav_config)
04020                         VERIFY(nvlist_remove(spa->spa_spares.sav_config,
04021                             ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
04022                 else
04023                         VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
04024                             NV_UNIQUE_NAME, KM_SLEEP) == 0);
04025                 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
04026                     ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
04027                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
04028                 spa_load_spares(spa);
04029                 spa_config_exit(spa, SCL_ALL, FTAG);
04030                 spa->spa_spares.sav_sync = B_TRUE;
04031         }
04032         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
04033             &l2cache, &nl2cache) == 0) {
04034                 if (spa->spa_l2cache.sav_config)
04035                         VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
04036                             ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
04037                 else
04038                         VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
04039                             NV_UNIQUE_NAME, KM_SLEEP) == 0);
04040                 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
04041                     ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
04042                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
04043                 spa_load_l2cache(spa);
04044                 spa_config_exit(spa, SCL_ALL, FTAG);
04045                 spa->spa_l2cache.sav_sync = B_TRUE;
04046         }
04047 
04048         /*
04049          * Check for any removed devices.
04050          */
04051         if (spa->spa_autoreplace) {
04052                 spa_aux_check_removed(&spa->spa_spares);
04053                 spa_aux_check_removed(&spa->spa_l2cache);
04054         }
04055 
04056         if (spa_writeable(spa)) {
04057                 /*
04058                  * Update the config cache to include the newly-imported pool.
04059                  */
04060                 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
04061         }
04062 
04063         /*
04064          * It's possible that the pool was expanded while it was exported.
04065          * We kick off an async task to handle this for us.
04066          */
04067         spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
04068 
04069         mutex_exit(&spa_namespace_lock);
04070         spa_history_log_version(spa, LOG_POOL_IMPORT);
04071 
04072 #ifdef __FreeBSD__
04073 #ifdef _KERNEL
04074         zvol_create_minors(pool);
04075 #endif
04076 #endif
04077         return (0);
04078 }
04079 
04080 nvlist_t *
04081 spa_tryimport(nvlist_t *tryconfig)
04082 {
04083         nvlist_t *config = NULL;
04084         char *poolname;
04085         spa_t *spa;
04086         uint64_t state;
04087         int error;
04088 
04089         if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
04090                 return (NULL);
04091 
04092         if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
04093                 return (NULL);
04094 
04095         /*
04096          * Create and initialize the spa structure.
04097          */
04098         mutex_enter(&spa_namespace_lock);
04099         spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
04100         spa_activate(spa, FREAD);
04101 
04102         /*
04103          * Pass off the heavy lifting to spa_load().
04104          * Pass TRUE for mosconfig because the user-supplied config
04105          * is actually the one to trust when doing an import.
04106          */
04107         error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
04108 
04109         /*
04110          * If 'tryconfig' was at least parsable, return the current config.
04111          */
04112         if (spa->spa_root_vdev != NULL) {
04113                 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
04114                 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
04115                     poolname) == 0);
04116                 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
04117                     state) == 0);
04118                 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
04119                     spa->spa_uberblock.ub_timestamp) == 0);
04120                 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
04121                     spa->spa_load_info) == 0);
04122 
04123                 /*
04124                  * If the bootfs property exists on this pool then we
04125                  * copy it out so that external consumers can tell which
04126                  * pools are bootable.
04127                  */
04128                 if ((!error || error == EEXIST) && spa->spa_bootfs) {
04129                         char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
04130 
04131                         /*
04132                          * We have to play games with the name since the
04133                          * pool was opened as TRYIMPORT_NAME.
04134                          */
04135                         if (dsl_dsobj_to_dsname(spa_name(spa),
04136                             spa->spa_bootfs, tmpname) == 0) {
04137                                 char *cp;
04138                                 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
04139 
04140                                 cp = strchr(tmpname, '/');
04141                                 if (cp == NULL) {
04142                                         (void) strlcpy(dsname, tmpname,
04143                                             MAXPATHLEN);
04144                                 } else {
04145                                         (void) snprintf(dsname, MAXPATHLEN,
04146                                             "%s/%s", poolname, ++cp);
04147                                 }
04148                                 VERIFY(nvlist_add_string(config,
04149                                     ZPOOL_CONFIG_BOOTFS, dsname) == 0);
04150                                 kmem_free(dsname, MAXPATHLEN);
04151                         }
04152                         kmem_free(tmpname, MAXPATHLEN);
04153                 }
04154 
04155                 /*
04156                  * Add the list of hot spares and level 2 cache devices.
04157                  */
04158                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
04159                 spa_add_spares(spa, config);
04160                 spa_add_l2cache(spa, config);
04161                 spa_config_exit(spa, SCL_CONFIG, FTAG);
04162         }
04163 
04164         spa_unload(spa);
04165         spa_deactivate(spa);
04166         spa_remove(spa);
04167         mutex_exit(&spa_namespace_lock);
04168 
04169         return (config);
04170 }
04171 
04181 static int
04182 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
04183     boolean_t force, boolean_t hardforce)
04184 {
04185         spa_t *spa;
04186 
04187         if (oldconfig)
04188                 *oldconfig = NULL;
04189 
04190         if (!(spa_mode_global & FWRITE))
04191                 return (EROFS);
04192 
04193         mutex_enter(&spa_namespace_lock);
04194         if ((spa = spa_lookup(pool)) == NULL) {
04195                 mutex_exit(&spa_namespace_lock);
04196                 return (ENOENT);
04197         }
04198 
04199         /*
04200          * Put a hold on the pool, drop the namespace lock, stop async tasks,
04201          * reacquire the namespace lock, and see if we can export.
04202          */
04203         spa_open_ref(spa, FTAG);
04204         mutex_exit(&spa_namespace_lock);
04205         spa_async_suspend(spa);
04206         mutex_enter(&spa_namespace_lock);
04207         spa_close(spa, FTAG);
04208 
04209         /*
04210          * The pool will be in core if it's openable,
04211          * in which case we can modify its state.
04212          */
04213         if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
04214                 /*
04215                  * Objsets may be open only because they're dirty, so we
04216                  * have to force it to sync before checking spa_refcnt.
04217                  */
04218                 txg_wait_synced(spa->spa_dsl_pool, 0);
04219 
04220                 /*
04221                  * A pool cannot be exported or destroyed if there are active
04222                  * references.  If we are resetting a pool, allow references by
04223                  * fault injection handlers.
04224                  */
04225                 if (!spa_refcount_zero(spa) ||
04226                     (spa->spa_inject_ref != 0 &&
04227                     new_state != POOL_STATE_UNINITIALIZED)) {
04228                         spa_async_resume(spa);
04229                         mutex_exit(&spa_namespace_lock);
04230                         return (EBUSY);
04231                 }
04232 
04233                 /*
04234                  * A pool cannot be exported if it has an active shared spare.
04235                  * This is to prevent other pools stealing the active spare
04236                  * from an exported pool. At user's own will, such pool can
04237                  * be forcedly exported.
04238                  */
04239                 if (!force && new_state == POOL_STATE_EXPORTED &&
04240                     spa_has_active_shared_spare(spa)) {
04241                         spa_async_resume(spa);
04242                         mutex_exit(&spa_namespace_lock);
04243                         return (EXDEV);
04244                 }
04245 
04246                 /*
04247                  * We want this to be reflected on every label,
04248                  * so mark them all dirty.  spa_unload() will do the
04249                  * final sync that pushes these changes out.
04250                  */
04251                 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
04252                         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
04253                         spa->spa_state = new_state;
04254                         spa->spa_final_txg = spa_last_synced_txg(spa) +
04255                             TXG_DEFER_SIZE + 1;
04256                         vdev_config_dirty(spa->spa_root_vdev);
04257                         spa_config_exit(spa, SCL_ALL, FTAG);
04258                 }
04259         }
04260 
04261         spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
04262 
04263         if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
04264                 spa_unload(spa);
04265                 spa_deactivate(spa);
04266         }
04267 
04268         if (oldconfig && spa->spa_config)
04269                 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
04270 
04271         if (new_state != POOL_STATE_UNINITIALIZED) {
04272                 if (!hardforce)
04273                         spa_config_sync(spa, B_TRUE, B_TRUE);
04274                 spa_remove(spa);
04275         }
04276         mutex_exit(&spa_namespace_lock);
04277 
04278         return (0);
04279 }
04280 
04284 int
04285 spa_destroy(char *pool)
04286 {
04287         return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
04288             B_FALSE, B_FALSE));
04289 }
04290 
04294 int
04295 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
04296     boolean_t hardforce)
04297 {
04298         return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
04299             force, hardforce));
04300 }
04301 
04308 int
04309 spa_reset(char *pool)
04310 {
04311         return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
04312             B_FALSE, B_FALSE));
04313 }
04314 
04315 /*
04316  * ==========================================================================
04317  * Device manipulation
04318  * ==========================================================================
04319  */
04320 
04324 int
04325 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
04326 {
04327         uint64_t txg, id;
04328         int error;
04329         vdev_t *rvd = spa->spa_root_vdev;
04330         vdev_t *vd, *tvd;
04331         nvlist_t **spares, **l2cache;
04332         uint_t nspares, nl2cache;
04333 
04334         ASSERT(spa_writeable(spa));
04335 
04336         txg = spa_vdev_enter(spa);
04337 
04338         if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
04339             VDEV_ALLOC_ADD)) != 0)
04340                 return (spa_vdev_exit(spa, NULL, txg, error));
04341 
04342         spa->spa_pending_vdev = vd;     /* spa_vdev_exit() will clear this */
04343 
04344         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
04345             &nspares) != 0)
04346                 nspares = 0;
04347 
04348         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
04349             &nl2cache) != 0)
04350                 nl2cache = 0;
04351 
04352         if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
04353                 return (spa_vdev_exit(spa, vd, txg, EINVAL));
04354 
04355         if (vd->vdev_children != 0 &&
04356             (error = vdev_create(vd, txg, B_FALSE)) != 0)
04357                 return (spa_vdev_exit(spa, vd, txg, error));
04358 
04359         /*
04360          * We must validate the spares and l2cache devices after checking the
04361          * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
04362          */
04363         if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
04364                 return (spa_vdev_exit(spa, vd, txg, error));
04365 
04366         /*
04367          * Transfer each new top-level vdev from vd to rvd.
04368          */
04369         for (int c = 0; c < vd->vdev_children; c++) {
04370 
04371                 /*
04372                  * Set the vdev id to the first hole, if one exists.
04373                  */
04374                 for (id = 0; id < rvd->vdev_children; id++) {
04375                         if (rvd->vdev_child[id]->vdev_ishole) {
04376                                 vdev_free(rvd->vdev_child[id]);
04377                                 break;
04378                         }
04379                 }
04380                 tvd = vd->vdev_child[c];
04381                 vdev_remove_child(vd, tvd);
04382                 tvd->vdev_id = id;
04383                 vdev_add_child(rvd, tvd);
04384                 vdev_config_dirty(tvd);
04385         }
04386 
04387         if (nspares != 0) {
04388                 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
04389                     ZPOOL_CONFIG_SPARES);
04390                 spa_load_spares(spa);
04391                 spa->spa_spares.sav_sync = B_TRUE;
04392         }
04393 
04394         if (nl2cache != 0) {
04395                 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
04396                     ZPOOL_CONFIG_L2CACHE);
04397                 spa_load_l2cache(spa);
04398                 spa->spa_l2cache.sav_sync = B_TRUE;
04399         }
04400 
04401         /*
04402          * We have to be careful when adding new vdevs to an existing pool.
04403          * If other threads start allocating from these vdevs before we
04404          * sync the config cache, and we lose power, then upon reboot we may
04405          * fail to open the pool because there are DVAs that the config cache
04406          * can't translate.  Therefore, we first add the vdevs without
04407          * initializing metaslabs; sync the config cache (via spa_vdev_exit());
04408          * and then let spa_config_update() initialize the new metaslabs.
04409          *
04410          * spa_load() checks for added-but-not-initialized vdevs, so that
04411          * if we lose power at any point in this sequence, the remaining
04412          * steps will be completed the next time we load the pool.
04413          */
04414         (void) spa_vdev_exit(spa, vd, txg, 0);
04415 
04416         mutex_enter(&spa_namespace_lock);
04417         spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
04418         mutex_exit(&spa_namespace_lock);
04419 
04420         return (0);
04421 }
04422 
04436 int
04437 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
04438 {
04439         uint64_t txg, dtl_max_txg;
04440         vdev_t *rvd = spa->spa_root_vdev;
04441         vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
04442         vdev_ops_t *pvops;
04443         char *oldvdpath, *newvdpath;
04444         int newvd_isspare;
04445         int error;
04446 
04447         ASSERT(spa_writeable(spa));
04448 
04449         txg = spa_vdev_enter(spa);
04450 
04451         oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
04452 
04453         if (oldvd == NULL)
04454                 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
04455 
04456         if (!oldvd->vdev_ops->vdev_op_leaf)
04457                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
04458 
04459         pvd = oldvd->vdev_parent;
04460 
04461         if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
04462             VDEV_ALLOC_ATTACH)) != 0)
04463                 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
04464 
04465         if (newrootvd->vdev_children != 1)
04466                 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
04467 
04468         newvd = newrootvd->vdev_child[0];
04469 
04470         if (!newvd->vdev_ops->vdev_op_leaf)
04471                 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
04472 
04473         if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
04474                 return (spa_vdev_exit(spa, newrootvd, txg, error));
04475 
04476         /*
04477          * Spares can't replace logs
04478          */
04479         if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
04480                 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
04481 
04482         if (!replacing) {
04483                 /*
04484                  * For attach, the only allowable parent is a mirror or the root
04485                  * vdev.
04486                  */
04487                 if (pvd->vdev_ops != &vdev_mirror_ops &&
04488                     pvd->vdev_ops != &vdev_root_ops)
04489                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
04490 
04491                 pvops = &vdev_mirror_ops;
04492         } else {
04493                 /*
04494                  * Active hot spares can only be replaced by inactive hot
04495                  * spares.
04496                  */
04497                 if (pvd->vdev_ops == &vdev_spare_ops &&
04498                     oldvd->vdev_isspare &&
04499                     !spa_has_spare(spa, newvd->vdev_guid))
04500                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
04501 
04502                 /*
04503                  * If the source is a hot spare, and the parent isn't already a
04504                  * spare, then we want to create a new hot spare.  Otherwise, we
04505                  * want to create a replacing vdev.  The user is not allowed to
04506                  * attach to a spared vdev child unless the 'isspare' state is
04507                  * the same (spare replaces spare, non-spare replaces
04508                  * non-spare).
04509                  */
04510                 if (pvd->vdev_ops == &vdev_replacing_ops &&
04511                     spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
04512                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
04513                 } else if (pvd->vdev_ops == &vdev_spare_ops &&
04514                     newvd->vdev_isspare != oldvd->vdev_isspare) {
04515                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
04516                 }
04517 
04518                 if (newvd->vdev_isspare)
04519                         pvops = &vdev_spare_ops;
04520                 else
04521                         pvops = &vdev_replacing_ops;
04522         }
04523 
04524         /*
04525          * Make sure the new device is big enough.
04526          */
04527         if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
04528                 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
04529 
04530         /*
04531          * The new device cannot have a higher alignment requirement
04532          * than the top-level vdev.
04533          */
04534         if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
04535                 return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
04536 
04537         /*
04538          * If this is an in-place replacement, update oldvd's path and devid
04539          * to make it distinguishable from newvd, and unopenable from now on.
04540          */
04541         if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
04542                 spa_strfree(oldvd->vdev_path);
04543                 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
04544                     KM_SLEEP);
04545                 (void) sprintf(oldvd->vdev_path, "%s/%s",
04546                     newvd->vdev_path, "old");
04547                 if (oldvd->vdev_devid != NULL) {
04548                         spa_strfree(oldvd->vdev_devid);
04549                         oldvd->vdev_devid = NULL;
04550                 }
04551         }
04552 
04553         /* mark the device being resilvered */
04554         newvd->vdev_resilvering = B_TRUE;
04555 
04556         /*
04557          * If the parent is not a mirror, or if we're replacing, insert the new
04558          * mirror/replacing/spare vdev above oldvd.
04559          */
04560         if (pvd->vdev_ops != pvops)
04561                 pvd = vdev_add_parent(oldvd, pvops);
04562 
04563         ASSERT(pvd->vdev_top->vdev_parent == rvd);
04564         ASSERT(pvd->vdev_ops == pvops);
04565         ASSERT(oldvd->vdev_parent == pvd);
04566 
04567         /*
04568          * Extract the new device from its root and add it to pvd.
04569          */
04570         vdev_remove_child(newrootvd, newvd);
04571         newvd->vdev_id = pvd->vdev_children;
04572         newvd->vdev_crtxg = oldvd->vdev_crtxg;
04573         vdev_add_child(pvd, newvd);
04574 
04575         tvd = newvd->vdev_top;
04576         ASSERT(pvd->vdev_top == tvd);
04577         ASSERT(tvd->vdev_parent == rvd);
04578 
04579         vdev_config_dirty(tvd);
04580 
04581         /*
04582          * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
04583          * for any dmu_sync-ed blocks.  It will propagate upward when
04584          * spa_vdev_exit() calls vdev_dtl_reassess().
04585          */
04586         dtl_max_txg = txg + TXG_CONCURRENT_STATES;
04587 
04588         vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
04589             dtl_max_txg - TXG_INITIAL);
04590 
04591         if (newvd->vdev_isspare) {
04592                 spa_spare_activate(newvd);
04593                 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
04594         }
04595 
04596         oldvdpath = spa_strdup(oldvd->vdev_path);
04597         newvdpath = spa_strdup(newvd->vdev_path);
04598         newvd_isspare = newvd->vdev_isspare;
04599 
04600         /*
04601          * Mark newvd's DTL dirty in this txg.
04602          */
04603         vdev_dirty(tvd, VDD_DTL, newvd, txg);
04604 
04605         /*
04606          * Restart the resilver
04607          */
04608         dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
04609 
04610         /*
04611          * Commit the config
04612          */
04613         (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
04614 
04615         spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL,
04616             "%s vdev=%s %s vdev=%s",
04617             replacing && newvd_isspare ? "spare in" :
04618             replacing ? "replace" : "attach", newvdpath,
04619             replacing ? "for" : "to", oldvdpath);
04620 
04621         spa_strfree(oldvdpath);
04622         spa_strfree(newvdpath);
04623 
04624         if (spa->spa_bootfs)
04625                 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
04626 
04627         return (0);
04628 }
04629 
04636 int
04637 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
04638 {
04639         uint64_t txg;
04640         int error;
04641         vdev_t *rvd = spa->spa_root_vdev;
04642         vdev_t *vd, *pvd, *cvd, *tvd;
04643         boolean_t unspare = B_FALSE;
04644         uint64_t unspare_guid;
04645         char *vdpath;
04646 
04647         ASSERT(spa_writeable(spa));
04648 
04649         txg = spa_vdev_enter(spa);
04650 
04651         vd = spa_lookup_by_guid(spa, guid, B_FALSE);
04652 
04653         if (vd == NULL)
04654                 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
04655 
04656         if (!vd->vdev_ops->vdev_op_leaf)
04657                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
04658 
04659         pvd = vd->vdev_parent;
04660 
04661         /*
04662          * If the parent/child relationship is not as expected, don't do it.
04663          * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
04664          * vdev that's replacing B with C.  The user's intent in replacing
04665          * is to go from M(A,B) to M(A,C).  If the user decides to cancel
04666          * the replace by detaching C, the expected behavior is to end up
04667          * M(A,B).  But suppose that right after deciding to detach C,
04668          * the replacement of B completes.  We would have M(A,C), and then
04669          * ask to detach C, which would leave us with just A -- not what
04670          * the user wanted.  To prevent this, we make sure that the
04671          * parent/child relationship hasn't changed -- in this example,
04672          * that C's parent is still the replacing vdev R.
04673          */
04674         if (pvd->vdev_guid != pguid && pguid != 0)
04675                 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
04676 
04677         /*
04678          * Only 'replacing' or 'spare' vdevs can be replaced.
04679          */
04680         if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
04681             pvd->vdev_ops != &vdev_spare_ops)
04682                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
04683 
04684         ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
04685             spa_version(spa) >= SPA_VERSION_SPARES);
04686 
04687         /*
04688          * Only mirror, replacing, and spare vdevs support detach.
04689          */
04690         if (pvd->vdev_ops != &vdev_replacing_ops &&
04691             pvd->vdev_ops != &vdev_mirror_ops &&
04692             pvd->vdev_ops != &vdev_spare_ops)
04693                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
04694 
04695         /*
04696          * If this device has the only valid copy of some data,
04697          * we cannot safely detach it.
04698          */
04699         if (vdev_dtl_required(vd))
04700                 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
04701 
04702         ASSERT(pvd->vdev_children >= 2);
04703 
04704         /*
04705          * If we are detaching the second disk from a replacing vdev, then
04706          * check to see if we changed the original vdev's path to have "/old"
04707          * at the end in spa_vdev_attach().  If so, undo that change now.
04708          */
04709         if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
04710             vd->vdev_path != NULL) {
04711                 size_t len = strlen(vd->vdev_path);
04712 
04713                 for (int c = 0; c < pvd->vdev_children; c++) {
04714                         cvd = pvd->vdev_child[c];
04715 
04716                         if (cvd == vd || cvd->vdev_path == NULL)
04717                                 continue;
04718 
04719                         if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
04720                             strcmp(cvd->vdev_path + len, "/old") == 0) {
04721                                 spa_strfree(cvd->vdev_path);
04722                                 cvd->vdev_path = spa_strdup(vd->vdev_path);
04723                                 break;
04724                         }
04725                 }
04726         }
04727 
04728         /*
04729          * If we are detaching the original disk from a spare, then it implies
04730          * that the spare should become a real disk, and be removed from the
04731          * active spare list for the pool.
04732          */
04733         if (pvd->vdev_ops == &vdev_spare_ops &&
04734             vd->vdev_id == 0 &&
04735             pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
04736                 unspare = B_TRUE;
04737 
04738         /*
04739          * Erase the disk labels so the disk can be used for other things.
04740          * This must be done after all other error cases are handled,
04741          * but before we disembowel vd (so we can still do I/O to it).
04742          * But if we can't do it, don't treat the error as fatal --
04743          * it may be that the unwritability of the disk is the reason
04744          * it's being detached!
04745          */
04746         error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
04747 
04748         /*
04749          * Remove vd from its parent and compact the parent's children.
04750          */
04751         vdev_remove_child(pvd, vd);
04752         vdev_compact_children(pvd);
04753 
04754         /*
04755          * Remember one of the remaining children so we can get tvd below.
04756          */
04757         cvd = pvd->vdev_child[pvd->vdev_children - 1];
04758 
04759         /*
04760          * If we need to remove the remaining child from the list of hot spares,
04761          * do it now, marking the vdev as no longer a spare in the process.
04762          * We must do this before vdev_remove_parent(), because that can
04763          * change the GUID if it creates a new toplevel GUID.  For a similar
04764          * reason, we must remove the spare now, in the same txg as the detach;
04765          * otherwise someone could attach a new sibling, change the GUID, and
04766          * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
04767          */
04768         if (unspare) {
04769                 ASSERT(cvd->vdev_isspare);
04770                 spa_spare_remove(cvd);
04771                 unspare_guid = cvd->vdev_guid;
04772                 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
04773                 cvd->vdev_unspare = B_TRUE;
04774         }
04775 
04776         /*
04777          * If the parent mirror/replacing vdev only has one child,
04778          * the parent is no longer needed.  Remove it from the tree.
04779          */
04780         if (pvd->vdev_children == 1) {
04781                 if (pvd->vdev_ops == &vdev_spare_ops)
04782                         cvd->vdev_unspare = B_FALSE;
04783                 vdev_remove_parent(cvd);
04784                 cvd->vdev_resilvering = B_FALSE;
04785         }
04786 
04787 
04788         /*
04789          * We don't set tvd until now because the parent we just removed
04790          * may have been the previous top-level vdev.
04791          */
04792         tvd = cvd->vdev_top;
04793         ASSERT(tvd->vdev_parent == rvd);
04794 
04795         /*
04796          * Reevaluate the parent vdev state.
04797          */
04798         vdev_propagate_state(cvd);
04799 
04800         /*
04801          * If the 'autoexpand' property is set on the pool then automatically
04802          * try to expand the size of the pool. For example if the device we
04803          * just detached was smaller than the others, it may be possible to
04804          * add metaslabs (i.e. grow the pool). We need to reopen the vdev
04805          * first so that we can obtain the updated sizes of the leaf vdevs.
04806          */
04807         if (spa->spa_autoexpand) {
04808                 vdev_reopen(tvd);
04809                 vdev_expand(tvd, txg);
04810         }
04811 
04812         vdev_config_dirty(tvd);
04813 
04814         /*
04815          * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
04816          * vd->vdev_detached is set and free vd's DTL object in syncing context.
04817          * But first make sure we're not on any *other* txg's DTL list, to
04818          * prevent vd from being accessed after it's freed.
04819          */
04820         vdpath = spa_strdup(vd->vdev_path);
04821         for (int t = 0; t < TXG_SIZE; t++)
04822                 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
04823         vd->vdev_detached = B_TRUE;
04824         vdev_dirty(tvd, VDD_DTL, vd, txg);
04825 
04826         spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
04827 
04828         /* hang on to the spa before we release the lock */
04829         spa_open_ref(spa, FTAG);
04830 
04831         error = spa_vdev_exit(spa, vd, txg, 0);
04832 
04833         spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL,
04834             "vdev=%s", vdpath);
04835         spa_strfree(vdpath);
04836 
04837         /*
04838          * If this was the removal of the original device in a hot spare vdev,
04839          * then we want to go through and remove the device from the hot spare
04840          * list of every other pool.
04841          */
04842         if (unspare) {
04843                 spa_t *altspa = NULL;
04844 
04845                 mutex_enter(&spa_namespace_lock);
04846                 while ((altspa = spa_next(altspa)) != NULL) {
04847                         if (altspa->spa_state != POOL_STATE_ACTIVE ||
04848                             altspa == spa)
04849                                 continue;
04850 
04851                         spa_open_ref(altspa, FTAG);
04852                         mutex_exit(&spa_namespace_lock);
04853                         (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
04854                         mutex_enter(&spa_namespace_lock);
04855                         spa_close(altspa, FTAG);
04856                 }
04857                 mutex_exit(&spa_namespace_lock);
04858 
04859                 /* search the rest of the vdevs for spares to remove */
04860                 spa_vdev_resilver_done(spa);
04861         }
04862 
04863         /* all done with the spa; OK to release */
04864         mutex_enter(&spa_namespace_lock);
04865         spa_close(spa, FTAG);
04866         mutex_exit(&spa_namespace_lock);
04867 
04868         return (error);
04869 }
04870 
04874 int
04875 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
04876     nvlist_t *props, boolean_t exp)
04877 {
04878         int error = 0;
04879         uint64_t txg, *glist;
04880         spa_t *newspa;
04881         uint_t c, children, lastlog;
04882         nvlist_t **child, *nvl, *tmp;
04883         dmu_tx_t *tx;
04884         char *altroot = NULL;
04885         vdev_t *rvd, **vml = NULL;                      /* vdev modify list */
04886         boolean_t activate_slog;
04887 
04888         ASSERT(spa_writeable(spa));
04889 
04890         txg = spa_vdev_enter(spa);
04891 
04892         /* clear the log and flush everything up to now */
04893         activate_slog = spa_passivate_log(spa);
04894         (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
04895         error = spa_offline_log(spa);
04896         txg = spa_vdev_config_enter(spa);
04897 
04898         if (activate_slog)
04899                 spa_activate_log(spa);
04900 
04901         if (error != 0)
04902                 return (spa_vdev_exit(spa, NULL, txg, error));
04903 
04904         /* check new spa name before going any further */
04905         if (spa_lookup(newname) != NULL)
04906                 return (spa_vdev_exit(spa, NULL, txg, EEXIST));
04907 
04908         /*
04909          * scan through all the children to ensure they're all mirrors
04910          */
04911         if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
04912             nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
04913             &children) != 0)
04914                 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
04915 
04916         /* first, check to ensure we've got the right child count */
04917         rvd = spa->spa_root_vdev;
04918         lastlog = 0;
04919         for (c = 0; c < rvd->vdev_children; c++) {
04920                 vdev_t *vd = rvd->vdev_child[c];
04921 
04922                 /* don't count the holes & logs as children */
04923                 if (vd->vdev_islog || vd->vdev_ishole) {
04924                         if (lastlog == 0)
04925                                 lastlog = c;
04926                         continue;
04927                 }
04928 
04929                 lastlog = 0;
04930         }
04931         if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
04932                 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
04933 
04934         /* next, ensure no spare or cache devices are part of the split */
04935         if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
04936             nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
04937                 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
04938 
04939         vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
04940         glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
04941 
04942         /* then, loop over each vdev and validate it */
04943         for (c = 0; c < children; c++) {
04944                 uint64_t is_hole = 0;
04945 
04946                 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
04947                     &is_hole);
04948 
04949                 if (is_hole != 0) {
04950                         if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
04951                             spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
04952                                 continue;
04953                         } else {
04954                                 error = EINVAL;
04955                                 break;
04956                         }
04957                 }
04958 
04959                 /* which disk is going to be split? */
04960                 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
04961                     &glist[c]) != 0) {
04962                         error = EINVAL;
04963                         break;
04964                 }
04965 
04966                 /* look it up in the spa */
04967                 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
04968                 if (vml[c] == NULL) {
04969                         error = ENODEV;
04970                         break;
04971                 }
04972 
04973                 /* make sure there's nothing stopping the split */
04974                 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
04975                     vml[c]->vdev_islog ||
04976                     vml[c]->vdev_ishole ||
04977                     vml[c]->vdev_isspare ||
04978                     vml[c]->vdev_isl2cache ||
04979                     !vdev_writeable(vml[c]) ||
04980                     vml[c]->vdev_children != 0 ||
04981                     vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
04982                     c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
04983                         error = EINVAL;
04984                         break;
04985                 }
04986 
04987                 if (vdev_dtl_required(vml[c])) {
04988                         error = EBUSY;
04989                         break;
04990                 }
04991 
04992                 /* we need certain info from the top level */
04993                 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
04994                     vml[c]->vdev_top->vdev_ms_array) == 0);
04995                 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
04996                     vml[c]->vdev_top->vdev_ms_shift) == 0);
04997                 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
04998                     vml[c]->vdev_top->vdev_asize) == 0);
04999                 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
05000                     vml[c]->vdev_top->vdev_ashift) == 0);
05001         }
05002 
05003         if (error != 0) {
05004                 kmem_free(vml, children * sizeof (vdev_t *));
05005                 kmem_free(glist, children * sizeof (uint64_t));
05006                 return (spa_vdev_exit(spa, NULL, txg, error));
05007         }
05008 
05009         /* stop writers from using the disks */
05010         for (c = 0; c < children; c++) {
05011                 if (vml[c] != NULL)
05012                         vml[c]->vdev_offline = B_TRUE;
05013         }
05014         vdev_reopen(spa->spa_root_vdev);
05015 
05016         /*
05017          * Temporarily record the splitting vdevs in the spa config.  This
05018          * will disappear once the config is regenerated.
05019          */
05020         VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
05021         VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
05022             glist, children) == 0);
05023         kmem_free(glist, children * sizeof (uint64_t));
05024 
05025         mutex_enter(&spa->spa_props_lock);
05026         VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
05027             nvl) == 0);
05028         mutex_exit(&spa->spa_props_lock);
05029         spa->spa_config_splitting = nvl;
05030         vdev_config_dirty(spa->spa_root_vdev);
05031 
05032         /* configure and create the new pool */
05033         VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
05034         VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
05035             exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
05036         VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
05037             spa_version(spa)) == 0);
05038         VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
05039             spa->spa_config_txg) == 0);
05040         VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
05041             spa_generate_guid(NULL)) == 0);
05042         (void) nvlist_lookup_string(props,
05043             zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
05044 
05045         /* add the new pool to the namespace */
05046         newspa = spa_add(newname, config, altroot);
05047         newspa->spa_config_txg = spa->spa_config_txg;
05048         spa_set_log_state(newspa, SPA_LOG_CLEAR);
05049 
05050         /* release the spa config lock, retaining the namespace lock */
05051         spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
05052 
05053         if (zio_injection_enabled)
05054                 zio_handle_panic_injection(spa, FTAG, 1);
05055 
05056         spa_activate(newspa, spa_mode_global);
05057         spa_async_suspend(newspa);
05058 
05059 #ifndef sun
05060         /* mark that we are creating new spa by splitting */
05061         newspa->spa_splitting_newspa = B_TRUE;
05062 #endif
05063         /* create the new pool from the disks of the original pool */
05064         error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
05065 #ifndef sun
05066         newspa->spa_splitting_newspa = B_FALSE;
05067 #endif
05068         if (error)
05069                 goto out;
05070 
05071         /* if that worked, generate a real config for the new pool */
05072         if (newspa->spa_root_vdev != NULL) {
05073                 VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
05074                     NV_UNIQUE_NAME, KM_SLEEP) == 0);
05075                 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
05076                     ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
05077                 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
05078                     B_TRUE));
05079         }
05080 
05081         /* set the props */
05082         if (props != NULL) {
05083                 spa_configfile_set(newspa, props, B_FALSE);
05084                 error = spa_prop_set(newspa, props);
05085                 if (error)
05086                         goto out;
05087         }
05088 
05089         /* flush everything */
05090         txg = spa_vdev_config_enter(newspa);
05091         vdev_config_dirty(newspa->spa_root_vdev);
05092         (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
05093 
05094         if (zio_injection_enabled)
05095                 zio_handle_panic_injection(spa, FTAG, 2);
05096 
05097         spa_async_resume(newspa);
05098 
05099         /* finally, update the original pool's config */
05100         txg = spa_vdev_config_enter(spa);
05101         tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
05102         error = dmu_tx_assign(tx, TXG_WAIT);
05103         if (error != 0)
05104                 dmu_tx_abort(tx);
05105         for (c = 0; c < children; c++) {
05106                 if (vml[c] != NULL) {
05107                         vdev_split(vml[c]);
05108                         if (error == 0)
05109                                 spa_history_log_internal(LOG_POOL_VDEV_DETACH,
05110                                     spa, tx, "vdev=%s",
05111                                     vml[c]->vdev_path);
05112                         vdev_free(vml[c]);
05113                 }
05114         }
05115         vdev_config_dirty(spa->spa_root_vdev);
05116         spa->spa_config_splitting = NULL;
05117         nvlist_free(nvl);
05118         if (error == 0)
05119                 dmu_tx_commit(tx);
05120         (void) spa_vdev_exit(spa, NULL, txg, 0);
05121 
05122         if (zio_injection_enabled)
05123                 zio_handle_panic_injection(spa, FTAG, 3);
05124 
05125         /* split is complete; log a history record */
05126         spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL,
05127             "split new pool %s from pool %s", newname, spa_name(spa));
05128 
05129         kmem_free(vml, children * sizeof (vdev_t *));
05130 
05131         /* if we're not going to mount the filesystems in userland, export */
05132         if (exp)
05133                 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
05134                     B_FALSE, B_FALSE);
05135 
05136         return (error);
05137 
05138 out:
05139         spa_unload(newspa);
05140         spa_deactivate(newspa);
05141         spa_remove(newspa);
05142 
05143         txg = spa_vdev_config_enter(spa);
05144 
05145         /* re-online all offlined disks */
05146         for (c = 0; c < children; c++) {
05147                 if (vml[c] != NULL)
05148                         vml[c]->vdev_offline = B_FALSE;
05149         }
05150         vdev_reopen(spa->spa_root_vdev);
05151 
05152         nvlist_free(spa->spa_config_splitting);
05153         spa->spa_config_splitting = NULL;
05154         (void) spa_vdev_exit(spa, NULL, txg, error);
05155 
05156         kmem_free(vml, children * sizeof (vdev_t *));
05157         return (error);
05158 }
05159 
05160 static nvlist_t *
05161 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
05162 {
05163         for (int i = 0; i < count; i++) {
05164                 uint64_t guid;
05165 
05166                 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
05167                     &guid) == 0);
05168 
05169                 if (guid == target_guid)
05170                         return (nvpp[i]);
05171         }
05172 
05173         return (NULL);
05174 }
05175 
05176 static void
05177 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
05178         nvlist_t *dev_to_remove)
05179 {
05180         nvlist_t **newdev = NULL;
05181 
05182         if (count > 1)
05183                 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
05184 
05185         for (int i = 0, j = 0; i < count; i++) {
05186                 if (dev[i] == dev_to_remove)
05187                         continue;
05188                 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
05189         }
05190 
05191         VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
05192         VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
05193 
05194         for (int i = 0; i < count - 1; i++)
05195                 nvlist_free(newdev[i]);
05196 
05197         if (count > 1)
05198                 kmem_free(newdev, (count - 1) * sizeof (void *));
05199 }
05200 
05204 static int
05205 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
05206 {
05207         uint64_t txg;
05208         int error = 0;
05209 
05210         ASSERT(MUTEX_HELD(&spa_namespace_lock));
05211         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
05212         ASSERT(vd == vd->vdev_top);
05213 
05214         /*
05215          * Evacuate the device.  We don't hold the config lock as writer
05216          * since we need to do I/O but we do keep the
05217          * spa_namespace_lock held.  Once this completes the device
05218          * should no longer have any blocks allocated on it.
05219          */
05220         if (vd->vdev_islog) {
05221                 if (vd->vdev_stat.vs_alloc != 0)
05222                         error = spa_offline_log(spa);
05223         } else {
05224                 error = ENOTSUP;
05225         }
05226 
05227         if (error)
05228                 return (error);
05229 
05230         /*
05231          * The evacuation succeeded.  Remove any remaining MOS metadata
05232          * associated with this vdev, and wait for these changes to sync.
05233          */
05234         ASSERT0(vd->vdev_stat.vs_alloc);
05235         txg = spa_vdev_config_enter(spa);
05236         vd->vdev_removing = B_TRUE;
05237         vdev_dirty(vd, 0, NULL, txg);
05238         vdev_config_dirty(vd);
05239         spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
05240 
05241         return (0);
05242 }
05243 
05247 static void
05248 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
05249 {
05250         vdev_t *rvd = spa->spa_root_vdev;
05251         uint64_t id = vd->vdev_id;
05252         boolean_t last_vdev = (id == (rvd->vdev_children - 1));
05253 
05254         ASSERT(MUTEX_HELD(&spa_namespace_lock));
05255         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
05256         ASSERT(vd == vd->vdev_top);
05257 
05258         /*
05259          * Only remove any devices which are empty.
05260          */
05261         if (vd->vdev_stat.vs_alloc != 0)
05262                 return;
05263 
05264         (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
05265 
05266         if (list_link_active(&vd->vdev_state_dirty_node))
05267                 vdev_state_clean(vd);
05268         if (list_link_active(&vd->vdev_config_dirty_node))
05269                 vdev_config_clean(vd);
05270 
05271         vdev_free(vd);
05272 
05273         if (last_vdev) {
05274                 vdev_compact_children(rvd);
05275         } else {
05276                 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
05277                 vdev_add_child(rvd, vd);
05278         }
05279         vdev_config_dirty(rvd);
05280 
05281         /*
05282          * Reassess the health of our root vdev.
05283          */
05284         vdev_reopen(rvd);
05285 }
05286 
05299 int
05300 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
05301 {
05302         vdev_t *vd;
05303         metaslab_group_t *mg;
05304         nvlist_t **spares, **l2cache, *nv;
05305         uint64_t txg = 0;
05306         uint_t nspares, nl2cache;
05307         int error = 0;
05308         boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
05309 
05310         ASSERT(spa_writeable(spa));
05311 
05312         if (!locked)
05313                 txg = spa_vdev_enter(spa);
05314 
05315         vd = spa_lookup_by_guid(spa, guid, B_FALSE);
05316 
05317         if (spa->spa_spares.sav_vdevs != NULL &&
05318             nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
05319             ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
05320             (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
05321                 /*
05322                  * Only remove the hot spare if it's not currently in use
05323                  * in this pool.
05324                  */
05325                 if (vd == NULL || unspare) {
05326                         spa_vdev_remove_aux(spa->spa_spares.sav_config,
05327                             ZPOOL_CONFIG_SPARES, spares, nspares, nv);
05328                         spa_load_spares(spa);
05329                         spa->spa_spares.sav_sync = B_TRUE;
05330                 } else {
05331                         error = EBUSY;
05332                 }
05333         } else if (spa->spa_l2cache.sav_vdevs != NULL &&
05334             nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
05335             ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
05336             (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
05337                 /*
05338                  * Cache devices can always be removed.
05339                  */
05340                 spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
05341                     ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
05342                 spa_load_l2cache(spa);
05343                 spa->spa_l2cache.sav_sync = B_TRUE;
05344         } else if (vd != NULL && vd->vdev_islog) {
05345                 ASSERT(!locked);
05346                 ASSERT(vd == vd->vdev_top);
05347 
05348                 /*
05349                  * XXX - Once we have bp-rewrite this should
05350                  * become the common case.
05351                  */
05352 
05353                 mg = vd->vdev_mg;
05354 
05355                 /*
05356                  * Stop allocating from this vdev.
05357                  */
05358                 metaslab_group_passivate(mg);
05359 
05360                 /*
05361                  * Wait for the youngest allocations and frees to sync,
05362                  * and then wait for the deferral of those frees to finish.
05363                  */
05364                 spa_vdev_config_exit(spa, NULL,
05365                     txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
05366 
05367                 /*
05368                  * Attempt to evacuate the vdev.
05369                  */
05370                 error = spa_vdev_remove_evacuate(spa, vd);
05371 
05372                 txg = spa_vdev_config_enter(spa);
05373 
05374                 /*
05375                  * If we couldn't evacuate the vdev, unwind.
05376                  */
05377                 if (error) {
05378                         metaslab_group_activate(mg);
05379                         return (spa_vdev_exit(spa, NULL, txg, error));
05380                 }
05381 
05382                 /*
05383                  * Clean up the vdev namespace.
05384                  */
05385                 spa_vdev_remove_from_namespace(spa, vd);
05386 
05387         } else if (vd != NULL) {
05388                 /*
05389                  * Normal vdevs cannot be removed (yet).
05390                  */
05391                 error = ENOTSUP;
05392         } else {
05393                 /*
05394                  * There is no vdev of any kind with the specified guid.
05395                  */
05396                 error = ENOENT;
05397         }
05398 
05399         if (!locked)
05400                 return (spa_vdev_exit(spa, NULL, txg, error));
05401 
05402         return (error);
05403 }
05404 
05409 static vdev_t *
05410 spa_vdev_resilver_done_hunt(vdev_t *vd)
05411 {
05412         vdev_t *newvd, *oldvd;
05413 
05414         for (int c = 0; c < vd->vdev_children; c++) {
05415                 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
05416                 if (oldvd != NULL)
05417                         return (oldvd);
05418         }
05419 
05420         /*
05421          * Check for a completed replacement.  We always consider the first
05422          * vdev in the list to be the oldest vdev, and the last one to be
05423          * the newest (see spa_vdev_attach() for how that works).  In
05424          * the case where the newest vdev is faulted, we will not automatically
05425          * remove it after a resilver completes.  This is OK as it will require
05426          * user intervention to determine which disk the admin wishes to keep.
05427          */
05428         if (vd->vdev_ops == &vdev_replacing_ops) {
05429                 ASSERT(vd->vdev_children > 1);
05430 
05431                 newvd = vd->vdev_child[vd->vdev_children - 1];
05432                 oldvd = vd->vdev_child[0];
05433 
05434                 if (vdev_dtl_empty(newvd, DTL_MISSING) &&
05435                     vdev_dtl_empty(newvd, DTL_OUTAGE) &&
05436                     !vdev_dtl_required(oldvd))
05437                         return (oldvd);
05438         }
05439 
05440         /*
05441          * Check for a completed resilver with the 'unspare' flag set.
05442          */
05443         if (vd->vdev_ops == &vdev_spare_ops) {
05444                 vdev_t *first = vd->vdev_child[0];
05445                 vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
05446 
05447                 if (last->vdev_unspare) {
05448                         oldvd = first;
05449                         newvd = last;
05450                 } else if (first->vdev_unspare) {
05451                         oldvd = last;
05452                         newvd = first;
05453                 } else {
05454                         oldvd = NULL;
05455                 }
05456 
05457                 if (oldvd != NULL &&
05458                     vdev_dtl_empty(newvd, DTL_MISSING) &&
05459                     vdev_dtl_empty(newvd, DTL_OUTAGE) &&
05460                     !vdev_dtl_required(oldvd))
05461                         return (oldvd);
05462 
05463                 /*
05464                  * If there are more than two spares attached to a disk,
05465                  * and those spares are not required, then we want to
05466                  * attempt to free them up now so that they can be used
05467                  * by other pools.  Once we're back down to a single
05468                  * disk+spare, we stop removing them.
05469                  */
05470                 if (vd->vdev_children > 2) {
05471                         newvd = vd->vdev_child[1];
05472 
05473                         if (newvd->vdev_isspare && last->vdev_isspare &&
05474                             vdev_dtl_empty(last, DTL_MISSING) &&
05475                             vdev_dtl_empty(last, DTL_OUTAGE) &&
05476                             !vdev_dtl_required(newvd))
05477                                 return (newvd);
05478                 }
05479         }
05480 
05481         return (NULL);
05482 }
05483 
05484 static void
05485 spa_vdev_resilver_done(spa_t *spa)
05486 {
05487         vdev_t *vd, *pvd, *ppvd;
05488         uint64_t guid, sguid, pguid, ppguid;
05489 
05490         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
05491 
05492         while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
05493                 pvd = vd->vdev_parent;
05494                 ppvd = pvd->vdev_parent;
05495                 guid = vd->vdev_guid;
05496                 pguid = pvd->vdev_guid;
05497                 ppguid = ppvd->vdev_guid;
05498                 sguid = 0;
05499                 /*
05500                  * If we have just finished replacing a hot spared device, then
05501                  * we need to detach the parent's first child (the original hot
05502                  * spare) as well.
05503                  */
05504                 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
05505                     ppvd->vdev_children == 2) {
05506                         ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
05507                         sguid = ppvd->vdev_child[1]->vdev_guid;
05508                 }
05509                 spa_config_exit(spa, SCL_ALL, FTAG);
05510                 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
05511                         return;
05512                 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
05513                         return;
05514                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
05515         }
05516 
05517         spa_config_exit(spa, SCL_ALL, FTAG);
05518 }
05519 
05523 int
05524 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
05525     boolean_t ispath)
05526 {
05527         vdev_t *vd;
05528         boolean_t sync = B_FALSE;
05529 
05530         ASSERT(spa_writeable(spa));
05531 
05532         spa_vdev_state_enter(spa, SCL_ALL);
05533 
05534         if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
05535                 return (spa_vdev_state_exit(spa, NULL, ENOENT));
05536 
05537         if (!vd->vdev_ops->vdev_op_leaf)
05538                 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
05539 
05540         if (ispath) {
05541                 if (strcmp(value, vd->vdev_path) != 0) {
05542                         spa_strfree(vd->vdev_path);
05543                         vd->vdev_path = spa_strdup(value);
05544                         sync = B_TRUE;
05545                 }
05546         } else {
05547                 if (vd->vdev_fru == NULL) {
05548                         vd->vdev_fru = spa_strdup(value);
05549                         sync = B_TRUE;
05550                 } else if (strcmp(value, vd->vdev_fru) != 0) {
05551                         spa_strfree(vd->vdev_fru);
05552                         vd->vdev_fru = spa_strdup(value);
05553                         sync = B_TRUE;
05554                 }
05555         }
05556 
05557         return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
05558 }
05559 
05560 int
05561 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
05562 {
05563         return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
05564 }
05565 
05566 int
05567 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
05568 {
05569         return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
05570 }
05571 
05572 /*
05573  * ==========================================================================
05574  * SPA Scanning
05575  * ==========================================================================
05576  */
05577 
05578 int
05579 spa_scan_stop(spa_t *spa)
05580 {
05581         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
05582         if (dsl_scan_resilvering(spa->spa_dsl_pool))
05583                 return (EBUSY);
05584         return (dsl_scan_cancel(spa->spa_dsl_pool));
05585 }
05586 
05587 int
05588 spa_scan(spa_t *spa, pool_scan_func_t func)
05589 {
05590         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
05591 
05592         if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
05593                 return (ENOTSUP);
05594 
05595         /*
05596          * If a resilver was requested, but there is no DTL on a
05597          * writeable leaf device, we have nothing to do.
05598          */
05599         if (func == POOL_SCAN_RESILVER &&
05600             !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
05601                 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
05602                 return (0);
05603         }
05604 
05605         return (dsl_scan(spa->spa_dsl_pool, func));
05606 }
05607 
05608 /*
05609  * ==========================================================================
05610  * SPA async task processing
05611  * ==========================================================================
05612  */
05613 
05614 static void
05615 spa_async_remove(spa_t *spa, vdev_t *vd)
05616 {
05617         if (vd->vdev_remove_wanted) {
05618                 vd->vdev_remove_wanted = B_FALSE;
05619                 vd->vdev_delayed_close = B_FALSE;
05620                 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
05621 
05622                 /*
05623                  * We want to clear the stats, but we don't want to do a full
05624                  * vdev_clear() as that will cause us to throw away
05625                  * degraded/faulted state as well as attempt to reopen the
05626                  * device, all of which is a waste.
05627                  */
05628                 vd->vdev_stat.vs_read_errors = 0;
05629                 vd->vdev_stat.vs_write_errors = 0;
05630                 vd->vdev_stat.vs_checksum_errors = 0;
05631 
05632                 vdev_state_dirty(vd->vdev_top);
05633         }
05634 
05635         for (int c = 0; c < vd->vdev_children; c++)
05636                 spa_async_remove(spa, vd->vdev_child[c]);
05637 }
05638 
05639 static void
05640 spa_async_probe(spa_t *spa, vdev_t *vd)
05641 {
05642         if (vd->vdev_probe_wanted) {
05643                 vd->vdev_probe_wanted = B_FALSE;
05644                 vdev_reopen(vd);        /* vdev_open() does the actual probe */
05645         }
05646 
05647         for (int c = 0; c < vd->vdev_children; c++)
05648                 spa_async_probe(spa, vd->vdev_child[c]);
05649 }
05650 
05651 static void
05652 spa_async_autoexpand(spa_t *spa, vdev_t *vd)
05653 {
05654         sysevent_id_t eid;
05655         nvlist_t *attr;
05656 
05657         if (!spa->spa_autoexpand)
05658                 return;
05659 
05660         for (int c = 0; c < vd->vdev_children; c++) {
05661                 vdev_t *cvd = vd->vdev_child[c];
05662                 spa_async_autoexpand(spa, cvd);
05663         }
05664 
05665         if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_path == NULL)
05666                 return;
05667 
05668         VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
05669         VERIFY(nvlist_add_string(attr, DEV_PATH, vd->vdev_path) == 0);
05670 
05671         (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
05672             ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP);
05673 
05674         nvlist_free(attr);
05675 }
05676 
05677 static void
05678 spa_async_thread(void *arg)
05679 {
05680         spa_t *spa = arg;
05681         int tasks;
05682 
05683         ASSERT(spa->spa_sync_on);
05684 
05685         mutex_enter(&spa->spa_async_lock);
05686         tasks = spa->spa_async_tasks;
05687         spa->spa_async_tasks = 0;
05688         mutex_exit(&spa->spa_async_lock);
05689 
05690         /*
05691          * See if the config needs to be updated.
05692          */
05693         if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
05694                 uint64_t old_space, new_space;
05695 
05696                 mutex_enter(&spa_namespace_lock);
05697                 old_space = metaslab_class_get_space(spa_normal_class(spa));
05698                 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
05699                 new_space = metaslab_class_get_space(spa_normal_class(spa));
05700                 mutex_exit(&spa_namespace_lock);
05701 
05702                 /*
05703                  * If the pool grew as a result of the config update,
05704                  * then log an internal history event.
05705                  */
05706                 if (new_space != old_space) {
05707                         spa_history_log_internal(LOG_POOL_VDEV_ONLINE,
05708                             spa, NULL,
05709                             "pool '%s' size: %llu(+%llu)",
05710                             spa_name(spa), new_space, new_space - old_space);
05711                 }
05712         }
05713 
05714         /*
05715          * See if any devices need to be marked REMOVED.
05716          */
05717         if (tasks & SPA_ASYNC_REMOVE) {
05718                 spa_vdev_state_enter(spa, SCL_NONE);
05719                 spa_async_remove(spa, spa->spa_root_vdev);
05720                 for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
05721                         spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
05722                 for (int i = 0; i < spa->spa_spares.sav_count; i++)
05723                         spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
05724                 (void) spa_vdev_state_exit(spa, NULL, 0);
05725         }
05726 
05727         if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
05728                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
05729                 spa_async_autoexpand(spa, spa->spa_root_vdev);
05730                 spa_config_exit(spa, SCL_CONFIG, FTAG);
05731         }
05732 
05733         /*
05734          * See if any devices need to be probed.
05735          */
05736         if (tasks & SPA_ASYNC_PROBE) {
05737                 spa_vdev_state_enter(spa, SCL_NONE);
05738                 spa_async_probe(spa, spa->spa_root_vdev);
05739                 (void) spa_vdev_state_exit(spa, NULL, 0);
05740         }
05741 
05742         /*
05743          * If any devices are done replacing, detach them.
05744          */
05745         if (tasks & SPA_ASYNC_RESILVER_DONE)
05746                 spa_vdev_resilver_done(spa);
05747 
05748         /*
05749          * Kick off a resilver.
05750          */
05751         if (tasks & SPA_ASYNC_RESILVER)
05752                 dsl_resilver_restart(spa->spa_dsl_pool, 0);
05753 
05754         /*
05755          * Let the world know that we're done.
05756          */
05757         mutex_enter(&spa->spa_async_lock);
05758         spa->spa_async_thread = NULL;
05759         cv_broadcast(&spa->spa_async_cv);
05760         mutex_exit(&spa->spa_async_lock);
05761         thread_exit();
05762 }
05763 
05764 void
05765 spa_async_suspend(spa_t *spa)
05766 {
05767         mutex_enter(&spa->spa_async_lock);
05768         spa->spa_async_suspended++;
05769         while (spa->spa_async_thread != NULL)
05770                 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
05771         mutex_exit(&spa->spa_async_lock);
05772 }
05773 
05774 void
05775 spa_async_resume(spa_t *spa)
05776 {
05777         mutex_enter(&spa->spa_async_lock);
05778         ASSERT(spa->spa_async_suspended != 0);
05779         spa->spa_async_suspended--;
05780         mutex_exit(&spa->spa_async_lock);
05781 }
05782 
05783 static int
05784 spa_async_tasks_pending(spa_t *spa)
05785 {
05786         u_int non_config_tasks;
05787         u_int config_task;
05788         boolean_t config_task_suspended;
05789 
05790         non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
05791         config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
05792         if (spa->spa_ccw_fail_time == 0) {
05793                 config_task_suspended = B_FALSE;
05794         } else {
05795                 config_task_suspended =
05796                     (ddi_get_lbolt64() - spa->spa_ccw_fail_time)
05797                   < (zfs_ccw_retry_interval * hz);
05798         }
05799 
05800         return (non_config_tasks || (config_task && !config_task_suspended));
05801 }
05802 
05803 static void
05804 spa_async_dispatch(spa_t *spa)
05805 {
05806         mutex_enter(&spa->spa_async_lock);
05807         if (spa_async_tasks_pending(spa) &&
05808             !spa->spa_async_suspended &&
05809             spa->spa_async_thread == NULL &&
05810             rootdir != NULL)
05811                 spa->spa_async_thread = thread_create(NULL, 0,
05812                     spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
05813         mutex_exit(&spa->spa_async_lock);
05814 }
05815 
05816 void
05817 spa_async_request(spa_t *spa, int task)
05818 {
05819         zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
05820         mutex_enter(&spa->spa_async_lock);
05821         spa->spa_async_tasks |= task;
05822         mutex_exit(&spa->spa_async_lock);
05823 }
05824 
05825 /*
05826  * ==========================================================================
05827  * SPA syncing routines
05828  * ==========================================================================
05829  */
05830 
05831 static int
05832 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
05833 {
05834         bpobj_t *bpo = arg;
05835         bpobj_enqueue(bpo, bp, tx);
05836         return (0);
05837 }
05838 
05839 static int
05840 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
05841 {
05842         zio_t *zio = arg;
05843 
05844         zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
05845             BP_GET_PSIZE(bp), zio->io_flags));
05846         return (0);
05847 }
05848 
05849 static void
05850 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
05851 {
05852         char *packed = NULL;
05853         size_t bufsize;
05854         size_t nvsize = 0;
05855         dmu_buf_t *db;
05856 
05857         VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
05858 
05859         /*
05860          * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
05861          * information.  This avoids the dbuf_will_dirty() path and
05862          * saves us a pre-read to get data we don't actually care about.
05863          */
05864         bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
05865         packed = kmem_alloc(bufsize, KM_SLEEP);
05866 
05867         VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
05868             KM_SLEEP) == 0);
05869         bzero(packed + nvsize, bufsize - nvsize);
05870 
05871         dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
05872 
05873         kmem_free(packed, bufsize);
05874 
05875         VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
05876         dmu_buf_will_dirty(db, tx);
05877         *(uint64_t *)db->db_data = nvsize;
05878         dmu_buf_rele(db, FTAG);
05879 }
05880 
05881 static void
05882 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
05883     const char *config, const char *entry)
05884 {
05885         nvlist_t *nvroot;
05886         nvlist_t **list;
05887         int i;
05888 
05889         if (!sav->sav_sync)
05890                 return;
05891 
05892         /*
05893          * Update the MOS nvlist describing the list of available devices.
05894          * spa_validate_aux() will have already made sure this nvlist is
05895          * valid and the vdevs are labeled appropriately.
05896          */
05897         if (sav->sav_object == 0) {
05898                 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
05899                     DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
05900                     sizeof (uint64_t), tx);
05901                 VERIFY(zap_update(spa->spa_meta_objset,
05902                     DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
05903                     &sav->sav_object, tx) == 0);
05904         }
05905 
05906         VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
05907         if (sav->sav_count == 0) {
05908                 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
05909         } else {
05910                 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
05911                 for (i = 0; i < sav->sav_count; i++)
05912                         list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
05913                             B_FALSE, VDEV_CONFIG_L2CACHE);
05914                 VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
05915                     sav->sav_count) == 0);
05916                 for (i = 0; i < sav->sav_count; i++)
05917                         nvlist_free(list[i]);
05918                 kmem_free(list, sav->sav_count * sizeof (void *));
05919         }
05920 
05921         spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
05922         nvlist_free(nvroot);
05923 
05924         sav->sav_sync = B_FALSE;
05925 }
05926 
05927 static void
05928 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
05929 {
05930         nvlist_t *config;
05931 
05932         if (list_is_empty(&spa->spa_config_dirty_list))
05933                 return;
05934 
05935         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
05936 
05937         config = spa_config_generate(spa, spa->spa_root_vdev,
05938             dmu_tx_get_txg(tx), B_FALSE);
05939 
05940         spa_config_exit(spa, SCL_STATE, FTAG);
05941 
05942         if (spa->spa_config_syncing)
05943                 nvlist_free(spa->spa_config_syncing);
05944         spa->spa_config_syncing = config;
05945 
05946         spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
05947 }
05948 
05949 static void
05950 spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx)
05951 {
05952         spa_t *spa = arg1;
05953         uint64_t version = *(uint64_t *)arg2;
05954 
05955         /*
05956          * Setting the version is special cased when first creating the pool.
05957          */
05958         ASSERT(tx->tx_txg != TXG_INITIAL);
05959 
05960         ASSERT(version <= SPA_VERSION);
05961         ASSERT(version >= spa_version(spa));
05962 
05963         spa->spa_uberblock.ub_version = version;
05964         vdev_config_dirty(spa->spa_root_vdev);
05965 }
05966 
05970 static void
05971 spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
05972 {
05973         spa_t *spa = arg1;
05974         objset_t *mos = spa->spa_meta_objset;
05975         nvlist_t *nvp = arg2;
05976         nvpair_t *elem = NULL;
05977 
05978         mutex_enter(&spa->spa_props_lock);
05979 
05980         while ((elem = nvlist_next_nvpair(nvp, elem))) {
05981                 uint64_t intval;
05982                 char *strval, *fname;
05983                 zpool_prop_t prop;
05984                 const char *propname;
05985                 zprop_type_t proptype;
05986                 zfeature_info_t *feature;
05987 
05988                 switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
05989                 case ZPROP_INVAL:
05990                         /*
05991                          * We checked this earlier in spa_prop_validate().
05992                          */
05993                         ASSERT(zpool_prop_feature(nvpair_name(elem)));
05994 
05995                         fname = strchr(nvpair_name(elem), '@') + 1;
05996                         VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature));
05997 
05998                         spa_feature_enable(spa, feature, tx);
05999                         break;
06000 
06001                 case ZPOOL_PROP_VERSION:
06002                         VERIFY(nvpair_value_uint64(elem, &intval) == 0);
06003                         /*
06004                          * The version is synced seperatly before other
06005                          * properties and should be correct by now.
06006                          */
06007                         ASSERT3U(spa_version(spa), >=, intval);
06008                         break;
06009 
06010                 case ZPOOL_PROP_ALTROOT:
06011                         /*
06012                          * 'altroot' is a non-persistent property. It should
06013                          * have been set temporarily at creation or import time.
06014                          */
06015                         ASSERT(spa->spa_root != NULL);
06016                         break;
06017 
06018                 case ZPOOL_PROP_READONLY:
06019                 case ZPOOL_PROP_CACHEFILE:
06020                         /*
06021                          * 'readonly' and 'cachefile' are also non-persisitent
06022                          * properties.
06023                          */
06024                         break;
06025                 case ZPOOL_PROP_COMMENT:
06026                         VERIFY(nvpair_value_string(elem, &strval) == 0);
06027                         if (spa->spa_comment != NULL)
06028                                 spa_strfree(spa->spa_comment);
06029                         spa->spa_comment = spa_strdup(strval);
06030                         /*
06031                          * We need to dirty the configuration on all the vdevs
06032                          * so that their labels get updated.  It's unnecessary
06033                          * to do this for pool creation since the vdev's
06034                          * configuratoin has already been dirtied.
06035                          */
06036                         if (tx->tx_txg != TXG_INITIAL)
06037                                 vdev_config_dirty(spa->spa_root_vdev);
06038                         break;
06039                 default:
06040                         /*
06041                          * Set pool property values in the poolprops mos object.
06042                          */
06043                         if (spa->spa_pool_props_object == 0) {
06044                                 spa->spa_pool_props_object =
06045                                     zap_create_link(mos, DMU_OT_POOL_PROPS,
06046                                     DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
06047                                     tx);
06048                         }
06049 
06050                         /* normalize the property name */
06051                         propname = zpool_prop_to_name(prop);
06052                         proptype = zpool_prop_get_type(prop);
06053 
06054                         if (nvpair_type(elem) == DATA_TYPE_STRING) {
06055                                 ASSERT(proptype == PROP_TYPE_STRING);
06056                                 VERIFY(nvpair_value_string(elem, &strval) == 0);
06057                                 VERIFY(zap_update(mos,
06058                                     spa->spa_pool_props_object, propname,
06059                                     1, strlen(strval) + 1, strval, tx) == 0);
06060 
06061                         } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
06062                                 VERIFY(nvpair_value_uint64(elem, &intval) == 0);
06063 
06064                                 if (proptype == PROP_TYPE_INDEX) {
06065                                         const char *unused;
06066                                         VERIFY(zpool_prop_index_to_string(
06067                                             prop, intval, &unused) == 0);
06068                                 }
06069                                 VERIFY(zap_update(mos,
06070                                     spa->spa_pool_props_object, propname,
06071                                     8, 1, &intval, tx) == 0);
06072                         } else {
06073                                 ASSERT(0); /* not allowed */
06074                         }
06075 
06076                         switch (prop) {
06077                         case ZPOOL_PROP_DELEGATION:
06078                                 spa->spa_delegation = intval;
06079                                 break;
06080                         case ZPOOL_PROP_BOOTFS:
06081                                 spa->spa_bootfs = intval;
06082                                 break;
06083                         case ZPOOL_PROP_FAILUREMODE:
06084                                 spa->spa_failmode = intval;
06085                                 break;
06086                         case ZPOOL_PROP_AUTOEXPAND:
06087                                 spa->spa_autoexpand = intval;
06088                                 if (tx->tx_txg != TXG_INITIAL)
06089                                         spa_async_request(spa,
06090                                             SPA_ASYNC_AUTOEXPAND);
06091                                 break;
06092                         case ZPOOL_PROP_DEDUPDITTO:
06093                                 spa->spa_dedup_ditto = intval;
06094                                 break;
06095                         default:
06096                                 break;
06097                         }
06098                 }
06099 
06100                 /* log internal history if this is not a zpool create */
06101                 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
06102                     tx->tx_txg != TXG_INITIAL) {
06103                         spa_history_log_internal(LOG_POOL_PROPSET,
06104                             spa, tx, "%s %lld %s",
06105                             nvpair_name(elem), intval, spa_name(spa));
06106                 }
06107         }
06108 
06109         mutex_exit(&spa->spa_props_lock);
06110 }
06111 
06119 static void
06120 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
06121 {
06122         dsl_pool_t *dp = spa->spa_dsl_pool;
06123 
06124         ASSERT(spa->spa_sync_pass == 1);
06125 
06126         if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
06127             spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
06128                 dsl_pool_create_origin(dp, tx);
06129 
06130                 /* Keeping the origin open increases spa_minref */
06131                 spa->spa_minref += 3;
06132         }
06133 
06134         if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
06135             spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
06136                 dsl_pool_upgrade_clones(dp, tx);
06137         }
06138 
06139         if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
06140             spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
06141                 dsl_pool_upgrade_dir_clones(dp, tx);
06142 
06143                 /* Keeping the freedir open increases spa_minref */
06144                 spa->spa_minref += 3;
06145         }
06146 
06147         if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
06148             spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
06149                 spa_feature_create_zap_objects(spa, tx);
06150         }
06151 }
06152 
06159 void
06160 spa_sync(spa_t *spa, uint64_t txg)
06161 {
06162         dsl_pool_t *dp = spa->spa_dsl_pool;
06163         objset_t *mos = spa->spa_meta_objset;
06164         bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
06165         bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
06166         vdev_t *rvd = spa->spa_root_vdev;
06167         vdev_t *vd;
06168         dmu_tx_t *tx;
06169         int error;
06170 
06171         VERIFY(spa_writeable(spa));
06172 
06173         /*
06174          * Lock out configuration changes.
06175          */
06176         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
06177 
06178         spa->spa_syncing_txg = txg;
06179         spa->spa_sync_pass = 0;
06180 
06181         /*
06182          * If there are any pending vdev state changes, convert them
06183          * into config changes that go out with this transaction group.
06184          */
06185         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
06186         while (list_head(&spa->spa_state_dirty_list) != NULL) {
06187                 /*
06188                  * We need the write lock here because, for aux vdevs,
06189                  * calling vdev_config_dirty() modifies sav_config.
06190                  * This is ugly and will become unnecessary when we
06191                  * eliminate the aux vdev wart by integrating all vdevs
06192                  * into the root vdev tree.
06193                  */
06194                 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
06195                 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
06196                 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
06197                         vdev_state_clean(vd);
06198                         vdev_config_dirty(vd);
06199                 }
06200                 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
06201                 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
06202         }
06203         spa_config_exit(spa, SCL_STATE, FTAG);
06204 
06205         tx = dmu_tx_create_assigned(dp, txg);
06206 
06207         /*
06208          * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
06209          * set spa_deflate if we have no raid-z vdevs.
06210          */
06211         if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
06212             spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
06213                 int i;
06214 
06215                 for (i = 0; i < rvd->vdev_children; i++) {
06216                         vd = rvd->vdev_child[i];
06217                         if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
06218                                 break;
06219                 }
06220                 if (i == rvd->vdev_children) {
06221                         spa->spa_deflate = TRUE;
06222                         VERIFY(0 == zap_add(spa->spa_meta_objset,
06223                             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
06224                             sizeof (uint64_t), 1, &spa->spa_deflate, tx));
06225                 }
06226         }
06227 
06228         /*
06229          * If anything has changed in this txg, or if someone is waiting
06230          * for this txg to sync (eg, spa_vdev_remove()), push the
06231          * deferred frees from the previous txg.  If not, leave them
06232          * alone so that we don't generate work on an otherwise idle
06233          * system.
06234          */
06235         if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
06236             !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
06237             !txg_list_empty(&dp->dp_sync_tasks, txg) ||
06238             ((dsl_scan_active(dp->dp_scan) ||
06239             txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
06240                 zio_t *zio = zio_root(spa, NULL, NULL, 0);
06241                 VERIFY3U(bpobj_iterate(defer_bpo,
06242                     spa_free_sync_cb, zio, tx), ==, 0);
06243                 VERIFY0(zio_wait(zio));
06244         }
06245 
06246         /*
06247          * Iterate to convergence.
06248          */
06249         do {
06250                 int pass = ++spa->spa_sync_pass;
06251 
06252                 spa_sync_config_object(spa, tx);
06253                 spa_sync_aux_dev(spa, &spa->spa_spares, tx,
06254                     ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
06255                 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
06256                     ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
06257                 spa_errlog_sync(spa, txg);
06258                 dsl_pool_sync(dp, txg);
06259 
06260                 if (pass <= SYNC_PASS_DEFERRED_FREE) {
06261                         zio_t *zio = zio_root(spa, NULL, NULL, 0);
06262                         bplist_iterate(free_bpl, spa_free_sync_cb,
06263                             zio, tx);
06264                         VERIFY(zio_wait(zio) == 0);
06265                 } else {
06266                         bplist_iterate(free_bpl, bpobj_enqueue_cb,
06267                             defer_bpo, tx);
06268                 }
06269 
06270                 ddt_sync(spa, txg);
06271                 dsl_scan_sync(dp, tx);
06272 
06273                 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
06274                         vdev_sync(vd, txg);
06275 
06276                 if (pass == 1)
06277                         spa_sync_upgrades(spa, tx);
06278 
06279         } while (dmu_objset_is_dirty(mos, txg));
06280 
06281         /*
06282          * Rewrite the vdev configuration (which includes the uberblock)
06283          * to commit the transaction group.
06284          *
06285          * If there are no dirty vdevs, we sync the uberblock to a few
06286          * random top-level vdevs that are known to be visible in the
06287          * config cache (see spa_vdev_add() for a complete description).
06288          * If there *are* dirty vdevs, sync the uberblock to all vdevs.
06289          */
06290         for (;;) {
06291                 /*
06292                  * We hold SCL_STATE to prevent vdev open/close/etc.
06293                  * while we're attempting to write the vdev labels.
06294                  */
06295                 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
06296 
06297                 if (list_is_empty(&spa->spa_config_dirty_list)) {
06298                         vdev_t *svd[SPA_DVAS_PER_BP];
06299                         int svdcount = 0;
06300                         int children = rvd->vdev_children;
06301                         int c0 = spa_get_random(children);
06302 
06303                         for (int c = 0; c < children; c++) {
06304                                 vd = rvd->vdev_child[(c0 + c) % children];
06305                                 if (vd->vdev_ms_array == 0 || vd->vdev_islog)
06306                                         continue;
06307                                 svd[svdcount++] = vd;
06308                                 if (svdcount == SPA_DVAS_PER_BP)
06309                                         break;
06310                         }
06311                         error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
06312                         if (error != 0)
06313                                 error = vdev_config_sync(svd, svdcount, txg,
06314                                     B_TRUE);
06315                 } else {
06316                         error = vdev_config_sync(rvd->vdev_child,
06317                             rvd->vdev_children, txg, B_FALSE);
06318                         if (error != 0)
06319                                 error = vdev_config_sync(rvd->vdev_child,
06320                                     rvd->vdev_children, txg, B_TRUE);
06321                 }
06322 
06323                 if (error == 0)
06324                         spa->spa_last_synced_guid = rvd->vdev_guid;
06325 
06326                 spa_config_exit(spa, SCL_STATE, FTAG);
06327 
06328                 if (error == 0)
06329                         break;
06330                 zio_suspend(spa, NULL);
06331                 zio_resume_wait(spa);
06332         }
06333         dmu_tx_commit(tx);
06334 
06335         /*
06336          * Clear the dirty config list.
06337          */
06338         while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
06339                 vdev_config_clean(vd);
06340 
06341         /*
06342          * Now that the new config has synced transactionally,
06343          * let it become visible to the config cache.
06344          */
06345         if (spa->spa_config_syncing != NULL) {
06346                 spa_config_set(spa, spa->spa_config_syncing);
06347                 spa->spa_config_txg = txg;
06348                 spa->spa_config_syncing = NULL;
06349         }
06350 
06351         spa->spa_ubsync = spa->spa_uberblock;
06352 
06353         dsl_pool_sync_done(dp, txg);
06354 
06355         /*
06356          * Update usable space statistics.
06357          */
06358         while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
06359                 vdev_sync_done(vd, txg);
06360 
06361         spa_update_dspace(spa);
06362 
06363         /*
06364          * It had better be the case that we didn't dirty anything
06365          * since vdev_config_sync().
06366          */
06367         ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
06368         ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
06369         ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
06370 
06371         spa->spa_sync_pass = 0;
06372 
06373         spa_config_exit(spa, SCL_CONFIG, FTAG);
06374 
06375         spa_handle_ignored_writes(spa);
06376 
06377         /*
06378          * If any async tasks have been requested, kick them off.
06379          */
06380         spa_async_dispatch(spa);
06381 }
06382 
06388 void
06389 spa_sync_allpools(void)
06390 {
06391         spa_t *spa = NULL;
06392         mutex_enter(&spa_namespace_lock);
06393         while ((spa = spa_next(spa)) != NULL) {
06394                 if (spa_state(spa) != POOL_STATE_ACTIVE ||
06395                     !spa_writeable(spa) || spa_suspended(spa))
06396                         continue;
06397                 spa_open_ref(spa, FTAG);
06398                 mutex_exit(&spa_namespace_lock);
06399                 txg_wait_synced(spa_get_dsl(spa), 0);
06400                 mutex_enter(&spa_namespace_lock);
06401                 spa_close(spa, FTAG);
06402         }
06403         mutex_exit(&spa_namespace_lock);
06404 }
06405 
06406 /*
06407  * ==========================================================================
06408  * Miscellaneous routines
06409  * ==========================================================================
06410  */
06411 
06415 void
06416 spa_evict_all(void)
06417 {
06418         spa_t *spa;
06419 
06420         /*
06421          * Remove all cached state.  All pools should be closed now,
06422          * so every spa in the AVL tree should be unreferenced.
06423          */
06424         mutex_enter(&spa_namespace_lock);
06425         while ((spa = spa_next(NULL)) != NULL) {
06426                 /*
06427                  * Stop async tasks.  The async thread may need to detach
06428                  * a device that's been replaced, which requires grabbing
06429                  * spa_namespace_lock, so we must drop it here.
06430                  */
06431                 spa_open_ref(spa, FTAG);
06432                 mutex_exit(&spa_namespace_lock);
06433                 spa_async_suspend(spa);
06434                 mutex_enter(&spa_namespace_lock);
06435                 spa_close(spa, FTAG);
06436 
06437                 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
06438                         spa_unload(spa);
06439                         spa_deactivate(spa);
06440                 }
06441                 spa_remove(spa);
06442         }
06443         mutex_exit(&spa_namespace_lock);
06444 }
06445 
06446 vdev_t *
06447 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
06448 {
06449         vdev_t *vd;
06450         int i;
06451 
06452         if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
06453                 return (vd);
06454 
06455         if (aux) {
06456                 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
06457                         vd = spa->spa_l2cache.sav_vdevs[i];
06458                         if (vd->vdev_guid == guid)
06459                                 return (vd);
06460                 }
06461 
06462                 for (i = 0; i < spa->spa_spares.sav_count; i++) {
06463                         vd = spa->spa_spares.sav_vdevs[i];
06464                         if (vd->vdev_guid == guid)
06465                                 return (vd);
06466                 }
06467         }
06468 
06469         return (NULL);
06470 }
06471 
06472 void
06473 spa_upgrade(spa_t *spa, uint64_t version)
06474 {
06475         ASSERT(spa_writeable(spa));
06476 
06477         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
06478 
06479         /*
06480          * This should only be called for a non-faulted pool, and since a
06481          * future version would result in an unopenable pool, this shouldn't be
06482          * possible.
06483          */
06484         ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION);
06485         ASSERT(version >= spa->spa_uberblock.ub_version);
06486 
06487         spa->spa_uberblock.ub_version = version;
06488         vdev_config_dirty(spa->spa_root_vdev);
06489 
06490         spa_config_exit(spa, SCL_ALL, FTAG);
06491 
06492         txg_wait_synced(spa_get_dsl(spa), 0);
06493 }
06494 
06495 boolean_t
06496 spa_has_spare(spa_t *spa, uint64_t guid)
06497 {
06498         int i;
06499         uint64_t spareguid;
06500         spa_aux_vdev_t *sav = &spa->spa_spares;
06501 
06502         for (i = 0; i < sav->sav_count; i++)
06503                 if (sav->sav_vdevs[i]->vdev_guid == guid)
06504                         return (B_TRUE);
06505 
06506         for (i = 0; i < sav->sav_npending; i++) {
06507                 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
06508                     &spareguid) == 0 && spareguid == guid)
06509                         return (B_TRUE);
06510         }
06511 
06512         return (B_FALSE);
06513 }
06514 
06521 static boolean_t
06522 spa_has_active_shared_spare(spa_t *spa)
06523 {
06524         int i, refcnt;
06525         uint64_t pool;
06526         spa_aux_vdev_t *sav = &spa->spa_spares;
06527 
06528         for (i = 0; i < sav->sav_count; i++) {
06529                 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
06530                     &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
06531                     refcnt > 2)
06532                         return (B_TRUE);
06533         }
06534 
06535         return (B_FALSE);
06536 }
06537 
06550 void
06551 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
06552 {
06553 #ifdef _KERNEL
06554         sysevent_t              *ev;
06555         sysevent_attr_list_t    *attr = NULL;
06556         sysevent_value_t        value;
06557         sysevent_id_t           eid;
06558 
06559         ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
06560             SE_SLEEP);
06561 
06562         value.value_type = SE_DATA_TYPE_STRING;
06563         value.value.sv_string = spa_name(spa);
06564         if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
06565                 goto done;
06566 
06567         value.value_type = SE_DATA_TYPE_UINT64;
06568         value.value.sv_uint64 = spa_guid(spa);
06569         if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
06570                 goto done;
06571 
06572         if (vd) {
06573                 value.value_type = SE_DATA_TYPE_UINT64;
06574                 value.value.sv_uint64 = vd->vdev_guid;
06575                 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
06576                     SE_SLEEP) != 0)
06577                         goto done;
06578 
06579                 if (vd->vdev_path) {
06580                         value.value_type = SE_DATA_TYPE_STRING;
06581                         value.value.sv_string = vd->vdev_path;
06582                         if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
06583                             &value, SE_SLEEP) != 0)
06584                                 goto done;
06585                 }
06586         }
06587 
06588         if (sysevent_attach_attributes(ev, attr) != 0)
06589                 goto done;
06590         attr = NULL;
06591 
06592         (void) log_sysevent(ev, SE_SLEEP, &eid);
06593 
06594 done:
06595         if (attr)
06596                 sysevent_free_attr(attr);
06597         sysevent_free(ev);
06598 #endif
06599 }
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines