FreeBSD ZFS
The Zettabyte File System
|
00001 /* 00002 * CDDL HEADER START 00003 * 00004 * The contents of this file are subject to the terms of the 00005 * Common Development and Distribution License (the "License"). 00006 * You may not use this file except in compliance with the License. 00007 * 00008 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 00009 * or http://www.opensolaris.org/os/licensing. 00010 * See the License for the specific language governing permissions 00011 * and limitations under the License. 00012 * 00013 * When distributing Covered Code, include this CDDL HEADER in each 00014 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 00015 * If applicable, add the following below this CDDL HEADER, with the 00016 * fields enclosed by brackets "[]" replaced with your own identifying 00017 * information: Portions Copyright [yyyy] [name of copyright owner] 00018 * 00019 * CDDL HEADER END 00020 */ 00021 00022 /* 00023 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 00024 * Copyright (c) 2012 by Delphix. All rights reserved. 00025 */ 00026 00036 #include <sys/zfs_context.h> 00037 #include <sys/fm/fs/zfs.h> 00038 #include <sys/spa_impl.h> 00039 #include <sys/zio.h> 00040 #include <sys/zio_checksum.h> 00041 #include <sys/dmu.h> 00042 #include <sys/dmu_tx.h> 00043 #include <sys/zap.h> 00044 #include <sys/zil.h> 00045 #include <sys/ddt.h> 00046 #include <sys/vdev_impl.h> 00047 #include <sys/metaslab.h> 00048 #include <sys/metaslab_impl.h> 00049 #include <sys/uberblock_impl.h> 00050 #include <sys/txg.h> 00051 #include <sys/avl.h> 00052 #include <sys/dmu_traverse.h> 00053 #include <sys/dmu_objset.h> 00054 #include <sys/unique.h> 00055 #include <sys/dsl_pool.h> 00056 #include <sys/dsl_dataset.h> 00057 #include <sys/dsl_dir.h> 00058 #include <sys/dsl_prop.h> 00059 #include <sys/dsl_synctask.h> 00060 #include <sys/fs/zfs.h> 00061 #include <sys/arc.h> 00062 #include <sys/callb.h> 00063 #include <sys/spa_boot.h> 00064 #include <sys/zfs_ioctl.h> 00065 #include <sys/dsl_scan.h> 00066 #include <sys/zfeature.h> 00067 #include <sys/zvol.h> 00068 #include <sys/trim_map.h> 00069 00070 #ifdef _KERNEL 00071 #include <sys/callb.h> 00072 #include <sys/cpupart.h> 00073 #include <sys/zone.h> 00074 #endif /* _KERNEL */ 00075 00076 #include "zfs_prop.h" 00077 #include "zfs_comutil.h" 00078 00084 int check_hostid = 1; 00085 00092 int zfs_ccw_retry_interval = 300; 00093 00094 SYSCTL_DECL(_vfs_zfs); 00095 TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); 00096 SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0, 00097 "Check hostid on import?"); 00098 TUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval); 00099 SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW, 00100 &zfs_ccw_retry_interval, 0, 00101 "Configuration cache file write, retry after failure, interval (seconds)"); 00102 00103 typedef enum zti_modes { 00104 zti_mode_fixed, 00105 zti_mode_online_percent, 00106 zti_mode_batch, 00107 zti_mode_null, 00108 zti_nmodes 00109 } zti_modes_t; 00110 00111 #define ZTI_FIX(n) { zti_mode_fixed, (n) } 00112 #define ZTI_PCT(n) { zti_mode_online_percent, (n) } 00113 #define ZTI_BATCH { zti_mode_batch, 0 } 00114 #define ZTI_NULL { zti_mode_null, 0 } 00115 00116 #define ZTI_ONE ZTI_FIX(1) 00117 00118 typedef struct zio_taskq_info { 00119 enum zti_modes zti_mode; 00120 uint_t zti_value; 00121 } zio_taskq_info_t; 00122 00123 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 00124 "issue", "issue_high", "intr", "intr_high" 00125 }; 00126 00131 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 00132 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 00133 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 00134 { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, 00135 { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, 00136 { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL }, 00137 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 00138 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 00139 }; 00140 00141 static dsl_syncfunc_t spa_sync_version; 00142 static dsl_syncfunc_t spa_sync_props; 00143 static dsl_checkfunc_t spa_change_guid_check; 00144 static dsl_syncfunc_t spa_change_guid_sync; 00145 static boolean_t spa_has_active_shared_spare(spa_t *spa); 00146 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 00147 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 00148 char **ereport); 00149 static void spa_vdev_resilver_done(spa_t *spa); 00150 00151 uint_t zio_taskq_batch_pct = 100; 00152 #ifdef PSRSET_BIND 00153 id_t zio_taskq_psrset_bind = PS_NONE; 00154 #endif 00155 #ifdef SYSDC 00156 boolean_t zio_taskq_sysdc = B_TRUE; 00157 #endif 00158 uint_t zio_taskq_basedc = 80; 00160 boolean_t spa_create_process = B_TRUE; 00166 #define TRYIMPORT_NAME "$import" 00167 00168 /* 00169 * ========================================================================== 00170 * SPA properties routines 00171 * ========================================================================== 00172 */ 00173 00177 static void 00178 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 00179 uint64_t intval, zprop_source_t src) 00180 { 00181 const char *propname = zpool_prop_to_name(prop); 00182 nvlist_t *propval; 00183 00184 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 00185 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 00186 00187 if (strval != NULL) 00188 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 00189 else 00190 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 00191 00192 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 00193 nvlist_free(propval); 00194 } 00195 00199 static void 00200 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 00201 { 00202 vdev_t *rvd = spa->spa_root_vdev; 00203 dsl_pool_t *pool = spa->spa_dsl_pool; 00204 uint64_t size; 00205 uint64_t alloc; 00206 uint64_t space; 00207 uint64_t cap, version; 00208 zprop_source_t src = ZPROP_SRC_NONE; 00209 spa_config_dirent_t *dp; 00210 00211 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 00212 00213 if (rvd != NULL) { 00214 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 00215 size = metaslab_class_get_space(spa_normal_class(spa)); 00216 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 00217 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 00218 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 00219 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 00220 size - alloc, src); 00221 00222 space = 0; 00223 for (int c = 0; c < rvd->vdev_children; c++) { 00224 vdev_t *tvd = rvd->vdev_child[c]; 00225 space += tvd->vdev_max_asize - tvd->vdev_asize; 00226 } 00227 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space, 00228 src); 00229 00230 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 00231 (spa_mode(spa) == FREAD), src); 00232 00233 cap = (size == 0) ? 0 : (alloc * 100 / size); 00234 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 00235 00236 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 00237 ddt_get_pool_dedup_ratio(spa), src); 00238 00239 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 00240 rvd->vdev_state, src); 00241 00242 version = spa_version(spa); 00243 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 00244 src = ZPROP_SRC_DEFAULT; 00245 else 00246 src = ZPROP_SRC_LOCAL; 00247 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 00248 } 00249 00250 if (pool != NULL) { 00251 dsl_dir_t *freedir = pool->dp_free_dir; 00252 00253 /* 00254 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 00255 * when opening pools before this version freedir will be NULL. 00256 */ 00257 if (freedir != NULL) { 00258 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 00259 freedir->dd_phys->dd_used_bytes, src); 00260 } else { 00261 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 00262 NULL, 0, src); 00263 } 00264 } 00265 00266 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 00267 00268 if (spa->spa_comment != NULL) { 00269 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 00270 0, ZPROP_SRC_LOCAL); 00271 } 00272 00273 if (spa->spa_root != NULL) 00274 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 00275 0, ZPROP_SRC_LOCAL); 00276 00277 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 00278 if (dp->scd_path == NULL) { 00279 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 00280 "none", 0, ZPROP_SRC_LOCAL); 00281 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 00282 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 00283 dp->scd_path, 0, ZPROP_SRC_LOCAL); 00284 } 00285 } 00286 } 00287 00291 int 00292 spa_prop_get(spa_t *spa, nvlist_t **nvp) 00293 { 00294 objset_t *mos = spa->spa_meta_objset; 00295 zap_cursor_t zc; 00296 zap_attribute_t za; 00297 int err; 00298 00299 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 00300 00301 mutex_enter(&spa->spa_props_lock); 00302 00303 /* 00304 * Get properties from the spa config. 00305 */ 00306 spa_prop_get_config(spa, nvp); 00307 00308 /* If no pool property object, no more prop to get. */ 00309 if (mos == NULL || spa->spa_pool_props_object == 0) { 00310 mutex_exit(&spa->spa_props_lock); 00311 return (0); 00312 } 00313 00314 /* 00315 * Get properties from the MOS pool property object. 00316 */ 00317 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 00318 (err = zap_cursor_retrieve(&zc, &za)) == 0; 00319 zap_cursor_advance(&zc)) { 00320 uint64_t intval = 0; 00321 char *strval = NULL; 00322 zprop_source_t src = ZPROP_SRC_DEFAULT; 00323 zpool_prop_t prop; 00324 00325 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 00326 continue; 00327 00328 switch (za.za_integer_length) { 00329 case 8: 00330 /* integer property */ 00331 if (za.za_first_integer != 00332 zpool_prop_default_numeric(prop)) 00333 src = ZPROP_SRC_LOCAL; 00334 00335 if (prop == ZPOOL_PROP_BOOTFS) { 00336 dsl_pool_t *dp; 00337 dsl_dataset_t *ds = NULL; 00338 00339 dp = spa_get_dsl(spa); 00340 rw_enter(&dp->dp_config_rwlock, RW_READER); 00341 if (err = dsl_dataset_hold_obj(dp, 00342 za.za_first_integer, FTAG, &ds)) { 00343 rw_exit(&dp->dp_config_rwlock); 00344 break; 00345 } 00346 00347 strval = kmem_alloc( 00348 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 00349 KM_SLEEP); 00350 dsl_dataset_name(ds, strval); 00351 dsl_dataset_rele(ds, FTAG); 00352 rw_exit(&dp->dp_config_rwlock); 00353 } else { 00354 strval = NULL; 00355 intval = za.za_first_integer; 00356 } 00357 00358 spa_prop_add_list(*nvp, prop, strval, intval, src); 00359 00360 if (strval != NULL) 00361 kmem_free(strval, 00362 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 00363 00364 break; 00365 00366 case 1: 00367 /* string property */ 00368 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 00369 err = zap_lookup(mos, spa->spa_pool_props_object, 00370 za.za_name, 1, za.za_num_integers, strval); 00371 if (err) { 00372 kmem_free(strval, za.za_num_integers); 00373 break; 00374 } 00375 spa_prop_add_list(*nvp, prop, strval, 0, src); 00376 kmem_free(strval, za.za_num_integers); 00377 break; 00378 00379 default: 00380 break; 00381 } 00382 } 00383 zap_cursor_fini(&zc); 00384 mutex_exit(&spa->spa_props_lock); 00385 out: 00386 if (err && err != ENOENT) { 00387 nvlist_free(*nvp); 00388 *nvp = NULL; 00389 return (err); 00390 } 00391 00392 return (0); 00393 } 00394 00399 static int 00400 spa_prop_validate(spa_t *spa, nvlist_t *props) 00401 { 00402 nvpair_t *elem; 00403 int error = 0, reset_bootfs = 0; 00404 uint64_t objnum; 00405 boolean_t has_feature = B_FALSE; 00406 00407 elem = NULL; 00408 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 00409 uint64_t intval; 00410 char *strval, *slash, *check, *fname; 00411 const char *propname = nvpair_name(elem); 00412 zpool_prop_t prop = zpool_name_to_prop(propname); 00413 00414 switch (prop) { 00415 case ZPROP_INVAL: 00416 if (!zpool_prop_feature(propname)) { 00417 error = EINVAL; 00418 break; 00419 } 00420 00421 /* 00422 * Sanitize the input. 00423 */ 00424 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 00425 error = EINVAL; 00426 break; 00427 } 00428 00429 if (nvpair_value_uint64(elem, &intval) != 0) { 00430 error = EINVAL; 00431 break; 00432 } 00433 00434 if (intval != 0) { 00435 error = EINVAL; 00436 break; 00437 } 00438 00439 fname = strchr(propname, '@') + 1; 00440 if (zfeature_lookup_name(fname, NULL) != 0) { 00441 error = EINVAL; 00442 break; 00443 } 00444 00445 has_feature = B_TRUE; 00446 break; 00447 00448 case ZPOOL_PROP_VERSION: 00449 error = nvpair_value_uint64(elem, &intval); 00450 if (!error && 00451 (intval < spa_version(spa) || 00452 intval > SPA_VERSION_BEFORE_FEATURES || 00453 has_feature)) 00454 error = EINVAL; 00455 break; 00456 00457 case ZPOOL_PROP_DELEGATION: 00458 case ZPOOL_PROP_AUTOREPLACE: 00459 case ZPOOL_PROP_LISTSNAPS: 00460 case ZPOOL_PROP_AUTOEXPAND: 00461 error = nvpair_value_uint64(elem, &intval); 00462 if (!error && intval > 1) 00463 error = EINVAL; 00464 break; 00465 00466 case ZPOOL_PROP_BOOTFS: 00467 /* 00468 * If the pool version is less than SPA_VERSION_BOOTFS, 00469 * or the pool is still being created (version == 0), 00470 * the bootfs property cannot be set. 00471 */ 00472 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 00473 error = ENOTSUP; 00474 break; 00475 } 00476 00477 /* 00478 * Make sure the vdev config is bootable 00479 */ 00480 if (!vdev_is_bootable(spa->spa_root_vdev)) { 00481 error = ENOTSUP; 00482 break; 00483 } 00484 00485 reset_bootfs = 1; 00486 00487 error = nvpair_value_string(elem, &strval); 00488 00489 if (!error) { 00490 objset_t *os; 00491 uint64_t compress; 00492 00493 if (strval == NULL || strval[0] == '\0') { 00494 objnum = zpool_prop_default_numeric( 00495 ZPOOL_PROP_BOOTFS); 00496 break; 00497 } 00498 00499 if (error = dmu_objset_hold(strval, FTAG, &os)) 00500 break; 00501 00502 /* Must be ZPL and not gzip compressed. */ 00503 00504 if (dmu_objset_type(os) != DMU_OST_ZFS) { 00505 error = ENOTSUP; 00506 } else if ((error = dsl_prop_get_integer(strval, 00507 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 00508 &compress, NULL)) == 0 && 00509 !BOOTFS_COMPRESS_VALID(compress)) { 00510 error = ENOTSUP; 00511 } else { 00512 objnum = dmu_objset_id(os); 00513 } 00514 dmu_objset_rele(os, FTAG); 00515 } 00516 break; 00517 00518 case ZPOOL_PROP_FAILUREMODE: 00519 error = nvpair_value_uint64(elem, &intval); 00520 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 00521 intval > ZIO_FAILURE_MODE_PANIC)) 00522 error = EINVAL; 00523 00524 /* 00525 * This is a special case which only occurs when 00526 * the pool has completely failed. This allows 00527 * the user to change the in-core failmode property 00528 * without syncing it out to disk (I/Os might 00529 * currently be blocked). We do this by returning 00530 * EIO to the caller (spa_prop_set) to trick it 00531 * into thinking we encountered a property validation 00532 * error. 00533 */ 00534 if (!error && spa_suspended(spa)) { 00535 spa->spa_failmode = intval; 00536 error = EIO; 00537 } 00538 break; 00539 00540 case ZPOOL_PROP_CACHEFILE: 00541 if ((error = nvpair_value_string(elem, &strval)) != 0) 00542 break; 00543 00544 if (strval[0] == '\0') 00545 break; 00546 00547 if (strcmp(strval, "none") == 0) 00548 break; 00549 00550 if (strval[0] != '/') { 00551 error = EINVAL; 00552 break; 00553 } 00554 00555 slash = strrchr(strval, '/'); 00556 ASSERT(slash != NULL); 00557 00558 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 00559 strcmp(slash, "/..") == 0) 00560 error = EINVAL; 00561 break; 00562 00563 case ZPOOL_PROP_COMMENT: 00564 if ((error = nvpair_value_string(elem, &strval)) != 0) 00565 break; 00566 for (check = strval; *check != '\0'; check++) { 00567 /* 00568 * The kernel doesn't have an easy isprint() 00569 * check. For this kernel check, we merely 00570 * check ASCII apart from DEL. Fix this if 00571 * there is an easy-to-use kernel isprint(). 00572 */ 00573 if (*check >= 0x7f) { 00574 error = EINVAL; 00575 break; 00576 } 00577 check++; 00578 } 00579 if (strlen(strval) > ZPROP_MAX_COMMENT) 00580 error = E2BIG; 00581 break; 00582 00583 case ZPOOL_PROP_DEDUPDITTO: 00584 if (spa_version(spa) < SPA_VERSION_DEDUP) 00585 error = ENOTSUP; 00586 else 00587 error = nvpair_value_uint64(elem, &intval); 00588 if (error == 0 && 00589 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 00590 error = EINVAL; 00591 break; 00592 } 00593 00594 if (error) 00595 break; 00596 } 00597 00598 if (!error && reset_bootfs) { 00599 error = nvlist_remove(props, 00600 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 00601 00602 if (!error) { 00603 error = nvlist_add_uint64(props, 00604 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 00605 } 00606 } 00607 00608 return (error); 00609 } 00610 00611 void 00612 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 00613 { 00614 char *cachefile; 00615 spa_config_dirent_t *dp; 00616 00617 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 00618 &cachefile) != 0) 00619 return; 00620 00621 dp = kmem_alloc(sizeof (spa_config_dirent_t), 00622 KM_SLEEP); 00623 00624 if (cachefile[0] == '\0') 00625 dp->scd_path = spa_strdup(spa_config_path); 00626 else if (strcmp(cachefile, "none") == 0) 00627 dp->scd_path = NULL; 00628 else 00629 dp->scd_path = spa_strdup(cachefile); 00630 00631 list_insert_head(&spa->spa_config_list, dp); 00632 if (need_sync) 00633 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 00634 } 00635 00636 int 00637 spa_prop_set(spa_t *spa, nvlist_t *nvp) 00638 { 00639 int error; 00640 nvpair_t *elem = NULL; 00641 boolean_t need_sync = B_FALSE; 00642 00643 if ((error = spa_prop_validate(spa, nvp)) != 0) 00644 return (error); 00645 00646 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 00647 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 00648 00649 if (prop == ZPOOL_PROP_CACHEFILE || 00650 prop == ZPOOL_PROP_ALTROOT || 00651 prop == ZPOOL_PROP_READONLY) 00652 continue; 00653 00654 if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 00655 uint64_t ver; 00656 00657 if (prop == ZPOOL_PROP_VERSION) { 00658 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 00659 } else { 00660 ASSERT(zpool_prop_feature(nvpair_name(elem))); 00661 ver = SPA_VERSION_FEATURES; 00662 need_sync = B_TRUE; 00663 } 00664 00665 /* Save time if the version is already set. */ 00666 if (ver == spa_version(spa)) 00667 continue; 00668 00669 /* 00670 * In addition to the pool directory object, we might 00671 * create the pool properties object, the features for 00672 * read object, the features for write object, or the 00673 * feature descriptions object. 00674 */ 00675 error = dsl_sync_task_do(spa_get_dsl(spa), NULL, 00676 spa_sync_version, spa, &ver, 6); 00677 if (error) 00678 return (error); 00679 continue; 00680 } 00681 00682 need_sync = B_TRUE; 00683 break; 00684 } 00685 00686 if (need_sync) { 00687 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 00688 spa, nvp, 6)); 00689 } 00690 00691 return (0); 00692 } 00693 00697 void 00698 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 00699 { 00700 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 00701 VERIFY(zap_remove(spa->spa_meta_objset, 00702 spa->spa_pool_props_object, 00703 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 00704 spa->spa_bootfs = 0; 00705 } 00706 } 00707 00708 /*ARGSUSED*/ 00709 static int 00710 spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx) 00711 { 00712 spa_t *spa = arg1; 00713 uint64_t *newguid = arg2; 00714 vdev_t *rvd = spa->spa_root_vdev; 00715 uint64_t vdev_state; 00716 00717 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 00718 vdev_state = rvd->vdev_state; 00719 spa_config_exit(spa, SCL_STATE, FTAG); 00720 00721 if (vdev_state != VDEV_STATE_HEALTHY) 00722 return (ENXIO); 00723 00724 ASSERT3U(spa_guid(spa), !=, *newguid); 00725 00726 return (0); 00727 } 00728 00729 static void 00730 spa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx) 00731 { 00732 spa_t *spa = arg1; 00733 uint64_t *newguid = arg2; 00734 uint64_t oldguid; 00735 vdev_t *rvd = spa->spa_root_vdev; 00736 00737 oldguid = spa_guid(spa); 00738 00739 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 00740 rvd->vdev_guid = *newguid; 00741 rvd->vdev_guid_sum += (*newguid - oldguid); 00742 vdev_config_dirty(rvd); 00743 spa_config_exit(spa, SCL_STATE, FTAG); 00744 00745 #ifdef __FreeBSD__ 00746 /* 00747 * TODO: until recent illumos logging changes are merged 00748 * log reguid as pool property change 00749 */ 00750 spa_history_log_internal(LOG_POOL_PROPSET, spa, tx, 00751 "guid change old=%llu new=%llu", oldguid, *newguid); 00752 #else 00753 spa_history_log_internal(spa, "guid change", tx, "old=%lld new=%lld", 00754 oldguid, *newguid); 00755 #endif 00756 } 00757 00767 int 00768 spa_change_guid(spa_t *spa) 00769 { 00770 int error; 00771 uint64_t guid; 00772 00773 mutex_enter(&spa_namespace_lock); 00774 guid = spa_generate_guid(NULL); 00775 00776 error = dsl_sync_task_do(spa_get_dsl(spa), spa_change_guid_check, 00777 spa_change_guid_sync, spa, &guid, 5); 00778 00779 if (error == 0) { 00780 spa_config_sync(spa, B_FALSE, B_TRUE); 00781 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 00782 } 00783 00784 mutex_exit(&spa_namespace_lock); 00785 00786 return (error); 00787 } 00788 00789 /* 00790 * ========================================================================== 00791 * SPA state manipulation (open/create/destroy/import/export) 00792 * ========================================================================== 00793 */ 00794 00795 static int 00796 spa_error_entry_compare(const void *a, const void *b) 00797 { 00798 spa_error_entry_t *sa = (spa_error_entry_t *)a; 00799 spa_error_entry_t *sb = (spa_error_entry_t *)b; 00800 int ret; 00801 00802 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 00803 sizeof (zbookmark_t)); 00804 00805 if (ret < 0) 00806 return (-1); 00807 else if (ret > 0) 00808 return (1); 00809 else 00810 return (0); 00811 } 00812 00817 void 00818 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 00819 { 00820 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 00821 00822 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 00823 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 00824 00825 avl_create(&spa->spa_errlist_scrub, 00826 spa_error_entry_compare, sizeof (spa_error_entry_t), 00827 offsetof(spa_error_entry_t, se_avl)); 00828 avl_create(&spa->spa_errlist_last, 00829 spa_error_entry_compare, sizeof (spa_error_entry_t), 00830 offsetof(spa_error_entry_t, se_avl)); 00831 } 00832 00833 static taskq_t * 00834 spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, 00835 uint_t value) 00836 { 00837 uint_t flags = TASKQ_PREPOPULATE; 00838 boolean_t batch = B_FALSE; 00839 00840 switch (mode) { 00841 case zti_mode_null: 00842 return (NULL); /* no taskq needed */ 00843 00844 case zti_mode_fixed: 00845 ASSERT3U(value, >=, 1); 00846 value = MAX(value, 1); 00847 break; 00848 00849 case zti_mode_batch: 00850 batch = B_TRUE; 00851 flags |= TASKQ_THREADS_CPU_PCT; 00852 value = zio_taskq_batch_pct; 00853 break; 00854 00855 case zti_mode_online_percent: 00856 flags |= TASKQ_THREADS_CPU_PCT; 00857 break; 00858 00859 default: 00860 panic("unrecognized mode for %s taskq (%u:%u) in " 00861 "spa_activate()", 00862 name, mode, value); 00863 break; 00864 } 00865 00866 #ifdef SYSDC 00867 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 00868 if (batch) 00869 flags |= TASKQ_DC_BATCH; 00870 00871 return (taskq_create_sysdc(name, value, 50, INT_MAX, 00872 spa->spa_proc, zio_taskq_basedc, flags)); 00873 } 00874 #endif 00875 return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, 00876 spa->spa_proc, flags)); 00877 } 00878 00879 static void 00880 spa_create_zio_taskqs(spa_t *spa) 00881 { 00882 for (int t = 0; t < ZIO_TYPES; t++) { 00883 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 00884 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 00885 enum zti_modes mode = ztip->zti_mode; 00886 uint_t value = ztip->zti_value; 00887 char name[32]; 00888 00889 (void) snprintf(name, sizeof (name), 00890 "%s_%s", zio_type_name[t], zio_taskq_types[q]); 00891 00892 spa->spa_zio_taskq[t][q] = 00893 spa_taskq_create(spa, name, mode, value); 00894 } 00895 } 00896 } 00897 00898 #ifdef _KERNEL 00899 #ifdef SPA_PROCESS 00900 static void 00901 spa_thread(void *arg) 00902 { 00903 callb_cpr_t cprinfo; 00904 00905 spa_t *spa = arg; 00906 user_t *pu = PTOU(curproc); 00907 00908 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 00909 spa->spa_name); 00910 00911 ASSERT(curproc != &p0); 00912 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 00913 "zpool-%s", spa->spa_name); 00914 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 00915 00916 #ifdef PSRSET_BIND 00917 /* bind this thread to the requested psrset */ 00918 if (zio_taskq_psrset_bind != PS_NONE) { 00919 pool_lock(); 00920 mutex_enter(&cpu_lock); 00921 mutex_enter(&pidlock); 00922 mutex_enter(&curproc->p_lock); 00923 00924 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 00925 0, NULL, NULL) == 0) { 00926 curthread->t_bind_pset = zio_taskq_psrset_bind; 00927 } else { 00928 cmn_err(CE_WARN, 00929 "Couldn't bind process for zfs pool \"%s\" to " 00930 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 00931 } 00932 00933 mutex_exit(&curproc->p_lock); 00934 mutex_exit(&pidlock); 00935 mutex_exit(&cpu_lock); 00936 pool_unlock(); 00937 } 00938 #endif 00939 00940 #ifdef SYSDC 00941 if (zio_taskq_sysdc) { 00942 sysdc_thread_enter(curthread, 100, 0); 00943 } 00944 #endif 00945 00946 spa->spa_proc = curproc; 00947 spa->spa_did = curthread->t_did; 00948 00949 spa_create_zio_taskqs(spa); 00950 00951 mutex_enter(&spa->spa_proc_lock); 00952 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 00953 00954 spa->spa_proc_state = SPA_PROC_ACTIVE; 00955 cv_broadcast(&spa->spa_proc_cv); 00956 00957 CALLB_CPR_SAFE_BEGIN(&cprinfo); 00958 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 00959 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 00960 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 00961 00962 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 00963 spa->spa_proc_state = SPA_PROC_GONE; 00964 spa->spa_proc = &p0; 00965 cv_broadcast(&spa->spa_proc_cv); 00966 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 00967 00968 mutex_enter(&curproc->p_lock); 00969 lwp_exit(); 00970 } 00971 #endif /* SPA_PROCESS */ 00972 #endif 00973 00977 static void 00978 spa_activate(spa_t *spa, int mode) 00979 { 00980 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 00981 00982 spa->spa_state = POOL_STATE_ACTIVE; 00983 spa->spa_mode = mode; 00984 00985 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 00986 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 00987 00988 /* Try to create a covering process */ 00989 mutex_enter(&spa->spa_proc_lock); 00990 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 00991 ASSERT(spa->spa_proc == &p0); 00992 spa->spa_did = 0; 00993 00994 #ifdef SPA_PROCESS 00995 /* Only create a process if we're going to be around a while. */ 00996 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 00997 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 00998 NULL, 0) == 0) { 00999 spa->spa_proc_state = SPA_PROC_CREATED; 01000 while (spa->spa_proc_state == SPA_PROC_CREATED) { 01001 cv_wait(&spa->spa_proc_cv, 01002 &spa->spa_proc_lock); 01003 } 01004 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 01005 ASSERT(spa->spa_proc != &p0); 01006 ASSERT(spa->spa_did != 0); 01007 } else { 01008 #ifdef _KERNEL 01009 cmn_err(CE_WARN, 01010 "Couldn't create process for zfs pool \"%s\"\n", 01011 spa->spa_name); 01012 #endif 01013 } 01014 } 01015 #endif /* SPA_PROCESS */ 01016 mutex_exit(&spa->spa_proc_lock); 01017 01018 /* If we didn't create a process, we need to create our taskqs. */ 01019 ASSERT(spa->spa_proc == &p0); 01020 if (spa->spa_proc == &p0) { 01021 spa_create_zio_taskqs(spa); 01022 } 01023 01024 /* 01025 * Start TRIM thread. 01026 */ 01027 trim_thread_create(spa); 01028 01029 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 01030 offsetof(vdev_t, vdev_config_dirty_node)); 01031 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 01032 offsetof(vdev_t, vdev_state_dirty_node)); 01033 01034 txg_list_create(&spa->spa_vdev_txg_list, 01035 offsetof(struct vdev, vdev_txg_node)); 01036 01037 avl_create(&spa->spa_errlist_scrub, 01038 spa_error_entry_compare, sizeof (spa_error_entry_t), 01039 offsetof(spa_error_entry_t, se_avl)); 01040 avl_create(&spa->spa_errlist_last, 01041 spa_error_entry_compare, sizeof (spa_error_entry_t), 01042 offsetof(spa_error_entry_t, se_avl)); 01043 } 01044 01048 static void 01049 spa_deactivate(spa_t *spa) 01050 { 01051 ASSERT(spa->spa_sync_on == B_FALSE); 01052 ASSERT(spa->spa_dsl_pool == NULL); 01053 ASSERT(spa->spa_root_vdev == NULL); 01054 ASSERT(spa->spa_async_zio_root == NULL); 01055 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 01056 01057 /* 01058 * Stop TRIM thread in case spa_unload() wasn't called directly 01059 * before spa_deactivate(). 01060 */ 01061 trim_thread_destroy(spa); 01062 01063 txg_list_destroy(&spa->spa_vdev_txg_list); 01064 01065 list_destroy(&spa->spa_config_dirty_list); 01066 list_destroy(&spa->spa_state_dirty_list); 01067 01068 for (int t = 0; t < ZIO_TYPES; t++) { 01069 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 01070 if (spa->spa_zio_taskq[t][q] != NULL) 01071 taskq_destroy(spa->spa_zio_taskq[t][q]); 01072 spa->spa_zio_taskq[t][q] = NULL; 01073 } 01074 } 01075 01076 metaslab_class_destroy(spa->spa_normal_class); 01077 spa->spa_normal_class = NULL; 01078 01079 metaslab_class_destroy(spa->spa_log_class); 01080 spa->spa_log_class = NULL; 01081 01082 /* 01083 * If this was part of an import or the open otherwise failed, we may 01084 * still have errors left in the queues. Empty them just in case. 01085 */ 01086 spa_errlog_drain(spa); 01087 01088 avl_destroy(&spa->spa_errlist_scrub); 01089 avl_destroy(&spa->spa_errlist_last); 01090 01091 spa->spa_state = POOL_STATE_UNINITIALIZED; 01092 01093 mutex_enter(&spa->spa_proc_lock); 01094 if (spa->spa_proc_state != SPA_PROC_NONE) { 01095 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 01096 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 01097 cv_broadcast(&spa->spa_proc_cv); 01098 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 01099 ASSERT(spa->spa_proc != &p0); 01100 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 01101 } 01102 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 01103 spa->spa_proc_state = SPA_PROC_NONE; 01104 } 01105 ASSERT(spa->spa_proc == &p0); 01106 mutex_exit(&spa->spa_proc_lock); 01107 01108 #ifdef SPA_PROCESS 01109 /* 01110 * We want to make sure spa_thread() has actually exited the ZFS 01111 * module, so that the module can't be unloaded out from underneath 01112 * it. 01113 */ 01114 if (spa->spa_did != 0) { 01115 thread_join(spa->spa_did); 01116 spa->spa_did = 0; 01117 } 01118 #endif /* SPA_PROCESS */ 01119 } 01120 01127 static int 01128 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 01129 uint_t id, int atype) 01130 { 01131 nvlist_t **child; 01132 uint_t children; 01133 int error; 01134 01135 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 01136 return (error); 01137 01138 if ((*vdp)->vdev_ops->vdev_op_leaf) 01139 return (0); 01140 01141 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 01142 &child, &children); 01143 01144 if (error == ENOENT) 01145 return (0); 01146 01147 if (error) { 01148 vdev_free(*vdp); 01149 *vdp = NULL; 01150 return (EINVAL); 01151 } 01152 01153 for (int c = 0; c < children; c++) { 01154 vdev_t *vd; 01155 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 01156 atype)) != 0) { 01157 vdev_free(*vdp); 01158 *vdp = NULL; 01159 return (error); 01160 } 01161 } 01162 01163 ASSERT(*vdp != NULL); 01164 01165 return (0); 01166 } 01167 01171 static void 01172 spa_unload(spa_t *spa) 01173 { 01174 int i; 01175 01176 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 01177 01178 /* 01179 * Stop TRIM thread. 01180 */ 01181 trim_thread_destroy(spa); 01182 01183 /* 01184 * Stop async tasks. 01185 */ 01186 spa_async_suspend(spa); 01187 01188 /* 01189 * Stop syncing. 01190 */ 01191 if (spa->spa_sync_on) { 01192 txg_sync_stop(spa->spa_dsl_pool); 01193 spa->spa_sync_on = B_FALSE; 01194 } 01195 01196 /* 01197 * Wait for any outstanding async I/O to complete. 01198 */ 01199 if (spa->spa_async_zio_root != NULL) { 01200 (void) zio_wait(spa->spa_async_zio_root); 01201 spa->spa_async_zio_root = NULL; 01202 } 01203 01204 bpobj_close(&spa->spa_deferred_bpobj); 01205 01206 /* 01207 * Close the dsl pool. 01208 */ 01209 if (spa->spa_dsl_pool) { 01210 dsl_pool_close(spa->spa_dsl_pool); 01211 spa->spa_dsl_pool = NULL; 01212 spa->spa_meta_objset = NULL; 01213 } 01214 01215 ddt_unload(spa); 01216 01217 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 01218 01219 /* 01220 * Drop and purge level 2 cache 01221 */ 01222 spa_l2cache_drop(spa); 01223 01224 /* 01225 * Close all vdevs. 01226 */ 01227 if (spa->spa_root_vdev) 01228 vdev_free(spa->spa_root_vdev); 01229 ASSERT(spa->spa_root_vdev == NULL); 01230 01231 for (i = 0; i < spa->spa_spares.sav_count; i++) 01232 vdev_free(spa->spa_spares.sav_vdevs[i]); 01233 if (spa->spa_spares.sav_vdevs) { 01234 kmem_free(spa->spa_spares.sav_vdevs, 01235 spa->spa_spares.sav_count * sizeof (void *)); 01236 spa->spa_spares.sav_vdevs = NULL; 01237 } 01238 if (spa->spa_spares.sav_config) { 01239 nvlist_free(spa->spa_spares.sav_config); 01240 spa->spa_spares.sav_config = NULL; 01241 } 01242 spa->spa_spares.sav_count = 0; 01243 01244 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 01245 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 01246 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 01247 } 01248 if (spa->spa_l2cache.sav_vdevs) { 01249 kmem_free(spa->spa_l2cache.sav_vdevs, 01250 spa->spa_l2cache.sav_count * sizeof (void *)); 01251 spa->spa_l2cache.sav_vdevs = NULL; 01252 } 01253 if (spa->spa_l2cache.sav_config) { 01254 nvlist_free(spa->spa_l2cache.sav_config); 01255 spa->spa_l2cache.sav_config = NULL; 01256 } 01257 spa->spa_l2cache.sav_count = 0; 01258 01259 spa->spa_async_suspended = 0; 01260 01261 if (spa->spa_comment != NULL) { 01262 spa_strfree(spa->spa_comment); 01263 spa->spa_comment = NULL; 01264 } 01265 01266 spa_config_exit(spa, SCL_ALL, FTAG); 01267 } 01268 01275 static void 01276 spa_load_spares(spa_t *spa) 01277 { 01278 nvlist_t **spares; 01279 uint_t nspares; 01280 int i; 01281 vdev_t *vd, *tvd; 01282 01283 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 01284 01285 /* 01286 * First, close and free any existing spare vdevs. 01287 */ 01288 for (i = 0; i < spa->spa_spares.sav_count; i++) { 01289 vd = spa->spa_spares.sav_vdevs[i]; 01290 01291 /* Undo the call to spa_activate() below */ 01292 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 01293 B_FALSE)) != NULL && tvd->vdev_isspare) 01294 spa_spare_remove(tvd); 01295 vdev_close(vd); 01296 vdev_free(vd); 01297 } 01298 01299 if (spa->spa_spares.sav_vdevs) 01300 kmem_free(spa->spa_spares.sav_vdevs, 01301 spa->spa_spares.sav_count * sizeof (void *)); 01302 01303 if (spa->spa_spares.sav_config == NULL) 01304 nspares = 0; 01305 else 01306 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 01307 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 01308 01309 spa->spa_spares.sav_count = (int)nspares; 01310 spa->spa_spares.sav_vdevs = NULL; 01311 01312 if (nspares == 0) 01313 return; 01314 01315 /* 01316 * Construct the array of vdevs, opening them to get status in the 01317 * process. For each spare, there is potentially two different vdev_t 01318 * structures associated with it: one in the list of spares (used only 01319 * for basic validation purposes) and one in the active vdev 01320 * configuration (if it's spared in). During this phase we open and 01321 * validate each vdev on the spare list. If the vdev also exists in the 01322 * active configuration, then we also mark this vdev as an active spare. 01323 */ 01324 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 01325 KM_SLEEP); 01326 for (i = 0; i < spa->spa_spares.sav_count; i++) { 01327 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 01328 VDEV_ALLOC_SPARE) == 0); 01329 ASSERT(vd != NULL); 01330 01331 spa->spa_spares.sav_vdevs[i] = vd; 01332 01333 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 01334 B_FALSE)) != NULL) { 01335 if (!tvd->vdev_isspare) 01336 spa_spare_add(tvd); 01337 01338 /* 01339 * We only mark the spare active if we were successfully 01340 * able to load the vdev. Otherwise, importing a pool 01341 * with a bad active spare would result in strange 01342 * behavior, because multiple pool would think the spare 01343 * is actively in use. 01344 * 01345 * There is a vulnerability here to an equally bizarre 01346 * circumstance, where a dead active spare is later 01347 * brought back to life (onlined or otherwise). Given 01348 * the rarity of this scenario, and the extra complexity 01349 * it adds, we ignore the possibility. 01350 */ 01351 if (!vdev_is_dead(tvd)) 01352 spa_spare_activate(tvd); 01353 } 01354 01355 vd->vdev_top = vd; 01356 vd->vdev_aux = &spa->spa_spares; 01357 01358 if (vdev_open(vd) != 0) 01359 continue; 01360 01361 if (vdev_validate_aux(vd) == 0) 01362 spa_spare_add(vd); 01363 } 01364 01365 /* 01366 * Recompute the stashed list of spares, with status information 01367 * this time. 01368 */ 01369 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 01370 DATA_TYPE_NVLIST_ARRAY) == 0); 01371 01372 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 01373 KM_SLEEP); 01374 for (i = 0; i < spa->spa_spares.sav_count; i++) 01375 spares[i] = vdev_config_generate(spa, 01376 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 01377 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 01378 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 01379 for (i = 0; i < spa->spa_spares.sav_count; i++) 01380 nvlist_free(spares[i]); 01381 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 01382 } 01383 01392 static void 01393 spa_load_l2cache(spa_t *spa) 01394 { 01395 nvlist_t **l2cache; 01396 uint_t nl2cache; 01397 int i, j, oldnvdevs; 01398 uint64_t guid; 01399 vdev_t *vd, **oldvdevs, **newvdevs; 01400 spa_aux_vdev_t *sav = &spa->spa_l2cache; 01401 01402 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 01403 01404 if (sav->sav_config != NULL) { 01405 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 01406 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 01407 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 01408 } else { 01409 nl2cache = 0; 01410 } 01411 01412 oldvdevs = sav->sav_vdevs; 01413 oldnvdevs = sav->sav_count; 01414 sav->sav_vdevs = NULL; 01415 sav->sav_count = 0; 01416 01417 /* 01418 * Process new nvlist of vdevs. 01419 */ 01420 for (i = 0; i < nl2cache; i++) { 01421 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 01422 &guid) == 0); 01423 01424 newvdevs[i] = NULL; 01425 for (j = 0; j < oldnvdevs; j++) { 01426 vd = oldvdevs[j]; 01427 if (vd != NULL && guid == vd->vdev_guid) { 01428 /* 01429 * Retain previous vdev for add/remove ops. 01430 */ 01431 newvdevs[i] = vd; 01432 oldvdevs[j] = NULL; 01433 break; 01434 } 01435 } 01436 01437 if (newvdevs[i] == NULL) { 01438 /* 01439 * Create new vdev 01440 */ 01441 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 01442 VDEV_ALLOC_L2CACHE) == 0); 01443 ASSERT(vd != NULL); 01444 newvdevs[i] = vd; 01445 01446 /* 01447 * Commit this vdev as an l2cache device, 01448 * even if it fails to open. 01449 */ 01450 spa_l2cache_add(vd); 01451 01452 vd->vdev_top = vd; 01453 vd->vdev_aux = sav; 01454 01455 spa_l2cache_activate(vd); 01456 01457 if (vdev_open(vd) != 0) 01458 continue; 01459 01460 (void) vdev_validate_aux(vd); 01461 01462 if (!vdev_is_dead(vd)) 01463 l2arc_add_vdev(spa, vd); 01464 } 01465 } 01466 01467 /* 01468 * Purge vdevs that were dropped 01469 */ 01470 for (i = 0; i < oldnvdevs; i++) { 01471 uint64_t pool; 01472 01473 vd = oldvdevs[i]; 01474 if (vd != NULL) { 01475 ASSERT(vd->vdev_isl2cache); 01476 01477 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 01478 pool != 0ULL && l2arc_vdev_present(vd)) 01479 l2arc_remove_vdev(vd); 01480 vdev_clear_stats(vd); 01481 vdev_free(vd); 01482 } 01483 } 01484 01485 if (oldvdevs) 01486 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 01487 01488 if (sav->sav_config == NULL) 01489 goto out; 01490 01491 sav->sav_vdevs = newvdevs; 01492 sav->sav_count = (int)nl2cache; 01493 01494 /* 01495 * Recompute the stashed list of l2cache devices, with status 01496 * information this time. 01497 */ 01498 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 01499 DATA_TYPE_NVLIST_ARRAY) == 0); 01500 01501 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 01502 for (i = 0; i < sav->sav_count; i++) 01503 l2cache[i] = vdev_config_generate(spa, 01504 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 01505 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 01506 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 01507 out: 01508 for (i = 0; i < sav->sav_count; i++) 01509 nvlist_free(l2cache[i]); 01510 if (sav->sav_count) 01511 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 01512 } 01513 01514 static int 01515 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 01516 { 01517 dmu_buf_t *db; 01518 char *packed = NULL; 01519 size_t nvsize = 0; 01520 int error; 01521 *value = NULL; 01522 01523 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 01524 nvsize = *(uint64_t *)db->db_data; 01525 dmu_buf_rele(db, FTAG); 01526 01527 packed = kmem_alloc(nvsize, KM_SLEEP); 01528 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 01529 DMU_READ_PREFETCH); 01530 if (error == 0) 01531 error = nvlist_unpack(packed, nvsize, value, 0); 01532 kmem_free(packed, nvsize); 01533 01534 return (error); 01535 } 01536 01541 static void 01542 spa_check_removed(vdev_t *vd) 01543 { 01544 for (int c = 0; c < vd->vdev_children; c++) 01545 spa_check_removed(vd->vdev_child[c]); 01546 01547 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 01548 zfs_post_autoreplace(vd->vdev_spa, vd); 01549 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 01550 } 01551 } 01552 01556 static boolean_t 01557 spa_config_valid(spa_t *spa, nvlist_t *config) 01558 { 01559 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 01560 nvlist_t *nv; 01561 01562 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 01563 01564 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 01565 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 01566 01567 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 01568 01569 /* 01570 * If we're doing a normal import, then build up any additional 01571 * diagnostic information about missing devices in this config. 01572 * We'll pass this up to the user for further processing. 01573 */ 01574 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 01575 nvlist_t **child, *nv; 01576 uint64_t idx = 0; 01577 01578 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 01579 KM_SLEEP); 01580 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 01581 01582 for (int c = 0; c < rvd->vdev_children; c++) { 01583 vdev_t *tvd = rvd->vdev_child[c]; 01584 vdev_t *mtvd = mrvd->vdev_child[c]; 01585 01586 if (tvd->vdev_ops == &vdev_missing_ops && 01587 mtvd->vdev_ops != &vdev_missing_ops && 01588 mtvd->vdev_islog) 01589 child[idx++] = vdev_config_generate(spa, mtvd, 01590 B_FALSE, 0); 01591 } 01592 01593 if (idx) { 01594 VERIFY(nvlist_add_nvlist_array(nv, 01595 ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 01596 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 01597 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 01598 01599 for (int i = 0; i < idx; i++) 01600 nvlist_free(child[i]); 01601 } 01602 nvlist_free(nv); 01603 kmem_free(child, rvd->vdev_children * sizeof (char **)); 01604 } 01605 01606 /* 01607 * Compare the root vdev tree with the information we have 01608 * from the MOS config (mrvd). Check each top-level vdev 01609 * with the corresponding MOS config top-level (mtvd). 01610 */ 01611 for (int c = 0; c < rvd->vdev_children; c++) { 01612 vdev_t *tvd = rvd->vdev_child[c]; 01613 vdev_t *mtvd = mrvd->vdev_child[c]; 01614 01615 /* 01616 * Resolve any "missing" vdevs in the current configuration. 01617 * If we find that the MOS config has more accurate information 01618 * about the top-level vdev then use that vdev instead. 01619 */ 01620 if (tvd->vdev_ops == &vdev_missing_ops && 01621 mtvd->vdev_ops != &vdev_missing_ops) { 01622 01623 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 01624 continue; 01625 01626 /* 01627 * Device specific actions. 01628 */ 01629 if (mtvd->vdev_islog) { 01630 spa_set_log_state(spa, SPA_LOG_CLEAR); 01631 } else { 01632 /* 01633 * XXX - once we have 'readonly' pool 01634 * support we should be able to handle 01635 * missing data devices by transitioning 01636 * the pool to readonly. 01637 */ 01638 continue; 01639 } 01640 01641 /* 01642 * Swap the missing vdev with the data we were 01643 * able to obtain from the MOS config. 01644 */ 01645 vdev_remove_child(rvd, tvd); 01646 vdev_remove_child(mrvd, mtvd); 01647 01648 vdev_add_child(rvd, mtvd); 01649 vdev_add_child(mrvd, tvd); 01650 01651 spa_config_exit(spa, SCL_ALL, FTAG); 01652 vdev_load(mtvd); 01653 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 01654 01655 vdev_reopen(rvd); 01656 } else if (mtvd->vdev_islog) { 01657 /* 01658 * Load the slog device's state from the MOS config 01659 * since it's possible that the label does not 01660 * contain the most up-to-date information. 01661 */ 01662 vdev_load_log_state(tvd, mtvd); 01663 vdev_reopen(tvd); 01664 } 01665 } 01666 vdev_free(mrvd); 01667 spa_config_exit(spa, SCL_ALL, FTAG); 01668 01669 /* 01670 * Ensure we were able to validate the config. 01671 */ 01672 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 01673 } 01674 01678 static int 01679 spa_check_logs(spa_t *spa) 01680 { 01681 switch (spa->spa_log_state) { 01682 case SPA_LOG_MISSING: 01683 /* need to recheck in case slog has been restored */ 01684 case SPA_LOG_UNKNOWN: 01685 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 01686 DS_FIND_CHILDREN)) { 01687 spa_set_log_state(spa, SPA_LOG_MISSING); 01688 return (1); 01689 } 01690 break; 01691 } 01692 return (0); 01693 } 01694 01695 static boolean_t 01696 spa_passivate_log(spa_t *spa) 01697 { 01698 vdev_t *rvd = spa->spa_root_vdev; 01699 boolean_t slog_found = B_FALSE; 01700 01701 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 01702 01703 if (!spa_has_slogs(spa)) 01704 return (B_FALSE); 01705 01706 for (int c = 0; c < rvd->vdev_children; c++) { 01707 vdev_t *tvd = rvd->vdev_child[c]; 01708 metaslab_group_t *mg = tvd->vdev_mg; 01709 01710 if (tvd->vdev_islog) { 01711 metaslab_group_passivate(mg); 01712 slog_found = B_TRUE; 01713 } 01714 } 01715 01716 return (slog_found); 01717 } 01718 01719 static void 01720 spa_activate_log(spa_t *spa) 01721 { 01722 vdev_t *rvd = spa->spa_root_vdev; 01723 01724 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 01725 01726 for (int c = 0; c < rvd->vdev_children; c++) { 01727 vdev_t *tvd = rvd->vdev_child[c]; 01728 metaslab_group_t *mg = tvd->vdev_mg; 01729 01730 if (tvd->vdev_islog) 01731 metaslab_group_activate(mg); 01732 } 01733 } 01734 01735 int 01736 spa_offline_log(spa_t *spa) 01737 { 01738 int error = 0; 01739 01740 if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 01741 NULL, DS_FIND_CHILDREN)) == 0) { 01742 01743 /* 01744 * We successfully offlined the log device, sync out the 01745 * current txg so that the "stubby" block can be removed 01746 * by zil_sync(). 01747 */ 01748 txg_wait_synced(spa->spa_dsl_pool, 0); 01749 } 01750 return (error); 01751 } 01752 01753 static void 01754 spa_aux_check_removed(spa_aux_vdev_t *sav) 01755 { 01756 int i; 01757 01758 for (i = 0; i < sav->sav_count; i++) 01759 spa_check_removed(sav->sav_vdevs[i]); 01760 } 01761 01765 void 01766 spa_claim_notify(zio_t *zio) 01767 { 01768 spa_t *spa = zio->io_spa; 01769 01770 if (zio->io_error) 01771 return; 01772 01773 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 01774 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 01775 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 01776 mutex_exit(&spa->spa_props_lock); 01777 } 01778 01779 typedef struct spa_load_error { 01780 uint64_t sle_meta_count; 01781 uint64_t sle_data_count; 01782 } spa_load_error_t; 01783 01784 static void 01785 spa_load_verify_done(zio_t *zio) 01786 { 01787 blkptr_t *bp = zio->io_bp; 01788 spa_load_error_t *sle = zio->io_private; 01789 dmu_object_type_t type = BP_GET_TYPE(bp); 01790 int error = zio->io_error; 01791 01792 if (error) { 01793 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 01794 type != DMU_OT_INTENT_LOG) 01795 atomic_add_64(&sle->sle_meta_count, 1); 01796 else 01797 atomic_add_64(&sle->sle_data_count, 1); 01798 } 01799 zio_data_buf_free(zio->io_data, zio->io_size); 01800 } 01801 01802 /*ARGSUSED*/ 01803 static int 01804 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 01805 arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 01806 { 01807 if (bp != NULL) { 01808 zio_t *rio = arg; 01809 size_t size = BP_GET_PSIZE(bp); 01810 void *data = zio_data_buf_alloc(size); 01811 01812 zio_nowait(zio_read(rio, spa, bp, data, size, 01813 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 01814 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 01815 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 01816 } 01817 return (0); 01818 } 01819 01820 static int 01821 spa_load_verify(spa_t *spa) 01822 { 01823 zio_t *rio; 01824 spa_load_error_t sle = { 0 }; 01825 zpool_rewind_policy_t policy; 01826 boolean_t verify_ok = B_FALSE; 01827 int error; 01828 01829 zpool_get_rewind_policy(spa->spa_config, &policy); 01830 01831 if (policy.zrp_request & ZPOOL_NEVER_REWIND) 01832 return (0); 01833 01834 rio = zio_root(spa, NULL, &sle, 01835 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 01836 01837 error = traverse_pool(spa, spa->spa_verify_min_txg, 01838 TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 01839 01840 (void) zio_wait(rio); 01841 01842 spa->spa_load_meta_errors = sle.sle_meta_count; 01843 spa->spa_load_data_errors = sle.sle_data_count; 01844 01845 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 01846 sle.sle_data_count <= policy.zrp_maxdata) { 01847 int64_t loss = 0; 01848 01849 verify_ok = B_TRUE; 01850 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 01851 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 01852 01853 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 01854 VERIFY(nvlist_add_uint64(spa->spa_load_info, 01855 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 01856 VERIFY(nvlist_add_int64(spa->spa_load_info, 01857 ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 01858 VERIFY(nvlist_add_uint64(spa->spa_load_info, 01859 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 01860 } else { 01861 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 01862 } 01863 01864 if (error) { 01865 if (error != ENXIO && error != EIO) 01866 error = EIO; 01867 return (error); 01868 } 01869 01870 return (verify_ok ? 0 : EIO); 01871 } 01872 01876 static void 01877 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 01878 { 01879 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 01880 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 01881 } 01882 01886 static int 01887 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 01888 { 01889 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 01890 name, sizeof (uint64_t), 1, val)); 01891 } 01892 01893 static int 01894 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 01895 { 01896 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 01897 return (err); 01898 } 01899 01916 static void 01917 spa_try_repair(spa_t *spa, nvlist_t *config) 01918 { 01919 uint_t extracted; 01920 uint64_t *glist; 01921 uint_t i, gcount; 01922 nvlist_t *nvl; 01923 vdev_t **vd; 01924 boolean_t attempt_reopen; 01925 01926 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 01927 return; 01928 01929 /* check that the config is complete */ 01930 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 01931 &glist, &gcount) != 0) 01932 return; 01933 01934 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 01935 01936 /* attempt to online all the vdevs & validate */ 01937 attempt_reopen = B_TRUE; 01938 for (i = 0; i < gcount; i++) { 01939 if (glist[i] == 0) /* vdev is hole */ 01940 continue; 01941 01942 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 01943 if (vd[i] == NULL) { 01944 /* 01945 * Don't bother attempting to reopen the disks; 01946 * just do the split. 01947 */ 01948 attempt_reopen = B_FALSE; 01949 } else { 01950 /* attempt to re-online it */ 01951 vd[i]->vdev_offline = B_FALSE; 01952 } 01953 } 01954 01955 if (attempt_reopen) { 01956 vdev_reopen(spa->spa_root_vdev); 01957 01958 /* check each device to see what state it's in */ 01959 for (extracted = 0, i = 0; i < gcount; i++) { 01960 if (vd[i] != NULL && 01961 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 01962 break; 01963 ++extracted; 01964 } 01965 } 01966 01967 /* 01968 * If every disk has been moved to the new pool, or if we never 01969 * even attempted to look at them, then we split them off for 01970 * good. 01971 */ 01972 if (!attempt_reopen || gcount == extracted) { 01973 for (i = 0; i < gcount; i++) 01974 if (vd[i] != NULL) 01975 vdev_split(vd[i]); 01976 vdev_reopen(spa->spa_root_vdev); 01977 } 01978 01979 kmem_free(vd, gcount * sizeof (vdev_t *)); 01980 } 01981 01982 static int 01983 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 01984 boolean_t mosconfig) 01985 { 01986 nvlist_t *config = spa->spa_config; 01987 char *ereport = FM_EREPORT_ZFS_POOL; 01988 char *comment; 01989 int error; 01990 uint64_t pool_guid; 01991 nvlist_t *nvl; 01992 01993 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 01994 return (EINVAL); 01995 01996 ASSERT(spa->spa_comment == NULL); 01997 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 01998 spa->spa_comment = spa_strdup(comment); 01999 02000 /* 02001 * Versioning wasn't explicitly added to the label until later, so if 02002 * it's not present treat it as the initial version. 02003 */ 02004 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 02005 &spa->spa_ubsync.ub_version) != 0) 02006 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 02007 02008 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 02009 &spa->spa_config_txg); 02010 02011 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 02012 spa_guid_exists(pool_guid, 0)) { 02013 error = EEXIST; 02014 } else { 02015 spa->spa_config_guid = pool_guid; 02016 02017 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 02018 &nvl) == 0) { 02019 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 02020 KM_SLEEP) == 0); 02021 } 02022 02023 nvlist_free(spa->spa_load_info); 02024 spa->spa_load_info = fnvlist_alloc(); 02025 02026 gethrestime(&spa->spa_loaded_ts); 02027 error = spa_load_impl(spa, pool_guid, config, state, type, 02028 mosconfig, &ereport); 02029 } 02030 02031 spa->spa_minref = refcount_count(&spa->spa_refcount); 02032 if (error) { 02033 if (error != EEXIST) { 02034 spa->spa_loaded_ts.tv_sec = 0; 02035 spa->spa_loaded_ts.tv_nsec = 0; 02036 } 02037 if (error != EBADF) { 02038 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 02039 } 02040 } 02041 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 02042 spa->spa_ena = 0; 02043 02044 return (error); 02045 } 02046 02051 static int 02052 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 02053 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 02054 char **ereport) 02055 { 02056 int error = 0; 02057 nvlist_t *nvroot = NULL; 02058 nvlist_t *label; 02059 vdev_t *rvd; 02060 uberblock_t *ub = &spa->spa_uberblock; 02061 uint64_t children, config_cache_txg = spa->spa_config_txg; 02062 int orig_mode = spa->spa_mode; 02063 int parse; 02064 uint64_t obj; 02065 boolean_t missing_feat_write = B_FALSE; 02066 02067 /* 02068 * If this is an untrusted config, access the pool in read-only mode. 02069 * This prevents things like resilvering recently removed devices. 02070 */ 02071 if (!mosconfig) 02072 spa->spa_mode = FREAD; 02073 02074 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 02075 02076 spa->spa_load_state = state; 02077 02078 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 02079 return (EINVAL); 02080 02081 parse = (type == SPA_IMPORT_EXISTING ? 02082 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 02083 02084 /* 02085 * Create "The Godfather" zio to hold all async IOs 02086 */ 02087 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 02088 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 02089 02090 /* 02091 * Parse the configuration into a vdev tree. We explicitly set the 02092 * value that will be returned by spa_version() since parsing the 02093 * configuration requires knowing the version number. 02094 */ 02095 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 02096 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 02097 spa_config_exit(spa, SCL_ALL, FTAG); 02098 02099 if (error != 0) 02100 return (error); 02101 02102 ASSERT(spa->spa_root_vdev == rvd); 02103 02104 if (type != SPA_IMPORT_ASSEMBLE) { 02105 ASSERT(spa_guid(spa) == pool_guid); 02106 } 02107 02108 /* 02109 * Try to open all vdevs, loading each label in the process. 02110 */ 02111 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 02112 error = vdev_open(rvd); 02113 spa_config_exit(spa, SCL_ALL, FTAG); 02114 if (error != 0) 02115 return (error); 02116 02117 /* 02118 * We need to validate the vdev labels against the configuration that 02119 * we have in hand, which is dependent on the setting of mosconfig. If 02120 * mosconfig is true then we're validating the vdev labels based on 02121 * that config. Otherwise, we're validating against the cached config 02122 * (zpool.cache) that was read when we loaded the zfs module, and then 02123 * later we will recursively call spa_load() and validate against 02124 * the vdev config. 02125 * 02126 * If we're assembling a new pool that's been split off from an 02127 * existing pool, the labels haven't yet been updated so we skip 02128 * validation for now. 02129 */ 02130 if (type != SPA_IMPORT_ASSEMBLE) { 02131 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 02132 error = vdev_validate(rvd, mosconfig); 02133 spa_config_exit(spa, SCL_ALL, FTAG); 02134 02135 if (error != 0) 02136 return (error); 02137 02138 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 02139 return (ENXIO); 02140 } 02141 02142 /* 02143 * Find the best uberblock. 02144 */ 02145 vdev_uberblock_load(rvd, ub, &label); 02146 02147 /* 02148 * If we weren't able to find a single valid uberblock, return failure. 02149 */ 02150 if (ub->ub_txg == 0) { 02151 nvlist_free(label); 02152 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 02153 } 02154 02155 /* 02156 * If the pool has an unsupported version we can't open it. 02157 */ 02158 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 02159 nvlist_free(label); 02160 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 02161 } 02162 02163 if (ub->ub_version >= SPA_VERSION_FEATURES) { 02164 nvlist_t *features; 02165 02166 /* 02167 * If we weren't able to find what's necessary for reading the 02168 * MOS in the label, return failure. 02169 */ 02170 if (label == NULL || nvlist_lookup_nvlist(label, 02171 ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 02172 nvlist_free(label); 02173 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 02174 ENXIO)); 02175 } 02176 02177 /* 02178 * Update our in-core representation with the definitive values 02179 * from the label. 02180 */ 02181 nvlist_free(spa->spa_label_features); 02182 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 02183 } 02184 02185 nvlist_free(label); 02186 02187 /* 02188 * Look through entries in the label nvlist's features_for_read. If 02189 * there is a feature listed there which we don't understand then we 02190 * cannot open a pool. 02191 */ 02192 if (ub->ub_version >= SPA_VERSION_FEATURES) { 02193 nvlist_t *unsup_feat; 02194 02195 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 02196 0); 02197 02198 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 02199 NULL); nvp != NULL; 02200 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 02201 if (!zfeature_is_supported(nvpair_name(nvp))) { 02202 VERIFY(nvlist_add_string(unsup_feat, 02203 nvpair_name(nvp), "") == 0); 02204 } 02205 } 02206 02207 if (!nvlist_empty(unsup_feat)) { 02208 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 02209 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 02210 nvlist_free(unsup_feat); 02211 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 02212 ENOTSUP)); 02213 } 02214 02215 nvlist_free(unsup_feat); 02216 } 02217 02218 /* 02219 * If the vdev guid sum doesn't match the uberblock, we have an 02220 * incomplete configuration. We first check to see if the pool 02221 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 02222 * If it is, defer the vdev_guid_sum check till later so we 02223 * can handle missing vdevs. 02224 */ 02225 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 02226 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 02227 rvd->vdev_guid_sum != ub->ub_guid_sum) 02228 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 02229 02230 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 02231 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 02232 spa_try_repair(spa, config); 02233 spa_config_exit(spa, SCL_ALL, FTAG); 02234 nvlist_free(spa->spa_config_splitting); 02235 spa->spa_config_splitting = NULL; 02236 } 02237 02238 /* 02239 * Initialize internal SPA structures. 02240 */ 02241 spa->spa_state = POOL_STATE_ACTIVE; 02242 spa->spa_ubsync = spa->spa_uberblock; 02243 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 02244 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 02245 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 02246 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 02247 spa->spa_claim_max_txg = spa->spa_first_txg; 02248 spa->spa_prev_software_version = ub->ub_software_version; 02249 02250 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 02251 if (error) 02252 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 02253 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 02254 02255 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 02256 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 02257 02258 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 02259 boolean_t missing_feat_read = B_FALSE; 02260 nvlist_t *unsup_feat, *enabled_feat; 02261 02262 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 02263 &spa->spa_feat_for_read_obj) != 0) { 02264 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 02265 } 02266 02267 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 02268 &spa->spa_feat_for_write_obj) != 0) { 02269 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 02270 } 02271 02272 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 02273 &spa->spa_feat_desc_obj) != 0) { 02274 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 02275 } 02276 02277 enabled_feat = fnvlist_alloc(); 02278 unsup_feat = fnvlist_alloc(); 02279 02280 if (!feature_is_supported(spa->spa_meta_objset, 02281 spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj, 02282 unsup_feat, enabled_feat)) 02283 missing_feat_read = B_TRUE; 02284 02285 if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 02286 if (!feature_is_supported(spa->spa_meta_objset, 02287 spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj, 02288 unsup_feat, enabled_feat)) { 02289 missing_feat_write = B_TRUE; 02290 } 02291 } 02292 02293 fnvlist_add_nvlist(spa->spa_load_info, 02294 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 02295 02296 if (!nvlist_empty(unsup_feat)) { 02297 fnvlist_add_nvlist(spa->spa_load_info, 02298 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 02299 } 02300 02301 fnvlist_free(enabled_feat); 02302 fnvlist_free(unsup_feat); 02303 02304 if (!missing_feat_read) { 02305 fnvlist_add_boolean(spa->spa_load_info, 02306 ZPOOL_CONFIG_CAN_RDONLY); 02307 } 02308 02309 /* 02310 * If the state is SPA_LOAD_TRYIMPORT, our objective is 02311 * twofold: to determine whether the pool is available for 02312 * import in read-write mode and (if it is not) whether the 02313 * pool is available for import in read-only mode. If the pool 02314 * is available for import in read-write mode, it is displayed 02315 * as available in userland; if it is not available for import 02316 * in read-only mode, it is displayed as unavailable in 02317 * userland. If the pool is available for import in read-only 02318 * mode but not read-write mode, it is displayed as unavailable 02319 * in userland with a special note that the pool is actually 02320 * available for open in read-only mode. 02321 * 02322 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 02323 * missing a feature for write, we must first determine whether 02324 * the pool can be opened read-only before returning to 02325 * userland in order to know whether to display the 02326 * abovementioned note. 02327 */ 02328 if (missing_feat_read || (missing_feat_write && 02329 spa_writeable(spa))) { 02330 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 02331 ENOTSUP)); 02332 } 02333 } 02334 02335 spa->spa_is_initializing = B_TRUE; 02336 error = dsl_pool_open(spa->spa_dsl_pool); 02337 spa->spa_is_initializing = B_FALSE; 02338 if (error != 0) 02339 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 02340 02341 if (!mosconfig) { 02342 uint64_t hostid; 02343 nvlist_t *policy = NULL, *nvconfig; 02344 02345 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 02346 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 02347 02348 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 02349 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 02350 char *hostname; 02351 unsigned long myhostid = 0; 02352 02353 VERIFY(nvlist_lookup_string(nvconfig, 02354 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 02355 02356 #ifdef _KERNEL 02357 myhostid = zone_get_hostid(NULL); 02358 #else /* _KERNEL */ 02359 /* 02360 * We're emulating the system's hostid in userland, so 02361 * we can't use zone_get_hostid(). 02362 */ 02363 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 02364 #endif /* _KERNEL */ 02365 if (check_hostid && hostid != 0 && myhostid != 0 && 02366 hostid != myhostid) { 02367 nvlist_free(nvconfig); 02368 cmn_err(CE_WARN, "pool '%s' could not be " 02369 "loaded as it was last accessed by " 02370 "another system (host: %s hostid: 0x%lx). " 02371 "See: http://illumos.org/msg/ZFS-8000-EY", 02372 spa_name(spa), hostname, 02373 (unsigned long)hostid); 02374 return (EBADF); 02375 } 02376 } 02377 if (nvlist_lookup_nvlist(spa->spa_config, 02378 ZPOOL_REWIND_POLICY, &policy) == 0) 02379 VERIFY(nvlist_add_nvlist(nvconfig, 02380 ZPOOL_REWIND_POLICY, policy) == 0); 02381 02382 spa_config_set(spa, nvconfig); 02383 spa_unload(spa); 02384 spa_deactivate(spa); 02385 spa_activate(spa, orig_mode); 02386 02387 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 02388 } 02389 02390 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 02391 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 02392 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 02393 if (error != 0) 02394 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 02395 02396 /* 02397 * Load the bit that tells us to use the new accounting function 02398 * (raid-z deflation). If we have an older pool, this will not 02399 * be present. 02400 */ 02401 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 02402 if (error != 0 && error != ENOENT) 02403 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 02404 02405 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 02406 &spa->spa_creation_version); 02407 if (error != 0 && error != ENOENT) 02408 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 02409 02410 /* 02411 * Load the persistent error log. If we have an older pool, this will 02412 * not be present. 02413 */ 02414 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 02415 if (error != 0 && error != ENOENT) 02416 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 02417 02418 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 02419 &spa->spa_errlog_scrub); 02420 if (error != 0 && error != ENOENT) 02421 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 02422 02423 /* 02424 * Load the history object. If we have an older pool, this 02425 * will not be present. 02426 */ 02427 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 02428 if (error != 0 && error != ENOENT) 02429 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 02430 02431 /* 02432 * If we're assembling the pool from the split-off vdevs of 02433 * an existing pool, we don't want to attach the spares & cache 02434 * devices. 02435 */ 02436 02437 /* 02438 * Load any hot spares for this pool. 02439 */ 02440 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 02441 if (error != 0 && error != ENOENT) 02442 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 02443 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 02444 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 02445 if (load_nvlist(spa, spa->spa_spares.sav_object, 02446 &spa->spa_spares.sav_config) != 0) 02447 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 02448 02449 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 02450 spa_load_spares(spa); 02451 spa_config_exit(spa, SCL_ALL, FTAG); 02452 } else if (error == 0) { 02453 spa->spa_spares.sav_sync = B_TRUE; 02454 } 02455 02456 /* 02457 * Load any level 2 ARC devices for this pool. 02458 */ 02459 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 02460 &spa->spa_l2cache.sav_object); 02461 if (error != 0 && error != ENOENT) 02462 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 02463 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 02464 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 02465 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 02466 &spa->spa_l2cache.sav_config) != 0) 02467 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 02468 02469 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 02470 spa_load_l2cache(spa); 02471 spa_config_exit(spa, SCL_ALL, FTAG); 02472 } else if (error == 0) { 02473 spa->spa_l2cache.sav_sync = B_TRUE; 02474 } 02475 02476 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 02477 02478 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 02479 if (error && error != ENOENT) 02480 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 02481 02482 if (error == 0) { 02483 uint64_t autoreplace; 02484 02485 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 02486 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 02487 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 02488 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 02489 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 02490 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 02491 &spa->spa_dedup_ditto); 02492 02493 spa->spa_autoreplace = (autoreplace != 0); 02494 } 02495 02496 /* 02497 * If the 'autoreplace' property is set, then post a resource notifying 02498 * the ZFS DE that it should not issue any faults for unopenable 02499 * devices. We also iterate over the vdevs, and post a sysevent for any 02500 * unopenable vdevs so that the normal autoreplace handler can take 02501 * over. 02502 */ 02503 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 02504 spa_check_removed(spa->spa_root_vdev); 02505 /* 02506 * For the import case, this is done in spa_import(), because 02507 * at this point we're using the spare definitions from 02508 * the MOS config, not necessarily from the userland config. 02509 */ 02510 if (state != SPA_LOAD_IMPORT) { 02511 spa_aux_check_removed(&spa->spa_spares); 02512 spa_aux_check_removed(&spa->spa_l2cache); 02513 } 02514 } 02515 02516 /* 02517 * Load the vdev state for all toplevel vdevs. 02518 */ 02519 vdev_load(rvd); 02520 02521 /* 02522 * Propagate the leaf DTLs we just loaded all the way up the tree. 02523 */ 02524 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 02525 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 02526 spa_config_exit(spa, SCL_ALL, FTAG); 02527 02528 /* 02529 * Load the DDTs (dedup tables). 02530 */ 02531 error = ddt_load(spa); 02532 if (error != 0) 02533 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 02534 02535 spa_update_dspace(spa); 02536 02537 /* 02538 * Validate the config, using the MOS config to fill in any 02539 * information which might be missing. If we fail to validate 02540 * the config then declare the pool unfit for use. If we're 02541 * assembling a pool from a split, the log is not transferred 02542 * over. 02543 */ 02544 if (type != SPA_IMPORT_ASSEMBLE) { 02545 nvlist_t *nvconfig; 02546 02547 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 02548 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 02549 02550 if (!spa_config_valid(spa, nvconfig)) { 02551 nvlist_free(nvconfig); 02552 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 02553 ENXIO)); 02554 } 02555 nvlist_free(nvconfig); 02556 02557 /* 02558 * Now that we've validated the config, check the state of the 02559 * root vdev. If it can't be opened, it indicates one or 02560 * more toplevel vdevs are faulted. 02561 */ 02562 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 02563 return (ENXIO); 02564 02565 if (spa_check_logs(spa)) { 02566 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 02567 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 02568 } 02569 } 02570 02571 if (missing_feat_write) { 02572 ASSERT(state == SPA_LOAD_TRYIMPORT); 02573 02574 /* 02575 * At this point, we know that we can open the pool in 02576 * read-only mode but not read-write mode. We now have enough 02577 * information and can return to userland. 02578 */ 02579 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 02580 } 02581 02582 /* 02583 * We've successfully opened the pool, verify that we're ready 02584 * to start pushing transactions. 02585 */ 02586 if (state != SPA_LOAD_TRYIMPORT) { 02587 if (error = spa_load_verify(spa)) 02588 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 02589 error)); 02590 } 02591 02592 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 02593 spa->spa_load_max_txg == UINT64_MAX)) { 02594 dmu_tx_t *tx; 02595 int need_update = B_FALSE; 02596 02597 ASSERT(state != SPA_LOAD_TRYIMPORT); 02598 02599 /* 02600 * Claim log blocks that haven't been committed yet. 02601 * This must all happen in a single txg. 02602 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 02603 * invoked from zil_claim_log_block()'s i/o done callback. 02604 * Price of rollback is that we abandon the log. 02605 */ 02606 spa->spa_claiming = B_TRUE; 02607 02608 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 02609 spa_first_txg(spa)); 02610 (void) dmu_objset_find(spa_name(spa), 02611 zil_claim, tx, DS_FIND_CHILDREN); 02612 dmu_tx_commit(tx); 02613 02614 spa->spa_claiming = B_FALSE; 02615 02616 spa_set_log_state(spa, SPA_LOG_GOOD); 02617 spa->spa_sync_on = B_TRUE; 02618 txg_sync_start(spa->spa_dsl_pool); 02619 02620 /* 02621 * Wait for all claims to sync. We sync up to the highest 02622 * claimed log block birth time so that claimed log blocks 02623 * don't appear to be from the future. spa_claim_max_txg 02624 * will have been set for us by either zil_check_log_chain() 02625 * (invoked from spa_check_logs()) or zil_claim() above. 02626 */ 02627 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 02628 02629 /* 02630 * If the config cache is stale, or we have uninitialized 02631 * metaslabs (see spa_vdev_add()), then update the config. 02632 * 02633 * If this is a verbatim import, trust the current 02634 * in-core spa_config and update the disk labels. 02635 */ 02636 if (config_cache_txg != spa->spa_config_txg || 02637 state == SPA_LOAD_IMPORT || 02638 state == SPA_LOAD_RECOVER || 02639 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 02640 need_update = B_TRUE; 02641 02642 for (int c = 0; c < rvd->vdev_children; c++) 02643 if (rvd->vdev_child[c]->vdev_ms_array == 0) 02644 need_update = B_TRUE; 02645 02646 /* 02647 * Update the config cache asychronously in case we're the 02648 * root pool, in which case the config cache isn't writable yet. 02649 */ 02650 if (need_update) 02651 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 02652 02653 /* 02654 * Check all DTLs to see if anything needs resilvering. 02655 */ 02656 if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 02657 vdev_resilver_needed(rvd, NULL, NULL)) 02658 spa_async_request(spa, SPA_ASYNC_RESILVER); 02659 02660 /* 02661 * Delete any inconsistent datasets. 02662 */ 02663 (void) dmu_objset_find(spa_name(spa), 02664 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 02665 02666 /* 02667 * Clean up any stale temporary dataset userrefs. 02668 */ 02669 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 02670 } 02671 02672 return (0); 02673 } 02674 02675 static int 02676 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 02677 { 02678 int mode = spa->spa_mode; 02679 02680 spa_unload(spa); 02681 spa_deactivate(spa); 02682 02683 spa->spa_load_max_txg--; 02684 02685 spa_activate(spa, mode); 02686 spa_async_suspend(spa); 02687 02688 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 02689 } 02690 02698 static int 02699 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 02700 uint64_t max_request, int rewind_flags) 02701 { 02702 nvlist_t *loadinfo = NULL; 02703 nvlist_t *config = NULL; 02704 int load_error, rewind_error; 02705 uint64_t safe_rewind_txg; 02706 uint64_t min_txg; 02707 02708 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 02709 spa->spa_load_max_txg = spa->spa_load_txg; 02710 spa_set_log_state(spa, SPA_LOG_CLEAR); 02711 } else { 02712 spa->spa_load_max_txg = max_request; 02713 } 02714 02715 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 02716 mosconfig); 02717 if (load_error == 0) 02718 return (0); 02719 02720 if (spa->spa_root_vdev != NULL) 02721 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 02722 02723 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 02724 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 02725 02726 if (rewind_flags & ZPOOL_NEVER_REWIND) { 02727 nvlist_free(config); 02728 return (load_error); 02729 } 02730 02731 if (state == SPA_LOAD_RECOVER) { 02732 /* Price of rolling back is discarding txgs, including log */ 02733 spa_set_log_state(spa, SPA_LOG_CLEAR); 02734 } else { 02735 /* 02736 * If we aren't rolling back save the load info from our first 02737 * import attempt so that we can restore it after attempting 02738 * to rewind. 02739 */ 02740 loadinfo = spa->spa_load_info; 02741 spa->spa_load_info = fnvlist_alloc(); 02742 } 02743 02744 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 02745 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 02746 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 02747 TXG_INITIAL : safe_rewind_txg; 02748 02749 /* 02750 * Continue as long as we're finding errors, we're still within 02751 * the acceptable rewind range, and we're still finding uberblocks 02752 */ 02753 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 02754 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 02755 if (spa->spa_load_max_txg < safe_rewind_txg) 02756 spa->spa_extreme_rewind = B_TRUE; 02757 rewind_error = spa_load_retry(spa, state, mosconfig); 02758 } 02759 02760 spa->spa_extreme_rewind = B_FALSE; 02761 spa->spa_load_max_txg = UINT64_MAX; 02762 02763 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 02764 spa_config_set(spa, config); 02765 02766 if (state == SPA_LOAD_RECOVER) { 02767 ASSERT3P(loadinfo, ==, NULL); 02768 return (rewind_error); 02769 } else { 02770 /* Store the rewind info as part of the initial load info */ 02771 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 02772 spa->spa_load_info); 02773 02774 /* Restore the initial load info */ 02775 fnvlist_free(spa->spa_load_info); 02776 spa->spa_load_info = loadinfo; 02777 02778 return (load_error); 02779 } 02780 } 02781 02794 static int 02795 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 02796 nvlist_t **config) 02797 { 02798 spa_t *spa; 02799 spa_load_state_t state = SPA_LOAD_OPEN; 02800 int error; 02801 int locked = B_FALSE; 02802 int firstopen = B_FALSE; 02803 02804 *spapp = NULL; 02805 02806 /* 02807 * As disgusting as this is, we need to support recursive calls to this 02808 * function because dsl_dir_open() is called during spa_load(), and ends 02809 * up calling spa_open() again. The real fix is to figure out how to 02810 * avoid dsl_dir_open() calling this in the first place. 02811 */ 02812 if (mutex_owner(&spa_namespace_lock) != curthread) { 02813 mutex_enter(&spa_namespace_lock); 02814 locked = B_TRUE; 02815 } 02816 02817 if ((spa = spa_lookup(pool)) == NULL) { 02818 if (locked) 02819 mutex_exit(&spa_namespace_lock); 02820 return (ENOENT); 02821 } 02822 02823 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 02824 zpool_rewind_policy_t policy; 02825 02826 firstopen = B_TRUE; 02827 02828 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 02829 &policy); 02830 if (policy.zrp_request & ZPOOL_DO_REWIND) 02831 state = SPA_LOAD_RECOVER; 02832 02833 spa_activate(spa, spa_mode_global); 02834 02835 if (state != SPA_LOAD_RECOVER) 02836 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 02837 02838 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 02839 policy.zrp_request); 02840 02841 if (error == EBADF) { 02842 /* 02843 * If vdev_validate() returns failure (indicated by 02844 * EBADF), it indicates that one of the vdevs indicates 02845 * that the pool has been exported or destroyed. If 02846 * this is the case, the config cache is out of sync and 02847 * we should remove the pool from the namespace. 02848 */ 02849 spa_unload(spa); 02850 spa_deactivate(spa); 02851 spa_config_sync(spa, B_TRUE, B_TRUE); 02852 spa_remove(spa); 02853 if (locked) 02854 mutex_exit(&spa_namespace_lock); 02855 return (ENOENT); 02856 } 02857 02858 if (error) { 02859 /* 02860 * We can't open the pool, but we still have useful 02861 * information: the state of each vdev after the 02862 * attempted vdev_open(). Return this to the user. 02863 */ 02864 if (config != NULL && spa->spa_config) { 02865 VERIFY(nvlist_dup(spa->spa_config, config, 02866 KM_SLEEP) == 0); 02867 VERIFY(nvlist_add_nvlist(*config, 02868 ZPOOL_CONFIG_LOAD_INFO, 02869 spa->spa_load_info) == 0); 02870 } 02871 spa_unload(spa); 02872 spa_deactivate(spa); 02873 spa->spa_last_open_failed = error; 02874 if (locked) 02875 mutex_exit(&spa_namespace_lock); 02876 *spapp = NULL; 02877 return (error); 02878 } 02879 } 02880 02881 spa_open_ref(spa, tag); 02882 02883 if (config != NULL) 02884 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 02885 02886 /* 02887 * If we've recovered the pool, pass back any information we 02888 * gathered while doing the load. 02889 */ 02890 if (state == SPA_LOAD_RECOVER) { 02891 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 02892 spa->spa_load_info) == 0); 02893 } 02894 02895 if (locked) { 02896 spa->spa_last_open_failed = 0; 02897 spa->spa_last_ubsync_txg = 0; 02898 spa->spa_load_txg = 0; 02899 mutex_exit(&spa_namespace_lock); 02900 #ifdef __FreeBSD__ 02901 #ifdef _KERNEL 02902 if (firstopen) 02903 zvol_create_minors(pool); 02904 #endif 02905 #endif 02906 } 02907 02908 *spapp = spa; 02909 02910 return (0); 02911 } 02912 02913 int 02914 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 02915 nvlist_t **config) 02916 { 02917 return (spa_open_common(name, spapp, tag, policy, config)); 02918 } 02919 02920 int 02921 spa_open(const char *name, spa_t **spapp, void *tag) 02922 { 02923 return (spa_open_common(name, spapp, tag, NULL, NULL)); 02924 } 02925 02932 spa_t * 02933 spa_inject_addref(char *name) 02934 { 02935 spa_t *spa; 02936 02937 mutex_enter(&spa_namespace_lock); 02938 if ((spa = spa_lookup(name)) == NULL) { 02939 mutex_exit(&spa_namespace_lock); 02940 return (NULL); 02941 } 02942 spa->spa_inject_ref++; 02943 mutex_exit(&spa_namespace_lock); 02944 02945 return (spa); 02946 } 02947 02948 void 02949 spa_inject_delref(spa_t *spa) 02950 { 02951 mutex_enter(&spa_namespace_lock); 02952 spa->spa_inject_ref--; 02953 mutex_exit(&spa_namespace_lock); 02954 } 02955 02959 static void 02960 spa_add_spares(spa_t *spa, nvlist_t *config) 02961 { 02962 nvlist_t **spares; 02963 uint_t i, nspares; 02964 nvlist_t *nvroot; 02965 uint64_t guid; 02966 vdev_stat_t *vs; 02967 uint_t vsc; 02968 uint64_t pool; 02969 02970 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 02971 02972 if (spa->spa_spares.sav_count == 0) 02973 return; 02974 02975 VERIFY(nvlist_lookup_nvlist(config, 02976 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 02977 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 02978 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 02979 if (nspares != 0) { 02980 VERIFY(nvlist_add_nvlist_array(nvroot, 02981 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 02982 VERIFY(nvlist_lookup_nvlist_array(nvroot, 02983 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 02984 02985 /* 02986 * Go through and find any spares which have since been 02987 * repurposed as an active spare. If this is the case, update 02988 * their status appropriately. 02989 */ 02990 for (i = 0; i < nspares; i++) { 02991 VERIFY(nvlist_lookup_uint64(spares[i], 02992 ZPOOL_CONFIG_GUID, &guid) == 0); 02993 if (spa_spare_exists(guid, &pool, NULL) && 02994 pool != 0ULL) { 02995 VERIFY(nvlist_lookup_uint64_array( 02996 spares[i], ZPOOL_CONFIG_VDEV_STATS, 02997 (uint64_t **)&vs, &vsc) == 0); 02998 vs->vs_state = VDEV_STATE_CANT_OPEN; 02999 vs->vs_aux = VDEV_AUX_SPARED; 03000 } 03001 } 03002 } 03003 } 03004 03008 static void 03009 spa_add_l2cache(spa_t *spa, nvlist_t *config) 03010 { 03011 nvlist_t **l2cache; 03012 uint_t i, j, nl2cache; 03013 nvlist_t *nvroot; 03014 uint64_t guid; 03015 vdev_t *vd; 03016 vdev_stat_t *vs; 03017 uint_t vsc; 03018 03019 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 03020 03021 if (spa->spa_l2cache.sav_count == 0) 03022 return; 03023 03024 VERIFY(nvlist_lookup_nvlist(config, 03025 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 03026 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 03027 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 03028 if (nl2cache != 0) { 03029 VERIFY(nvlist_add_nvlist_array(nvroot, 03030 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 03031 VERIFY(nvlist_lookup_nvlist_array(nvroot, 03032 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 03033 03034 /* 03035 * Update level 2 cache device stats. 03036 */ 03037 03038 for (i = 0; i < nl2cache; i++) { 03039 VERIFY(nvlist_lookup_uint64(l2cache[i], 03040 ZPOOL_CONFIG_GUID, &guid) == 0); 03041 03042 vd = NULL; 03043 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 03044 if (guid == 03045 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 03046 vd = spa->spa_l2cache.sav_vdevs[j]; 03047 break; 03048 } 03049 } 03050 ASSERT(vd != NULL); 03051 03052 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 03053 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 03054 == 0); 03055 vdev_get_stats(vd, vs); 03056 } 03057 } 03058 } 03059 03060 static void 03061 spa_add_feature_stats(spa_t *spa, nvlist_t *config) 03062 { 03063 nvlist_t *features; 03064 zap_cursor_t zc; 03065 zap_attribute_t za; 03066 03067 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 03068 VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 03069 03070 if (spa->spa_feat_for_read_obj != 0) { 03071 for (zap_cursor_init(&zc, spa->spa_meta_objset, 03072 spa->spa_feat_for_read_obj); 03073 zap_cursor_retrieve(&zc, &za) == 0; 03074 zap_cursor_advance(&zc)) { 03075 ASSERT(za.za_integer_length == sizeof (uint64_t) && 03076 za.za_num_integers == 1); 03077 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 03078 za.za_first_integer)); 03079 } 03080 zap_cursor_fini(&zc); 03081 } 03082 03083 if (spa->spa_feat_for_write_obj != 0) { 03084 for (zap_cursor_init(&zc, spa->spa_meta_objset, 03085 spa->spa_feat_for_write_obj); 03086 zap_cursor_retrieve(&zc, &za) == 0; 03087 zap_cursor_advance(&zc)) { 03088 ASSERT(za.za_integer_length == sizeof (uint64_t) && 03089 za.za_num_integers == 1); 03090 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 03091 za.za_first_integer)); 03092 } 03093 zap_cursor_fini(&zc); 03094 } 03095 03096 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 03097 features) == 0); 03098 nvlist_free(features); 03099 } 03100 03101 int 03102 spa_get_stats(const char *name, nvlist_t **config, 03103 char *altroot, size_t buflen) 03104 { 03105 int error; 03106 spa_t *spa; 03107 03108 *config = NULL; 03109 error = spa_open_common(name, &spa, FTAG, NULL, config); 03110 03111 if (spa != NULL) { 03112 /* 03113 * This still leaves a window of inconsistency where the spares 03114 * or l2cache devices could change and the config would be 03115 * self-inconsistent. 03116 */ 03117 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 03118 03119 if (*config != NULL) { 03120 uint64_t loadtimes[2]; 03121 03122 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 03123 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 03124 VERIFY(nvlist_add_uint64_array(*config, 03125 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 03126 03127 VERIFY(nvlist_add_uint64(*config, 03128 ZPOOL_CONFIG_ERRCOUNT, 03129 spa_get_errlog_size(spa)) == 0); 03130 03131 if (spa_suspended(spa)) 03132 VERIFY(nvlist_add_uint64(*config, 03133 ZPOOL_CONFIG_SUSPENDED, 03134 spa->spa_failmode) == 0); 03135 03136 spa_add_spares(spa, *config); 03137 spa_add_l2cache(spa, *config); 03138 spa_add_feature_stats(spa, *config); 03139 } 03140 } 03141 03142 /* 03143 * We want to get the alternate root even for faulted pools, so we cheat 03144 * and call spa_lookup() directly. 03145 */ 03146 if (altroot) { 03147 if (spa == NULL) { 03148 mutex_enter(&spa_namespace_lock); 03149 spa = spa_lookup(name); 03150 if (spa) 03151 spa_altroot(spa, altroot, buflen); 03152 else 03153 altroot[0] = '\0'; 03154 spa = NULL; 03155 mutex_exit(&spa_namespace_lock); 03156 } else { 03157 spa_altroot(spa, altroot, buflen); 03158 } 03159 } 03160 03161 if (spa != NULL) { 03162 spa_config_exit(spa, SCL_CONFIG, FTAG); 03163 spa_close(spa, FTAG); 03164 } 03165 03166 return (error); 03167 } 03168 03175 static int 03176 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 03177 spa_aux_vdev_t *sav, const char *config, uint64_t version, 03178 vdev_labeltype_t label) 03179 { 03180 nvlist_t **dev; 03181 uint_t i, ndev; 03182 vdev_t *vd; 03183 int error; 03184 03185 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 03186 03187 /* 03188 * It's acceptable to have no devs specified. 03189 */ 03190 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 03191 return (0); 03192 03193 if (ndev == 0) 03194 return (EINVAL); 03195 03196 /* 03197 * Make sure the pool is formatted with a version that supports this 03198 * device type. 03199 */ 03200 if (spa_version(spa) < version) 03201 return (ENOTSUP); 03202 03203 /* 03204 * Set the pending device list so we correctly handle device in-use 03205 * checking. 03206 */ 03207 sav->sav_pending = dev; 03208 sav->sav_npending = ndev; 03209 03210 for (i = 0; i < ndev; i++) { 03211 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 03212 mode)) != 0) 03213 goto out; 03214 03215 if (!vd->vdev_ops->vdev_op_leaf) { 03216 vdev_free(vd); 03217 error = EINVAL; 03218 goto out; 03219 } 03220 03221 /* 03222 * The L2ARC currently only supports disk devices in 03223 * kernel context. For user-level testing, we allow it. 03224 */ 03225 #ifdef _KERNEL 03226 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 03227 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 03228 error = ENOTBLK; 03229 vdev_free(vd); 03230 goto out; 03231 } 03232 #endif 03233 vd->vdev_top = vd; 03234 03235 if ((error = vdev_open(vd)) == 0 && 03236 (error = vdev_label_init(vd, crtxg, label)) == 0) { 03237 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 03238 vd->vdev_guid) == 0); 03239 } 03240 03241 vdev_free(vd); 03242 03243 if (error && 03244 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 03245 goto out; 03246 else 03247 error = 0; 03248 } 03249 03250 out: 03251 sav->sav_pending = NULL; 03252 sav->sav_npending = 0; 03253 return (error); 03254 } 03255 03256 static int 03257 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 03258 { 03259 int error; 03260 03261 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 03262 03263 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 03264 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 03265 VDEV_LABEL_SPARE)) != 0) { 03266 return (error); 03267 } 03268 03269 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 03270 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 03271 VDEV_LABEL_L2CACHE)); 03272 } 03273 03274 static void 03275 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 03276 const char *config) 03277 { 03278 int i; 03279 03280 if (sav->sav_config != NULL) { 03281 nvlist_t **olddevs; 03282 uint_t oldndevs; 03283 nvlist_t **newdevs; 03284 03285 /* 03286 * Generate new dev list by concatentating with the 03287 * current dev list. 03288 */ 03289 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 03290 &olddevs, &oldndevs) == 0); 03291 03292 newdevs = kmem_alloc(sizeof (void *) * 03293 (ndevs + oldndevs), KM_SLEEP); 03294 for (i = 0; i < oldndevs; i++) 03295 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 03296 KM_SLEEP) == 0); 03297 for (i = 0; i < ndevs; i++) 03298 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 03299 KM_SLEEP) == 0); 03300 03301 VERIFY(nvlist_remove(sav->sav_config, config, 03302 DATA_TYPE_NVLIST_ARRAY) == 0); 03303 03304 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 03305 config, newdevs, ndevs + oldndevs) == 0); 03306 for (i = 0; i < oldndevs + ndevs; i++) 03307 nvlist_free(newdevs[i]); 03308 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 03309 } else { 03310 /* 03311 * Generate a new dev list. 03312 */ 03313 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 03314 KM_SLEEP) == 0); 03315 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 03316 devs, ndevs) == 0); 03317 } 03318 } 03319 03323 void 03324 spa_l2cache_drop(spa_t *spa) 03325 { 03326 vdev_t *vd; 03327 int i; 03328 spa_aux_vdev_t *sav = &spa->spa_l2cache; 03329 03330 for (i = 0; i < sav->sav_count; i++) { 03331 uint64_t pool; 03332 03333 vd = sav->sav_vdevs[i]; 03334 ASSERT(vd != NULL); 03335 03336 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 03337 pool != 0ULL && l2arc_vdev_present(vd)) 03338 l2arc_remove_vdev(vd); 03339 } 03340 } 03341 03345 int 03346 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 03347 const char *history_str, nvlist_t *zplprops) 03348 { 03349 spa_t *spa; 03350 char *altroot = NULL; 03351 vdev_t *rvd; 03352 dsl_pool_t *dp; 03353 dmu_tx_t *tx; 03354 int error = 0; 03355 uint64_t txg = TXG_INITIAL; 03356 nvlist_t **spares, **l2cache; 03357 uint_t nspares, nl2cache; 03358 uint64_t version, obj; 03359 boolean_t has_features; 03360 03361 /* 03362 * If this pool already exists, return failure. 03363 */ 03364 mutex_enter(&spa_namespace_lock); 03365 if (spa_lookup(pool) != NULL) { 03366 mutex_exit(&spa_namespace_lock); 03367 return (EEXIST); 03368 } 03369 03370 /* 03371 * Allocate a new spa_t structure. 03372 */ 03373 (void) nvlist_lookup_string(props, 03374 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 03375 spa = spa_add(pool, NULL, altroot); 03376 spa_activate(spa, spa_mode_global); 03377 03378 if (props && (error = spa_prop_validate(spa, props))) { 03379 spa_deactivate(spa); 03380 spa_remove(spa); 03381 mutex_exit(&spa_namespace_lock); 03382 return (error); 03383 } 03384 03385 has_features = B_FALSE; 03386 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 03387 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 03388 if (zpool_prop_feature(nvpair_name(elem))) 03389 has_features = B_TRUE; 03390 } 03391 03392 if (has_features || nvlist_lookup_uint64(props, 03393 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 03394 version = SPA_VERSION; 03395 } 03396 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 03397 03398 spa->spa_first_txg = txg; 03399 spa->spa_uberblock.ub_txg = txg - 1; 03400 spa->spa_uberblock.ub_version = version; 03401 spa->spa_ubsync = spa->spa_uberblock; 03402 03403 /* 03404 * Create "The Godfather" zio to hold all async IOs 03405 */ 03406 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 03407 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 03408 03409 /* 03410 * Create the root vdev. 03411 */ 03412 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 03413 03414 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 03415 03416 ASSERT(error != 0 || rvd != NULL); 03417 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 03418 03419 if (error == 0 && !zfs_allocatable_devs(nvroot)) 03420 error = EINVAL; 03421 03422 if (error == 0 && 03423 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 03424 (error = spa_validate_aux(spa, nvroot, txg, 03425 VDEV_ALLOC_ADD)) == 0) { 03426 for (int c = 0; c < rvd->vdev_children; c++) { 03427 vdev_metaslab_set_size(rvd->vdev_child[c]); 03428 vdev_expand(rvd->vdev_child[c], txg); 03429 } 03430 } 03431 03432 spa_config_exit(spa, SCL_ALL, FTAG); 03433 03434 if (error != 0) { 03435 spa_unload(spa); 03436 spa_deactivate(spa); 03437 spa_remove(spa); 03438 mutex_exit(&spa_namespace_lock); 03439 return (error); 03440 } 03441 03442 /* 03443 * Get the list of spares, if specified. 03444 */ 03445 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 03446 &spares, &nspares) == 0) { 03447 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 03448 KM_SLEEP) == 0); 03449 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 03450 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 03451 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 03452 spa_load_spares(spa); 03453 spa_config_exit(spa, SCL_ALL, FTAG); 03454 spa->spa_spares.sav_sync = B_TRUE; 03455 } 03456 03457 /* 03458 * Get the list of level 2 cache devices, if specified. 03459 */ 03460 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 03461 &l2cache, &nl2cache) == 0) { 03462 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 03463 NV_UNIQUE_NAME, KM_SLEEP) == 0); 03464 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 03465 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 03466 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 03467 spa_load_l2cache(spa); 03468 spa_config_exit(spa, SCL_ALL, FTAG); 03469 spa->spa_l2cache.sav_sync = B_TRUE; 03470 } 03471 03472 spa->spa_is_initializing = B_TRUE; 03473 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 03474 spa->spa_meta_objset = dp->dp_meta_objset; 03475 spa->spa_is_initializing = B_FALSE; 03476 03477 /* 03478 * Create DDTs (dedup tables). 03479 */ 03480 ddt_create(spa); 03481 03482 spa_update_dspace(spa); 03483 03484 tx = dmu_tx_create_assigned(dp, txg); 03485 03486 /* 03487 * Create the pool config object. 03488 */ 03489 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 03490 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 03491 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 03492 03493 if (zap_add(spa->spa_meta_objset, 03494 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 03495 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 03496 cmn_err(CE_PANIC, "failed to add pool config"); 03497 } 03498 03499 if (spa_version(spa) >= SPA_VERSION_FEATURES) 03500 spa_feature_create_zap_objects(spa, tx); 03501 03502 if (zap_add(spa->spa_meta_objset, 03503 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 03504 sizeof (uint64_t), 1, &version, tx) != 0) { 03505 cmn_err(CE_PANIC, "failed to add pool version"); 03506 } 03507 03508 /* Newly created pools with the right version are always deflated. */ 03509 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 03510 spa->spa_deflate = TRUE; 03511 if (zap_add(spa->spa_meta_objset, 03512 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 03513 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 03514 cmn_err(CE_PANIC, "failed to add deflate"); 03515 } 03516 } 03517 03518 /* 03519 * Create the deferred-free bpobj. Turn off compression 03520 * because sync-to-convergence takes longer if the blocksize 03521 * keeps changing. 03522 */ 03523 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 03524 dmu_object_set_compress(spa->spa_meta_objset, obj, 03525 ZIO_COMPRESS_OFF, tx); 03526 if (zap_add(spa->spa_meta_objset, 03527 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 03528 sizeof (uint64_t), 1, &obj, tx) != 0) { 03529 cmn_err(CE_PANIC, "failed to add bpobj"); 03530 } 03531 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 03532 spa->spa_meta_objset, obj)); 03533 03534 /* 03535 * Create the pool's history object. 03536 */ 03537 if (version >= SPA_VERSION_ZPOOL_HISTORY) 03538 spa_history_create_obj(spa, tx); 03539 03540 /* 03541 * Set pool properties. 03542 */ 03543 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 03544 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 03545 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 03546 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 03547 03548 if (props != NULL) { 03549 spa_configfile_set(spa, props, B_FALSE); 03550 spa_sync_props(spa, props, tx); 03551 } 03552 03553 dmu_tx_commit(tx); 03554 03555 spa->spa_sync_on = B_TRUE; 03556 txg_sync_start(spa->spa_dsl_pool); 03557 03558 /* 03559 * We explicitly wait for the first transaction to complete so that our 03560 * bean counters are appropriately updated. 03561 */ 03562 txg_wait_synced(spa->spa_dsl_pool, txg); 03563 03564 spa_config_sync(spa, B_FALSE, B_TRUE); 03565 03566 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 03567 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 03568 spa_history_log_version(spa, LOG_POOL_CREATE); 03569 03570 spa->spa_minref = refcount_count(&spa->spa_refcount); 03571 03572 mutex_exit(&spa_namespace_lock); 03573 03574 return (0); 03575 } 03576 03577 #ifdef _KERNEL 03578 #if defined(sun) 03579 /* 03580 * Get the root pool information from the root disk, then import the root pool 03581 * during the system boot up time. 03582 */ 03583 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 03584 03585 static nvlist_t * 03586 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 03587 { 03588 nvlist_t *config; 03589 nvlist_t *nvtop, *nvroot; 03590 uint64_t pgid; 03591 03592 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 03593 return (NULL); 03594 03595 /* 03596 * Add this top-level vdev to the child array. 03597 */ 03598 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 03599 &nvtop) == 0); 03600 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 03601 &pgid) == 0); 03602 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 03603 03604 /* 03605 * Put this pool's top-level vdevs into a root vdev. 03606 */ 03607 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 03608 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 03609 VDEV_TYPE_ROOT) == 0); 03610 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 03611 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 03612 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 03613 &nvtop, 1) == 0); 03614 03615 /* 03616 * Replace the existing vdev_tree with the new root vdev in 03617 * this pool's configuration (remove the old, add the new). 03618 */ 03619 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 03620 nvlist_free(nvroot); 03621 return (config); 03622 } 03623 03629 static void 03630 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 03631 { 03632 for (int c = 0; c < vd->vdev_children; c++) 03633 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 03634 03635 if (vd->vdev_ops->vdev_op_leaf) { 03636 nvlist_t *label; 03637 uint64_t label_txg; 03638 03639 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 03640 &label) != 0) 03641 return; 03642 03643 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 03644 &label_txg) == 0); 03645 03646 /* 03647 * Do we have a better boot device? 03648 */ 03649 if (label_txg > *txg) { 03650 *txg = label_txg; 03651 *avd = vd; 03652 } 03653 nvlist_free(label); 03654 } 03655 } 03656 03669 int 03670 spa_import_rootpool(char *devpath, char *devid) 03671 { 03672 spa_t *spa; 03673 vdev_t *rvd, *bvd, *avd = NULL; 03674 nvlist_t *config, *nvtop; 03675 uint64_t guid, txg; 03676 char *pname; 03677 int error; 03678 03679 /* 03680 * Read the label from the boot device and generate a configuration. 03681 */ 03682 config = spa_generate_rootconf(devpath, devid, &guid); 03683 #if defined(_OBP) && defined(_KERNEL) 03684 if (config == NULL) { 03685 if (strstr(devpath, "/iscsi/ssd") != NULL) { 03686 /* iscsi boot */ 03687 get_iscsi_bootpath_phy(devpath); 03688 config = spa_generate_rootconf(devpath, devid, &guid); 03689 } 03690 } 03691 #endif 03692 if (config == NULL) { 03693 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 03694 devpath); 03695 return (EIO); 03696 } 03697 03698 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 03699 &pname) == 0); 03700 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 03701 03702 mutex_enter(&spa_namespace_lock); 03703 if ((spa = spa_lookup(pname)) != NULL) { 03704 /* 03705 * Remove the existing root pool from the namespace so that we 03706 * can replace it with the correct config we just read in. 03707 */ 03708 spa_remove(spa); 03709 } 03710 03711 spa = spa_add(pname, config, NULL); 03712 spa->spa_is_root = B_TRUE; 03713 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 03714 03715 /* 03716 * Build up a vdev tree based on the boot device's label config. 03717 */ 03718 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 03719 &nvtop) == 0); 03720 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 03721 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 03722 VDEV_ALLOC_ROOTPOOL); 03723 spa_config_exit(spa, SCL_ALL, FTAG); 03724 if (error) { 03725 mutex_exit(&spa_namespace_lock); 03726 nvlist_free(config); 03727 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 03728 pname); 03729 return (error); 03730 } 03731 03732 /* 03733 * Get the boot vdev. 03734 */ 03735 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 03736 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 03737 (u_longlong_t)guid); 03738 error = ENOENT; 03739 goto out; 03740 } 03741 03742 /* 03743 * Determine if there is a better boot device. 03744 */ 03745 avd = bvd; 03746 spa_alt_rootvdev(rvd, &avd, &txg); 03747 if (avd != bvd) { 03748 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 03749 "try booting from '%s'", avd->vdev_path); 03750 error = EINVAL; 03751 goto out; 03752 } 03753 03754 /* 03755 * If the boot device is part of a spare vdev then ensure that 03756 * we're booting off the active spare. 03757 */ 03758 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 03759 !bvd->vdev_isspare) { 03760 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 03761 "try booting from '%s'", 03762 bvd->vdev_parent-> 03763 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 03764 error = EINVAL; 03765 goto out; 03766 } 03767 03768 error = 0; 03769 spa_history_log_version(spa, LOG_POOL_IMPORT); 03770 out: 03771 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 03772 vdev_free(rvd); 03773 spa_config_exit(spa, SCL_ALL, FTAG); 03774 mutex_exit(&spa_namespace_lock); 03775 03776 nvlist_free(config); 03777 return (error); 03778 } 03779 03780 #else 03781 03782 extern int 03783 vdev_geom_read_pool_label(const char *name, nvlist_t **config); 03784 03785 static nvlist_t * 03786 spa_generate_rootconf(const char *name) 03787 { 03788 nvlist_t *config; 03789 nvlist_t *nvtop, *nvroot; 03790 uint64_t pgid; 03791 03792 if (vdev_geom_read_pool_label(name, &config) != 0) 03793 return (NULL); 03794 03795 /* 03796 * Add this top-level vdev to the child array. 03797 */ 03798 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 03799 &nvtop) == 0); 03800 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 03801 &pgid) == 0); 03802 03803 /* 03804 * Put this pool's top-level vdevs into a root vdev. 03805 */ 03806 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 03807 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 03808 VDEV_TYPE_ROOT) == 0); 03809 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 03810 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 03811 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 03812 &nvtop, 1) == 0); 03813 03814 /* 03815 * Replace the existing vdev_tree with the new root vdev in 03816 * this pool's configuration (remove the old, add the new). 03817 */ 03818 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 03819 nvlist_free(nvroot); 03820 return (config); 03821 } 03822 03823 int 03824 spa_import_rootpool(const char *name) 03825 { 03826 spa_t *spa; 03827 vdev_t *rvd, *bvd, *avd = NULL; 03828 nvlist_t *config, *nvtop; 03829 uint64_t txg; 03830 char *pname; 03831 int error; 03832 03833 /* 03834 * Read the label from the boot device and generate a configuration. 03835 */ 03836 config = spa_generate_rootconf(name); 03837 if (config == NULL) { 03838 cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", 03839 name); 03840 return (EIO); 03841 } 03842 03843 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 03844 &pname) == 0 && strcmp(name, pname) == 0); 03845 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 03846 03847 mutex_enter(&spa_namespace_lock); 03848 if ((spa = spa_lookup(pname)) != NULL) { 03849 /* 03850 * Remove the existing root pool from the namespace so that we 03851 * can replace it with the correct config we just read in. 03852 */ 03853 spa_remove(spa); 03854 } 03855 spa = spa_add(pname, config, NULL); 03856 spa->spa_is_root = B_TRUE; 03857 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 03858 03859 /* 03860 * Build up a vdev tree based on the boot device's label config. 03861 */ 03862 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 03863 &nvtop) == 0); 03864 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 03865 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 03866 VDEV_ALLOC_ROOTPOOL); 03867 spa_config_exit(spa, SCL_ALL, FTAG); 03868 if (error) { 03869 mutex_exit(&spa_namespace_lock); 03870 nvlist_free(config); 03871 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 03872 pname); 03873 return (error); 03874 } 03875 03876 error = 0; 03877 spa_history_log_version(spa, LOG_POOL_IMPORT); 03878 out: 03879 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 03880 vdev_free(rvd); 03881 spa_config_exit(spa, SCL_ALL, FTAG); 03882 mutex_exit(&spa_namespace_lock); 03883 03884 return (error); 03885 } 03886 03887 #endif /* sun */ 03888 #endif 03889 03893 int 03894 spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 03895 { 03896 spa_t *spa; 03897 char *altroot = NULL; 03898 spa_load_state_t state = SPA_LOAD_IMPORT; 03899 zpool_rewind_policy_t policy; 03900 uint64_t mode = spa_mode_global; 03901 uint64_t readonly = B_FALSE; 03902 int error; 03903 nvlist_t *nvroot; 03904 nvlist_t **spares, **l2cache; 03905 uint_t nspares, nl2cache; 03906 03907 /* 03908 * If a pool with this name exists, return failure. 03909 */ 03910 mutex_enter(&spa_namespace_lock); 03911 if (spa_lookup(pool) != NULL) { 03912 mutex_exit(&spa_namespace_lock); 03913 return (EEXIST); 03914 } 03915 03916 /* 03917 * Create and initialize the spa structure. 03918 */ 03919 (void) nvlist_lookup_string(props, 03920 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 03921 (void) nvlist_lookup_uint64(props, 03922 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 03923 if (readonly) 03924 mode = FREAD; 03925 spa = spa_add(pool, config, altroot); 03926 spa->spa_import_flags = flags; 03927 03928 /* 03929 * Verbatim import - Take a pool and insert it into the namespace 03930 * as if it had been loaded at boot. 03931 */ 03932 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 03933 if (props != NULL) 03934 spa_configfile_set(spa, props, B_FALSE); 03935 03936 spa_config_sync(spa, B_FALSE, B_TRUE); 03937 03938 mutex_exit(&spa_namespace_lock); 03939 spa_history_log_version(spa, LOG_POOL_IMPORT); 03940 03941 return (0); 03942 } 03943 03944 spa_activate(spa, mode); 03945 03946 /* 03947 * Don't start async tasks until we know everything is healthy. 03948 */ 03949 spa_async_suspend(spa); 03950 03951 zpool_get_rewind_policy(config, &policy); 03952 if (policy.zrp_request & ZPOOL_DO_REWIND) 03953 state = SPA_LOAD_RECOVER; 03954 03955 /* 03956 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 03957 * because the user-supplied config is actually the one to trust when 03958 * doing an import. 03959 */ 03960 if (state != SPA_LOAD_RECOVER) 03961 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 03962 03963 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 03964 policy.zrp_request); 03965 03966 /* 03967 * Propagate anything learned while loading the pool and pass it 03968 * back to caller (i.e. rewind info, missing devices, etc). 03969 */ 03970 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 03971 spa->spa_load_info) == 0); 03972 03973 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 03974 /* 03975 * Toss any existing sparelist, as it doesn't have any validity 03976 * anymore, and conflicts with spa_has_spare(). 03977 */ 03978 if (spa->spa_spares.sav_config) { 03979 nvlist_free(spa->spa_spares.sav_config); 03980 spa->spa_spares.sav_config = NULL; 03981 spa_load_spares(spa); 03982 } 03983 if (spa->spa_l2cache.sav_config) { 03984 nvlist_free(spa->spa_l2cache.sav_config); 03985 spa->spa_l2cache.sav_config = NULL; 03986 spa_load_l2cache(spa); 03987 } 03988 03989 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 03990 &nvroot) == 0); 03991 if (error == 0) 03992 error = spa_validate_aux(spa, nvroot, -1ULL, 03993 VDEV_ALLOC_SPARE); 03994 if (error == 0) 03995 error = spa_validate_aux(spa, nvroot, -1ULL, 03996 VDEV_ALLOC_L2CACHE); 03997 spa_config_exit(spa, SCL_ALL, FTAG); 03998 03999 if (props != NULL) 04000 spa_configfile_set(spa, props, B_FALSE); 04001 04002 if (error != 0 || (props && spa_writeable(spa) && 04003 (error = spa_prop_set(spa, props)))) { 04004 spa_unload(spa); 04005 spa_deactivate(spa); 04006 spa_remove(spa); 04007 mutex_exit(&spa_namespace_lock); 04008 return (error); 04009 } 04010 04011 spa_async_resume(spa); 04012 04013 /* 04014 * Override any spares and level 2 cache devices as specified by 04015 * the user, as these may have correct device names/devids, etc. 04016 */ 04017 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 04018 &spares, &nspares) == 0) { 04019 if (spa->spa_spares.sav_config) 04020 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 04021 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 04022 else 04023 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 04024 NV_UNIQUE_NAME, KM_SLEEP) == 0); 04025 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 04026 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 04027 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 04028 spa_load_spares(spa); 04029 spa_config_exit(spa, SCL_ALL, FTAG); 04030 spa->spa_spares.sav_sync = B_TRUE; 04031 } 04032 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 04033 &l2cache, &nl2cache) == 0) { 04034 if (spa->spa_l2cache.sav_config) 04035 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 04036 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 04037 else 04038 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 04039 NV_UNIQUE_NAME, KM_SLEEP) == 0); 04040 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 04041 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 04042 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 04043 spa_load_l2cache(spa); 04044 spa_config_exit(spa, SCL_ALL, FTAG); 04045 spa->spa_l2cache.sav_sync = B_TRUE; 04046 } 04047 04048 /* 04049 * Check for any removed devices. 04050 */ 04051 if (spa->spa_autoreplace) { 04052 spa_aux_check_removed(&spa->spa_spares); 04053 spa_aux_check_removed(&spa->spa_l2cache); 04054 } 04055 04056 if (spa_writeable(spa)) { 04057 /* 04058 * Update the config cache to include the newly-imported pool. 04059 */ 04060 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 04061 } 04062 04063 /* 04064 * It's possible that the pool was expanded while it was exported. 04065 * We kick off an async task to handle this for us. 04066 */ 04067 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 04068 04069 mutex_exit(&spa_namespace_lock); 04070 spa_history_log_version(spa, LOG_POOL_IMPORT); 04071 04072 #ifdef __FreeBSD__ 04073 #ifdef _KERNEL 04074 zvol_create_minors(pool); 04075 #endif 04076 #endif 04077 return (0); 04078 } 04079 04080 nvlist_t * 04081 spa_tryimport(nvlist_t *tryconfig) 04082 { 04083 nvlist_t *config = NULL; 04084 char *poolname; 04085 spa_t *spa; 04086 uint64_t state; 04087 int error; 04088 04089 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 04090 return (NULL); 04091 04092 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 04093 return (NULL); 04094 04095 /* 04096 * Create and initialize the spa structure. 04097 */ 04098 mutex_enter(&spa_namespace_lock); 04099 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 04100 spa_activate(spa, FREAD); 04101 04102 /* 04103 * Pass off the heavy lifting to spa_load(). 04104 * Pass TRUE for mosconfig because the user-supplied config 04105 * is actually the one to trust when doing an import. 04106 */ 04107 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 04108 04109 /* 04110 * If 'tryconfig' was at least parsable, return the current config. 04111 */ 04112 if (spa->spa_root_vdev != NULL) { 04113 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 04114 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 04115 poolname) == 0); 04116 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 04117 state) == 0); 04118 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 04119 spa->spa_uberblock.ub_timestamp) == 0); 04120 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 04121 spa->spa_load_info) == 0); 04122 04123 /* 04124 * If the bootfs property exists on this pool then we 04125 * copy it out so that external consumers can tell which 04126 * pools are bootable. 04127 */ 04128 if ((!error || error == EEXIST) && spa->spa_bootfs) { 04129 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 04130 04131 /* 04132 * We have to play games with the name since the 04133 * pool was opened as TRYIMPORT_NAME. 04134 */ 04135 if (dsl_dsobj_to_dsname(spa_name(spa), 04136 spa->spa_bootfs, tmpname) == 0) { 04137 char *cp; 04138 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 04139 04140 cp = strchr(tmpname, '/'); 04141 if (cp == NULL) { 04142 (void) strlcpy(dsname, tmpname, 04143 MAXPATHLEN); 04144 } else { 04145 (void) snprintf(dsname, MAXPATHLEN, 04146 "%s/%s", poolname, ++cp); 04147 } 04148 VERIFY(nvlist_add_string(config, 04149 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 04150 kmem_free(dsname, MAXPATHLEN); 04151 } 04152 kmem_free(tmpname, MAXPATHLEN); 04153 } 04154 04155 /* 04156 * Add the list of hot spares and level 2 cache devices. 04157 */ 04158 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 04159 spa_add_spares(spa, config); 04160 spa_add_l2cache(spa, config); 04161 spa_config_exit(spa, SCL_CONFIG, FTAG); 04162 } 04163 04164 spa_unload(spa); 04165 spa_deactivate(spa); 04166 spa_remove(spa); 04167 mutex_exit(&spa_namespace_lock); 04168 04169 return (config); 04170 } 04171 04181 static int 04182 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 04183 boolean_t force, boolean_t hardforce) 04184 { 04185 spa_t *spa; 04186 04187 if (oldconfig) 04188 *oldconfig = NULL; 04189 04190 if (!(spa_mode_global & FWRITE)) 04191 return (EROFS); 04192 04193 mutex_enter(&spa_namespace_lock); 04194 if ((spa = spa_lookup(pool)) == NULL) { 04195 mutex_exit(&spa_namespace_lock); 04196 return (ENOENT); 04197 } 04198 04199 /* 04200 * Put a hold on the pool, drop the namespace lock, stop async tasks, 04201 * reacquire the namespace lock, and see if we can export. 04202 */ 04203 spa_open_ref(spa, FTAG); 04204 mutex_exit(&spa_namespace_lock); 04205 spa_async_suspend(spa); 04206 mutex_enter(&spa_namespace_lock); 04207 spa_close(spa, FTAG); 04208 04209 /* 04210 * The pool will be in core if it's openable, 04211 * in which case we can modify its state. 04212 */ 04213 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 04214 /* 04215 * Objsets may be open only because they're dirty, so we 04216 * have to force it to sync before checking spa_refcnt. 04217 */ 04218 txg_wait_synced(spa->spa_dsl_pool, 0); 04219 04220 /* 04221 * A pool cannot be exported or destroyed if there are active 04222 * references. If we are resetting a pool, allow references by 04223 * fault injection handlers. 04224 */ 04225 if (!spa_refcount_zero(spa) || 04226 (spa->spa_inject_ref != 0 && 04227 new_state != POOL_STATE_UNINITIALIZED)) { 04228 spa_async_resume(spa); 04229 mutex_exit(&spa_namespace_lock); 04230 return (EBUSY); 04231 } 04232 04233 /* 04234 * A pool cannot be exported if it has an active shared spare. 04235 * This is to prevent other pools stealing the active spare 04236 * from an exported pool. At user's own will, such pool can 04237 * be forcedly exported. 04238 */ 04239 if (!force && new_state == POOL_STATE_EXPORTED && 04240 spa_has_active_shared_spare(spa)) { 04241 spa_async_resume(spa); 04242 mutex_exit(&spa_namespace_lock); 04243 return (EXDEV); 04244 } 04245 04246 /* 04247 * We want this to be reflected on every label, 04248 * so mark them all dirty. spa_unload() will do the 04249 * final sync that pushes these changes out. 04250 */ 04251 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 04252 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 04253 spa->spa_state = new_state; 04254 spa->spa_final_txg = spa_last_synced_txg(spa) + 04255 TXG_DEFER_SIZE + 1; 04256 vdev_config_dirty(spa->spa_root_vdev); 04257 spa_config_exit(spa, SCL_ALL, FTAG); 04258 } 04259 } 04260 04261 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 04262 04263 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 04264 spa_unload(spa); 04265 spa_deactivate(spa); 04266 } 04267 04268 if (oldconfig && spa->spa_config) 04269 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 04270 04271 if (new_state != POOL_STATE_UNINITIALIZED) { 04272 if (!hardforce) 04273 spa_config_sync(spa, B_TRUE, B_TRUE); 04274 spa_remove(spa); 04275 } 04276 mutex_exit(&spa_namespace_lock); 04277 04278 return (0); 04279 } 04280 04284 int 04285 spa_destroy(char *pool) 04286 { 04287 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 04288 B_FALSE, B_FALSE)); 04289 } 04290 04294 int 04295 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 04296 boolean_t hardforce) 04297 { 04298 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 04299 force, hardforce)); 04300 } 04301 04308 int 04309 spa_reset(char *pool) 04310 { 04311 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 04312 B_FALSE, B_FALSE)); 04313 } 04314 04315 /* 04316 * ========================================================================== 04317 * Device manipulation 04318 * ========================================================================== 04319 */ 04320 04324 int 04325 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 04326 { 04327 uint64_t txg, id; 04328 int error; 04329 vdev_t *rvd = spa->spa_root_vdev; 04330 vdev_t *vd, *tvd; 04331 nvlist_t **spares, **l2cache; 04332 uint_t nspares, nl2cache; 04333 04334 ASSERT(spa_writeable(spa)); 04335 04336 txg = spa_vdev_enter(spa); 04337 04338 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 04339 VDEV_ALLOC_ADD)) != 0) 04340 return (spa_vdev_exit(spa, NULL, txg, error)); 04341 04342 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 04343 04344 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 04345 &nspares) != 0) 04346 nspares = 0; 04347 04348 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 04349 &nl2cache) != 0) 04350 nl2cache = 0; 04351 04352 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 04353 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 04354 04355 if (vd->vdev_children != 0 && 04356 (error = vdev_create(vd, txg, B_FALSE)) != 0) 04357 return (spa_vdev_exit(spa, vd, txg, error)); 04358 04359 /* 04360 * We must validate the spares and l2cache devices after checking the 04361 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 04362 */ 04363 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 04364 return (spa_vdev_exit(spa, vd, txg, error)); 04365 04366 /* 04367 * Transfer each new top-level vdev from vd to rvd. 04368 */ 04369 for (int c = 0; c < vd->vdev_children; c++) { 04370 04371 /* 04372 * Set the vdev id to the first hole, if one exists. 04373 */ 04374 for (id = 0; id < rvd->vdev_children; id++) { 04375 if (rvd->vdev_child[id]->vdev_ishole) { 04376 vdev_free(rvd->vdev_child[id]); 04377 break; 04378 } 04379 } 04380 tvd = vd->vdev_child[c]; 04381 vdev_remove_child(vd, tvd); 04382 tvd->vdev_id = id; 04383 vdev_add_child(rvd, tvd); 04384 vdev_config_dirty(tvd); 04385 } 04386 04387 if (nspares != 0) { 04388 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 04389 ZPOOL_CONFIG_SPARES); 04390 spa_load_spares(spa); 04391 spa->spa_spares.sav_sync = B_TRUE; 04392 } 04393 04394 if (nl2cache != 0) { 04395 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 04396 ZPOOL_CONFIG_L2CACHE); 04397 spa_load_l2cache(spa); 04398 spa->spa_l2cache.sav_sync = B_TRUE; 04399 } 04400 04401 /* 04402 * We have to be careful when adding new vdevs to an existing pool. 04403 * If other threads start allocating from these vdevs before we 04404 * sync the config cache, and we lose power, then upon reboot we may 04405 * fail to open the pool because there are DVAs that the config cache 04406 * can't translate. Therefore, we first add the vdevs without 04407 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 04408 * and then let spa_config_update() initialize the new metaslabs. 04409 * 04410 * spa_load() checks for added-but-not-initialized vdevs, so that 04411 * if we lose power at any point in this sequence, the remaining 04412 * steps will be completed the next time we load the pool. 04413 */ 04414 (void) spa_vdev_exit(spa, vd, txg, 0); 04415 04416 mutex_enter(&spa_namespace_lock); 04417 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 04418 mutex_exit(&spa_namespace_lock); 04419 04420 return (0); 04421 } 04422 04436 int 04437 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 04438 { 04439 uint64_t txg, dtl_max_txg; 04440 vdev_t *rvd = spa->spa_root_vdev; 04441 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 04442 vdev_ops_t *pvops; 04443 char *oldvdpath, *newvdpath; 04444 int newvd_isspare; 04445 int error; 04446 04447 ASSERT(spa_writeable(spa)); 04448 04449 txg = spa_vdev_enter(spa); 04450 04451 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 04452 04453 if (oldvd == NULL) 04454 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 04455 04456 if (!oldvd->vdev_ops->vdev_op_leaf) 04457 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 04458 04459 pvd = oldvd->vdev_parent; 04460 04461 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 04462 VDEV_ALLOC_ATTACH)) != 0) 04463 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 04464 04465 if (newrootvd->vdev_children != 1) 04466 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 04467 04468 newvd = newrootvd->vdev_child[0]; 04469 04470 if (!newvd->vdev_ops->vdev_op_leaf) 04471 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 04472 04473 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 04474 return (spa_vdev_exit(spa, newrootvd, txg, error)); 04475 04476 /* 04477 * Spares can't replace logs 04478 */ 04479 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 04480 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 04481 04482 if (!replacing) { 04483 /* 04484 * For attach, the only allowable parent is a mirror or the root 04485 * vdev. 04486 */ 04487 if (pvd->vdev_ops != &vdev_mirror_ops && 04488 pvd->vdev_ops != &vdev_root_ops) 04489 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 04490 04491 pvops = &vdev_mirror_ops; 04492 } else { 04493 /* 04494 * Active hot spares can only be replaced by inactive hot 04495 * spares. 04496 */ 04497 if (pvd->vdev_ops == &vdev_spare_ops && 04498 oldvd->vdev_isspare && 04499 !spa_has_spare(spa, newvd->vdev_guid)) 04500 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 04501 04502 /* 04503 * If the source is a hot spare, and the parent isn't already a 04504 * spare, then we want to create a new hot spare. Otherwise, we 04505 * want to create a replacing vdev. The user is not allowed to 04506 * attach to a spared vdev child unless the 'isspare' state is 04507 * the same (spare replaces spare, non-spare replaces 04508 * non-spare). 04509 */ 04510 if (pvd->vdev_ops == &vdev_replacing_ops && 04511 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 04512 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 04513 } else if (pvd->vdev_ops == &vdev_spare_ops && 04514 newvd->vdev_isspare != oldvd->vdev_isspare) { 04515 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 04516 } 04517 04518 if (newvd->vdev_isspare) 04519 pvops = &vdev_spare_ops; 04520 else 04521 pvops = &vdev_replacing_ops; 04522 } 04523 04524 /* 04525 * Make sure the new device is big enough. 04526 */ 04527 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 04528 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 04529 04530 /* 04531 * The new device cannot have a higher alignment requirement 04532 * than the top-level vdev. 04533 */ 04534 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 04535 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 04536 04537 /* 04538 * If this is an in-place replacement, update oldvd's path and devid 04539 * to make it distinguishable from newvd, and unopenable from now on. 04540 */ 04541 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 04542 spa_strfree(oldvd->vdev_path); 04543 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 04544 KM_SLEEP); 04545 (void) sprintf(oldvd->vdev_path, "%s/%s", 04546 newvd->vdev_path, "old"); 04547 if (oldvd->vdev_devid != NULL) { 04548 spa_strfree(oldvd->vdev_devid); 04549 oldvd->vdev_devid = NULL; 04550 } 04551 } 04552 04553 /* mark the device being resilvered */ 04554 newvd->vdev_resilvering = B_TRUE; 04555 04556 /* 04557 * If the parent is not a mirror, or if we're replacing, insert the new 04558 * mirror/replacing/spare vdev above oldvd. 04559 */ 04560 if (pvd->vdev_ops != pvops) 04561 pvd = vdev_add_parent(oldvd, pvops); 04562 04563 ASSERT(pvd->vdev_top->vdev_parent == rvd); 04564 ASSERT(pvd->vdev_ops == pvops); 04565 ASSERT(oldvd->vdev_parent == pvd); 04566 04567 /* 04568 * Extract the new device from its root and add it to pvd. 04569 */ 04570 vdev_remove_child(newrootvd, newvd); 04571 newvd->vdev_id = pvd->vdev_children; 04572 newvd->vdev_crtxg = oldvd->vdev_crtxg; 04573 vdev_add_child(pvd, newvd); 04574 04575 tvd = newvd->vdev_top; 04576 ASSERT(pvd->vdev_top == tvd); 04577 ASSERT(tvd->vdev_parent == rvd); 04578 04579 vdev_config_dirty(tvd); 04580 04581 /* 04582 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 04583 * for any dmu_sync-ed blocks. It will propagate upward when 04584 * spa_vdev_exit() calls vdev_dtl_reassess(). 04585 */ 04586 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 04587 04588 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 04589 dtl_max_txg - TXG_INITIAL); 04590 04591 if (newvd->vdev_isspare) { 04592 spa_spare_activate(newvd); 04593 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 04594 } 04595 04596 oldvdpath = spa_strdup(oldvd->vdev_path); 04597 newvdpath = spa_strdup(newvd->vdev_path); 04598 newvd_isspare = newvd->vdev_isspare; 04599 04600 /* 04601 * Mark newvd's DTL dirty in this txg. 04602 */ 04603 vdev_dirty(tvd, VDD_DTL, newvd, txg); 04604 04605 /* 04606 * Restart the resilver 04607 */ 04608 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 04609 04610 /* 04611 * Commit the config 04612 */ 04613 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 04614 04615 spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL, 04616 "%s vdev=%s %s vdev=%s", 04617 replacing && newvd_isspare ? "spare in" : 04618 replacing ? "replace" : "attach", newvdpath, 04619 replacing ? "for" : "to", oldvdpath); 04620 04621 spa_strfree(oldvdpath); 04622 spa_strfree(newvdpath); 04623 04624 if (spa->spa_bootfs) 04625 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 04626 04627 return (0); 04628 } 04629 04636 int 04637 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 04638 { 04639 uint64_t txg; 04640 int error; 04641 vdev_t *rvd = spa->spa_root_vdev; 04642 vdev_t *vd, *pvd, *cvd, *tvd; 04643 boolean_t unspare = B_FALSE; 04644 uint64_t unspare_guid; 04645 char *vdpath; 04646 04647 ASSERT(spa_writeable(spa)); 04648 04649 txg = spa_vdev_enter(spa); 04650 04651 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 04652 04653 if (vd == NULL) 04654 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 04655 04656 if (!vd->vdev_ops->vdev_op_leaf) 04657 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 04658 04659 pvd = vd->vdev_parent; 04660 04661 /* 04662 * If the parent/child relationship is not as expected, don't do it. 04663 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 04664 * vdev that's replacing B with C. The user's intent in replacing 04665 * is to go from M(A,B) to M(A,C). If the user decides to cancel 04666 * the replace by detaching C, the expected behavior is to end up 04667 * M(A,B). But suppose that right after deciding to detach C, 04668 * the replacement of B completes. We would have M(A,C), and then 04669 * ask to detach C, which would leave us with just A -- not what 04670 * the user wanted. To prevent this, we make sure that the 04671 * parent/child relationship hasn't changed -- in this example, 04672 * that C's parent is still the replacing vdev R. 04673 */ 04674 if (pvd->vdev_guid != pguid && pguid != 0) 04675 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 04676 04677 /* 04678 * Only 'replacing' or 'spare' vdevs can be replaced. 04679 */ 04680 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 04681 pvd->vdev_ops != &vdev_spare_ops) 04682 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 04683 04684 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 04685 spa_version(spa) >= SPA_VERSION_SPARES); 04686 04687 /* 04688 * Only mirror, replacing, and spare vdevs support detach. 04689 */ 04690 if (pvd->vdev_ops != &vdev_replacing_ops && 04691 pvd->vdev_ops != &vdev_mirror_ops && 04692 pvd->vdev_ops != &vdev_spare_ops) 04693 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 04694 04695 /* 04696 * If this device has the only valid copy of some data, 04697 * we cannot safely detach it. 04698 */ 04699 if (vdev_dtl_required(vd)) 04700 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 04701 04702 ASSERT(pvd->vdev_children >= 2); 04703 04704 /* 04705 * If we are detaching the second disk from a replacing vdev, then 04706 * check to see if we changed the original vdev's path to have "/old" 04707 * at the end in spa_vdev_attach(). If so, undo that change now. 04708 */ 04709 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 04710 vd->vdev_path != NULL) { 04711 size_t len = strlen(vd->vdev_path); 04712 04713 for (int c = 0; c < pvd->vdev_children; c++) { 04714 cvd = pvd->vdev_child[c]; 04715 04716 if (cvd == vd || cvd->vdev_path == NULL) 04717 continue; 04718 04719 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 04720 strcmp(cvd->vdev_path + len, "/old") == 0) { 04721 spa_strfree(cvd->vdev_path); 04722 cvd->vdev_path = spa_strdup(vd->vdev_path); 04723 break; 04724 } 04725 } 04726 } 04727 04728 /* 04729 * If we are detaching the original disk from a spare, then it implies 04730 * that the spare should become a real disk, and be removed from the 04731 * active spare list for the pool. 04732 */ 04733 if (pvd->vdev_ops == &vdev_spare_ops && 04734 vd->vdev_id == 0 && 04735 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 04736 unspare = B_TRUE; 04737 04738 /* 04739 * Erase the disk labels so the disk can be used for other things. 04740 * This must be done after all other error cases are handled, 04741 * but before we disembowel vd (so we can still do I/O to it). 04742 * But if we can't do it, don't treat the error as fatal -- 04743 * it may be that the unwritability of the disk is the reason 04744 * it's being detached! 04745 */ 04746 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 04747 04748 /* 04749 * Remove vd from its parent and compact the parent's children. 04750 */ 04751 vdev_remove_child(pvd, vd); 04752 vdev_compact_children(pvd); 04753 04754 /* 04755 * Remember one of the remaining children so we can get tvd below. 04756 */ 04757 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 04758 04759 /* 04760 * If we need to remove the remaining child from the list of hot spares, 04761 * do it now, marking the vdev as no longer a spare in the process. 04762 * We must do this before vdev_remove_parent(), because that can 04763 * change the GUID if it creates a new toplevel GUID. For a similar 04764 * reason, we must remove the spare now, in the same txg as the detach; 04765 * otherwise someone could attach a new sibling, change the GUID, and 04766 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 04767 */ 04768 if (unspare) { 04769 ASSERT(cvd->vdev_isspare); 04770 spa_spare_remove(cvd); 04771 unspare_guid = cvd->vdev_guid; 04772 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 04773 cvd->vdev_unspare = B_TRUE; 04774 } 04775 04776 /* 04777 * If the parent mirror/replacing vdev only has one child, 04778 * the parent is no longer needed. Remove it from the tree. 04779 */ 04780 if (pvd->vdev_children == 1) { 04781 if (pvd->vdev_ops == &vdev_spare_ops) 04782 cvd->vdev_unspare = B_FALSE; 04783 vdev_remove_parent(cvd); 04784 cvd->vdev_resilvering = B_FALSE; 04785 } 04786 04787 04788 /* 04789 * We don't set tvd until now because the parent we just removed 04790 * may have been the previous top-level vdev. 04791 */ 04792 tvd = cvd->vdev_top; 04793 ASSERT(tvd->vdev_parent == rvd); 04794 04795 /* 04796 * Reevaluate the parent vdev state. 04797 */ 04798 vdev_propagate_state(cvd); 04799 04800 /* 04801 * If the 'autoexpand' property is set on the pool then automatically 04802 * try to expand the size of the pool. For example if the device we 04803 * just detached was smaller than the others, it may be possible to 04804 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 04805 * first so that we can obtain the updated sizes of the leaf vdevs. 04806 */ 04807 if (spa->spa_autoexpand) { 04808 vdev_reopen(tvd); 04809 vdev_expand(tvd, txg); 04810 } 04811 04812 vdev_config_dirty(tvd); 04813 04814 /* 04815 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 04816 * vd->vdev_detached is set and free vd's DTL object in syncing context. 04817 * But first make sure we're not on any *other* txg's DTL list, to 04818 * prevent vd from being accessed after it's freed. 04819 */ 04820 vdpath = spa_strdup(vd->vdev_path); 04821 for (int t = 0; t < TXG_SIZE; t++) 04822 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 04823 vd->vdev_detached = B_TRUE; 04824 vdev_dirty(tvd, VDD_DTL, vd, txg); 04825 04826 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 04827 04828 /* hang on to the spa before we release the lock */ 04829 spa_open_ref(spa, FTAG); 04830 04831 error = spa_vdev_exit(spa, vd, txg, 0); 04832 04833 spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, 04834 "vdev=%s", vdpath); 04835 spa_strfree(vdpath); 04836 04837 /* 04838 * If this was the removal of the original device in a hot spare vdev, 04839 * then we want to go through and remove the device from the hot spare 04840 * list of every other pool. 04841 */ 04842 if (unspare) { 04843 spa_t *altspa = NULL; 04844 04845 mutex_enter(&spa_namespace_lock); 04846 while ((altspa = spa_next(altspa)) != NULL) { 04847 if (altspa->spa_state != POOL_STATE_ACTIVE || 04848 altspa == spa) 04849 continue; 04850 04851 spa_open_ref(altspa, FTAG); 04852 mutex_exit(&spa_namespace_lock); 04853 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 04854 mutex_enter(&spa_namespace_lock); 04855 spa_close(altspa, FTAG); 04856 } 04857 mutex_exit(&spa_namespace_lock); 04858 04859 /* search the rest of the vdevs for spares to remove */ 04860 spa_vdev_resilver_done(spa); 04861 } 04862 04863 /* all done with the spa; OK to release */ 04864 mutex_enter(&spa_namespace_lock); 04865 spa_close(spa, FTAG); 04866 mutex_exit(&spa_namespace_lock); 04867 04868 return (error); 04869 } 04870 04874 int 04875 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 04876 nvlist_t *props, boolean_t exp) 04877 { 04878 int error = 0; 04879 uint64_t txg, *glist; 04880 spa_t *newspa; 04881 uint_t c, children, lastlog; 04882 nvlist_t **child, *nvl, *tmp; 04883 dmu_tx_t *tx; 04884 char *altroot = NULL; 04885 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 04886 boolean_t activate_slog; 04887 04888 ASSERT(spa_writeable(spa)); 04889 04890 txg = spa_vdev_enter(spa); 04891 04892 /* clear the log and flush everything up to now */ 04893 activate_slog = spa_passivate_log(spa); 04894 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 04895 error = spa_offline_log(spa); 04896 txg = spa_vdev_config_enter(spa); 04897 04898 if (activate_slog) 04899 spa_activate_log(spa); 04900 04901 if (error != 0) 04902 return (spa_vdev_exit(spa, NULL, txg, error)); 04903 04904 /* check new spa name before going any further */ 04905 if (spa_lookup(newname) != NULL) 04906 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 04907 04908 /* 04909 * scan through all the children to ensure they're all mirrors 04910 */ 04911 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 04912 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 04913 &children) != 0) 04914 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 04915 04916 /* first, check to ensure we've got the right child count */ 04917 rvd = spa->spa_root_vdev; 04918 lastlog = 0; 04919 for (c = 0; c < rvd->vdev_children; c++) { 04920 vdev_t *vd = rvd->vdev_child[c]; 04921 04922 /* don't count the holes & logs as children */ 04923 if (vd->vdev_islog || vd->vdev_ishole) { 04924 if (lastlog == 0) 04925 lastlog = c; 04926 continue; 04927 } 04928 04929 lastlog = 0; 04930 } 04931 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 04932 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 04933 04934 /* next, ensure no spare or cache devices are part of the split */ 04935 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 04936 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 04937 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 04938 04939 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 04940 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 04941 04942 /* then, loop over each vdev and validate it */ 04943 for (c = 0; c < children; c++) { 04944 uint64_t is_hole = 0; 04945 04946 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 04947 &is_hole); 04948 04949 if (is_hole != 0) { 04950 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 04951 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 04952 continue; 04953 } else { 04954 error = EINVAL; 04955 break; 04956 } 04957 } 04958 04959 /* which disk is going to be split? */ 04960 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 04961 &glist[c]) != 0) { 04962 error = EINVAL; 04963 break; 04964 } 04965 04966 /* look it up in the spa */ 04967 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 04968 if (vml[c] == NULL) { 04969 error = ENODEV; 04970 break; 04971 } 04972 04973 /* make sure there's nothing stopping the split */ 04974 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 04975 vml[c]->vdev_islog || 04976 vml[c]->vdev_ishole || 04977 vml[c]->vdev_isspare || 04978 vml[c]->vdev_isl2cache || 04979 !vdev_writeable(vml[c]) || 04980 vml[c]->vdev_children != 0 || 04981 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 04982 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 04983 error = EINVAL; 04984 break; 04985 } 04986 04987 if (vdev_dtl_required(vml[c])) { 04988 error = EBUSY; 04989 break; 04990 } 04991 04992 /* we need certain info from the top level */ 04993 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 04994 vml[c]->vdev_top->vdev_ms_array) == 0); 04995 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 04996 vml[c]->vdev_top->vdev_ms_shift) == 0); 04997 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 04998 vml[c]->vdev_top->vdev_asize) == 0); 04999 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 05000 vml[c]->vdev_top->vdev_ashift) == 0); 05001 } 05002 05003 if (error != 0) { 05004 kmem_free(vml, children * sizeof (vdev_t *)); 05005 kmem_free(glist, children * sizeof (uint64_t)); 05006 return (spa_vdev_exit(spa, NULL, txg, error)); 05007 } 05008 05009 /* stop writers from using the disks */ 05010 for (c = 0; c < children; c++) { 05011 if (vml[c] != NULL) 05012 vml[c]->vdev_offline = B_TRUE; 05013 } 05014 vdev_reopen(spa->spa_root_vdev); 05015 05016 /* 05017 * Temporarily record the splitting vdevs in the spa config. This 05018 * will disappear once the config is regenerated. 05019 */ 05020 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 05021 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 05022 glist, children) == 0); 05023 kmem_free(glist, children * sizeof (uint64_t)); 05024 05025 mutex_enter(&spa->spa_props_lock); 05026 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 05027 nvl) == 0); 05028 mutex_exit(&spa->spa_props_lock); 05029 spa->spa_config_splitting = nvl; 05030 vdev_config_dirty(spa->spa_root_vdev); 05031 05032 /* configure and create the new pool */ 05033 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 05034 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 05035 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 05036 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 05037 spa_version(spa)) == 0); 05038 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 05039 spa->spa_config_txg) == 0); 05040 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 05041 spa_generate_guid(NULL)) == 0); 05042 (void) nvlist_lookup_string(props, 05043 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 05044 05045 /* add the new pool to the namespace */ 05046 newspa = spa_add(newname, config, altroot); 05047 newspa->spa_config_txg = spa->spa_config_txg; 05048 spa_set_log_state(newspa, SPA_LOG_CLEAR); 05049 05050 /* release the spa config lock, retaining the namespace lock */ 05051 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 05052 05053 if (zio_injection_enabled) 05054 zio_handle_panic_injection(spa, FTAG, 1); 05055 05056 spa_activate(newspa, spa_mode_global); 05057 spa_async_suspend(newspa); 05058 05059 #ifndef sun 05060 /* mark that we are creating new spa by splitting */ 05061 newspa->spa_splitting_newspa = B_TRUE; 05062 #endif 05063 /* create the new pool from the disks of the original pool */ 05064 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 05065 #ifndef sun 05066 newspa->spa_splitting_newspa = B_FALSE; 05067 #endif 05068 if (error) 05069 goto out; 05070 05071 /* if that worked, generate a real config for the new pool */ 05072 if (newspa->spa_root_vdev != NULL) { 05073 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 05074 NV_UNIQUE_NAME, KM_SLEEP) == 0); 05075 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 05076 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 05077 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 05078 B_TRUE)); 05079 } 05080 05081 /* set the props */ 05082 if (props != NULL) { 05083 spa_configfile_set(newspa, props, B_FALSE); 05084 error = spa_prop_set(newspa, props); 05085 if (error) 05086 goto out; 05087 } 05088 05089 /* flush everything */ 05090 txg = spa_vdev_config_enter(newspa); 05091 vdev_config_dirty(newspa->spa_root_vdev); 05092 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 05093 05094 if (zio_injection_enabled) 05095 zio_handle_panic_injection(spa, FTAG, 2); 05096 05097 spa_async_resume(newspa); 05098 05099 /* finally, update the original pool's config */ 05100 txg = spa_vdev_config_enter(spa); 05101 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 05102 error = dmu_tx_assign(tx, TXG_WAIT); 05103 if (error != 0) 05104 dmu_tx_abort(tx); 05105 for (c = 0; c < children; c++) { 05106 if (vml[c] != NULL) { 05107 vdev_split(vml[c]); 05108 if (error == 0) 05109 spa_history_log_internal(LOG_POOL_VDEV_DETACH, 05110 spa, tx, "vdev=%s", 05111 vml[c]->vdev_path); 05112 vdev_free(vml[c]); 05113 } 05114 } 05115 vdev_config_dirty(spa->spa_root_vdev); 05116 spa->spa_config_splitting = NULL; 05117 nvlist_free(nvl); 05118 if (error == 0) 05119 dmu_tx_commit(tx); 05120 (void) spa_vdev_exit(spa, NULL, txg, 0); 05121 05122 if (zio_injection_enabled) 05123 zio_handle_panic_injection(spa, FTAG, 3); 05124 05125 /* split is complete; log a history record */ 05126 spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL, 05127 "split new pool %s from pool %s", newname, spa_name(spa)); 05128 05129 kmem_free(vml, children * sizeof (vdev_t *)); 05130 05131 /* if we're not going to mount the filesystems in userland, export */ 05132 if (exp) 05133 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 05134 B_FALSE, B_FALSE); 05135 05136 return (error); 05137 05138 out: 05139 spa_unload(newspa); 05140 spa_deactivate(newspa); 05141 spa_remove(newspa); 05142 05143 txg = spa_vdev_config_enter(spa); 05144 05145 /* re-online all offlined disks */ 05146 for (c = 0; c < children; c++) { 05147 if (vml[c] != NULL) 05148 vml[c]->vdev_offline = B_FALSE; 05149 } 05150 vdev_reopen(spa->spa_root_vdev); 05151 05152 nvlist_free(spa->spa_config_splitting); 05153 spa->spa_config_splitting = NULL; 05154 (void) spa_vdev_exit(spa, NULL, txg, error); 05155 05156 kmem_free(vml, children * sizeof (vdev_t *)); 05157 return (error); 05158 } 05159 05160 static nvlist_t * 05161 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 05162 { 05163 for (int i = 0; i < count; i++) { 05164 uint64_t guid; 05165 05166 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 05167 &guid) == 0); 05168 05169 if (guid == target_guid) 05170 return (nvpp[i]); 05171 } 05172 05173 return (NULL); 05174 } 05175 05176 static void 05177 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 05178 nvlist_t *dev_to_remove) 05179 { 05180 nvlist_t **newdev = NULL; 05181 05182 if (count > 1) 05183 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 05184 05185 for (int i = 0, j = 0; i < count; i++) { 05186 if (dev[i] == dev_to_remove) 05187 continue; 05188 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 05189 } 05190 05191 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 05192 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 05193 05194 for (int i = 0; i < count - 1; i++) 05195 nvlist_free(newdev[i]); 05196 05197 if (count > 1) 05198 kmem_free(newdev, (count - 1) * sizeof (void *)); 05199 } 05200 05204 static int 05205 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 05206 { 05207 uint64_t txg; 05208 int error = 0; 05209 05210 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 05211 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 05212 ASSERT(vd == vd->vdev_top); 05213 05214 /* 05215 * Evacuate the device. We don't hold the config lock as writer 05216 * since we need to do I/O but we do keep the 05217 * spa_namespace_lock held. Once this completes the device 05218 * should no longer have any blocks allocated on it. 05219 */ 05220 if (vd->vdev_islog) { 05221 if (vd->vdev_stat.vs_alloc != 0) 05222 error = spa_offline_log(spa); 05223 } else { 05224 error = ENOTSUP; 05225 } 05226 05227 if (error) 05228 return (error); 05229 05230 /* 05231 * The evacuation succeeded. Remove any remaining MOS metadata 05232 * associated with this vdev, and wait for these changes to sync. 05233 */ 05234 ASSERT0(vd->vdev_stat.vs_alloc); 05235 txg = spa_vdev_config_enter(spa); 05236 vd->vdev_removing = B_TRUE; 05237 vdev_dirty(vd, 0, NULL, txg); 05238 vdev_config_dirty(vd); 05239 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 05240 05241 return (0); 05242 } 05243 05247 static void 05248 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 05249 { 05250 vdev_t *rvd = spa->spa_root_vdev; 05251 uint64_t id = vd->vdev_id; 05252 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 05253 05254 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 05255 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 05256 ASSERT(vd == vd->vdev_top); 05257 05258 /* 05259 * Only remove any devices which are empty. 05260 */ 05261 if (vd->vdev_stat.vs_alloc != 0) 05262 return; 05263 05264 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 05265 05266 if (list_link_active(&vd->vdev_state_dirty_node)) 05267 vdev_state_clean(vd); 05268 if (list_link_active(&vd->vdev_config_dirty_node)) 05269 vdev_config_clean(vd); 05270 05271 vdev_free(vd); 05272 05273 if (last_vdev) { 05274 vdev_compact_children(rvd); 05275 } else { 05276 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 05277 vdev_add_child(rvd, vd); 05278 } 05279 vdev_config_dirty(rvd); 05280 05281 /* 05282 * Reassess the health of our root vdev. 05283 */ 05284 vdev_reopen(rvd); 05285 } 05286 05299 int 05300 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 05301 { 05302 vdev_t *vd; 05303 metaslab_group_t *mg; 05304 nvlist_t **spares, **l2cache, *nv; 05305 uint64_t txg = 0; 05306 uint_t nspares, nl2cache; 05307 int error = 0; 05308 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 05309 05310 ASSERT(spa_writeable(spa)); 05311 05312 if (!locked) 05313 txg = spa_vdev_enter(spa); 05314 05315 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 05316 05317 if (spa->spa_spares.sav_vdevs != NULL && 05318 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 05319 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 05320 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 05321 /* 05322 * Only remove the hot spare if it's not currently in use 05323 * in this pool. 05324 */ 05325 if (vd == NULL || unspare) { 05326 spa_vdev_remove_aux(spa->spa_spares.sav_config, 05327 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 05328 spa_load_spares(spa); 05329 spa->spa_spares.sav_sync = B_TRUE; 05330 } else { 05331 error = EBUSY; 05332 } 05333 } else if (spa->spa_l2cache.sav_vdevs != NULL && 05334 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 05335 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 05336 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 05337 /* 05338 * Cache devices can always be removed. 05339 */ 05340 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 05341 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 05342 spa_load_l2cache(spa); 05343 spa->spa_l2cache.sav_sync = B_TRUE; 05344 } else if (vd != NULL && vd->vdev_islog) { 05345 ASSERT(!locked); 05346 ASSERT(vd == vd->vdev_top); 05347 05348 /* 05349 * XXX - Once we have bp-rewrite this should 05350 * become the common case. 05351 */ 05352 05353 mg = vd->vdev_mg; 05354 05355 /* 05356 * Stop allocating from this vdev. 05357 */ 05358 metaslab_group_passivate(mg); 05359 05360 /* 05361 * Wait for the youngest allocations and frees to sync, 05362 * and then wait for the deferral of those frees to finish. 05363 */ 05364 spa_vdev_config_exit(spa, NULL, 05365 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 05366 05367 /* 05368 * Attempt to evacuate the vdev. 05369 */ 05370 error = spa_vdev_remove_evacuate(spa, vd); 05371 05372 txg = spa_vdev_config_enter(spa); 05373 05374 /* 05375 * If we couldn't evacuate the vdev, unwind. 05376 */ 05377 if (error) { 05378 metaslab_group_activate(mg); 05379 return (spa_vdev_exit(spa, NULL, txg, error)); 05380 } 05381 05382 /* 05383 * Clean up the vdev namespace. 05384 */ 05385 spa_vdev_remove_from_namespace(spa, vd); 05386 05387 } else if (vd != NULL) { 05388 /* 05389 * Normal vdevs cannot be removed (yet). 05390 */ 05391 error = ENOTSUP; 05392 } else { 05393 /* 05394 * There is no vdev of any kind with the specified guid. 05395 */ 05396 error = ENOENT; 05397 } 05398 05399 if (!locked) 05400 return (spa_vdev_exit(spa, NULL, txg, error)); 05401 05402 return (error); 05403 } 05404 05409 static vdev_t * 05410 spa_vdev_resilver_done_hunt(vdev_t *vd) 05411 { 05412 vdev_t *newvd, *oldvd; 05413 05414 for (int c = 0; c < vd->vdev_children; c++) { 05415 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 05416 if (oldvd != NULL) 05417 return (oldvd); 05418 } 05419 05420 /* 05421 * Check for a completed replacement. We always consider the first 05422 * vdev in the list to be the oldest vdev, and the last one to be 05423 * the newest (see spa_vdev_attach() for how that works). In 05424 * the case where the newest vdev is faulted, we will not automatically 05425 * remove it after a resilver completes. This is OK as it will require 05426 * user intervention to determine which disk the admin wishes to keep. 05427 */ 05428 if (vd->vdev_ops == &vdev_replacing_ops) { 05429 ASSERT(vd->vdev_children > 1); 05430 05431 newvd = vd->vdev_child[vd->vdev_children - 1]; 05432 oldvd = vd->vdev_child[0]; 05433 05434 if (vdev_dtl_empty(newvd, DTL_MISSING) && 05435 vdev_dtl_empty(newvd, DTL_OUTAGE) && 05436 !vdev_dtl_required(oldvd)) 05437 return (oldvd); 05438 } 05439 05440 /* 05441 * Check for a completed resilver with the 'unspare' flag set. 05442 */ 05443 if (vd->vdev_ops == &vdev_spare_ops) { 05444 vdev_t *first = vd->vdev_child[0]; 05445 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 05446 05447 if (last->vdev_unspare) { 05448 oldvd = first; 05449 newvd = last; 05450 } else if (first->vdev_unspare) { 05451 oldvd = last; 05452 newvd = first; 05453 } else { 05454 oldvd = NULL; 05455 } 05456 05457 if (oldvd != NULL && 05458 vdev_dtl_empty(newvd, DTL_MISSING) && 05459 vdev_dtl_empty(newvd, DTL_OUTAGE) && 05460 !vdev_dtl_required(oldvd)) 05461 return (oldvd); 05462 05463 /* 05464 * If there are more than two spares attached to a disk, 05465 * and those spares are not required, then we want to 05466 * attempt to free them up now so that they can be used 05467 * by other pools. Once we're back down to a single 05468 * disk+spare, we stop removing them. 05469 */ 05470 if (vd->vdev_children > 2) { 05471 newvd = vd->vdev_child[1]; 05472 05473 if (newvd->vdev_isspare && last->vdev_isspare && 05474 vdev_dtl_empty(last, DTL_MISSING) && 05475 vdev_dtl_empty(last, DTL_OUTAGE) && 05476 !vdev_dtl_required(newvd)) 05477 return (newvd); 05478 } 05479 } 05480 05481 return (NULL); 05482 } 05483 05484 static void 05485 spa_vdev_resilver_done(spa_t *spa) 05486 { 05487 vdev_t *vd, *pvd, *ppvd; 05488 uint64_t guid, sguid, pguid, ppguid; 05489 05490 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 05491 05492 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 05493 pvd = vd->vdev_parent; 05494 ppvd = pvd->vdev_parent; 05495 guid = vd->vdev_guid; 05496 pguid = pvd->vdev_guid; 05497 ppguid = ppvd->vdev_guid; 05498 sguid = 0; 05499 /* 05500 * If we have just finished replacing a hot spared device, then 05501 * we need to detach the parent's first child (the original hot 05502 * spare) as well. 05503 */ 05504 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 05505 ppvd->vdev_children == 2) { 05506 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 05507 sguid = ppvd->vdev_child[1]->vdev_guid; 05508 } 05509 spa_config_exit(spa, SCL_ALL, FTAG); 05510 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 05511 return; 05512 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 05513 return; 05514 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 05515 } 05516 05517 spa_config_exit(spa, SCL_ALL, FTAG); 05518 } 05519 05523 int 05524 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 05525 boolean_t ispath) 05526 { 05527 vdev_t *vd; 05528 boolean_t sync = B_FALSE; 05529 05530 ASSERT(spa_writeable(spa)); 05531 05532 spa_vdev_state_enter(spa, SCL_ALL); 05533 05534 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 05535 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 05536 05537 if (!vd->vdev_ops->vdev_op_leaf) 05538 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 05539 05540 if (ispath) { 05541 if (strcmp(value, vd->vdev_path) != 0) { 05542 spa_strfree(vd->vdev_path); 05543 vd->vdev_path = spa_strdup(value); 05544 sync = B_TRUE; 05545 } 05546 } else { 05547 if (vd->vdev_fru == NULL) { 05548 vd->vdev_fru = spa_strdup(value); 05549 sync = B_TRUE; 05550 } else if (strcmp(value, vd->vdev_fru) != 0) { 05551 spa_strfree(vd->vdev_fru); 05552 vd->vdev_fru = spa_strdup(value); 05553 sync = B_TRUE; 05554 } 05555 } 05556 05557 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 05558 } 05559 05560 int 05561 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 05562 { 05563 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 05564 } 05565 05566 int 05567 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 05568 { 05569 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 05570 } 05571 05572 /* 05573 * ========================================================================== 05574 * SPA Scanning 05575 * ========================================================================== 05576 */ 05577 05578 int 05579 spa_scan_stop(spa_t *spa) 05580 { 05581 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 05582 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 05583 return (EBUSY); 05584 return (dsl_scan_cancel(spa->spa_dsl_pool)); 05585 } 05586 05587 int 05588 spa_scan(spa_t *spa, pool_scan_func_t func) 05589 { 05590 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 05591 05592 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 05593 return (ENOTSUP); 05594 05595 /* 05596 * If a resilver was requested, but there is no DTL on a 05597 * writeable leaf device, we have nothing to do. 05598 */ 05599 if (func == POOL_SCAN_RESILVER && 05600 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 05601 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 05602 return (0); 05603 } 05604 05605 return (dsl_scan(spa->spa_dsl_pool, func)); 05606 } 05607 05608 /* 05609 * ========================================================================== 05610 * SPA async task processing 05611 * ========================================================================== 05612 */ 05613 05614 static void 05615 spa_async_remove(spa_t *spa, vdev_t *vd) 05616 { 05617 if (vd->vdev_remove_wanted) { 05618 vd->vdev_remove_wanted = B_FALSE; 05619 vd->vdev_delayed_close = B_FALSE; 05620 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 05621 05622 /* 05623 * We want to clear the stats, but we don't want to do a full 05624 * vdev_clear() as that will cause us to throw away 05625 * degraded/faulted state as well as attempt to reopen the 05626 * device, all of which is a waste. 05627 */ 05628 vd->vdev_stat.vs_read_errors = 0; 05629 vd->vdev_stat.vs_write_errors = 0; 05630 vd->vdev_stat.vs_checksum_errors = 0; 05631 05632 vdev_state_dirty(vd->vdev_top); 05633 } 05634 05635 for (int c = 0; c < vd->vdev_children; c++) 05636 spa_async_remove(spa, vd->vdev_child[c]); 05637 } 05638 05639 static void 05640 spa_async_probe(spa_t *spa, vdev_t *vd) 05641 { 05642 if (vd->vdev_probe_wanted) { 05643 vd->vdev_probe_wanted = B_FALSE; 05644 vdev_reopen(vd); /* vdev_open() does the actual probe */ 05645 } 05646 05647 for (int c = 0; c < vd->vdev_children; c++) 05648 spa_async_probe(spa, vd->vdev_child[c]); 05649 } 05650 05651 static void 05652 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 05653 { 05654 sysevent_id_t eid; 05655 nvlist_t *attr; 05656 05657 if (!spa->spa_autoexpand) 05658 return; 05659 05660 for (int c = 0; c < vd->vdev_children; c++) { 05661 vdev_t *cvd = vd->vdev_child[c]; 05662 spa_async_autoexpand(spa, cvd); 05663 } 05664 05665 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_path == NULL) 05666 return; 05667 05668 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 05669 VERIFY(nvlist_add_string(attr, DEV_PATH, vd->vdev_path) == 0); 05670 05671 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 05672 ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 05673 05674 nvlist_free(attr); 05675 } 05676 05677 static void 05678 spa_async_thread(void *arg) 05679 { 05680 spa_t *spa = arg; 05681 int tasks; 05682 05683 ASSERT(spa->spa_sync_on); 05684 05685 mutex_enter(&spa->spa_async_lock); 05686 tasks = spa->spa_async_tasks; 05687 spa->spa_async_tasks = 0; 05688 mutex_exit(&spa->spa_async_lock); 05689 05690 /* 05691 * See if the config needs to be updated. 05692 */ 05693 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 05694 uint64_t old_space, new_space; 05695 05696 mutex_enter(&spa_namespace_lock); 05697 old_space = metaslab_class_get_space(spa_normal_class(spa)); 05698 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 05699 new_space = metaslab_class_get_space(spa_normal_class(spa)); 05700 mutex_exit(&spa_namespace_lock); 05701 05702 /* 05703 * If the pool grew as a result of the config update, 05704 * then log an internal history event. 05705 */ 05706 if (new_space != old_space) { 05707 spa_history_log_internal(LOG_POOL_VDEV_ONLINE, 05708 spa, NULL, 05709 "pool '%s' size: %llu(+%llu)", 05710 spa_name(spa), new_space, new_space - old_space); 05711 } 05712 } 05713 05714 /* 05715 * See if any devices need to be marked REMOVED. 05716 */ 05717 if (tasks & SPA_ASYNC_REMOVE) { 05718 spa_vdev_state_enter(spa, SCL_NONE); 05719 spa_async_remove(spa, spa->spa_root_vdev); 05720 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 05721 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 05722 for (int i = 0; i < spa->spa_spares.sav_count; i++) 05723 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 05724 (void) spa_vdev_state_exit(spa, NULL, 0); 05725 } 05726 05727 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 05728 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 05729 spa_async_autoexpand(spa, spa->spa_root_vdev); 05730 spa_config_exit(spa, SCL_CONFIG, FTAG); 05731 } 05732 05733 /* 05734 * See if any devices need to be probed. 05735 */ 05736 if (tasks & SPA_ASYNC_PROBE) { 05737 spa_vdev_state_enter(spa, SCL_NONE); 05738 spa_async_probe(spa, spa->spa_root_vdev); 05739 (void) spa_vdev_state_exit(spa, NULL, 0); 05740 } 05741 05742 /* 05743 * If any devices are done replacing, detach them. 05744 */ 05745 if (tasks & SPA_ASYNC_RESILVER_DONE) 05746 spa_vdev_resilver_done(spa); 05747 05748 /* 05749 * Kick off a resilver. 05750 */ 05751 if (tasks & SPA_ASYNC_RESILVER) 05752 dsl_resilver_restart(spa->spa_dsl_pool, 0); 05753 05754 /* 05755 * Let the world know that we're done. 05756 */ 05757 mutex_enter(&spa->spa_async_lock); 05758 spa->spa_async_thread = NULL; 05759 cv_broadcast(&spa->spa_async_cv); 05760 mutex_exit(&spa->spa_async_lock); 05761 thread_exit(); 05762 } 05763 05764 void 05765 spa_async_suspend(spa_t *spa) 05766 { 05767 mutex_enter(&spa->spa_async_lock); 05768 spa->spa_async_suspended++; 05769 while (spa->spa_async_thread != NULL) 05770 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 05771 mutex_exit(&spa->spa_async_lock); 05772 } 05773 05774 void 05775 spa_async_resume(spa_t *spa) 05776 { 05777 mutex_enter(&spa->spa_async_lock); 05778 ASSERT(spa->spa_async_suspended != 0); 05779 spa->spa_async_suspended--; 05780 mutex_exit(&spa->spa_async_lock); 05781 } 05782 05783 static int 05784 spa_async_tasks_pending(spa_t *spa) 05785 { 05786 u_int non_config_tasks; 05787 u_int config_task; 05788 boolean_t config_task_suspended; 05789 05790 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; 05791 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 05792 if (spa->spa_ccw_fail_time == 0) { 05793 config_task_suspended = B_FALSE; 05794 } else { 05795 config_task_suspended = 05796 (ddi_get_lbolt64() - spa->spa_ccw_fail_time) 05797 < (zfs_ccw_retry_interval * hz); 05798 } 05799 05800 return (non_config_tasks || (config_task && !config_task_suspended)); 05801 } 05802 05803 static void 05804 spa_async_dispatch(spa_t *spa) 05805 { 05806 mutex_enter(&spa->spa_async_lock); 05807 if (spa_async_tasks_pending(spa) && 05808 !spa->spa_async_suspended && 05809 spa->spa_async_thread == NULL && 05810 rootdir != NULL) 05811 spa->spa_async_thread = thread_create(NULL, 0, 05812 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 05813 mutex_exit(&spa->spa_async_lock); 05814 } 05815 05816 void 05817 spa_async_request(spa_t *spa, int task) 05818 { 05819 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 05820 mutex_enter(&spa->spa_async_lock); 05821 spa->spa_async_tasks |= task; 05822 mutex_exit(&spa->spa_async_lock); 05823 } 05824 05825 /* 05826 * ========================================================================== 05827 * SPA syncing routines 05828 * ========================================================================== 05829 */ 05830 05831 static int 05832 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 05833 { 05834 bpobj_t *bpo = arg; 05835 bpobj_enqueue(bpo, bp, tx); 05836 return (0); 05837 } 05838 05839 static int 05840 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 05841 { 05842 zio_t *zio = arg; 05843 05844 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 05845 BP_GET_PSIZE(bp), zio->io_flags)); 05846 return (0); 05847 } 05848 05849 static void 05850 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 05851 { 05852 char *packed = NULL; 05853 size_t bufsize; 05854 size_t nvsize = 0; 05855 dmu_buf_t *db; 05856 05857 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 05858 05859 /* 05860 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 05861 * information. This avoids the dbuf_will_dirty() path and 05862 * saves us a pre-read to get data we don't actually care about. 05863 */ 05864 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 05865 packed = kmem_alloc(bufsize, KM_SLEEP); 05866 05867 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 05868 KM_SLEEP) == 0); 05869 bzero(packed + nvsize, bufsize - nvsize); 05870 05871 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 05872 05873 kmem_free(packed, bufsize); 05874 05875 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 05876 dmu_buf_will_dirty(db, tx); 05877 *(uint64_t *)db->db_data = nvsize; 05878 dmu_buf_rele(db, FTAG); 05879 } 05880 05881 static void 05882 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 05883 const char *config, const char *entry) 05884 { 05885 nvlist_t *nvroot; 05886 nvlist_t **list; 05887 int i; 05888 05889 if (!sav->sav_sync) 05890 return; 05891 05892 /* 05893 * Update the MOS nvlist describing the list of available devices. 05894 * spa_validate_aux() will have already made sure this nvlist is 05895 * valid and the vdevs are labeled appropriately. 05896 */ 05897 if (sav->sav_object == 0) { 05898 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 05899 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 05900 sizeof (uint64_t), tx); 05901 VERIFY(zap_update(spa->spa_meta_objset, 05902 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 05903 &sav->sav_object, tx) == 0); 05904 } 05905 05906 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 05907 if (sav->sav_count == 0) { 05908 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 05909 } else { 05910 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 05911 for (i = 0; i < sav->sav_count; i++) 05912 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 05913 B_FALSE, VDEV_CONFIG_L2CACHE); 05914 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 05915 sav->sav_count) == 0); 05916 for (i = 0; i < sav->sav_count; i++) 05917 nvlist_free(list[i]); 05918 kmem_free(list, sav->sav_count * sizeof (void *)); 05919 } 05920 05921 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 05922 nvlist_free(nvroot); 05923 05924 sav->sav_sync = B_FALSE; 05925 } 05926 05927 static void 05928 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 05929 { 05930 nvlist_t *config; 05931 05932 if (list_is_empty(&spa->spa_config_dirty_list)) 05933 return; 05934 05935 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 05936 05937 config = spa_config_generate(spa, spa->spa_root_vdev, 05938 dmu_tx_get_txg(tx), B_FALSE); 05939 05940 spa_config_exit(spa, SCL_STATE, FTAG); 05941 05942 if (spa->spa_config_syncing) 05943 nvlist_free(spa->spa_config_syncing); 05944 spa->spa_config_syncing = config; 05945 05946 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 05947 } 05948 05949 static void 05950 spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx) 05951 { 05952 spa_t *spa = arg1; 05953 uint64_t version = *(uint64_t *)arg2; 05954 05955 /* 05956 * Setting the version is special cased when first creating the pool. 05957 */ 05958 ASSERT(tx->tx_txg != TXG_INITIAL); 05959 05960 ASSERT(version <= SPA_VERSION); 05961 ASSERT(version >= spa_version(spa)); 05962 05963 spa->spa_uberblock.ub_version = version; 05964 vdev_config_dirty(spa->spa_root_vdev); 05965 } 05966 05970 static void 05971 spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 05972 { 05973 spa_t *spa = arg1; 05974 objset_t *mos = spa->spa_meta_objset; 05975 nvlist_t *nvp = arg2; 05976 nvpair_t *elem = NULL; 05977 05978 mutex_enter(&spa->spa_props_lock); 05979 05980 while ((elem = nvlist_next_nvpair(nvp, elem))) { 05981 uint64_t intval; 05982 char *strval, *fname; 05983 zpool_prop_t prop; 05984 const char *propname; 05985 zprop_type_t proptype; 05986 zfeature_info_t *feature; 05987 05988 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 05989 case ZPROP_INVAL: 05990 /* 05991 * We checked this earlier in spa_prop_validate(). 05992 */ 05993 ASSERT(zpool_prop_feature(nvpair_name(elem))); 05994 05995 fname = strchr(nvpair_name(elem), '@') + 1; 05996 VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature)); 05997 05998 spa_feature_enable(spa, feature, tx); 05999 break; 06000 06001 case ZPOOL_PROP_VERSION: 06002 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 06003 /* 06004 * The version is synced seperatly before other 06005 * properties and should be correct by now. 06006 */ 06007 ASSERT3U(spa_version(spa), >=, intval); 06008 break; 06009 06010 case ZPOOL_PROP_ALTROOT: 06011 /* 06012 * 'altroot' is a non-persistent property. It should 06013 * have been set temporarily at creation or import time. 06014 */ 06015 ASSERT(spa->spa_root != NULL); 06016 break; 06017 06018 case ZPOOL_PROP_READONLY: 06019 case ZPOOL_PROP_CACHEFILE: 06020 /* 06021 * 'readonly' and 'cachefile' are also non-persisitent 06022 * properties. 06023 */ 06024 break; 06025 case ZPOOL_PROP_COMMENT: 06026 VERIFY(nvpair_value_string(elem, &strval) == 0); 06027 if (spa->spa_comment != NULL) 06028 spa_strfree(spa->spa_comment); 06029 spa->spa_comment = spa_strdup(strval); 06030 /* 06031 * We need to dirty the configuration on all the vdevs 06032 * so that their labels get updated. It's unnecessary 06033 * to do this for pool creation since the vdev's 06034 * configuratoin has already been dirtied. 06035 */ 06036 if (tx->tx_txg != TXG_INITIAL) 06037 vdev_config_dirty(spa->spa_root_vdev); 06038 break; 06039 default: 06040 /* 06041 * Set pool property values in the poolprops mos object. 06042 */ 06043 if (spa->spa_pool_props_object == 0) { 06044 spa->spa_pool_props_object = 06045 zap_create_link(mos, DMU_OT_POOL_PROPS, 06046 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 06047 tx); 06048 } 06049 06050 /* normalize the property name */ 06051 propname = zpool_prop_to_name(prop); 06052 proptype = zpool_prop_get_type(prop); 06053 06054 if (nvpair_type(elem) == DATA_TYPE_STRING) { 06055 ASSERT(proptype == PROP_TYPE_STRING); 06056 VERIFY(nvpair_value_string(elem, &strval) == 0); 06057 VERIFY(zap_update(mos, 06058 spa->spa_pool_props_object, propname, 06059 1, strlen(strval) + 1, strval, tx) == 0); 06060 06061 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 06062 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 06063 06064 if (proptype == PROP_TYPE_INDEX) { 06065 const char *unused; 06066 VERIFY(zpool_prop_index_to_string( 06067 prop, intval, &unused) == 0); 06068 } 06069 VERIFY(zap_update(mos, 06070 spa->spa_pool_props_object, propname, 06071 8, 1, &intval, tx) == 0); 06072 } else { 06073 ASSERT(0); /* not allowed */ 06074 } 06075 06076 switch (prop) { 06077 case ZPOOL_PROP_DELEGATION: 06078 spa->spa_delegation = intval; 06079 break; 06080 case ZPOOL_PROP_BOOTFS: 06081 spa->spa_bootfs = intval; 06082 break; 06083 case ZPOOL_PROP_FAILUREMODE: 06084 spa->spa_failmode = intval; 06085 break; 06086 case ZPOOL_PROP_AUTOEXPAND: 06087 spa->spa_autoexpand = intval; 06088 if (tx->tx_txg != TXG_INITIAL) 06089 spa_async_request(spa, 06090 SPA_ASYNC_AUTOEXPAND); 06091 break; 06092 case ZPOOL_PROP_DEDUPDITTO: 06093 spa->spa_dedup_ditto = intval; 06094 break; 06095 default: 06096 break; 06097 } 06098 } 06099 06100 /* log internal history if this is not a zpool create */ 06101 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 06102 tx->tx_txg != TXG_INITIAL) { 06103 spa_history_log_internal(LOG_POOL_PROPSET, 06104 spa, tx, "%s %lld %s", 06105 nvpair_name(elem), intval, spa_name(spa)); 06106 } 06107 } 06108 06109 mutex_exit(&spa->spa_props_lock); 06110 } 06111 06119 static void 06120 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 06121 { 06122 dsl_pool_t *dp = spa->spa_dsl_pool; 06123 06124 ASSERT(spa->spa_sync_pass == 1); 06125 06126 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 06127 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 06128 dsl_pool_create_origin(dp, tx); 06129 06130 /* Keeping the origin open increases spa_minref */ 06131 spa->spa_minref += 3; 06132 } 06133 06134 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 06135 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 06136 dsl_pool_upgrade_clones(dp, tx); 06137 } 06138 06139 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 06140 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 06141 dsl_pool_upgrade_dir_clones(dp, tx); 06142 06143 /* Keeping the freedir open increases spa_minref */ 06144 spa->spa_minref += 3; 06145 } 06146 06147 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 06148 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 06149 spa_feature_create_zap_objects(spa, tx); 06150 } 06151 } 06152 06159 void 06160 spa_sync(spa_t *spa, uint64_t txg) 06161 { 06162 dsl_pool_t *dp = spa->spa_dsl_pool; 06163 objset_t *mos = spa->spa_meta_objset; 06164 bpobj_t *defer_bpo = &spa->spa_deferred_bpobj; 06165 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 06166 vdev_t *rvd = spa->spa_root_vdev; 06167 vdev_t *vd; 06168 dmu_tx_t *tx; 06169 int error; 06170 06171 VERIFY(spa_writeable(spa)); 06172 06173 /* 06174 * Lock out configuration changes. 06175 */ 06176 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 06177 06178 spa->spa_syncing_txg = txg; 06179 spa->spa_sync_pass = 0; 06180 06181 /* 06182 * If there are any pending vdev state changes, convert them 06183 * into config changes that go out with this transaction group. 06184 */ 06185 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 06186 while (list_head(&spa->spa_state_dirty_list) != NULL) { 06187 /* 06188 * We need the write lock here because, for aux vdevs, 06189 * calling vdev_config_dirty() modifies sav_config. 06190 * This is ugly and will become unnecessary when we 06191 * eliminate the aux vdev wart by integrating all vdevs 06192 * into the root vdev tree. 06193 */ 06194 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 06195 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 06196 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 06197 vdev_state_clean(vd); 06198 vdev_config_dirty(vd); 06199 } 06200 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 06201 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 06202 } 06203 spa_config_exit(spa, SCL_STATE, FTAG); 06204 06205 tx = dmu_tx_create_assigned(dp, txg); 06206 06207 /* 06208 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 06209 * set spa_deflate if we have no raid-z vdevs. 06210 */ 06211 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 06212 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 06213 int i; 06214 06215 for (i = 0; i < rvd->vdev_children; i++) { 06216 vd = rvd->vdev_child[i]; 06217 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 06218 break; 06219 } 06220 if (i == rvd->vdev_children) { 06221 spa->spa_deflate = TRUE; 06222 VERIFY(0 == zap_add(spa->spa_meta_objset, 06223 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 06224 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 06225 } 06226 } 06227 06228 /* 06229 * If anything has changed in this txg, or if someone is waiting 06230 * for this txg to sync (eg, spa_vdev_remove()), push the 06231 * deferred frees from the previous txg. If not, leave them 06232 * alone so that we don't generate work on an otherwise idle 06233 * system. 06234 */ 06235 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 06236 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 06237 !txg_list_empty(&dp->dp_sync_tasks, txg) || 06238 ((dsl_scan_active(dp->dp_scan) || 06239 txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { 06240 zio_t *zio = zio_root(spa, NULL, NULL, 0); 06241 VERIFY3U(bpobj_iterate(defer_bpo, 06242 spa_free_sync_cb, zio, tx), ==, 0); 06243 VERIFY0(zio_wait(zio)); 06244 } 06245 06246 /* 06247 * Iterate to convergence. 06248 */ 06249 do { 06250 int pass = ++spa->spa_sync_pass; 06251 06252 spa_sync_config_object(spa, tx); 06253 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 06254 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 06255 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 06256 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 06257 spa_errlog_sync(spa, txg); 06258 dsl_pool_sync(dp, txg); 06259 06260 if (pass <= SYNC_PASS_DEFERRED_FREE) { 06261 zio_t *zio = zio_root(spa, NULL, NULL, 0); 06262 bplist_iterate(free_bpl, spa_free_sync_cb, 06263 zio, tx); 06264 VERIFY(zio_wait(zio) == 0); 06265 } else { 06266 bplist_iterate(free_bpl, bpobj_enqueue_cb, 06267 defer_bpo, tx); 06268 } 06269 06270 ddt_sync(spa, txg); 06271 dsl_scan_sync(dp, tx); 06272 06273 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 06274 vdev_sync(vd, txg); 06275 06276 if (pass == 1) 06277 spa_sync_upgrades(spa, tx); 06278 06279 } while (dmu_objset_is_dirty(mos, txg)); 06280 06281 /* 06282 * Rewrite the vdev configuration (which includes the uberblock) 06283 * to commit the transaction group. 06284 * 06285 * If there are no dirty vdevs, we sync the uberblock to a few 06286 * random top-level vdevs that are known to be visible in the 06287 * config cache (see spa_vdev_add() for a complete description). 06288 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 06289 */ 06290 for (;;) { 06291 /* 06292 * We hold SCL_STATE to prevent vdev open/close/etc. 06293 * while we're attempting to write the vdev labels. 06294 */ 06295 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 06296 06297 if (list_is_empty(&spa->spa_config_dirty_list)) { 06298 vdev_t *svd[SPA_DVAS_PER_BP]; 06299 int svdcount = 0; 06300 int children = rvd->vdev_children; 06301 int c0 = spa_get_random(children); 06302 06303 for (int c = 0; c < children; c++) { 06304 vd = rvd->vdev_child[(c0 + c) % children]; 06305 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 06306 continue; 06307 svd[svdcount++] = vd; 06308 if (svdcount == SPA_DVAS_PER_BP) 06309 break; 06310 } 06311 error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 06312 if (error != 0) 06313 error = vdev_config_sync(svd, svdcount, txg, 06314 B_TRUE); 06315 } else { 06316 error = vdev_config_sync(rvd->vdev_child, 06317 rvd->vdev_children, txg, B_FALSE); 06318 if (error != 0) 06319 error = vdev_config_sync(rvd->vdev_child, 06320 rvd->vdev_children, txg, B_TRUE); 06321 } 06322 06323 if (error == 0) 06324 spa->spa_last_synced_guid = rvd->vdev_guid; 06325 06326 spa_config_exit(spa, SCL_STATE, FTAG); 06327 06328 if (error == 0) 06329 break; 06330 zio_suspend(spa, NULL); 06331 zio_resume_wait(spa); 06332 } 06333 dmu_tx_commit(tx); 06334 06335 /* 06336 * Clear the dirty config list. 06337 */ 06338 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 06339 vdev_config_clean(vd); 06340 06341 /* 06342 * Now that the new config has synced transactionally, 06343 * let it become visible to the config cache. 06344 */ 06345 if (spa->spa_config_syncing != NULL) { 06346 spa_config_set(spa, spa->spa_config_syncing); 06347 spa->spa_config_txg = txg; 06348 spa->spa_config_syncing = NULL; 06349 } 06350 06351 spa->spa_ubsync = spa->spa_uberblock; 06352 06353 dsl_pool_sync_done(dp, txg); 06354 06355 /* 06356 * Update usable space statistics. 06357 */ 06358 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 06359 vdev_sync_done(vd, txg); 06360 06361 spa_update_dspace(spa); 06362 06363 /* 06364 * It had better be the case that we didn't dirty anything 06365 * since vdev_config_sync(). 06366 */ 06367 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 06368 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 06369 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 06370 06371 spa->spa_sync_pass = 0; 06372 06373 spa_config_exit(spa, SCL_CONFIG, FTAG); 06374 06375 spa_handle_ignored_writes(spa); 06376 06377 /* 06378 * If any async tasks have been requested, kick them off. 06379 */ 06380 spa_async_dispatch(spa); 06381 } 06382 06388 void 06389 spa_sync_allpools(void) 06390 { 06391 spa_t *spa = NULL; 06392 mutex_enter(&spa_namespace_lock); 06393 while ((spa = spa_next(spa)) != NULL) { 06394 if (spa_state(spa) != POOL_STATE_ACTIVE || 06395 !spa_writeable(spa) || spa_suspended(spa)) 06396 continue; 06397 spa_open_ref(spa, FTAG); 06398 mutex_exit(&spa_namespace_lock); 06399 txg_wait_synced(spa_get_dsl(spa), 0); 06400 mutex_enter(&spa_namespace_lock); 06401 spa_close(spa, FTAG); 06402 } 06403 mutex_exit(&spa_namespace_lock); 06404 } 06405 06406 /* 06407 * ========================================================================== 06408 * Miscellaneous routines 06409 * ========================================================================== 06410 */ 06411 06415 void 06416 spa_evict_all(void) 06417 { 06418 spa_t *spa; 06419 06420 /* 06421 * Remove all cached state. All pools should be closed now, 06422 * so every spa in the AVL tree should be unreferenced. 06423 */ 06424 mutex_enter(&spa_namespace_lock); 06425 while ((spa = spa_next(NULL)) != NULL) { 06426 /* 06427 * Stop async tasks. The async thread may need to detach 06428 * a device that's been replaced, which requires grabbing 06429 * spa_namespace_lock, so we must drop it here. 06430 */ 06431 spa_open_ref(spa, FTAG); 06432 mutex_exit(&spa_namespace_lock); 06433 spa_async_suspend(spa); 06434 mutex_enter(&spa_namespace_lock); 06435 spa_close(spa, FTAG); 06436 06437 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 06438 spa_unload(spa); 06439 spa_deactivate(spa); 06440 } 06441 spa_remove(spa); 06442 } 06443 mutex_exit(&spa_namespace_lock); 06444 } 06445 06446 vdev_t * 06447 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 06448 { 06449 vdev_t *vd; 06450 int i; 06451 06452 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 06453 return (vd); 06454 06455 if (aux) { 06456 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 06457 vd = spa->spa_l2cache.sav_vdevs[i]; 06458 if (vd->vdev_guid == guid) 06459 return (vd); 06460 } 06461 06462 for (i = 0; i < spa->spa_spares.sav_count; i++) { 06463 vd = spa->spa_spares.sav_vdevs[i]; 06464 if (vd->vdev_guid == guid) 06465 return (vd); 06466 } 06467 } 06468 06469 return (NULL); 06470 } 06471 06472 void 06473 spa_upgrade(spa_t *spa, uint64_t version) 06474 { 06475 ASSERT(spa_writeable(spa)); 06476 06477 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 06478 06479 /* 06480 * This should only be called for a non-faulted pool, and since a 06481 * future version would result in an unopenable pool, this shouldn't be 06482 * possible. 06483 */ 06484 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 06485 ASSERT(version >= spa->spa_uberblock.ub_version); 06486 06487 spa->spa_uberblock.ub_version = version; 06488 vdev_config_dirty(spa->spa_root_vdev); 06489 06490 spa_config_exit(spa, SCL_ALL, FTAG); 06491 06492 txg_wait_synced(spa_get_dsl(spa), 0); 06493 } 06494 06495 boolean_t 06496 spa_has_spare(spa_t *spa, uint64_t guid) 06497 { 06498 int i; 06499 uint64_t spareguid; 06500 spa_aux_vdev_t *sav = &spa->spa_spares; 06501 06502 for (i = 0; i < sav->sav_count; i++) 06503 if (sav->sav_vdevs[i]->vdev_guid == guid) 06504 return (B_TRUE); 06505 06506 for (i = 0; i < sav->sav_npending; i++) { 06507 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 06508 &spareguid) == 0 && spareguid == guid) 06509 return (B_TRUE); 06510 } 06511 06512 return (B_FALSE); 06513 } 06514 06521 static boolean_t 06522 spa_has_active_shared_spare(spa_t *spa) 06523 { 06524 int i, refcnt; 06525 uint64_t pool; 06526 spa_aux_vdev_t *sav = &spa->spa_spares; 06527 06528 for (i = 0; i < sav->sav_count; i++) { 06529 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 06530 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 06531 refcnt > 2) 06532 return (B_TRUE); 06533 } 06534 06535 return (B_FALSE); 06536 } 06537 06550 void 06551 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 06552 { 06553 #ifdef _KERNEL 06554 sysevent_t *ev; 06555 sysevent_attr_list_t *attr = NULL; 06556 sysevent_value_t value; 06557 sysevent_id_t eid; 06558 06559 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 06560 SE_SLEEP); 06561 06562 value.value_type = SE_DATA_TYPE_STRING; 06563 value.value.sv_string = spa_name(spa); 06564 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 06565 goto done; 06566 06567 value.value_type = SE_DATA_TYPE_UINT64; 06568 value.value.sv_uint64 = spa_guid(spa); 06569 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 06570 goto done; 06571 06572 if (vd) { 06573 value.value_type = SE_DATA_TYPE_UINT64; 06574 value.value.sv_uint64 = vd->vdev_guid; 06575 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 06576 SE_SLEEP) != 0) 06577 goto done; 06578 06579 if (vd->vdev_path) { 06580 value.value_type = SE_DATA_TYPE_STRING; 06581 value.value.sv_string = vd->vdev_path; 06582 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 06583 &value, SE_SLEEP) != 0) 06584 goto done; 06585 } 06586 } 06587 06588 if (sysevent_attach_attributes(ev, attr) != 0) 06589 goto done; 06590 attr = NULL; 06591 06592 (void) log_sysevent(ev, SE_SLEEP, &eid); 06593 06594 done: 06595 if (attr) 06596 sysevent_free_attr(attr); 06597 sysevent_free(ev); 06598 #endif 06599 }