FreeBSD ZFS
The Zettabyte File System
|
00001 /* 00002 * CDDL HEADER START 00003 * 00004 * The contents of this file are subject to the terms of the 00005 * Common Development and Distribution License (the "License"). 00006 * You may not use this file except in compliance with the License. 00007 * 00008 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 00009 * or http://www.opensolaris.org/os/licensing. 00010 * See the License for the specific language governing permissions 00011 * and limitations under the License. 00012 * 00013 * When distributing Covered Code, include this CDDL HEADER in each 00014 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 00015 * If applicable, add the following below this CDDL HEADER, with the 00016 * fields enclosed by brackets "[]" replaced with your own identifying 00017 * information: Portions Copyright [yyyy] [name of copyright owner] 00018 * 00019 * CDDL HEADER END 00020 */ 00021 00022 /* 00023 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 00024 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 00025 * Copyright (c) 2012 by Delphix. All rights reserved. 00026 */ 00027 00028 #include <sys/zfs_context.h> 00029 #include <sys/fm/fs/zfs.h> 00030 #include <sys/spa.h> 00031 #include <sys/spa_impl.h> 00032 #include <sys/nvpair.h> 00033 #include <sys/uio.h> 00034 #include <sys/fs/zfs.h> 00035 #include <sys/vdev_impl.h> 00036 #include <sys/zfs_ioctl.h> 00037 #include <sys/utsname.h> 00038 #include <sys/sunddi.h> 00039 #include <sys/zfeature.h> 00040 #ifdef _KERNEL 00041 #include <sys/kobj.h> 00042 #include <sys/zone.h> 00043 #endif 00044 00063 static uint64_t spa_config_generation = 1; 00064 00069 const char *spa_config_path = ZPOOL_CACHE; 00070 00076 void 00077 spa_config_load(void) 00078 { 00079 void *buf = NULL; 00080 nvlist_t *nvlist, *child; 00081 nvpair_t *nvpair; 00082 char *pathname; 00083 struct _buf *file; 00084 uint64_t fsize; 00085 00086 /* 00087 * Open the configuration file. 00088 */ 00089 pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 00090 00091 (void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path); 00092 00093 file = kobj_open_file(pathname); 00094 00095 kmem_free(pathname, MAXPATHLEN); 00096 00097 if (file == (struct _buf *)-1) 00098 return; 00099 00100 if (kobj_get_filesize(file, &fsize) != 0) 00101 goto out; 00102 00103 buf = kmem_alloc(fsize, KM_SLEEP); 00104 00105 /* 00106 * Read the nvlist from the file. 00107 */ 00108 if (kobj_read_file(file, buf, fsize, 0) < 0) 00109 goto out; 00110 00111 /* 00112 * Unpack the nvlist. 00113 */ 00114 if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0) 00115 goto out; 00116 00117 /* 00118 * Iterate over all elements in the nvlist, creating a new spa_t for 00119 * each one with the specified configuration. 00120 */ 00121 mutex_enter(&spa_namespace_lock); 00122 nvpair = NULL; 00123 while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { 00124 if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) 00125 continue; 00126 00127 VERIFY(nvpair_value_nvlist(nvpair, &child) == 0); 00128 00129 if (spa_lookup(nvpair_name(nvpair)) != NULL) 00130 continue; 00131 (void) spa_add(nvpair_name(nvpair), child, NULL); 00132 } 00133 mutex_exit(&spa_namespace_lock); 00134 00135 nvlist_free(nvlist); 00136 00137 out: 00138 if (buf != NULL) 00139 kmem_free(buf, fsize); 00140 00141 kobj_close_file(file); 00142 } 00143 00144 static int 00145 spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) 00146 { 00147 size_t buflen; 00148 char *buf; 00149 vnode_t *vp; 00150 int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX; 00151 char *temp; 00152 int err; 00153 00154 /* 00155 * If the nvlist is empty (NULL), then remove the old cachefile. 00156 */ 00157 if (nvl == NULL) { 00158 err = vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE); 00159 return (err); 00160 } 00161 00162 /* 00163 * Pack the configuration into a buffer. 00164 */ 00165 VERIFY(nvlist_size(nvl, &buflen, NV_ENCODE_XDR) == 0); 00166 00167 buf = kmem_alloc(buflen, KM_SLEEP); 00168 temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 00169 00170 VERIFY(nvlist_pack(nvl, &buf, &buflen, NV_ENCODE_XDR, 00171 KM_SLEEP) == 0); 00172 00173 /* 00174 * Write the configuration to disk. We need to do the traditional 00175 * 'write to temporary file, sync, move over original' to make sure we 00176 * always have a consistent view of the data. 00177 */ 00178 (void) snprintf(temp, MAXPATHLEN, "%s.tmp", dp->scd_path); 00179 00180 err = vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0); 00181 if (err == 0) { 00182 if ((err = vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE, 00183 0, RLIM64_INFINITY, kcred, NULL)) == 0 && 00184 (err = VOP_FSYNC(vp, FSYNC, kcred, NULL)) == 0) { 00185 err = vn_rename(temp, dp->scd_path, UIO_SYSSPACE); 00186 } 00187 (void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL); 00188 } 00189 00190 (void) vn_remove(temp, UIO_SYSSPACE, RMFILE); 00191 00192 kmem_free(buf, buflen); 00193 kmem_free(temp, MAXPATHLEN); 00194 return (err); 00195 } 00196 00201 void 00202 spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) 00203 { 00204 spa_config_dirent_t *dp, *tdp; 00205 nvlist_t *nvl; 00206 boolean_t ccw_failure; 00207 int error; 00208 00209 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 00210 00211 if (rootdir == NULL || !(spa_mode_global & FWRITE)) 00212 return; 00213 00214 /* 00215 * Iterate over all cachefiles for the pool, past or present. When the 00216 * cachefile is changed, the new one is pushed onto this list, allowing 00217 * us to update previous cachefiles that no longer contain this pool. 00218 */ 00219 ccw_failure = B_FALSE; 00220 for (dp = list_head(&target->spa_config_list); dp != NULL; 00221 dp = list_next(&target->spa_config_list, dp)) { 00222 spa_t *spa = NULL; 00223 if (dp->scd_path == NULL) 00224 continue; 00225 00226 /* 00227 * Iterate over all pools, adding any matching pools to 'nvl'. 00228 */ 00229 nvl = NULL; 00230 while ((spa = spa_next(spa)) != NULL) { 00231 if (spa == target && removing) 00232 continue; 00233 00234 mutex_enter(&spa->spa_props_lock); 00235 tdp = list_head(&spa->spa_config_list); 00236 if (spa->spa_config == NULL || 00237 tdp->scd_path == NULL || 00238 strcmp(tdp->scd_path, dp->scd_path) != 0) { 00239 mutex_exit(&spa->spa_props_lock); 00240 continue; 00241 } 00242 00243 if (nvl == NULL) 00244 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, 00245 KM_SLEEP) == 0); 00246 00247 VERIFY(nvlist_add_nvlist(nvl, spa->spa_name, 00248 spa->spa_config) == 0); 00249 mutex_exit(&spa->spa_props_lock); 00250 } 00251 00252 error = spa_config_write(dp, nvl); 00253 if (error != 0) { 00254 00255 printf("ZFS ERROR: Update of cache file %s failed: " 00256 "Errno %d\n", dp->scd_path, error); 00257 ccw_failure = B_TRUE; 00258 } 00259 00260 nvlist_free(nvl); 00261 } 00262 00263 if (ccw_failure) { 00264 /* 00265 * Keep trying so that configuration data is 00266 * written if/when any temporary filesystem 00267 * resource issues are resolved. 00268 */ 00269 target->spa_ccw_fail_time = ddi_get_lbolt64(); 00270 spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE); 00271 zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, 00272 target, NULL, NULL, 0, 0); 00273 } else { 00274 /* 00275 * Do not rate limit future attempts to update 00276 * the config cache. 00277 */ 00278 target->spa_ccw_fail_time = 0; 00279 } 00280 00281 /* 00282 * Remove any config entries older than the current one. 00283 */ 00284 dp = list_head(&target->spa_config_list); 00285 while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) { 00286 list_remove(&target->spa_config_list, tdp); 00287 if (tdp->scd_path != NULL) 00288 spa_strfree(tdp->scd_path); 00289 kmem_free(tdp, sizeof (spa_config_dirent_t)); 00290 } 00291 00292 spa_config_generation++; 00293 00294 if (postsysevent) 00295 spa_event_notify(target, NULL, ESC_ZFS_CONFIG_SYNC); 00296 } 00297 00304 nvlist_t * 00305 spa_all_configs(uint64_t *generation) 00306 { 00307 nvlist_t *pools; 00308 spa_t *spa = NULL; 00309 00310 if (*generation == spa_config_generation) 00311 return (NULL); 00312 00313 VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, KM_SLEEP) == 0); 00314 00315 mutex_enter(&spa_namespace_lock); 00316 while ((spa = spa_next(spa)) != NULL) { 00317 if (INGLOBALZONE(curthread) || 00318 zone_dataset_visible(spa_name(spa), NULL)) { 00319 mutex_enter(&spa->spa_props_lock); 00320 VERIFY(nvlist_add_nvlist(pools, spa_name(spa), 00321 spa->spa_config) == 0); 00322 mutex_exit(&spa->spa_props_lock); 00323 } 00324 } 00325 *generation = spa_config_generation; 00326 mutex_exit(&spa_namespace_lock); 00327 00328 return (pools); 00329 } 00330 00331 void 00332 spa_config_set(spa_t *spa, nvlist_t *config) 00333 { 00334 mutex_enter(&spa->spa_props_lock); 00335 if (spa->spa_config != NULL) 00336 nvlist_free(spa->spa_config); 00337 spa->spa_config = config; 00338 mutex_exit(&spa->spa_props_lock); 00339 } 00340 00347 nvlist_t * 00348 spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) 00349 { 00350 nvlist_t *config, *nvroot; 00351 vdev_t *rvd = spa->spa_root_vdev; 00352 unsigned long hostid = 0; 00353 boolean_t locked = B_FALSE; 00354 uint64_t split_guid; 00355 00356 if (vd == NULL) { 00357 vd = rvd; 00358 locked = B_TRUE; 00359 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 00360 } 00361 00362 ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) == 00363 (SCL_CONFIG | SCL_STATE)); 00364 00365 /* 00366 * If txg is -1, report the current value of spa->spa_config_txg. 00367 */ 00368 if (txg == -1ULL) 00369 txg = spa->spa_config_txg; 00370 00371 VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0); 00372 00373 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 00374 spa_version(spa)) == 0); 00375 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 00376 spa_name(spa)) == 0); 00377 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 00378 spa_state(spa)) == 0); 00379 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 00380 txg) == 0); 00381 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 00382 spa_guid(spa)) == 0); 00383 VERIFY(spa->spa_comment == NULL || nvlist_add_string(config, 00384 ZPOOL_CONFIG_COMMENT, spa->spa_comment) == 0); 00385 00386 00387 #ifdef _KERNEL 00388 hostid = zone_get_hostid(NULL); 00389 #else /* _KERNEL */ 00390 /* 00391 * We're emulating the system's hostid in userland, so we can't use 00392 * zone_get_hostid(). 00393 */ 00394 (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); 00395 #endif /* _KERNEL */ 00396 if (hostid != 0) { 00397 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, 00398 hostid) == 0); 00399 } 00400 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, 00401 utsname.nodename) == 0); 00402 00403 if (vd != rvd) { 00404 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID, 00405 vd->vdev_top->vdev_guid) == 0); 00406 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_GUID, 00407 vd->vdev_guid) == 0); 00408 if (vd->vdev_isspare) 00409 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE, 00410 1ULL) == 0); 00411 if (vd->vdev_islog) 00412 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG, 00413 1ULL) == 0); 00414 vd = vd->vdev_top; /* label contains top config */ 00415 } else { 00416 /* 00417 * Only add the (potentially large) split information 00418 * in the mos config, and not in the vdev labels 00419 */ 00420 if (spa->spa_config_splitting != NULL) 00421 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT, 00422 spa->spa_config_splitting) == 0); 00423 } 00424 00425 /* 00426 * Add the top-level config. We even add this on pools which 00427 * don't support holes in the namespace. 00428 */ 00429 vdev_top_config_generate(spa, config); 00430 00431 /* 00432 * If we're splitting, record the original pool's guid. 00433 */ 00434 if (spa->spa_config_splitting != NULL && 00435 nvlist_lookup_uint64(spa->spa_config_splitting, 00436 ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) { 00437 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID, 00438 split_guid) == 0); 00439 } 00440 00441 nvroot = vdev_config_generate(spa, vd, getstats, 0); 00442 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 00443 nvlist_free(nvroot); 00444 00445 /* 00446 * Store what's necessary for reading the MOS in the label. 00447 */ 00448 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, 00449 spa->spa_label_features) == 0); 00450 00451 if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) { 00452 ddt_histogram_t *ddh; 00453 ddt_stat_t *dds; 00454 ddt_object_t *ddo; 00455 00456 ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); 00457 ddt_get_dedup_histogram(spa, ddh); 00458 VERIFY(nvlist_add_uint64_array(config, 00459 ZPOOL_CONFIG_DDT_HISTOGRAM, 00460 (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)) == 0); 00461 kmem_free(ddh, sizeof (ddt_histogram_t)); 00462 00463 ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP); 00464 ddt_get_dedup_object_stats(spa, ddo); 00465 VERIFY(nvlist_add_uint64_array(config, 00466 ZPOOL_CONFIG_DDT_OBJ_STATS, 00467 (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)) == 0); 00468 kmem_free(ddo, sizeof (ddt_object_t)); 00469 00470 dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP); 00471 ddt_get_dedup_stats(spa, dds); 00472 VERIFY(nvlist_add_uint64_array(config, 00473 ZPOOL_CONFIG_DDT_STATS, 00474 (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)) == 0); 00475 kmem_free(dds, sizeof (ddt_stat_t)); 00476 } 00477 00478 if (locked) 00479 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 00480 00481 return (config); 00482 } 00483 00489 void 00490 spa_config_update(spa_t *spa, int what) 00491 { 00492 vdev_t *rvd = spa->spa_root_vdev; 00493 uint64_t txg; 00494 int c; 00495 00496 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 00497 00498 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 00499 txg = spa_last_synced_txg(spa) + 1; 00500 if (what == SPA_CONFIG_UPDATE_POOL) { 00501 vdev_config_dirty(rvd); 00502 } else { 00503 /* 00504 * If we have top-level vdevs that were added but have 00505 * not yet been prepared for allocation, do that now. 00506 * (It's safe now because the config cache is up to date, 00507 * so it will be able to translate the new DVAs.) 00508 * See comments in spa_vdev_add() for full details. 00509 */ 00510 for (c = 0; c < rvd->vdev_children; c++) { 00511 vdev_t *tvd = rvd->vdev_child[c]; 00512 if (tvd->vdev_ms_array == 0) 00513 vdev_metaslab_set_size(tvd); 00514 vdev_expand(tvd, txg); 00515 } 00516 } 00517 spa_config_exit(spa, SCL_ALL, FTAG); 00518 00519 /* 00520 * Wait for the mosconfig to be regenerated and synced. 00521 */ 00522 txg_wait_synced(spa->spa_dsl_pool, txg); 00523 00524 /* 00525 * Update the global config cache to reflect the new mosconfig. 00526 */ 00527 if (!spa->spa_is_root) 00528 spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); 00529 00530 if (what == SPA_CONFIG_UPDATE_POOL) 00531 spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); 00532 }