commit 4a6c300fc72ee46515d0b8c07982d496050e9fe3 Author: Andriy Gapon Date: Mon Sep 10 21:54:42 2012 +0300 [wip] zfs_mount: taste geom providers for root pool config This should allow to mount a dataset as a root filesystem even if it belongs to a pool that is not described in zpool.cache. This adds overhead to the boot process though... To do: cleanup diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index 7ee082a..0fa81a4 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -3533,8 +3533,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, return (0); } -#if defined(sun) #ifdef _KERNEL +#if defined(sun) /* * Get the root pool information from the root disk, then import the root pool * during the system boot up time. @@ -3736,8 +3736,115 @@ out: return (error); } -#endif +#else + +extern int +vdev_geom_read_pool_label(const char *name, nvlist_t **config); + +static nvlist_t * +spa_generate_rootconf(const char *name) +{ + nvlist_t *config; + nvlist_t *nvtop, *nvroot; + uint64_t pgid; + + if (vdev_geom_read_pool_label(name, &config) != 0) + return (NULL); + + /* + * Add this top-level vdev to the child array. + */ + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pgid) == 0); + + /* + * Put this pool's top-level vdevs into a root vdev. + */ + VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); + VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &nvtop, 1) == 0); + + /* + * Replace the existing vdev_tree with the new root vdev in + * this pool's configuration (remove the old, add the new). + */ + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); + nvlist_free(nvroot); + return (config); +} + +int +spa_import_rootpool(const char *name) +{ + spa_t *spa; + vdev_t *rvd, *bvd, *avd = NULL; + nvlist_t *config, *nvtop; + uint64_t txg; + char *pname; + int error; + + /* + * Read the label from the boot device and generate a configuration. + */ + config = spa_generate_rootconf(name); + if (config == NULL) { + cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", + name); + return (EIO); + } + + VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &pname) == 0 && strcmp(name, pname) == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); + + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(pname)) != NULL) { + /* + * Remove the existing root pool from the namespace so that we + * can replace it with the correct config we just read in. + */ + spa_remove(spa); + } + spa = spa_add(pname, config, NULL); + spa->spa_is_root = B_TRUE; + spa->spa_import_flags = ZFS_IMPORT_VERBATIM; + + /* + * Build up a vdev tree based on the boot device's label config. + */ + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, + VDEV_ALLOC_ROOTPOOL); + spa_config_exit(spa, SCL_ALL, FTAG); + if (error) { + mutex_exit(&spa_namespace_lock); + nvlist_free(config); + cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", + pname); + return (error); + } + + error = 0; + spa_history_log_version(spa, LOG_POOL_IMPORT); +out: + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + vdev_free(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + mutex_exit(&spa_namespace_lock); + + return (error); +} + #endif /* sun */ +#endif /* * Import a non-root pool into the system. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h index ec6914e..0211ed5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h @@ -419,7 +419,11 @@ extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, size_t buflen); extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, const char *history_str, nvlist_t *zplprops); +#if defined(sun) extern int spa_import_rootpool(char *devpath, char *devid); +#else +extern int spa_import_rootpool(const char *name); +#endif extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c index b2aada7..7d146ff 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c @@ -175,17 +175,11 @@ vdev_geom_detach(void *arg, int flag __unused) static uint64_t nvlist_get_guid(nvlist_t *list) { - nvpair_t *elem = NULL; uint64_t value; - while ((elem = nvlist_next_nvpair(list, elem)) != NULL) { - if (nvpair_type(elem) == DATA_TYPE_UINT64 && - strcmp(nvpair_name(elem), "guid") == 0) { - VERIFY(nvpair_value_uint64(elem, &value) == 0); - return (value); - } - } - return (0); + value = 0; + nvlist_lookup_uint64(list, ZPOOL_CONFIG_GUID, &value); + return (value); } static int @@ -223,8 +217,16 @@ vdev_geom_io(struct g_consumer *cp, int cmd, void *data, off_t offset, off_t siz return (error); } -static uint64_t -vdev_geom_read_guid(struct g_consumer *cp) +static void +vdev_geom_taste_orphan(struct g_consumer *cp) +{ + + KASSERT(1 == 0, ("%s called while tasting %s.", __func__, + cp->provider->name)); +} + +static int +vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config) { struct g_provider *pp; vdev_label_t *label; @@ -232,13 +234,13 @@ vdev_geom_read_guid(struct g_consumer *cp) size_t buflen; uint64_t psize; off_t offset, size; - uint64_t guid; + uint64_t guid, state, txg; int error, l, len; g_topology_assert_not(); pp = cp->provider; - ZFS_LOG(1, "Reading guid from %s...", pp->name); + ZFS_LOG(1, "Reading config from %s...", pp->name); psize = pp->mediasize; psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t)); @@ -250,8 +252,8 @@ vdev_geom_read_guid(struct g_consumer *cp) label = kmem_alloc(size, KM_SLEEP); buflen = sizeof(label->vl_vdev_phys.vp_nvlist); + *config = NULL; for (l = 0; l < VDEV_LABELS; l++) { - nvlist_t *config = NULL; offset = vdev_label_offset(psize, l, 0); if ((offset % pp->sectorsize) != 0) @@ -261,27 +263,149 @@ vdev_geom_read_guid(struct g_consumer *cp) continue; buf = label->vl_vdev_phys.vp_nvlist; - if (nvlist_unpack(buf, buflen, &config, 0) != 0) + if (nvlist_unpack(buf, buflen, config, 0) != 0) continue; - guid = nvlist_get_guid(config); - nvlist_free(config); - if (guid != 0) - break; + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state >= POOL_STATE_DESTROYED) { + nvlist_free(*config); + *config = NULL; + continue; + } + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0) { + nvlist_free(*config); + *config = NULL; + continue; + } + + break; } kmem_free(label, size); - if (guid != 0) - ZFS_LOG(1, "guid for %s is %ju", pp->name, (uintmax_t)guid); - return (guid); + return (*config == NULL ? ENOENT : 0); +} + +static int +vdev_geom_check_config(nvlist_t *config, const char *name, uint64_t *best_txg) +{ + uint64_t vdev_guid; + uint64_t txg; + char *pname; + + if (nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 || + strcmp(pname, name) != 0) + return (ENOENT); + + ZFS_LOG(1, "found pool: %s", pname); + + txg = 0; + nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg); + if (txg <= *best_txg) + return (ENOENT); + *best_txg = txg; + ZFS_LOG(1, "txg: %ju", (uintmax_t)*best_txg); + + return (0); +} + +static int +vdev_geom_attach_taster(struct g_consumer *cp, struct g_provider *pp) +{ + int error; + + if (pp->flags & G_PF_WITHER) + return (EINVAL); + if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) + return (EINVAL); + g_attach(cp, pp); + error = g_access(cp, 1, 0, 0); + if (error != 0) + g_detach(cp); + return (error); } static void -vdev_geom_taste_orphan(struct g_consumer *cp) +vdev_geom_dettach_taster(struct g_consumer *cp) { + g_access(cp, -1, 0, 0); + g_detach(cp); +} - KASSERT(1 == 0, ("%s called while tasting %s.", __func__, - cp->provider->name)); +int +vdev_geom_read_pool_label(const char *name, nvlist_t **config) +{ + struct g_class *mp; + struct g_geom *gp, *zgp; + struct g_provider *pp; + struct g_consumer *zcp; + nvlist_t *vdev_cfg; + uint64_t best_txg; + int error; + + DROP_GIANT(); + g_topology_lock(); + + zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste"); + /* This orphan function should be never called. */ + zgp->orphan = vdev_geom_taste_orphan; + zcp = g_new_consumer(zgp); + + best_txg = 0; + *config = NULL; + LIST_FOREACH(mp, &g_classes, class) { + if (mp == &zfs_vdev_class) + continue; + LIST_FOREACH(gp, &mp->geom, geom) { + if (gp->flags & G_GEOM_WITHER) + continue; + LIST_FOREACH(pp, &gp->provider, provider) { + if (pp->flags & G_PF_WITHER) + continue; + if (vdev_geom_attach_taster(zcp, pp) != 0) + continue; + g_topology_unlock(); + error = vdev_geom_read_config(zcp, &vdev_cfg); + g_topology_lock(); + vdev_geom_dettach_taster(zcp); + if (error) + continue; + ZFS_LOG(1, "successfully read vdev config"); + + error = vdev_geom_check_config(vdev_cfg, name, + &best_txg); + if (error != 0) { + nvlist_free(vdev_cfg); + continue; + } + nvlist_free(*config); + *config = vdev_cfg; + } + } + } + + g_destroy_consumer(zcp); + g_destroy_geom(zgp); + g_topology_unlock(); + PICKUP_GIANT(); + return (*config == NULL ? ENOENT : 0); +} + +static uint64_t +vdev_geom_read_guid(struct g_consumer *cp) +{ + nvlist_t *config; + uint64_t guid; + + g_topology_assert_not(); + + guid = 0; + if (vdev_geom_read_config(cp, &config) == 0) { + guid = nvlist_get_guid(config); + nvlist_free(config); + } + return (guid); } static struct g_consumer * @@ -308,18 +432,12 @@ vdev_geom_attach_by_guid(uint64_t guid) if (gp->flags & G_GEOM_WITHER) continue; LIST_FOREACH(pp, &gp->provider, provider) { - if (pp->flags & G_PF_WITHER) - continue; - g_attach(zcp, pp); - if (g_access(zcp, 1, 0, 0) != 0) { - g_detach(zcp); + if (vdev_geom_attach_taster(zcp, pp) != 0) continue; - } g_topology_unlock(); pguid = vdev_geom_read_guid(zcp); g_topology_lock(); - g_access(zcp, -1, 0, 0); - g_detach(zcp); + vdev_geom_dettach_taster(zcp); if (pguid != guid) continue; cp = vdev_geom_attach(pp); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c index 212faa0..334f444 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -1523,6 +1523,25 @@ out: } #endif /* OPENSOLARIS_MOUNTROOT */ +static int +getpoolname(const char *osname, char *poolname) +{ + char *p; + + p = strchr(osname, '/'); + if (p == NULL) { + if (strlen(osname) >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strcpy(poolname, osname); + } else { + if (p - osname >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strncpy(poolname, osname, p - osname); + poolname[p - osname] = '\0'; + } + return (0); +} + /*ARGSUSED*/ static int zfs_mount(vfs_t *vfsp) @@ -1616,6 +1635,29 @@ zfs_mount(vfs_t *vfsp) goto out; } + /* Initial root mount: try hard to import the requested root pool. */ + if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && + (vfsp->vfs_flag & MNT_UPDATE) == 0) { + char pname[MAXNAMELEN]; + spa_t *spa; + int prefer_cache; + + error = getpoolname(osname, pname); + if (error) + goto out; + + prefer_cache = 1; + TUNABLE_INT_FETCH("vfs.zfs.rootpool.prefer_cached_config", + &prefer_cache); + mutex_enter(&spa_namespace_lock); + spa = spa_lookup(pname); + mutex_exit(&spa_namespace_lock); + if (!prefer_cache || spa == NULL) { + error = spa_import_rootpool(pname); + if (error) + goto out; + } + } DROP_GIANT(); error = zfs_domount(vfsp, osname); PICKUP_GIANT();