FreeBSD ZFS
The Zettabyte File System

zfeature.c

Go to the documentation of this file.
00001 /*
00002  * CDDL HEADER START
00003  *
00004  * The contents of this file are subject to the terms of the
00005  * Common Development and Distribution License (the "License").
00006  * You may not use this file except in compliance with the License.
00007  *
00008  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
00009  * or http://www.opensolaris.org/os/licensing.
00010  * See the License for the specific language governing permissions
00011  * and limitations under the License.
00012  *
00013  * When distributing Covered Code, include this CDDL HEADER in each
00014  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
00015  * If applicable, add the following below this CDDL HEADER, with the
00016  * fields enclosed by brackets "[]" replaced with your own identifying
00017  * information: Portions Copyright [yyyy] [name of copyright owner]
00018  *
00019  * CDDL HEADER END
00020  */
00021 
00022 /*
00023  * Copyright (c) 2012 by Delphix. All rights reserved.
00024  */
00025 
00026 #include <sys/zfs_context.h>
00027 #include <sys/zfeature.h>
00028 #include <sys/dmu.h>
00029 #include <sys/nvpair.h>
00030 #include <sys/zap.h>
00031 #include <sys/dmu_tx.h>
00032 #include "zfeature_common.h"
00033 #include <sys/spa_impl.h>
00034 
00035 /*
00036  * ZFS Feature Flags
00037  * -----------------
00038  *
00039  * ZFS feature flags are used to provide fine-grained versioning to the ZFS
00040  * on-disk format. Once enabled on a pool feature flags replace the old
00041  * spa_version() number.
00042  *
00043  * Each new on-disk format change will be given a uniquely identifying string
00044  * guid rather than a version number. This avoids the problem of different
00045  * organizations creating new on-disk formats with the same version number. To
00046  * keep feature guids unique they should consist of the reverse dns name of the
00047  * organization which implemented the feature and a short name for the feature,
00048  * separated by a colon (e.g. com.delphix:async_destroy).
00049  *
00050  * Reference Counts
00051  * ----------------
00052  *
00053  * Within each pool features can be in one of three states: disabled, enabled,
00054  * or active. These states are differentiated by a reference count stored on
00055  * disk for each feature:
00056  *
00057  *   1) If there is no reference count stored on disk the feature is disabled.
00058  *   2) If the reference count is 0 a system administrator has enabled the
00059  *      feature, but the feature has not been used yet, so no on-disk
00060  *      format changes have been made.
00061  *   3) If the reference count is greater than 0 the feature is active.
00062  *      The format changes required by the feature are currently on disk.
00063  *      Note that if the feature's format changes are reversed the feature
00064  *      may choose to set its reference count back to 0.
00065  *
00066  * Feature flags makes no differentiation between non-zero reference counts
00067  * for an active feature (e.g. a reference count of 1 means the same thing as a
00068  * reference count of 27834721), but feature implementations may choose to use
00069  * the reference count to store meaningful information. For example, a new RAID
00070  * implementation might set the reference count to the number of vdevs using
00071  * it. If all those disks are removed from the pool the feature goes back to
00072  * having a reference count of 0.
00073  *
00074  * It is the responsibility of the individual features to maintain a non-zero
00075  * reference count as long as the feature's format changes are present on disk.
00076  *
00077  * Dependencies
00078  * ------------
00079  *
00080  * Each feature may depend on other features. The only effect of this
00081  * relationship is that when a feature is enabled all of its dependencies are
00082  * automatically enabled as well. Any future work to support disabling of
00083  * features would need to ensure that features cannot be disabled if other
00084  * enabled features depend on them.
00085  *
00086  * On-disk Format
00087  * --------------
00088  *
00089  * When feature flags are enabled spa_version() is set to SPA_VERSION_FEATURES
00090  * (5000). In order for this to work the pool is automatically upgraded to
00091  * SPA_VERSION_BEFORE_FEATURES (28) first, so all pre-feature flags on disk
00092  * format changes will be in use.
00093  *
00094  * Information about features is stored in 3 ZAP objects in the pool's MOS.
00095  * These objects are linked to by the following names in the pool directory
00096  * object:
00097  *
00098  * 1) features_for_read: feature guid -> reference count
00099  *    Features needed to open the pool for reading.
00100  * 2) features_for_write: feature guid -> reference count
00101  *    Features needed to open the pool for writing.
00102  * 3) feature_descriptions: feature guid -> descriptive string
00103  *    A human readable string.
00104  *
00105  * All enabled features appear in either features_for_read or
00106  * features_for_write, but not both.
00107  *
00108  * To open a pool in read-only mode only the features listed in
00109  * features_for_read need to be supported.
00110  *
00111  * To open the pool in read-write mode features in both features_for_read and
00112  * features_for_write need to be supported.
00113  *
00114  * Some features may be required to read the ZAP objects containing feature
00115  * information. To allow software to check for compatibility with these features
00116  * before the pool is opened their names must be stored in the label in a
00117  * new "features_for_read" entry (note that features that are only required
00118  * to write to a pool never need to be stored in the label since the
00119  * features_for_write ZAP object can be read before the pool is written to).
00120  * To save space in the label features must be explicitly marked as needing to
00121  * be written to the label. Also, reference counts are not stored in the label,
00122  * instead any feature whose reference count drops to 0 is removed from the
00123  * label.
00124  *
00125  * Adding New Features
00126  * -------------------
00127  *
00128  * Features must be registered in zpool_feature_init() function in
00129  * zfeature_common.c using the zfeature_register() function. This function
00130  * has arguments to specify if the feature should be stored in the
00131  * features_for_read or features_for_write ZAP object and if it needs to be
00132  * written to the label when active.
00133  *
00134  * Once a feature is registered it will appear as a "feature@<feature name>"
00135  * property which can be set by an administrator. Feature implementors should
00136  * use the spa_feature_is_enabled() and spa_feature_is_active() functions to
00137  * query the state of a feature and the spa_feature_incr() and
00138  * spa_feature_decr() functions to change an enabled feature's reference count.
00139  * Reference counts may only be updated in the syncing context.
00140  *
00141  * Features may not perform enable-time initialization. Instead, any such
00142  * initialization should occur when the feature is first used. This design
00143  * enforces that on-disk changes be made only when features are used. Code
00144  * should only check if a feature is enabled using spa_feature_is_enabled(),
00145  * not by relying on any feature specific metadata existing. If a feature is
00146  * enabled, but the feature's metadata is not on disk yet then it should be
00147  * created as needed.
00148  *
00149  * As an example, consider the com.delphix:async_destroy feature. This feature
00150  * relies on the existence of a bptree in the MOS that store blocks for
00151  * asynchronous freeing. This bptree is not created when async_destroy is
00152  * enabled. Instead, when a dataset is destroyed spa_feature_is_enabled() is
00153  * called to check if async_destroy is enabled. If it is and the bptree object
00154  * does not exist yet, the bptree object is created as part of the dataset
00155  * destroy and async_destroy's reference count is incremented to indicate it
00156  * has made an on-disk format change. Later, after the destroyed dataset's
00157  * blocks have all been asynchronously freed there is no longer any use for the
00158  * bptree object, so it is destroyed and async_destroy's reference count is
00159  * decremented back to 0 to indicate that it has undone its on-disk format
00160  * changes.
00161  */
00162 
00163 typedef enum {
00164         FEATURE_ACTION_ENABLE,
00165         FEATURE_ACTION_INCR,
00166         FEATURE_ACTION_DECR,
00167 } feature_action_t;
00168 
00169 /*
00170  * Checks that the features active in the specified object are supported by
00171  * this software.  Adds each unsupported feature (name -> description) to
00172  * the supplied nvlist.
00173  */
00174 boolean_t
00175 feature_is_supported(objset_t *os, uint64_t obj, uint64_t desc_obj,
00176     nvlist_t *unsup_feat, nvlist_t *enabled_feat)
00177 {
00178         boolean_t supported;
00179         zap_cursor_t zc;
00180         zap_attribute_t za;
00181 
00182         supported = B_TRUE;
00183         for (zap_cursor_init(&zc, os, obj);
00184             zap_cursor_retrieve(&zc, &za) == 0;
00185             zap_cursor_advance(&zc)) {
00186                 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
00187                     za.za_num_integers == 1);
00188 
00189                 if (NULL != enabled_feat) {
00190                         fnvlist_add_uint64(enabled_feat, za.za_name,
00191                             za.za_first_integer);
00192                 }
00193 
00194                 if (za.za_first_integer != 0 &&
00195                     !zfeature_is_supported(za.za_name)) {
00196                         supported = B_FALSE;
00197 
00198                         if (NULL != unsup_feat) {
00199                                 char *desc = "";
00200                                 char buf[MAXPATHLEN];
00201 
00202                                 if (zap_lookup(os, desc_obj, za.za_name,
00203                                     1, sizeof (buf), buf) == 0)
00204                                         desc = buf;
00205 
00206                                 VERIFY(nvlist_add_string(unsup_feat, za.za_name,
00207                                     desc) == 0);
00208                         }
00209                 }
00210         }
00211         zap_cursor_fini(&zc);
00212 
00213         return (supported);
00214 }
00215 
00216 static int
00217 feature_get_refcount(objset_t *os, uint64_t read_obj, uint64_t write_obj,
00218     zfeature_info_t *feature, uint64_t *res)
00219 {
00220         int err;
00221         uint64_t refcount;
00222         uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj;
00223 
00224         /*
00225          * If the pool is currently being created, the feature objects may not
00226          * have been allocated yet.  Act as though all features are disabled.
00227          */
00228         if (zapobj == 0)
00229                 return (ENOTSUP);
00230 
00231         err = zap_lookup(os, zapobj, feature->fi_guid, sizeof (uint64_t), 1,
00232             &refcount);
00233         if (err != 0) {
00234                 if (err == ENOENT)
00235                         return (ENOTSUP);
00236                 else
00237                         return (err);
00238         }
00239         *res = refcount;
00240         return (0);
00241 }
00242 
00243 static int
00244 feature_do_action(objset_t *os, uint64_t read_obj, uint64_t write_obj,
00245     uint64_t desc_obj, zfeature_info_t *feature, feature_action_t action,
00246     dmu_tx_t *tx)
00247 {
00248         int error;
00249         uint64_t refcount;
00250         uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj;
00251 
00252         ASSERT(0 != zapobj);
00253         ASSERT(zfeature_is_valid_guid(feature->fi_guid));
00254 
00255         error = zap_lookup(os, zapobj, feature->fi_guid,
00256             sizeof (uint64_t), 1, &refcount);
00257 
00258         /*
00259          * If we can't ascertain the status of the specified feature, an I/O
00260          * error occurred.
00261          */
00262         if (error != 0 && error != ENOENT)
00263                 return (error);
00264 
00265         switch (action) {
00266         case FEATURE_ACTION_ENABLE:
00267                 /*
00268                  * If the feature is already enabled, ignore the request.
00269                  */
00270                 if (error == 0)
00271                         return (0);
00272                 refcount = 0;
00273                 break;
00274         case FEATURE_ACTION_INCR:
00275                 if (error == ENOENT)
00276                         return (ENOTSUP);
00277                 if (refcount == UINT64_MAX)
00278                         return (EOVERFLOW);
00279                 refcount++;
00280                 break;
00281         case FEATURE_ACTION_DECR:
00282                 if (error == ENOENT)
00283                         return (ENOTSUP);
00284                 if (refcount == 0)
00285                         return (EOVERFLOW);
00286                 refcount--;
00287                 break;
00288         default:
00289                 ASSERT(0);
00290                 break;
00291         }
00292 
00293         if (action == FEATURE_ACTION_ENABLE) {
00294                 int i;
00295 
00296                 for (i = 0; feature->fi_depends[i] != NULL; i++) {
00297                         zfeature_info_t *dep = feature->fi_depends[i];
00298 
00299                         error = feature_do_action(os, read_obj, write_obj,
00300                             desc_obj, dep, FEATURE_ACTION_ENABLE, tx);
00301                         if (error != 0)
00302                                 return (error);
00303                 }
00304         }
00305 
00306         error = zap_update(os, zapobj, feature->fi_guid,
00307             sizeof (uint64_t), 1, &refcount, tx);
00308         if (error != 0)
00309                 return (error);
00310 
00311         if (action == FEATURE_ACTION_ENABLE) {
00312                 error = zap_update(os, desc_obj,
00313                     feature->fi_guid, 1, strlen(feature->fi_desc) + 1,
00314                     feature->fi_desc, tx);
00315                 if (error != 0)
00316                         return (error);
00317         }
00318 
00319         if (action == FEATURE_ACTION_INCR && refcount == 1 && feature->fi_mos) {
00320                 spa_activate_mos_feature(dmu_objset_spa(os), feature->fi_guid);
00321         }
00322 
00323         if (action == FEATURE_ACTION_DECR && refcount == 0) {
00324                 spa_deactivate_mos_feature(dmu_objset_spa(os),
00325                     feature->fi_guid);
00326         }
00327 
00328         return (0);
00329 }
00330 
00331 void
00332 spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx)
00333 {
00334         /*
00335          * We create feature flags ZAP objects in two instances: during pool
00336          * creation and during pool upgrade.
00337          */
00338         ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)) || (!spa->spa_sync_on &&
00339             tx->tx_txg == TXG_INITIAL));
00340 
00341         spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset,
00342             DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
00343             DMU_POOL_FEATURES_FOR_READ, tx);
00344         spa->spa_feat_for_write_obj = zap_create_link(spa->spa_meta_objset,
00345             DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
00346             DMU_POOL_FEATURES_FOR_WRITE, tx);
00347         spa->spa_feat_desc_obj = zap_create_link(spa->spa_meta_objset,
00348             DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
00349             DMU_POOL_FEATURE_DESCRIPTIONS, tx);
00350 }
00351 
00352 /*
00353  * Enable any required dependencies, then enable the requested feature.
00354  */
00355 void
00356 spa_feature_enable(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
00357 {
00358         ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
00359         VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
00360             spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
00361             spa->spa_feat_desc_obj, feature, FEATURE_ACTION_ENABLE, tx));
00362 }
00363 
00364 /*
00365  * If the specified feature has not yet been enabled, this function returns
00366  * ENOTSUP; otherwise, this function increments the feature's refcount (or
00367  * returns EOVERFLOW if the refcount cannot be incremented). This function must
00368  * be called from syncing context.
00369  */
00370 void
00371 spa_feature_incr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
00372 {
00373         ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
00374         VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
00375             spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
00376             spa->spa_feat_desc_obj, feature, FEATURE_ACTION_INCR, tx));
00377 }
00378 
00379 /*
00380  * If the specified feature has not yet been enabled, this function returns
00381  * ENOTSUP; otherwise, this function decrements the feature's refcount (or
00382  * returns EOVERFLOW if the refcount is already 0). This function must
00383  * be called from syncing context.
00384  */
00385 void
00386 spa_feature_decr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
00387 {
00388         ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
00389         VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
00390             spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
00391             spa->spa_feat_desc_obj, feature, FEATURE_ACTION_DECR, tx));
00392 }
00393 
00394 boolean_t
00395 spa_feature_is_enabled(spa_t *spa, zfeature_info_t *feature)
00396 {
00397         int err;
00398         uint64_t refcount;
00399 
00400         if (spa_version(spa) < SPA_VERSION_FEATURES)
00401                 return (B_FALSE);
00402 
00403         err = feature_get_refcount(spa->spa_meta_objset,
00404             spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
00405             feature, &refcount);
00406         ASSERT(err == 0 || err == ENOTSUP);
00407         return (err == 0);
00408 }
00409 
00410 boolean_t
00411 spa_feature_is_active(spa_t *spa, zfeature_info_t *feature)
00412 {
00413         int err;
00414         uint64_t refcount;
00415 
00416         if (spa_version(spa) < SPA_VERSION_FEATURES)
00417                 return (B_FALSE);
00418 
00419         err = feature_get_refcount(spa->spa_meta_objset,
00420             spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
00421             feature, &refcount);
00422         ASSERT(err == 0 || err == ENOTSUP);
00423         return (err == 0 && refcount > 0);
00424 }
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines