FreeBSD ZFS
The Zettabyte File System
|
Virtual Device Labels. More...
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/dmu.h>
#include <sys/zap.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
#include <sys/zio.h>
#include <sys/dsl_scan.h>
#include <sys/trim_map.h>
#include <sys/fs/zfs.h>
Go to the source code of this file.
Data Structures | |
struct | ubl_cbdata |
Functions | |
uint64_t | vdev_label_offset (uint64_t psize, int l, uint64_t offset) |
Basic routines to read and write from a vdev label. | |
int | vdev_label_number (uint64_t psize, uint64_t offset) |
Returns back the vdev label associated with the passed in offset. | |
static void | vdev_label_read (zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) |
static void | vdev_label_write (zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) |
nvlist_t * | vdev_config_generate (spa_t *spa, vdev_t *vd, boolean_t getstats, vdev_config_flag_t flags) |
Generate the nvlist representing this vdev's config. | |
void | vdev_top_config_generate (spa_t *spa, nvlist_t *config) |
Generate a view of the top-level vdevs. | |
nvlist_t * | vdev_label_read_config (vdev_t *vd, uint64_t txg) |
Returns the configuration from the label of the given vdev. | |
static boolean_t | vdev_inuse (vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, uint64_t *spare_guid, uint64_t *l2cache_guid) |
Determine if a device is in use. | |
int | vdev_label_init (vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) |
Initialize a vdev label. | |
static int | vdev_uberblock_compare (uberblock_t *ub1, uberblock_t *ub2) |
Consider the following situation: txg is safely synced to disk. | |
static void | vdev_uberblock_load_done (zio_t *zio) |
static void | vdev_uberblock_load_impl (zio_t *zio, vdev_t *vd, int flags, struct ubl_cbdata *cbp) |
void | vdev_uberblock_load (vdev_t *rvd, uberblock_t *ub, nvlist_t **config) |
Reads the 'best' uberblock from disk along with its associated configuration. | |
static void | vdev_uberblock_sync_done (zio_t *zio) |
On success, increment root zio's count of good writes. | |
static void | vdev_uberblock_sync (zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) |
Write the uberblock to all labels of all leaves of the specified vdev. | |
int | vdev_uberblock_sync_list (vdev_t **svd, int svdcount, uberblock_t *ub, int flags) |
Sync the uberblocks to all vdevs in svd[]. | |
static void | vdev_label_sync_done (zio_t *zio) |
On success, increment the count of good writes for our top-level vdev. | |
static void | vdev_label_sync_top_done (zio_t *zio) |
If there weren't enough good writes, indicate failure to the parent. | |
static void | vdev_label_sync_ignore_done (zio_t *zio) |
We ignore errors for log and cache devices, simply free the private data. | |
static void | vdev_label_sync (zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) |
Write all even or odd labels to all leaves of the specified vdev. | |
int | vdev_label_sync_list (spa_t *spa, int l, uint64_t txg, int flags) |
int | vdev_config_sync (vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard) |
Sync the uberblock and any changes to the vdev configuration. |
Virtual Device Labels.
The vdev label serves several distinct purposes:
It is important to note that while the kernel is responsible for writing the label, it only consumes the information in the first three cases. The latter information is only consumed in userland when determining the configuration to import a pool.
Before describing the contents of the label, it's important to understand how the labels are written and updated with respect to the uberblock.
When the pool configuration is altered, either because it was newly created or a device was added, we want to update all the labels such that we can deal with fatal failure at any point. To this end, each disk has two labels which are updated before and after the uberblock is synced. Assuming we have labels and an uberblock with the following transaction groups:
L1 UB L2 +------+ +------+ +------+ | | | | | | | t10 | | t10 | | t10 | | | | | | | +------+ +------+ +------+
In this stable state, the labels and the uberblock were all updated within the same transaction group (10). Each label is mirrored and checksummed, so that we can detect when we fail partway through writing the label.
In order to identify which labels are valid, the labels are written in the following manner:
Given arbitrary failure, we can determine the correct label to use based on the transaction group. If we fail after updating L1 but before updating the UB, we will notice that L1's transaction group is greater than the uberblock, so L2 must be valid. If we fail after writing the uberblock but before writing L2, we will notice that L2's transaction group is less than L1, and therefore L1 is valid.
Another added complexity is that not every label is updated when the config is synced. If we add a single device, we do not want to have to re-write every label for every device in the pool. This means that both L1 and L2 may be older than the pool uberblock, because the necessary information is stored on another vdev.
The vdev label consists of two distinct parts, and is wrapped within the vdev_label_t structure. The label includes 8k of padding to permit legacy VTOC disk labels, but is otherwise ignored.
The first half of the label is a packed nvlist which contains pool wide properties, per-vdev properties, and configuration information. It is described in more detail below.
The latter half of the label consists of a redundant array of uberblocks. These uberblocks are updated whenever a transaction group is committed, or when the configuration is updated. When a pool is loaded, we scan each vdev for the 'best' uberblock.
The nvlist describing the pool and vdev contains the following elements:
Each leaf device label also contains the following:
The 'vs' configuration follows the format described in 'spa_config.c'.
Definition in file vdev_label.c.
nvlist_t* vdev_config_generate | ( | spa_t * | spa, |
vdev_t * | vd, | ||
boolean_t | getstats, | ||
vdev_config_flag_t | flags | ||
) |
Generate the nvlist representing this vdev's config.
Definition at line 212 of file vdev_label.c.
int vdev_config_sync | ( | vdev_t ** | svd, |
int | svdcount, | ||
uint64_t | txg, | ||
boolean_t | tryhard | ||
) |
Sync the uberblock and any changes to the vdev configuration.
The order of operations is carefully crafted to ensure that if the system panics or loses power at any time, the state on disk is still transactionally consistent. The in-line comments below describe the failure semantics at each stage.
Moreover, vdev_config_sync() is designed to be idempotent: if it fails at any time, you can just call it again, and it will resume its work.
Definition at line 1205 of file vdev_label.c.
static boolean_t vdev_inuse | ( | vdev_t * | vd, |
uint64_t | crtxg, | ||
vdev_labeltype_t | reason, | ||
uint64_t * | spare_guid, | ||
uint64_t * | l2cache_guid | ||
) | [static] |
Determine if a device is in use.
The 'spare_guid' parameter will be filled in with the device guid if this spare is active elsewhere on the system.
Definition at line 515 of file vdev_label.c.
int vdev_label_init | ( | vdev_t * | vd, |
uint64_t | crtxg, | ||
vdev_labeltype_t | reason | ||
) |
Initialize a vdev label.
We check to make sure each leaf device is not in use, and writable. We put down an initial label which we will later overwrite with a complete label. Note that it's important to do this sequentially, not in parallel, so that we catch cases of multiple use of the same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with itself.
Definition at line 633 of file vdev_label.c.
int vdev_label_number | ( | uint64_t | psize, |
uint64_t | offset | ||
) |
Returns back the vdev label associated with the passed in offset.
Definition at line 166 of file vdev_label.c.
uint64_t vdev_label_offset | ( | uint64_t | psize, |
int | l, | ||
uint64_t | offset | ||
) |
Basic routines to read and write from a vdev label.
Used throughout the rest of this file.
Definition at line 153 of file vdev_label.c.
static void vdev_label_read | ( | zio_t * | zio, |
vdev_t * | vd, | ||
int | l, | ||
void * | buf, | ||
uint64_t | offset, | ||
uint64_t | size, | ||
zio_done_func_t * | done, | ||
void * | private, | ||
int | flags | ||
) | [static] |
Definition at line 179 of file vdev_label.c.
nvlist_t* vdev_label_read_config | ( | vdev_t * | vd, |
uint64_t | txg | ||
) |
Returns the configuration from the label of the given vdev.
For vdevs which don't have a txg value stored on their label (i.e. spares/cache) or have not been completely initialized (txg = 0) just return the configuration from the first valid label we find. Otherwise, find the most up-to-date label that does not exceed the specified 'txg' value.
Definition at line 442 of file vdev_label.c.
Write all even or odd labels to all leaves of the specified vdev.
Definition at line 1110 of file vdev_label.c.
static void vdev_label_sync_done | ( | zio_t * | zio | ) | [static] |
On success, increment the count of good writes for our top-level vdev.
Definition at line 1075 of file vdev_label.c.
static void vdev_label_sync_ignore_done | ( | zio_t * | zio | ) | [static] |
We ignore errors for log and cache devices, simply free the private data.
Definition at line 1101 of file vdev_label.c.
int vdev_label_sync_list | ( | spa_t * | spa, |
int | l, | ||
uint64_t | txg, | ||
int | flags | ||
) |
Definition at line 1152 of file vdev_label.c.
static void vdev_label_sync_top_done | ( | zio_t * | zio | ) | [static] |
If there weren't enough good writes, indicate failure to the parent.
Definition at line 1087 of file vdev_label.c.
static void vdev_label_write | ( | zio_t * | zio, |
vdev_t * | vd, | ||
int | l, | ||
void * | buf, | ||
uint64_t | offset, | ||
uint64_t | size, | ||
zio_done_func_t * | done, | ||
void * | private, | ||
int | flags | ||
) | [static] |
Definition at line 193 of file vdev_label.c.
void vdev_top_config_generate | ( | spa_t * | spa, |
nvlist_t * | config | ||
) |
Generate a view of the top-level vdevs.
If we currently have holes in the namespace, then generate an array which contains a list of holey vdevs. Additionally, add the number of top-level children that currently exist.
Definition at line 407 of file vdev_label.c.
static int vdev_uberblock_compare | ( | uberblock_t * | ub1, |
uberblock_t * | ub2 | ||
) | [static] |
Consider the following situation: txg is safely synced to disk.
We've written the first uberblock for txg + 1, and then we lose power. When we come back up, we fail to see the uberblock for txg + 1 because, say, it was on a mirrored device and the replica to which we wrote txg + 1 is now offline. If we then make some changes and sync txg + 1, and then the missing replica comes back, then for a few seconds we'll have two conflicting uberblocks on disk with the same txg. The solution is simple: among uberblocks with equal txg, choose the one with the latest timestamp.
Definition at line 883 of file vdev_label.c.
void vdev_uberblock_load | ( | vdev_t * | rvd, |
uberblock_t * | ub, | ||
nvlist_t ** | config | ||
) |
Reads the 'best' uberblock from disk along with its associated configuration.
First, we read the uberblock array of each label of each vdev, keeping track of the uberblock with the highest txg in each array. Then, we read the configuration from the same vdev as the best uberblock.
Definition at line 960 of file vdev_label.c.
static void vdev_uberblock_load_done | ( | zio_t * | zio | ) | [static] |
Definition at line 904 of file vdev_label.c.
static void vdev_uberblock_load_impl | ( | zio_t * | zio, |
vdev_t * | vd, | ||
int | flags, | ||
struct ubl_cbdata * | cbp | ||
) | [static] |
Definition at line 934 of file vdev_label.c.
static void vdev_uberblock_sync | ( | zio_t * | zio, |
uberblock_t * | ub, | ||
vdev_t * | vd, | ||
int | flags | ||
) | [static] |
Write the uberblock to all labels of all leaves of the specified vdev.
Definition at line 1010 of file vdev_label.c.
static void vdev_uberblock_sync_done | ( | zio_t * | zio | ) | [static] |
On success, increment root zio's count of good writes.
We only get credit for writes to known-visible vdevs; see spa_vdev_add().
Definition at line 998 of file vdev_label.c.
int vdev_uberblock_sync_list | ( | vdev_t ** | svd, |
int | svdcount, | ||
uberblock_t * | ub, | ||
int | flags | ||
) |
Sync the uberblocks to all vdevs in svd[].
Definition at line 1043 of file vdev_label.c.