Defines |
#define | IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) |
| An allocating zio is one that either currently has the DVA allocate stage set or will have it later in its lifetime.
|
Functions |
| SYSCTL_DECL (_vfs_zfs) |
| SYSCTL_NODE (_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0,"ZFS ZIO") |
| TUNABLE_INT ("vfs.zfs.zio.use_uma",&zio_use_uma) |
| SYSCTL_INT (_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN,&zio_use_uma, 0,"Use uma(9) for ZIO allocations") |
| TUNABLE_INT ("vfs.zfs.zio.exclude_metadata",&zio_exclude_metadata) |
| SYSCTL_INT (_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN,&zio_exclude_metadata, 0,"Exclude metadata buffers from dumps as well") |
void | zio_init (void) |
void | zio_fini (void) |
void * | zio_buf_alloc (size_t size) |
| Use zio_buf_alloc to allocate ZFS metadata.
|
void * | zio_data_buf_alloc (size_t size) |
| Use zio_data_buf_alloc to allocate data.
|
void | zio_buf_free (void *buf, size_t size) |
void | zio_data_buf_free (void *buf, size_t size) |
static void | zio_push_transform (zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) |
static void | zio_pop_transforms (zio_t *zio) |
static void | zio_subblock (zio_t *zio, void *data, uint64_t size) |
static void | zio_decompress (zio_t *zio, void *data, uint64_t size) |
zio_t * | zio_walk_parents (zio_t *cio) |
zio_t * | zio_walk_children (zio_t *pio) |
zio_t * | zio_unique_parent (zio_t *cio) |
void | zio_add_child (zio_t *pio, zio_t *cio) |
static void | zio_remove_child (zio_t *pio, zio_t *cio, zio_link_t *zl) |
static boolean_t | zio_wait_for_children (zio_t *zio, enum zio_child child, enum zio_wait_type wait) |
static void | zio_notify_parent (zio_t *pio, zio_t *zio, enum zio_wait_type wait) |
static void | zio_inherit_child_errors (zio_t *zio, enum zio_child c) |
static zio_t * | zio_create (zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, zio_type_t type, int priority, enum zio_flag flags, vdev_t *vd, uint64_t offset, const zbookmark_t *zb, enum zio_stage stage, enum zio_stage pipeline) |
| Create the various types of I/O (read, write, free, etc)
|
static void | zio_destroy (zio_t *zio) |
zio_t * | zio_null (zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, enum zio_flag flags) |
zio_t * | zio_root (spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) |
zio_t * | zio_read (zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, int priority, enum zio_flag flags, const zbookmark_t *zb) |
zio_t * | zio_write (zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, enum zio_flag flags, const zbookmark_t *zb) |
zio_t * | zio_rewrite (zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, int priority, enum zio_flag flags, zbookmark_t *zb) |
void | zio_write_override (zio_t *zio, blkptr_t *bp, int copies) |
void | zio_free (spa_t *spa, uint64_t txg, const blkptr_t *bp) |
zio_t * | zio_free_sync (zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, uint64_t size, enum zio_flag flags) |
zio_t * | zio_claim (zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *private, enum zio_flag flags) |
zio_t * | zio_ioctl (zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int priority, enum zio_flag flags) |
zio_t * | zio_read_phys (zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, zio_done_func_t *done, void *private, int priority, enum zio_flag flags, boolean_t labels) |
zio_t * | zio_write_phys (zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, zio_done_func_t *done, void *private, int priority, enum zio_flag flags, boolean_t labels) |
zio_t * | zio_vdev_child_io (zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, int priority, enum zio_flag flags, zio_done_func_t *done, void *private) |
| Create a child I/O to do some work for us.
|
zio_t * | zio_vdev_delegated_io (vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, int priority, enum zio_flag flags, zio_done_func_t *done, void *private) |
void | zio_flush (zio_t *zio, vdev_t *vd) |
zio_t * | zio_trim (zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) |
void | zio_shrink (zio_t *zio, uint64_t size) |
static int | zio_read_bp_init (zio_t *zio) |
| Prepare to read and write logical blocks.
|
static int | zio_write_bp_init (zio_t *zio) |
static int | zio_free_bp_init (zio_t *zio) |
static void | zio_taskq_dispatch (zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) |
static boolean_t | zio_taskq_member (zio_t *zio, enum zio_taskq_type q) |
static int | zio_issue_async (zio_t *zio) |
void | zio_interrupt (zio_t *zio) |
void | zio_execute (zio_t *zio) |
| Execute the I/O pipeline until one of the following occurs:
- the I/O completes
- the pipeline stalls waiting for dependent child I/Os
- the I/O issues, so we're waiting for an I/O completion interrupt
- the I/O is delegated by vdev-level caching or aggregation
- the I/O is deferred due to vdev-level queueing
- the I/O is handed off to another thread.
|
int | zio_wait (zio_t *zio) |
| Initiate I/O, either sync or async.
|
void | zio_nowait (zio_t *zio) |
static void | zio_reexecute (zio_t *pio) |
| Reexecute or suspend/resume failed I/O.
|
void | zio_suspend (spa_t *spa, zio_t *zio) |
int | zio_resume (spa_t *spa) |
void | zio_resume_wait (spa_t *spa) |
static zio_t * | zio_read_gang (zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) |
zio_t * | zio_rewrite_gang (zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) |
zio_t * | zio_free_gang (zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) |
zio_t * | zio_claim_gang (zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) |
static void | zio_gang_tree_assemble_done (zio_t *zio) |
static zio_gang_node_t * | zio_gang_node_alloc (zio_gang_node_t **gnpp) |
static void | zio_gang_node_free (zio_gang_node_t **gnpp) |
static void | zio_gang_tree_free (zio_gang_node_t **gnpp) |
static void | zio_gang_tree_assemble (zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) |
static void | zio_gang_tree_issue (zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) |
static int | zio_gang_assemble (zio_t *zio) |
static int | zio_gang_issue (zio_t *zio) |
static void | zio_write_gang_member_ready (zio_t *zio) |
static int | zio_write_gang_block (zio_t *pio) |
static void | zio_ddt_child_read_done (zio_t *zio) |
static int | zio_ddt_read_start (zio_t *zio) |
static int | zio_ddt_read_done (zio_t *zio) |
static boolean_t | zio_ddt_collision (zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) |
static void | zio_ddt_child_write_ready (zio_t *zio) |
static void | zio_ddt_child_write_done (zio_t *zio) |
static void | zio_ddt_ditto_write_done (zio_t *zio) |
static int | zio_ddt_write (zio_t *zio) |
static int | zio_ddt_free (zio_t *zio) |
static int | zio_dva_allocate (zio_t *zio) |
static int | zio_dva_free (zio_t *zio) |
static int | zio_dva_claim (zio_t *zio) |
static void | zio_dva_unallocate (zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) |
| Undo an allocation.
|
int | zio_alloc_zil (spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t use_slog) |
| Try to allocate an intent log block.
|
void | zio_free_zil (spa_t *spa, uint64_t txg, blkptr_t *bp) |
| Free an intent log block.
|
static int | zio_vdev_io_start (zio_t *zio) |
static int | zio_vdev_io_done (zio_t *zio) |
static void | zio_vsd_default_cksum_finish (zio_cksum_report_t *zcr, const void *good_buf) |
| For non-raidz ZIOs, we can just copy aside the bad data read from the disk, and use that to finish the checksum ereport later.
|
void | zio_vsd_default_cksum_report (zio_t *zio, zio_cksum_report_t *zcr, void *ignored) |
static int | zio_vdev_io_assess (zio_t *zio) |
void | zio_vdev_io_reissue (zio_t *zio) |
void | zio_vdev_io_redone (zio_t *zio) |
void | zio_vdev_io_bypass (zio_t *zio) |
static int | zio_checksum_generate (zio_t *zio) |
static int | zio_checksum_verify (zio_t *zio) |
void | zio_checksum_verified (zio_t *zio) |
| Called by RAID-Z to ensure we don't compute the checksum twice.
|
int | zio_worst_error (int e1, int e2) |
| Compare the severity of errors.
|
static int | zio_ready (zio_t *zio) |
static int | zio_done (zio_t *zio) |
boolean_t | zbookmark_is_before (const dnode_phys_t *dnp, const zbookmark_t *zb1, const zbookmark_t *zb2) |
Variables |
int | zio_use_uma = 0 |
static int | zio_exclude_metadata = 0 |
zio_trim_stats_t | zio_trim_stats |
| See zio.h for more information about these fields.
|
static kstat_t * | zio_trim_ksp |
uint8_t | zio_priority_table [ZIO_PRIORITY_TABLE_SIZE] |
| I/O priority table.
|
char * | zio_type_name [ZIO_TYPES] |
| I/O type descriptions.
|
kmem_cache_t * | zio_cache |
kmem_cache_t * | zio_link_cache |
kmem_cache_t * | zio_buf_cache [SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT] |
kmem_cache_t * | zio_data_buf_cache [SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT] |
vmem_t * | zio_alloc_arena |
int | zfs_mg_alloc_failures |
| This value defines the number of allowed allocation failures per vdev.
|
boolean_t | zio_requeue_io_start_cut_in_line = B_TRUE |
int | zio_buf_debug_limit = 0 |
static zio_pipe_stage_t * | zio_pipeline [] |
static zio_gang_issue_func_t * | zio_gang_issue_func [ZIO_TYPES] |
ddt_entry_t * | freedde |
Gang blocks
A gang block is a collection of small blocks that looks to the DMU like one large block. When zio_dva_allocate() cannot find a block of the requested size, due to either severe fragmentation or the pool being nearly full, it calls zio_write_gang_block() to construct the block from smaller fragments.
A gang block consists of a gang header (zio_gbh_phys_t) and up to three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like an indirect block: it's an array of block pointers. It consumes only one sector and hence is allocatable regardless of fragmentation. The gang header's bps point to its gang members, which hold the data.
Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> as the verifier to ensure uniqueness of the SHA256 checksum. Critically, the gang block bp's blk_cksum is the checksum of the data, not the gang header. This ensures that data block signatures (needed for deduplication) are independent of how the block is physically stored.
Gang blocks can be nested: a gang member may itself be a gang block. Thus every gang block is a tree in which root and all interior nodes are gang headers, and the leaves are normal blocks that contain user data. The root of the gang tree is called the gang leader.
To perform any operation (read, rewrite, free, claim) on a gang block, zio_gang_assemble() first assembles the gang tree (minus data leaves) in the io_gang_tree field of the original logical i/o by recursively reading the gang leader and all gang headers below it. This yields an in-core tree containing the contents of every gang header and the bps for every constituent of the gang block.
With the gang tree now assembled, zio_gang_issue() just walks the gang tree and invokes a callback on each bp. To free a gang block, zio_gang_issue() calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). zio_read_gang() is a wrapper around zio_read() that omits reading gang headers, since we already have those in io_gang_tree. zio_rewrite_gang() performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() of the gang header plus zio_checksum_compute() of the data to update the gang header's blk_cksum as described above.
The two-phase assemble/issue model solves the problem of partial failure -- what if you'd freed part of a gang block but then couldn't read the gang header for another part? Assembling the entire gang tree first ensures that all the necessary gang header I/O has succeeded before starting the actual work of free, claim, or write. Once the gang tree is assembled, free and claim are in-memory operations that cannot fail.
In the event that a gang write fails, zio_dva_unallocate() walks the gang tree to immediately free (i.e. insert back into the space map) everything we've allocated. This ensures that we don't get ENOSPC errors during repeated suspend/resume cycles due to a flaky device.
Gang rewrites only happen during sync-to-convergence. If we can't assemble the gang tree, we won't modify the block, so we can safely defer the free (knowing that the block is still intact). If we *can* assemble the gang tree, then even if some of the rewrites fail, zio_dva_unallocate() will free each constituent bp and we can allocate a new block on the next sync pass.
In all cases, the gang tree allows complete recovery from partial failure.
Definition in file zio.c.