--- cddl/lib/libzpool/Makefile.orig +++ cddl/lib/libzpool/Makefile @@ -26,7 +26,7 @@ LIB= zpool -ZFS_COMMON_SRCS= ${ZFS_COMMON_OBJS:C/.o$/.c/} vdev_file.c +ZFS_COMMON_SRCS= ${ZFS_COMMON_OBJS:C/.o$/.c/} vdev_file.c trim_map.c ZFS_SHARED_SRCS= ${ZFS_SHARED_OBJS:C/.o$/.c/} KERNEL_SRCS= kernel.c taskq.c util.c LIST_SRCS= list.c --- sys/cddl/compat/opensolaris/sys/dkio.h.orig +++ sys/cddl/compat/opensolaris/sys/dkio.h @@ -75,6 +75,8 @@ */ #define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */ +#define DKIOCTRIM (DKIOC|35) /* TRIM a block */ + struct dk_callback { void (*dkc_callback)(void *dkc_cookie, int error); void *dkc_cookie; --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c.orig +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c @@ -361,7 +361,8 @@ dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) { ASSERT(dsl_pool_sync_context(dp)); - zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags)); + zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp), + pio->io_flags)); } int @@ -1411,7 +1412,7 @@ return (ERESTART); zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, - dmu_tx_get_txg(tx), bp, 0)); + dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0)); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c.orig +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -61,6 +61,7 @@ #include #include #include +#include #ifdef _KERNEL #include @@ -880,6 +881,11 @@ spa_create_zio_taskqs(spa); } + /* + * Start TRIM thread. + */ + trim_thread_create(spa); + list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), @@ -908,6 +914,12 @@ ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); + /* + * Stop TRIM thread in case spa_unload() wasn't called directly + * before spa_deactivate(). + */ + trim_thread_destroy(spa); + txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); @@ -1024,6 +1036,11 @@ ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* + * Stop TRIM thread. + */ + trim_thread_destroy(spa); + + /* * Stop async tasks. */ spa_async_suspend(spa); @@ -5309,7 +5326,7 @@ zio_t *zio = arg; zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, - zio->io_flags)); + BP_GET_PSIZE(bp), zio->io_flags)); return (0); } --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h.orig +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h @@ -217,6 +217,9 @@ spa_proc_state_t spa_proc_state; /* see definition */ struct proc *spa_proc; /* "zpool-poolname" process */ uint64_t spa_did; /* if procp != p0, did of t1 */ + kthread_t *spa_trim_thread; /* thread sending TRIM I/Os */ + kmutex_t spa_trim_lock; /* protects spa_trim_cv */ + kcondvar_t spa_trim_cv; /* used to notify TRIM thread */ boolean_t spa_autoreplace; /* autoreplace set in open */ int spa_vdev_locks; /* locks grabbed */ uint64_t spa_creation_version; /* version at pool creation */ --- /dev/null +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h @@ -0,0 +1,51 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012 Pawel Jakub Dawidek . + * All rights reserved. + */ + +#ifndef _SYS_TRIM_MAP_H +#define _SYS_TRIM_MAP_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern void trim_map_create(vdev_t *vd); +extern void trim_map_destroy(vdev_t *vd); +extern void trim_map_free(zio_t *zio); +extern boolean_t trim_map_write_start(zio_t *zio); +extern void trim_map_write_done(zio_t *zio); + +extern void trim_thread_create(spa_t *spa); +extern void trim_thread_destroy(spa_t *spa); +extern void trim_thread_wakeup(spa_t *spa); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TRIM_MAP_H */ --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h.orig +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h @@ -45,6 +45,7 @@ } vdev_dtl_type_t; extern boolean_t zfs_nocacheflush; +extern boolean_t zfs_notrim; extern int vdev_open(vdev_t *); extern void vdev_open_children(vdev_t *); --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h.orig +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h @@ -182,6 +182,7 @@ uint64_t vdev_unspare; /* unspare when resilvering done */ hrtime_t vdev_last_try; /* last reopen time */ boolean_t vdev_nowritecache; /* true if flushwritecache failed */ + boolean_t vdev_notrim; /* true if trim failed */ boolean_t vdev_checkremove; /* temporary online test */ boolean_t vdev_forcefault; /* force online fault */ boolean_t vdev_splitting; /* split or repair in progress */ @@ -197,6 +198,7 @@ spa_aux_vdev_t *vdev_aux; /* for l2cache vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ vdev_aux_t vdev_label_aux; /* on-disk aux state */ + struct trim_map *vdev_trimmap; /* * For DTrace to work in userland (libzpool) context, these fields must --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h.orig +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -132,7 +133,8 @@ #define ZIO_PRIORITY_RESILVER (zio_priority_table[9]) #define ZIO_PRIORITY_SCRUB (zio_priority_table[10]) #define ZIO_PRIORITY_DDT_PREFETCH (zio_priority_table[11]) -#define ZIO_PRIORITY_TABLE_SIZE 12 +#define ZIO_PRIORITY_TRIM (zio_priority_table[12]) +#define ZIO_PRIORITY_TABLE_SIZE 13 #define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_STOP 0x101 @@ -350,6 +352,39 @@ list_node_t zl_child_node; } zio_link_t; +/* + * Used for TRIM kstat. + */ +typedef struct zio_trim_stats { + /* + * Number of bytes successfully TRIMmed. + */ + kstat_named_t zio_trim_bytes; + + /* + * Number of successful TRIM requests. + */ + kstat_named_t zio_trim_success; + + /* + * Number of TRIM requests that failed because TRIM is not + * supported. + */ + kstat_named_t zio_trim_unsupported; + + /* + * Number of TRIM requests that failed for other reasons. + */ + kstat_named_t zio_trim_failed; +} zio_trim_stats_t; + +extern zio_trim_stats_t zio_trim_stats; + +#define ZIO_TRIM_STAT_INCR(stat, val) \ + atomic_add_64(&zio_trim_stats.stat.value.ui64, (val)); +#define ZIO_TRIM_STAT_BUMP(stat) \ + ZIO_TRIM_STAT_INCR(stat, 1); + struct zio { /* Core information about this I/O */ zbookmark_t io_bookmark; @@ -423,6 +458,8 @@ /* FreeBSD only. */ struct ostask io_task; #endif + avl_node_t io_trim_node; + list_node_t io_trim_link; }; extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, @@ -453,7 +490,8 @@ zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *private, int priority, enum zio_flag flags); + uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, + int priority, enum zio_flag flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, @@ -466,12 +504,14 @@ boolean_t labels); extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, - const blkptr_t *bp, enum zio_flag flags); + const blkptr_t *bp, uint64_t size, enum zio_flag flags); extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t use_slog); extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp); extern void zio_flush(zio_t *zio, vdev_t *vd); +extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, + uint64_t size); extern void zio_shrink(zio_t *zio, uint64_t size); extern int zio_wait(zio_t *zio); --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h.orig +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h @@ -60,9 +60,9 @@ ZIO_STAGE_READY = 1 << 15, /* RWFCI */ - ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RW--I */ - ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RW--I */ - ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RW--I */ + ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RWF-I */ + ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RWF-- */ + ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RWF-I */ ZIO_STAGE_CHECKSUM_VERIFY = 1 << 19, /* R---- */ @@ -143,7 +143,9 @@ #define ZIO_FREE_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ ZIO_STAGE_FREE_BP_INIT | \ - ZIO_STAGE_DVA_FREE) + ZIO_STAGE_DVA_FREE | \ + ZIO_STAGE_VDEV_IO_START | \ + ZIO_STAGE_VDEV_IO_ASSESS) #define ZIO_DDT_FREE_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ --- /dev/null +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c @@ -0,0 +1,541 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012 Pawel Jakub Dawidek . + * All rights reserved. + */ + +#include +#include +#include +#include + +typedef struct trim_map { + list_t tm_head; /* List of segments sorted by txg. */ + avl_tree_t tm_queued_frees; /* AVL tree of segments waiting for TRIM. */ + avl_tree_t tm_inflight_frees; /* AVL tree of in-flight TRIMs. */ + avl_tree_t tm_inflight_writes; /* AVL tree of in-flight writes. */ + list_t tm_pending_writes; /* Writes blocked on in-flight frees. */ + kmutex_t tm_lock; +} trim_map_t; + +typedef struct trim_seg { + avl_node_t ts_node; /* AVL node. */ + list_node_t ts_next; /* List element. */ + uint64_t ts_start; /* Starting offset of this segment. */ + uint64_t ts_end; /* Ending offset (non-inclusive). */ + uint64_t ts_txg; /* Segment creation txg. */ +} trim_seg_t; + +extern boolean_t zfs_notrim; + +SYSCTL_DECL(_vfs_zfs); +/* Delay TRIMs by that many TXGs. */ +static int trim_txg_limit = 64; +TUNABLE_INT("vfs.zfs.trim_txg_limit", &trim_txg_limit); +SYSCTL_INT(_vfs_zfs, OID_AUTO, trim_txg_limit, CTLFLAG_RW, &trim_txg_limit, 0, + "Delay TRIMs by that many TXGs."); + +static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd); + +static int +trim_map_seg_compare(const void *x1, const void *x2) +{ + const trim_seg_t *s1 = x1; + const trim_seg_t *s2 = x2; + + if (s1->ts_start < s2->ts_start) { + if (s1->ts_end > s2->ts_start) + return (0); + return (-1); + } + if (s1->ts_start > s2->ts_start) { + if (s1->ts_start < s2->ts_end) + return (0); + return (1); + } + return (0); +} + +static int +trim_map_zio_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = x1; + const zio_t *z2 = x2; + + if (z1->io_offset < z2->io_offset) { + if (z1->io_offset + z1->io_size > z2->io_offset) + return (0); + return (-1); + } + if (z1->io_offset > z2->io_offset) { + if (z1->io_offset < z2->io_offset + z2->io_size) + return (0); + return (1); + } + return (0); +} + +void +trim_map_create(vdev_t *vd) +{ + trim_map_t *tm; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (zfs_notrim) + return; + + tm = kmem_zalloc(sizeof (*tm), KM_SLEEP); + mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&tm->tm_head, sizeof (trim_seg_t), + offsetof(trim_seg_t, ts_next)); + list_create(&tm->tm_pending_writes, sizeof (zio_t), + offsetof(zio_t, io_trim_link)); + avl_create(&tm->tm_queued_frees, trim_map_seg_compare, + sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); + avl_create(&tm->tm_inflight_frees, trim_map_seg_compare, + sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); + avl_create(&tm->tm_inflight_writes, trim_map_zio_compare, + sizeof (zio_t), offsetof(zio_t, io_trim_node)); + vd->vdev_trimmap = tm; +} + +void +trim_map_destroy(vdev_t *vd) +{ + trim_map_t *tm; + trim_seg_t *ts; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (zfs_notrim) + return; + + tm = vd->vdev_trimmap; + if (tm == NULL) + return; + + /* + * We may have been called before trim_map_vdev_commit_done() + * had a chance to run, so do it now to prune the remaining + * inflight frees. + */ + trim_map_vdev_commit_done(vd->vdev_spa, vd); + + mutex_enter(&tm->tm_lock); + while ((ts = list_head(&tm->tm_head)) != NULL) { + avl_remove(&tm->tm_queued_frees, ts); + list_remove(&tm->tm_head, ts); + kmem_free(ts, sizeof (*ts)); + } + mutex_exit(&tm->tm_lock); + + avl_destroy(&tm->tm_queued_frees); + avl_destroy(&tm->tm_inflight_frees); + avl_destroy(&tm->tm_inflight_writes); + list_destroy(&tm->tm_pending_writes); + list_destroy(&tm->tm_head); + mutex_destroy(&tm->tm_lock); + kmem_free(tm, sizeof (*tm)); + vd->vdev_trimmap = NULL; +} + +static void +trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) +{ + avl_index_t where; + trim_seg_t tsearch, *ts_before, *ts_after, *ts; + boolean_t merge_before, merge_after; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + VERIFY(start < end); + + tsearch.ts_start = start; + tsearch.ts_end = end; + + ts = avl_find(&tm->tm_queued_frees, &tsearch, &where); + if (ts != NULL) { + if (start < ts->ts_start) + trim_map_segment_add(tm, start, ts->ts_start, txg); + if (end > ts->ts_end) + trim_map_segment_add(tm, ts->ts_end, end, txg); + return; + } + + ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE); + ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER); + + merge_before = (ts_before != NULL && ts_before->ts_end == start && + ts_before->ts_txg == txg); + merge_after = (ts_after != NULL && ts_after->ts_start == end && + ts_after->ts_txg == txg); + + if (merge_before && merge_after) { + avl_remove(&tm->tm_queued_frees, ts_before); + list_remove(&tm->tm_head, ts_before); + ts_after->ts_start = ts_before->ts_start; + kmem_free(ts_before, sizeof (*ts_before)); + } else if (merge_before) { + ts_before->ts_end = end; + } else if (merge_after) { + ts_after->ts_start = start; + } else { + ts = kmem_alloc(sizeof (*ts), KM_SLEEP); + ts->ts_start = start; + ts->ts_end = end; + ts->ts_txg = txg; + avl_insert(&tm->tm_queued_frees, ts, where); + list_insert_tail(&tm->tm_head, ts); + } +} + +static void +trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start, + uint64_t end) +{ + trim_seg_t *nts; + boolean_t left_over, right_over; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + + left_over = (ts->ts_start < start); + right_over = (ts->ts_end > end); + + if (left_over && right_over) { + nts = kmem_alloc(sizeof (*nts), KM_SLEEP); + nts->ts_start = end; + nts->ts_end = ts->ts_end; + nts->ts_txg = ts->ts_txg; + ts->ts_end = start; + avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER); + list_insert_after(&tm->tm_head, ts, nts); + } else if (left_over) { + ts->ts_end = start; + } else if (right_over) { + ts->ts_start = end; + } else { + avl_remove(&tm->tm_queued_frees, ts); + list_remove(&tm->tm_head, ts); + kmem_free(ts, sizeof (*ts)); + } +} + +static void +trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) +{ + zio_t zsearch, *zs; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + + zsearch.io_offset = start; + zsearch.io_size = end - start; + + zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL); + if (zs == NULL) { + trim_map_segment_add(tm, start, end, txg); + return; + } + if (start < zs->io_offset) + trim_map_free_locked(tm, start, zs->io_offset, txg); + if (zs->io_offset + zs->io_size < end) + trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg); +} + +void +trim_map_free(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + trim_map_t *tm = vd->vdev_trimmap; + + if (zfs_notrim || vd->vdev_notrim || tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + trim_map_free_locked(tm, zio->io_offset, zio->io_offset + zio->io_size, + vd->vdev_spa->spa_syncing_txg); + mutex_exit(&tm->tm_lock); +} + +boolean_t +trim_map_write_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + trim_map_t *tm = vd->vdev_trimmap; + trim_seg_t tsearch, *ts; + boolean_t left_over, right_over; + uint64_t start, end; + + if (zfs_notrim || vd->vdev_notrim || tm == NULL) + return (B_TRUE); + + start = zio->io_offset; + end = start + zio->io_size; + tsearch.ts_start = start; + tsearch.ts_end = end; + + mutex_enter(&tm->tm_lock); + + /* + * Checking for colliding in-flight frees. + */ + ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL); + if (ts != NULL) { + list_insert_tail(&tm->tm_pending_writes, zio); + mutex_exit(&tm->tm_lock); + return (B_FALSE); + } + + ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); + if (ts != NULL) { + /* + * Loop until all overlapping segments are removed. + */ + do { + trim_map_segment_remove(tm, ts, start, end); + ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); + } while (ts != NULL); + } + avl_add(&tm->tm_inflight_writes, zio); + + mutex_exit(&tm->tm_lock); + + return (B_TRUE); +} + +void +trim_map_write_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + trim_map_t *tm = vd->vdev_trimmap; + + /* + * Don't check for vdev_notrim, since the write could have + * started before vdev_notrim was set. + */ + if (zfs_notrim || tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + /* + * Don't fail if the write isn't in the tree, since the write + * could have started after vdev_notrim was set. + */ + if (zio->io_trim_node.avl_child[0] || + zio->io_trim_node.avl_child[1] || + AVL_XPARENT(&zio->io_trim_node) || + tm->tm_inflight_writes.avl_root == &zio->io_trim_node) + avl_remove(&tm->tm_inflight_writes, zio); + mutex_exit(&tm->tm_lock); +} + +/* + * Return the oldest segment (the one with the lowest txg) or false if + * the list is empty or the first element's txg is greater than txg given + * as function argument. + */ +static trim_seg_t * +trim_map_first(trim_map_t *tm, uint64_t txg) +{ + trim_seg_t *ts; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + + ts = list_head(&tm->tm_head); + if (ts != NULL && ts->ts_txg <= txg) + return (ts); + return (NULL); +} + +static void +trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd) +{ + trim_map_t *tm = vd->vdev_trimmap; + trim_seg_t *ts; + uint64_t start, size, txglimit; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (tm == NULL) + return; + + txglimit = MIN(spa->spa_syncing_txg, spa_freeze_txg(spa)) - + trim_txg_limit; + + mutex_enter(&tm->tm_lock); + /* + * Loop until we send all frees up to the txglimit. + */ + while ((ts = trim_map_first(tm, txglimit)) != NULL) { + list_remove(&tm->tm_head, ts); + avl_remove(&tm->tm_queued_frees, ts); + avl_add(&tm->tm_inflight_frees, ts); + zio_nowait(zio_trim(zio, spa, vd, ts->ts_start, + ts->ts_end - ts->ts_start)); + } + mutex_exit(&tm->tm_lock); +} + +static void +trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd) +{ + trim_map_t *tm = vd->vdev_trimmap; + trim_seg_t *ts; + list_t pending_writes; + zio_t *zio; + uint64_t start, size; + void *cookie; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + if (!avl_is_empty(&tm->tm_inflight_frees)) { + cookie = NULL; + while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees, + &cookie)) != NULL) { + kmem_free(ts, sizeof (*ts)); + } + } + list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t, + io_trim_link)); + list_move_tail(&pending_writes, &tm->tm_pending_writes); + mutex_exit(&tm->tm_lock); + + while ((zio = list_remove_head(&pending_writes)) != NULL) { + zio_vdev_io_reissue(zio); + zio_execute(zio); + } + list_destroy(&pending_writes); +} + +static void +trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd) +{ + int c; + + if (vd == NULL || spa->spa_syncing_txg <= trim_txg_limit) + return; + + if (vd->vdev_ops->vdev_op_leaf) { + trim_map_vdev_commit(spa, zio, vd); + } else { + for (c = 0; c < vd->vdev_children; c++) + trim_map_commit(spa, zio, vd->vdev_child[c]); + } +} + +static void +trim_map_commit_done(spa_t *spa, vdev_t *vd) +{ + int c; + + if (vd == NULL) + return; + + if (vd->vdev_ops->vdev_op_leaf) { + trim_map_vdev_commit_done(spa, vd); + } else { + for (c = 0; c < vd->vdev_children; c++) + trim_map_commit_done(spa, vd->vdev_child[c]); + } +} + +static void +trim_thread(void *arg) +{ + spa_t *spa = arg; + zio_t *zio; + + for (;;) { + mutex_enter(&spa->spa_trim_lock); + if (spa->spa_trim_thread == NULL) { + spa->spa_trim_thread = curthread; + cv_signal(&spa->spa_trim_cv); + mutex_exit(&spa->spa_trim_lock); + thread_exit(); + } + cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock); + mutex_exit(&spa->spa_trim_lock); + + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + trim_map_commit(spa, zio, spa->spa_root_vdev); + (void) zio_wait(zio); + trim_map_commit_done(spa, spa->spa_root_vdev); + spa_config_exit(spa, SCL_STATE, FTAG); + } +} + +void +trim_thread_create(spa_t *spa) +{ + + if (zfs_notrim) + return; + + mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL); + mutex_enter(&spa->spa_trim_lock); + spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0, + TS_RUN, minclsyspri); + mutex_exit(&spa->spa_trim_lock); +} + +void +trim_thread_destroy(spa_t *spa) +{ + + if (zfs_notrim) + return; + if (spa->spa_trim_thread == NULL) + return; + + mutex_enter(&spa->spa_trim_lock); + /* Setting spa_trim_thread to NULL tells the thread to stop. */ + spa->spa_trim_thread = NULL; + cv_signal(&spa->spa_trim_cv); + /* The thread will set it back to != NULL on exit. */ + while (spa->spa_trim_thread == NULL) + cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock); + spa->spa_trim_thread = NULL; + mutex_exit(&spa->spa_trim_lock); + + cv_destroy(&spa->spa_trim_cv); + mutex_destroy(&spa->spa_trim_lock); +} + +void +trim_thread_wakeup(spa_t *spa) +{ + + if (zfs_notrim) + return; + if (spa->spa_trim_thread == NULL) + return; + + mutex_enter(&spa->spa_trim_lock); + cv_signal(&spa->spa_trim_cv); + mutex_exit(&spa->spa_trim_lock); +} --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c.orig +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c @@ -42,6 +42,7 @@ #include #include #include +#include SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); @@ -1195,6 +1196,11 @@ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); + if (vd->vdev_ops->vdev_op_leaf) { + vd->vdev_notrim = B_FALSE; + trim_map_create(vd); + } + for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, @@ -1438,6 +1444,9 @@ vdev_cache_purge(vd); + if (vd->vdev_ops->vdev_op_leaf) + trim_map_destroy(vd); + /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c.orig +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c @@ -49,14 +49,17 @@ DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); -/* - * Don't send BIO_FLUSH. - */ +SYSCTL_DECL(_vfs_zfs_vdev); +/* Don't send BIO_FLUSH. */ static int vdev_geom_bio_flush_disable = 0; TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable); -SYSCTL_DECL(_vfs_zfs_vdev); SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW, &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); +/* Don't send BIO_DELETE. */ +static int vdev_geom_bio_delete_disable = 0; +TUNABLE_INT("vfs.zfs.vdev.bio_delete_disable", &vdev_geom_bio_delete_disable); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RW, + &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE"); static void vdev_geom_orphan(struct g_consumer *cp) @@ -499,8 +502,8 @@ *ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; /* - * Clear the nowritecache bit, so that on a vdev_reopen() we will - * try again. + * Clear the nowritecache settings, so that on a vdev_reopen() + * we will try again. */ vd->vdev_nowritecache = B_FALSE; @@ -546,6 +549,15 @@ */ vd->vdev_nowritecache = B_TRUE; } + if (bp->bio_cmd == BIO_DELETE && bp->bio_error == ENOTSUP) { + /* + * If we get ENOTSUP, we know that no future + * attempts will ever succeed. In this case we + * set a persistent bit so that we don't bother + * with the ioctl in the future. + */ + vd->vdev_notrim = B_TRUE; + } if (zio->io_error == EIO && !vd->vdev_remove_wanted) { /* * If provider's error is set we assume it is being @@ -588,17 +600,21 @@ } switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - if (zfs_nocacheflush || vdev_geom_bio_flush_disable) break; - if (vd->vdev_nowritecache) { zio->io_error = ENOTSUP; break; } - + goto sendreq; + case DKIOCTRIM: + if (vdev_geom_bio_delete_disable) + break; + if (vd->vdev_notrim) { + zio->io_error = ENOTSUP; + break; + } goto sendreq; default: zio->io_error = ENOTSUP; @@ -623,11 +639,21 @@ bp->bio_length = zio->io_size; break; case ZIO_TYPE_IOCTL: - bp->bio_cmd = BIO_FLUSH; - bp->bio_flags |= BIO_ORDERED; - bp->bio_data = NULL; - bp->bio_offset = cp->provider->mediasize; - bp->bio_length = 0; + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + bp->bio_cmd = BIO_FLUSH; + bp->bio_flags |= BIO_ORDERED; + bp->bio_data = NULL; + bp->bio_offset = cp->provider->mediasize; + bp->bio_length = 0; + break; + case DKIOCTRIM: + bp->bio_cmd = BIO_DELETE; + bp->bio_data = NULL; + bp->bio_offset = zio->io_offset; + bp->bio_length = zio->io_size; + break; + } break; } bp->bio_done = vdev_geom_io_intr; --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c.orig +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c @@ -141,6 +141,7 @@ #include #include #include +#include #include /* @@ -684,6 +685,16 @@ } /* + * TRIM the whole thing so that we start with a clean slate. + * It's just an optimization, so we don't care if it fails. + * Don't TRIM if removing so that we don't interfere with zpool + * disaster recovery. + */ + if (!zfs_notrim && (reason == VDEV_LABEL_CREATE || + reason == VDEV_LABEL_SPARE || reason == VDEV_LABEL_L2CACHE)) + zio_wait(zio_trim(NULL, spa, vd, 0, vd->vdev_psize)); + + /* * Initialize its label. */ vp = zio_buf_alloc(sizeof (vdev_phys_t)); @@ -1212,5 +1223,10 @@ * to disk to ensure that all odd-label updates are committed to * stable storage before the next transaction group begins. */ - return (vdev_label_sync_list(spa, 1, txg, flags)); + if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) + return (error); + + trim_thread_wakeup(spa); + + return (0); } --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c.orig +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c @@ -293,10 +293,11 @@ c = vdev_mirror_child_select(zio); children = (c >= 0); } else { - ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_type == ZIO_TYPE_WRITE || + zio->io_type == ZIO_TYPE_FREE); /* - * Writes go to all children. + * Writes and frees go to all children. */ c = 0; children = mm->mm_children; @@ -377,6 +378,8 @@ zio->io_error = vdev_mirror_worst_error(mm); } return; + } else if (zio->io_type == ZIO_TYPE_FREE) { + return; } ASSERT(zio->io_type == ZIO_TYPE_READ); --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c.orig +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c @@ -259,7 +259,9 @@ size_t size; for (c = 0; c < rm->rm_firstdatacol; c++) { - zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); + if (rm->rm_col[c].rc_data != NULL) + zio_buf_free(rm->rm_col[c].rc_data, + rm->rm_col[c].rc_size); if (rm->rm_col[c].rc_gdata != NULL) zio_buf_free(rm->rm_col[c].rc_gdata, @@ -504,14 +506,20 @@ ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); ASSERT3U(rm->rm_nskip, <=, nparity); - for (c = 0; c < rm->rm_firstdatacol; c++) - rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); + if (zio->io_type != ZIO_TYPE_FREE) { + for (c = 0; c < rm->rm_firstdatacol; c++) { + rm->rm_col[c].rc_data = + zio_buf_alloc(rm->rm_col[c].rc_size); + } - rm->rm_col[c].rc_data = zio->io_data; + rm->rm_col[c].rc_data = zio->io_data; - for (c = c + 1; c < acols; c++) - rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + - rm->rm_col[c - 1].rc_size; + for (c = c + 1; c < acols; c++) { + rm->rm_col[c].rc_data = + (char *)rm->rm_col[c - 1].rc_data + + rm->rm_col[c - 1].rc_size; + } + } /* * If all data stored spans all columns, there's a danger that parity @@ -1535,6 +1543,18 @@ ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); + if (zio->io_type == ZIO_TYPE_FREE) { + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_data, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + return (ZIO_PIPELINE_CONTINUE); + } + if (zio->io_type == ZIO_TYPE_WRITE) { vdev_raidz_generate_parity(rm); @@ -1917,6 +1937,8 @@ zio->io_error = vdev_raidz_worst_error(rm); return; + } else if (zio->io_type == ZIO_TYPE_FREE) { + return; } ASSERT(zio->io_type == ZIO_TYPE_READ); --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c.orig +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c @@ -83,6 +83,10 @@ TUNABLE_INT("vfs.zfs.cache_flush_disable", &zfs_nocacheflush); SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RDTUN, &zfs_nocacheflush, 0, "Disable cache flush"); +boolean_t zfs_notrim = B_FALSE; +TUNABLE_INT("vfs.zfs.trim_disable", &zfs_notrim); +SYSCTL_INT(_vfs_zfs, OID_AUTO, trim_disable, CTLFLAG_RDTUN, &zfs_notrim, 0, + "Disable trim"); static kmem_cache_t *zil_lwb_cache; --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c.orig +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -35,6 +35,7 @@ #include #include #include +#include SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); @@ -44,6 +45,18 @@ "Use uma(9) for ZIO allocations"); /* + * See zio.h for more information about these fields. + */ +zio_trim_stats_t zio_trim_stats = { + { "zio_trim_bytes", KSTAT_DATA_UINT64 }, + { "zio_trim_success", KSTAT_DATA_UINT64 }, + { "zio_trim_unsupported", KSTAT_DATA_UINT64 }, + { "zio_trim_failed", KSTAT_DATA_UINT64 }, +}; + +static kstat_t *zio_trim_ksp; + +/* * ========================================================================== * I/O priority table * ========================================================================== @@ -61,6 +74,7 @@ 10, /* ZIO_PRIORITY_RESILVER */ 20, /* ZIO_PRIORITY_SCRUB */ 2, /* ZIO_PRIORITY_DDT_PREFETCH */ + 30, /* ZIO_PRIORITY_TRIM */ }; /* @@ -172,6 +186,16 @@ zfs_mg_alloc_failures = 8; zio_inject_init(); + + zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", + KSTAT_TYPE_NAMED, + sizeof(zio_trim_stats) / sizeof(kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (zio_trim_ksp != NULL) { + zio_trim_ksp->ks_data = &zio_trim_stats; + kstat_install(zio_trim_ksp); + } } void @@ -199,6 +223,11 @@ kmem_cache_destroy(zio_cache); zio_inject_fini(); + + if (zio_trim_ksp != NULL) { + kstat_delete(zio_trim_ksp); + zio_trim_ksp = NULL; + } } /* @@ -506,7 +535,7 @@ { zio_t *zio; - ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); @@ -687,7 +716,7 @@ zio_t * zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - enum zio_flag flags) + uint64_t size, enum zio_flag flags) { zio_t *zio; @@ -698,7 +727,7 @@ ASSERT(spa_syncing_txg(spa) == txg); ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE); - zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), + zio = zio_create(pio, spa, txg, bp, NULL, size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); @@ -735,15 +764,16 @@ } zio_t * -zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *private, int priority, enum zio_flag flags) +zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, + uint64_t size, zio_done_func_t *done, void *private, int priority, + enum zio_flag flags) { zio_t *zio; int c; if (vd->vdev_children == 0) { - zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, - ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, + zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, + ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); zio->io_cmd = cmd; @@ -752,7 +782,7 @@ for (c = 0; c < vd->vdev_children; c++) zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, - done, private, priority, flags)); + offset, size, done, private, priority, flags)); } return (zio); @@ -877,11 +907,22 @@ void zio_flush(zio_t *zio, vdev_t *vd) { - zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, + zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, NULL, NULL, ZIO_PRIORITY_NOW, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); } +zio_t * +zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) +{ + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size, + NULL, NULL, ZIO_PRIORITY_TRIM, + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); +} + void zio_shrink(zio_t *zio, uint64_t size) { @@ -1485,6 +1526,7 @@ zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) { return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, + BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), ZIO_GANG_CHILD_FLAGS(pio))); } @@ -1617,7 +1659,7 @@ } } - if (gn == gio->io_gang_tree) + if (gn == gio->io_gang_tree && gio->io_data != NULL) ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); if (zio != pio) @@ -2305,6 +2347,11 @@ return (vdev_mirror_ops.vdev_op_io_start(zio)); } + if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) { + trim_map_free(zio); + return (ZIO_PIPELINE_CONTINUE); + } + /* * We keep track of time-sensitive I/Os so that the scan thread * can quickly react to certain workloads. In particular, we care @@ -2329,18 +2376,22 @@ if (P2PHASE(zio->io_size, align) != 0) { uint64_t asize = P2ROUNDUP(zio->io_size, align); - char *abuf = zio_buf_alloc(asize); + char *abuf = NULL; + if (zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE) + abuf = zio_buf_alloc(asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { bcopy(zio->io_data, abuf, zio->io_size); bzero(abuf + zio->io_size, asize - zio->io_size); } - zio_push_transform(zio, abuf, asize, asize, zio_subblock); + zio_push_transform(zio, abuf, asize, abuf ? asize : 0, + zio_subblock); } ASSERT(P2PHASE(zio->io_offset, align) == 0); ASSERT(P2PHASE(zio->io_size, align) == 0); - VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); + VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); /* * If this is a repair I/O, and there's no self-healing involved -- @@ -2380,6 +2431,11 @@ } } + if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE) { + if (!trim_map_write_start(zio)) + return (ZIO_PIPELINE_STOP); + } + return (vd->vdev_ops->vdev_op_io_start(zio)); } @@ -2393,9 +2449,16 @@ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); - ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); + + if (vd != NULL && vd->vdev_ops->vdev_op_leaf && + zio->io_type == ZIO_TYPE_WRITE) { + trim_map_write_done(zio); + } - if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { + if (vd != NULL && vd->vdev_ops->vdev_op_leaf && + (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { vdev_queue_io_done(zio); @@ -2471,6 +2534,20 @@ if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); + if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM) + switch (zio->io_error) { + case 0: + ZIO_TRIM_STAT_INCR(zio_trim_bytes, zio->io_size); + ZIO_TRIM_STAT_BUMP(zio_trim_success); + break; + case EOPNOTSUPP: + ZIO_TRIM_STAT_BUMP(zio_trim_unsupported); + break; + default: + ZIO_TRIM_STAT_BUMP(zio_trim_failed); + break; + } + /* * If the I/O failed, determine whether we should attempt to retry it. * --- sys/modules/zfs/Makefile.orig +++ sys/modules/zfs/Makefile @@ -68,6 +68,7 @@ ZFS_SRCS= ${ZFS_OBJS:C/.o$/.c/} SRCS+= ${ZFS_SRCS} SRCS+= vdev_geom.c +SRCS+= trim_map.c # Use FreeBSD's namecache. CFLAGS+=-DFREEBSD_NAMECACHE