r240868 | pjd | 2012-09-23 21:40:58 +0200 (nie) | 25 linii Add TRIM support. The code builds a map of regions that were freed. On every write the code consults the map and eventually removes ranges that were freed before, but are now overwritten. Freed blocks are not TRIMed immediately. There is a tunable that defines how many txg we should wait with TRIMming freed blocks (64 by default). There is a low priority thread that TRIMs ranges when the time comes. During TRIM we keep in-flight ranges on a list to detect colliding writes - we have to delay writes that collide with in-flight TRIMs in case something will be reordered and write will reached the disk before the TRIM. We don't have to do the same for in-flight writes, as colliding writes just remove ranges to TRIM. Sponsored by: multiplay.co.uk This work includes some important fixes and some improvements obtained from the zfsonlinux project, including TRIMming entire vdevs on pool create/add/attach and on pool import for spare and cache vdevs. Obtained from: zfsonlinux Submitted by: Etienne Dechamps Index: cddl/lib/libzpool/Makefile =================================================================== --- cddl/lib/libzpool/Makefile (wersja 240867) +++ cddl/lib/libzpool/Makefile (wersja 240868) @@ -26,7 +26,7 @@ LIB= zpool -ZFS_COMMON_SRCS= ${ZFS_COMMON_OBJS:C/.o$/.c/} vdev_file.c +ZFS_COMMON_SRCS= ${ZFS_COMMON_OBJS:C/.o$/.c/} vdev_file.c trim_map.c ZFS_SHARED_SRCS= ${ZFS_SHARED_OBJS:C/.o$/.c/} KERNEL_SRCS= kernel.c taskq.c util.c LIST_SRCS= list.c Index: sys/modules/zfs/Makefile =================================================================== --- sys/modules/zfs/Makefile (wersja 240867) +++ sys/modules/zfs/Makefile (wersja 240868) @@ -72,6 +72,7 @@ ZFS_SRCS= ${ZFS_OBJS:C/.o$/.c/} SRCS+= ${ZFS_SRCS} SRCS+= vdev_geom.c +SRCS+= trim_map.c # Use FreeBSD's namecache. CFLAGS+=-DFREEBSD_NAMECACHE Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c (wersja 240867) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c (wersja 240868) @@ -83,6 +83,10 @@ TUNABLE_INT("vfs.zfs.cache_flush_disable", &zfs_nocacheflush); SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RDTUN, &zfs_nocacheflush, 0, "Disable cache flush"); +boolean_t zfs_notrim = B_TRUE; +TUNABLE_INT("vfs.zfs.trim_disable", &zfs_notrim); +SYSCTL_INT(_vfs_zfs, OID_AUTO, trim_disable, CTLFLAG_RDTUN, &zfs_notrim, 0, + "Disable trim"); static kmem_cache_t *zil_lwb_cache; Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c (wersja 240867) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c (wersja 240868) @@ -293,10 +293,11 @@ c = vdev_mirror_child_select(zio); children = (c >= 0); } else { - ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_type == ZIO_TYPE_WRITE || + zio->io_type == ZIO_TYPE_FREE); /* - * Writes go to all children. + * Writes and frees go to all children. */ c = 0; children = mm->mm_children; @@ -377,6 +378,8 @@ zio->io_error = vdev_mirror_worst_error(mm); } return; + } else if (zio->io_type == ZIO_TYPE_FREE) { + return; } ASSERT(zio->io_type == ZIO_TYPE_READ); Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c (wersja 0) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c (wersja 240868) @@ -0,0 +1,541 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012 Pawel Jakub Dawidek . + * All rights reserved. + */ + +#include +#include +#include +#include + +typedef struct trim_map { + list_t tm_head; /* List of segments sorted by txg. */ + avl_tree_t tm_queued_frees; /* AVL tree of segments waiting for TRIM. */ + avl_tree_t tm_inflight_frees; /* AVL tree of in-flight TRIMs. */ + avl_tree_t tm_inflight_writes; /* AVL tree of in-flight writes. */ + list_t tm_pending_writes; /* Writes blocked on in-flight frees. */ + kmutex_t tm_lock; +} trim_map_t; + +typedef struct trim_seg { + avl_node_t ts_node; /* AVL node. */ + list_node_t ts_next; /* List element. */ + uint64_t ts_start; /* Starting offset of this segment. */ + uint64_t ts_end; /* Ending offset (non-inclusive). */ + uint64_t ts_txg; /* Segment creation txg. */ +} trim_seg_t; + +extern boolean_t zfs_notrim; + +SYSCTL_DECL(_vfs_zfs); +/* Delay TRIMs by that many TXGs. */ +static int trim_txg_limit = 64; +TUNABLE_INT("vfs.zfs.trim_txg_limit", &trim_txg_limit); +SYSCTL_INT(_vfs_zfs, OID_AUTO, trim_txg_limit, CTLFLAG_RW, &trim_txg_limit, 0, + "Delay TRIMs by that many TXGs."); + +static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd); + +static int +trim_map_seg_compare(const void *x1, const void *x2) +{ + const trim_seg_t *s1 = x1; + const trim_seg_t *s2 = x2; + + if (s1->ts_start < s2->ts_start) { + if (s1->ts_end > s2->ts_start) + return (0); + return (-1); + } + if (s1->ts_start > s2->ts_start) { + if (s1->ts_start < s2->ts_end) + return (0); + return (1); + } + return (0); +} + +static int +trim_map_zio_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = x1; + const zio_t *z2 = x2; + + if (z1->io_offset < z2->io_offset) { + if (z1->io_offset + z1->io_size > z2->io_offset) + return (0); + return (-1); + } + if (z1->io_offset > z2->io_offset) { + if (z1->io_offset < z2->io_offset + z2->io_size) + return (0); + return (1); + } + return (0); +} + +void +trim_map_create(vdev_t *vd) +{ + trim_map_t *tm; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (zfs_notrim) + return; + + tm = kmem_zalloc(sizeof (*tm), KM_SLEEP); + mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&tm->tm_head, sizeof (trim_seg_t), + offsetof(trim_seg_t, ts_next)); + list_create(&tm->tm_pending_writes, sizeof (zio_t), + offsetof(zio_t, io_trim_link)); + avl_create(&tm->tm_queued_frees, trim_map_seg_compare, + sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); + avl_create(&tm->tm_inflight_frees, trim_map_seg_compare, + sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); + avl_create(&tm->tm_inflight_writes, trim_map_zio_compare, + sizeof (zio_t), offsetof(zio_t, io_trim_node)); + vd->vdev_trimmap = tm; +} + +void +trim_map_destroy(vdev_t *vd) +{ + trim_map_t *tm; + trim_seg_t *ts; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (zfs_notrim) + return; + + tm = vd->vdev_trimmap; + if (tm == NULL) + return; + + /* + * We may have been called before trim_map_vdev_commit_done() + * had a chance to run, so do it now to prune the remaining + * inflight frees. + */ + trim_map_vdev_commit_done(vd->vdev_spa, vd); + + mutex_enter(&tm->tm_lock); + while ((ts = list_head(&tm->tm_head)) != NULL) { + avl_remove(&tm->tm_queued_frees, ts); + list_remove(&tm->tm_head, ts); + kmem_free(ts, sizeof (*ts)); + } + mutex_exit(&tm->tm_lock); + + avl_destroy(&tm->tm_queued_frees); + avl_destroy(&tm->tm_inflight_frees); + avl_destroy(&tm->tm_inflight_writes); + list_destroy(&tm->tm_pending_writes); + list_destroy(&tm->tm_head); + mutex_destroy(&tm->tm_lock); + kmem_free(tm, sizeof (*tm)); + vd->vdev_trimmap = NULL; +} + +static void +trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) +{ + avl_index_t where; + trim_seg_t tsearch, *ts_before, *ts_after, *ts; + boolean_t merge_before, merge_after; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + VERIFY(start < end); + + tsearch.ts_start = start; + tsearch.ts_end = end; + + ts = avl_find(&tm->tm_queued_frees, &tsearch, &where); + if (ts != NULL) { + if (start < ts->ts_start) + trim_map_segment_add(tm, start, ts->ts_start, txg); + if (end > ts->ts_end) + trim_map_segment_add(tm, ts->ts_end, end, txg); + return; + } + + ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE); + ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER); + + merge_before = (ts_before != NULL && ts_before->ts_end == start && + ts_before->ts_txg == txg); + merge_after = (ts_after != NULL && ts_after->ts_start == end && + ts_after->ts_txg == txg); + + if (merge_before && merge_after) { + avl_remove(&tm->tm_queued_frees, ts_before); + list_remove(&tm->tm_head, ts_before); + ts_after->ts_start = ts_before->ts_start; + kmem_free(ts_before, sizeof (*ts_before)); + } else if (merge_before) { + ts_before->ts_end = end; + } else if (merge_after) { + ts_after->ts_start = start; + } else { + ts = kmem_alloc(sizeof (*ts), KM_SLEEP); + ts->ts_start = start; + ts->ts_end = end; + ts->ts_txg = txg; + avl_insert(&tm->tm_queued_frees, ts, where); + list_insert_tail(&tm->tm_head, ts); + } +} + +static void +trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start, + uint64_t end) +{ + trim_seg_t *nts; + boolean_t left_over, right_over; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + + left_over = (ts->ts_start < start); + right_over = (ts->ts_end > end); + + if (left_over && right_over) { + nts = kmem_alloc(sizeof (*nts), KM_SLEEP); + nts->ts_start = end; + nts->ts_end = ts->ts_end; + nts->ts_txg = ts->ts_txg; + ts->ts_end = start; + avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER); + list_insert_after(&tm->tm_head, ts, nts); + } else if (left_over) { + ts->ts_end = start; + } else if (right_over) { + ts->ts_start = end; + } else { + avl_remove(&tm->tm_queued_frees, ts); + list_remove(&tm->tm_head, ts); + kmem_free(ts, sizeof (*ts)); + } +} + +static void +trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) +{ + zio_t zsearch, *zs; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + + zsearch.io_offset = start; + zsearch.io_size = end - start; + + zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL); + if (zs == NULL) { + trim_map_segment_add(tm, start, end, txg); + return; + } + if (start < zs->io_offset) + trim_map_free_locked(tm, start, zs->io_offset, txg); + if (zs->io_offset + zs->io_size < end) + trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg); +} + +void +trim_map_free(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + trim_map_t *tm = vd->vdev_trimmap; + + if (zfs_notrim || vd->vdev_notrim || tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + trim_map_free_locked(tm, zio->io_offset, zio->io_offset + zio->io_size, + vd->vdev_spa->spa_syncing_txg); + mutex_exit(&tm->tm_lock); +} + +boolean_t +trim_map_write_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + trim_map_t *tm = vd->vdev_trimmap; + trim_seg_t tsearch, *ts; + boolean_t left_over, right_over; + uint64_t start, end; + + if (zfs_notrim || vd->vdev_notrim || tm == NULL) + return (B_TRUE); + + start = zio->io_offset; + end = start + zio->io_size; + tsearch.ts_start = start; + tsearch.ts_end = end; + + mutex_enter(&tm->tm_lock); + + /* + * Checking for colliding in-flight frees. + */ + ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL); + if (ts != NULL) { + list_insert_tail(&tm->tm_pending_writes, zio); + mutex_exit(&tm->tm_lock); + return (B_FALSE); + } + + ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); + if (ts != NULL) { + /* + * Loop until all overlapping segments are removed. + */ + do { + trim_map_segment_remove(tm, ts, start, end); + ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); + } while (ts != NULL); + } + avl_add(&tm->tm_inflight_writes, zio); + + mutex_exit(&tm->tm_lock); + + return (B_TRUE); +} + +void +trim_map_write_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + trim_map_t *tm = vd->vdev_trimmap; + + /* + * Don't check for vdev_notrim, since the write could have + * started before vdev_notrim was set. + */ + if (zfs_notrim || tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + /* + * Don't fail if the write isn't in the tree, since the write + * could have started after vdev_notrim was set. + */ + if (zio->io_trim_node.avl_child[0] || + zio->io_trim_node.avl_child[1] || + AVL_XPARENT(&zio->io_trim_node) || + tm->tm_inflight_writes.avl_root == &zio->io_trim_node) + avl_remove(&tm->tm_inflight_writes, zio); + mutex_exit(&tm->tm_lock); +} + +/* + * Return the oldest segment (the one with the lowest txg) or false if + * the list is empty or the first element's txg is greater than txg given + * as function argument. + */ +static trim_seg_t * +trim_map_first(trim_map_t *tm, uint64_t txg) +{ + trim_seg_t *ts; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + + ts = list_head(&tm->tm_head); + if (ts != NULL && ts->ts_txg <= txg) + return (ts); + return (NULL); +} + +static void +trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd) +{ + trim_map_t *tm = vd->vdev_trimmap; + trim_seg_t *ts; + uint64_t start, size, txglimit; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (tm == NULL) + return; + + txglimit = MIN(spa->spa_syncing_txg, spa_freeze_txg(spa)) - + trim_txg_limit; + + mutex_enter(&tm->tm_lock); + /* + * Loop until we send all frees up to the txglimit. + */ + while ((ts = trim_map_first(tm, txglimit)) != NULL) { + list_remove(&tm->tm_head, ts); + avl_remove(&tm->tm_queued_frees, ts); + avl_add(&tm->tm_inflight_frees, ts); + zio_nowait(zio_trim(zio, spa, vd, ts->ts_start, + ts->ts_end - ts->ts_start)); + } + mutex_exit(&tm->tm_lock); +} + +static void +trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd) +{ + trim_map_t *tm = vd->vdev_trimmap; + trim_seg_t *ts; + list_t pending_writes; + zio_t *zio; + uint64_t start, size; + void *cookie; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + if (!avl_is_empty(&tm->tm_inflight_frees)) { + cookie = NULL; + while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees, + &cookie)) != NULL) { + kmem_free(ts, sizeof (*ts)); + } + } + list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t, + io_trim_link)); + list_move_tail(&pending_writes, &tm->tm_pending_writes); + mutex_exit(&tm->tm_lock); + + while ((zio = list_remove_head(&pending_writes)) != NULL) { + zio_vdev_io_reissue(zio); + zio_execute(zio); + } + list_destroy(&pending_writes); +} + +static void +trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd) +{ + int c; + + if (vd == NULL || spa->spa_syncing_txg <= trim_txg_limit) + return; + + if (vd->vdev_ops->vdev_op_leaf) { + trim_map_vdev_commit(spa, zio, vd); + } else { + for (c = 0; c < vd->vdev_children; c++) + trim_map_commit(spa, zio, vd->vdev_child[c]); + } +} + +static void +trim_map_commit_done(spa_t *spa, vdev_t *vd) +{ + int c; + + if (vd == NULL) + return; + + if (vd->vdev_ops->vdev_op_leaf) { + trim_map_vdev_commit_done(spa, vd); + } else { + for (c = 0; c < vd->vdev_children; c++) + trim_map_commit_done(spa, vd->vdev_child[c]); + } +} + +static void +trim_thread(void *arg) +{ + spa_t *spa = arg; + zio_t *zio; + + for (;;) { + mutex_enter(&spa->spa_trim_lock); + if (spa->spa_trim_thread == NULL) { + spa->spa_trim_thread = curthread; + cv_signal(&spa->spa_trim_cv); + mutex_exit(&spa->spa_trim_lock); + thread_exit(); + } + cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock); + mutex_exit(&spa->spa_trim_lock); + + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + trim_map_commit(spa, zio, spa->spa_root_vdev); + (void) zio_wait(zio); + trim_map_commit_done(spa, spa->spa_root_vdev); + spa_config_exit(spa, SCL_STATE, FTAG); + } +} + +void +trim_thread_create(spa_t *spa) +{ + + if (zfs_notrim) + return; + + mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL); + mutex_enter(&spa->spa_trim_lock); + spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0, + TS_RUN, minclsyspri); + mutex_exit(&spa->spa_trim_lock); +} + +void +trim_thread_destroy(spa_t *spa) +{ + + if (zfs_notrim) + return; + if (spa->spa_trim_thread == NULL) + return; + + mutex_enter(&spa->spa_trim_lock); + /* Setting spa_trim_thread to NULL tells the thread to stop. */ + spa->spa_trim_thread = NULL; + cv_signal(&spa->spa_trim_cv); + /* The thread will set it back to != NULL on exit. */ + while (spa->spa_trim_thread == NULL) + cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock); + spa->spa_trim_thread = NULL; + mutex_exit(&spa->spa_trim_lock); + + cv_destroy(&spa->spa_trim_cv); + mutex_destroy(&spa->spa_trim_lock); +} + +void +trim_thread_wakeup(spa_t *spa) +{ + + if (zfs_notrim) + return; + if (spa->spa_trim_thread == NULL) + return; + + mutex_enter(&spa->spa_trim_lock); + cv_signal(&spa->spa_trim_cv); + mutex_exit(&spa->spa_trim_lock); +} Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c (wersja 240867) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c (wersja 240868) @@ -393,7 +393,8 @@ dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) { ASSERT(dsl_pool_sync_context(dp)); - zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags)); + zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp), + pio->io_flags)); } int @@ -1387,7 +1388,7 @@ } zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, - dmu_tx_get_txg(tx), bp, 0)); + dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0)); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c (wersja 240867) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c (wersja 240868) @@ -259,7 +259,9 @@ size_t size; for (c = 0; c < rm->rm_firstdatacol; c++) { - zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); + if (rm->rm_col[c].rc_data != NULL) + zio_buf_free(rm->rm_col[c].rc_data, + rm->rm_col[c].rc_size); if (rm->rm_col[c].rc_gdata != NULL) zio_buf_free(rm->rm_col[c].rc_gdata, @@ -504,14 +506,20 @@ ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); ASSERT3U(rm->rm_nskip, <=, nparity); - for (c = 0; c < rm->rm_firstdatacol; c++) - rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); + if (zio->io_type != ZIO_TYPE_FREE) { + for (c = 0; c < rm->rm_firstdatacol; c++) { + rm->rm_col[c].rc_data = + zio_buf_alloc(rm->rm_col[c].rc_size); + } - rm->rm_col[c].rc_data = zio->io_data; + rm->rm_col[c].rc_data = zio->io_data; - for (c = c + 1; c < acols; c++) - rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + - rm->rm_col[c - 1].rc_size; + for (c = c + 1; c < acols; c++) { + rm->rm_col[c].rc_data = + (char *)rm->rm_col[c - 1].rc_data + + rm->rm_col[c - 1].rc_size; + } + } /* * If all data stored spans all columns, there's a danger that parity @@ -1535,6 +1543,18 @@ ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); + if (zio->io_type == ZIO_TYPE_FREE) { + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_data, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + return (ZIO_PIPELINE_CONTINUE); + } + if (zio->io_type == ZIO_TYPE_WRITE) { vdev_raidz_generate_parity(rm); @@ -1917,6 +1937,8 @@ zio->io_error = vdev_raidz_worst_error(rm); return; + } else if (zio->io_type == ZIO_TYPE_FREE) { + return; } ASSERT(zio->io_type == ZIO_TYPE_READ); Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c (wersja 240867) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c (wersja 240868) @@ -42,6 +42,7 @@ #include #include #include +#include SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); @@ -1195,6 +1196,11 @@ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); + if (vd->vdev_ops->vdev_op_leaf) { + vd->vdev_notrim = B_FALSE; + trim_map_create(vd); + } + for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, @@ -1439,6 +1445,9 @@ vdev_cache_purge(vd); + if (vd->vdev_ops->vdev_op_leaf) + trim_map_destroy(vd); + /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h (wersja 0) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h (wersja 240868) @@ -0,0 +1,51 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012 Pawel Jakub Dawidek . + * All rights reserved. + */ + +#ifndef _SYS_TRIM_MAP_H +#define _SYS_TRIM_MAP_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern void trim_map_create(vdev_t *vd); +extern void trim_map_destroy(vdev_t *vd); +extern void trim_map_free(zio_t *zio); +extern boolean_t trim_map_write_start(zio_t *zio); +extern void trim_map_write_done(zio_t *zio); + +extern void trim_thread_create(spa_t *spa); +extern void trim_thread_destroy(spa_t *spa); +extern void trim_thread_wakeup(spa_t *spa); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TRIM_MAP_H */ Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h (wersja 240867) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h (wersja 240868) @@ -46,6 +46,7 @@ } vdev_dtl_type_t; extern boolean_t zfs_nocacheflush; +extern boolean_t zfs_notrim; extern int vdev_open(vdev_t *); extern void vdev_open_children(vdev_t *); Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h (wersja 240867) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h (wersja 240868) @@ -182,6 +182,7 @@ uint64_t vdev_unspare; /* unspare when resilvering done */ hrtime_t vdev_last_try; /* last reopen time */ boolean_t vdev_nowritecache; /* true if flushwritecache failed */ + boolean_t vdev_notrim; /* true if trim failed */ boolean_t vdev_checkremove; /* temporary online test */ boolean_t vdev_forcefault; /* force online fault */ boolean_t vdev_splitting; /* split or repair in progress */ @@ -197,6 +198,7 @@ spa_aux_vdev_t *vdev_aux; /* for l2cache vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ vdev_aux_t vdev_label_aux; /* on-disk aux state */ + struct trim_map *vdev_trimmap; /* * For DTrace to work in userland (libzpool) context, these fields must Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h (wersja 240867) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h (wersja 240868) @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -133,7 +134,8 @@ #define ZIO_PRIORITY_RESILVER (zio_priority_table[9]) #define ZIO_PRIORITY_SCRUB (zio_priority_table[10]) #define ZIO_PRIORITY_DDT_PREFETCH (zio_priority_table[11]) -#define ZIO_PRIORITY_TABLE_SIZE 12 +#define ZIO_PRIORITY_TRIM (zio_priority_table[12]) +#define ZIO_PRIORITY_TABLE_SIZE 13 #define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_STOP 0x101 @@ -360,6 +362,39 @@ list_node_t zl_child_node; } zio_link_t; +/* + * Used for TRIM kstat. + */ +typedef struct zio_trim_stats { + /* + * Number of bytes successfully TRIMmed. + */ + kstat_named_t zio_trim_bytes; + + /* + * Number of successful TRIM requests. + */ + kstat_named_t zio_trim_success; + + /* + * Number of TRIM requests that failed because TRIM is not + * supported. + */ + kstat_named_t zio_trim_unsupported; + + /* + * Number of TRIM requests that failed for other reasons. + */ + kstat_named_t zio_trim_failed; +} zio_trim_stats_t; + +extern zio_trim_stats_t zio_trim_stats; + +#define ZIO_TRIM_STAT_INCR(stat, val) \ + atomic_add_64(&zio_trim_stats.stat.value.ui64, (val)); +#define ZIO_TRIM_STAT_BUMP(stat) \ + ZIO_TRIM_STAT_INCR(stat, 1); + struct zio { /* Core information about this I/O */ zbookmark_t io_bookmark; @@ -433,6 +468,8 @@ /* FreeBSD only. */ struct ostask io_task; #endif + avl_node_t io_trim_node; + list_node_t io_trim_link; }; extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, @@ -463,8 +500,8 @@ zio_done_func_t *done, void *priv, enum zio_flag flags); extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *priv, int priority, - enum zio_flag flags); + uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv, + int priority, enum zio_flag flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, @@ -477,12 +514,14 @@ boolean_t labels); extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, - const blkptr_t *bp, enum zio_flag flags); + const blkptr_t *bp, uint64_t size, enum zio_flag flags); extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t use_slog); extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp); extern void zio_flush(zio_t *zio, vdev_t *vd); +extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, + uint64_t size); extern void zio_shrink(zio_t *zio, uint64_t size); extern int zio_wait(zio_t *zio); Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h (wersja 240867) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h (wersja 240868) @@ -220,6 +220,9 @@ spa_proc_state_t spa_proc_state; /* see definition */ struct proc *spa_proc; /* "zpool-poolname" process */ uint64_t spa_did; /* if procp != p0, did of t1 */ + kthread_t *spa_trim_thread; /* thread sending TRIM I/Os */ + kmutex_t spa_trim_lock; /* protects spa_trim_cv */ + kcondvar_t spa_trim_cv; /* used to notify TRIM thread */ boolean_t spa_autoreplace; /* autoreplace set in open */ int spa_vdev_locks; /* locks grabbed */ uint64_t spa_creation_version; /* version at pool creation */ Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h (wersja 240867) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h (wersja 240868) @@ -60,9 +60,9 @@ ZIO_STAGE_READY = 1 << 15, /* RWFCI */ - ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RW--I */ - ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RW--- */ - ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RW--I */ + ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RWF-I */ + ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RWF-- */ + ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RWF-I */ ZIO_STAGE_CHECKSUM_VERIFY = 1 << 19, /* R---- */ @@ -143,7 +143,9 @@ #define ZIO_FREE_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ ZIO_STAGE_FREE_BP_INIT | \ - ZIO_STAGE_DVA_FREE) + ZIO_STAGE_DVA_FREE | \ + ZIO_STAGE_VDEV_IO_START | \ + ZIO_STAGE_VDEV_IO_ASSESS) #define ZIO_DDT_FREE_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c (wersja 240867) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c (wersja 240868) @@ -35,6 +35,7 @@ #include #include #include +#include SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); @@ -48,6 +49,18 @@ "Exclude metadata buffers from dumps as well"); /* + * See zio.h for more information about these fields. + */ +zio_trim_stats_t zio_trim_stats = { + { "zio_trim_bytes", KSTAT_DATA_UINT64 }, + { "zio_trim_success", KSTAT_DATA_UINT64 }, + { "zio_trim_unsupported", KSTAT_DATA_UINT64 }, + { "zio_trim_failed", KSTAT_DATA_UINT64 }, +}; + +static kstat_t *zio_trim_ksp; + +/* * ========================================================================== * I/O priority table * ========================================================================== @@ -65,6 +78,7 @@ 10, /* ZIO_PRIORITY_RESILVER */ 20, /* ZIO_PRIORITY_SCRUB */ 2, /* ZIO_PRIORITY_DDT_PREFETCH */ + 30, /* ZIO_PRIORITY_TRIM */ }; /* @@ -188,6 +202,16 @@ zfs_mg_alloc_failures = 8; zio_inject_init(); + + zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", + KSTAT_TYPE_NAMED, + sizeof(zio_trim_stats) / sizeof(kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (zio_trim_ksp != NULL) { + zio_trim_ksp->ks_data = &zio_trim_stats; + kstat_install(zio_trim_ksp); + } } void @@ -215,6 +239,11 @@ kmem_cache_destroy(zio_cache); zio_inject_fini(); + + if (zio_trim_ksp != NULL) { + kstat_delete(zio_trim_ksp); + zio_trim_ksp = NULL; + } } /* @@ -523,7 +552,7 @@ { zio_t *zio; - ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); @@ -704,7 +733,7 @@ zio_t * zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - enum zio_flag flags) + uint64_t size, enum zio_flag flags) { zio_t *zio; @@ -715,7 +744,7 @@ ASSERT(spa_syncing_txg(spa) == txg); ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE); - zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), + zio = zio_create(pio, spa, txg, bp, NULL, size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); @@ -752,15 +781,16 @@ } zio_t * -zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *private, int priority, enum zio_flag flags) +zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, + uint64_t size, zio_done_func_t *done, void *private, int priority, + enum zio_flag flags) { zio_t *zio; int c; if (vd->vdev_children == 0) { - zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, - ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, + zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, + ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); zio->io_cmd = cmd; @@ -769,7 +799,7 @@ for (c = 0; c < vd->vdev_children; c++) zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, - done, private, priority, flags)); + offset, size, done, private, priority, flags)); } return (zio); @@ -894,11 +924,22 @@ void zio_flush(zio_t *zio, vdev_t *vd) { - zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, + zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, NULL, NULL, ZIO_PRIORITY_NOW, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); } +zio_t * +zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) +{ + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size, + NULL, NULL, ZIO_PRIORITY_TRIM, + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); +} + void zio_shrink(zio_t *zio, uint64_t size) { @@ -1502,6 +1543,7 @@ zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) { return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, + BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), ZIO_GANG_CHILD_FLAGS(pio))); } @@ -1634,7 +1676,7 @@ } } - if (gn == gio->io_gang_tree) + if (gn == gio->io_gang_tree && gio->io_data != NULL) ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); if (zio != pio) @@ -2322,6 +2364,11 @@ return (vdev_mirror_ops.vdev_op_io_start(zio)); } + if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) { + trim_map_free(zio); + return (ZIO_PIPELINE_CONTINUE); + } + /* * We keep track of time-sensitive I/Os so that the scan thread * can quickly react to certain workloads. In particular, we care @@ -2346,18 +2393,22 @@ if (P2PHASE(zio->io_size, align) != 0) { uint64_t asize = P2ROUNDUP(zio->io_size, align); - char *abuf = zio_buf_alloc(asize); + char *abuf = NULL; + if (zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE) + abuf = zio_buf_alloc(asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { bcopy(zio->io_data, abuf, zio->io_size); bzero(abuf + zio->io_size, asize - zio->io_size); } - zio_push_transform(zio, abuf, asize, asize, zio_subblock); + zio_push_transform(zio, abuf, asize, abuf ? asize : 0, + zio_subblock); } ASSERT(P2PHASE(zio->io_offset, align) == 0); ASSERT(P2PHASE(zio->io_size, align) == 0); - VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); + VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); /* * If this is a repair I/O, and there's no self-healing involved -- @@ -2397,6 +2448,11 @@ } } + if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE) { + if (!trim_map_write_start(zio)) + return (ZIO_PIPELINE_STOP); + } + return (vd->vdev_ops->vdev_op_io_start(zio)); } @@ -2410,10 +2466,17 @@ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); - ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); - if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { + if (vd != NULL && vd->vdev_ops->vdev_op_leaf && + zio->io_type == ZIO_TYPE_WRITE) { + trim_map_write_done(zio); + } + if (vd != NULL && vd->vdev_ops->vdev_op_leaf && + (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { + vdev_queue_io_done(zio); if (zio->io_type == ZIO_TYPE_WRITE) @@ -2488,6 +2551,20 @@ if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); + if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM) + switch (zio->io_error) { + case 0: + ZIO_TRIM_STAT_INCR(zio_trim_bytes, zio->io_size); + ZIO_TRIM_STAT_BUMP(zio_trim_success); + break; + case EOPNOTSUPP: + ZIO_TRIM_STAT_BUMP(zio_trim_unsupported); + break; + default: + ZIO_TRIM_STAT_BUMP(zio_trim_failed); + break; + } + /* * If the I/O failed, determine whether we should attempt to retry it. * Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c (wersja 240867) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c (wersja 240868) @@ -49,14 +49,17 @@ DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); -/* - * Don't send BIO_FLUSH. - */ +SYSCTL_DECL(_vfs_zfs_vdev); +/* Don't send BIO_FLUSH. */ static int vdev_geom_bio_flush_disable = 0; TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable); -SYSCTL_DECL(_vfs_zfs_vdev); SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW, &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); +/* Don't send BIO_DELETE. */ +static int vdev_geom_bio_delete_disable = 0; +TUNABLE_INT("vfs.zfs.vdev.bio_delete_disable", &vdev_geom_bio_delete_disable); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RW, + &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE"); static void vdev_geom_orphan(struct g_consumer *cp) @@ -499,8 +502,8 @@ *ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; /* - * Clear the nowritecache bit, so that on a vdev_reopen() we will - * try again. + * Clear the nowritecache settings, so that on a vdev_reopen() + * we will try again. */ vd->vdev_nowritecache = B_FALSE; @@ -546,6 +549,15 @@ */ vd->vdev_nowritecache = B_TRUE; } + if (bp->bio_cmd == BIO_DELETE && bp->bio_error == ENOTSUP) { + /* + * If we get ENOTSUP, we know that no future + * attempts will ever succeed. In this case we + * set a persistent bit so that we don't bother + * with the ioctl in the future. + */ + vd->vdev_notrim = B_TRUE; + } if (zio->io_error == EIO && !vd->vdev_remove_wanted) { /* * If provider's error is set we assume it is being @@ -588,18 +600,22 @@ } switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - if (zfs_nocacheflush || vdev_geom_bio_flush_disable) break; - if (vd->vdev_nowritecache) { zio->io_error = ENOTSUP; break; } - goto sendreq; + case DKIOCTRIM: + if (vdev_geom_bio_delete_disable) + break; + if (vd->vdev_notrim) { + zio->io_error = ENOTSUP; + break; + } + goto sendreq; default: zio->io_error = ENOTSUP; } @@ -623,11 +639,21 @@ bp->bio_length = zio->io_size; break; case ZIO_TYPE_IOCTL: - bp->bio_cmd = BIO_FLUSH; - bp->bio_flags |= BIO_ORDERED; - bp->bio_data = NULL; - bp->bio_offset = cp->provider->mediasize; - bp->bio_length = 0; + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + bp->bio_cmd = BIO_FLUSH; + bp->bio_flags |= BIO_ORDERED; + bp->bio_data = NULL; + bp->bio_offset = cp->provider->mediasize; + bp->bio_length = 0; + break; + case DKIOCTRIM: + bp->bio_cmd = BIO_DELETE; + bp->bio_data = NULL; + bp->bio_offset = zio->io_offset; + bp->bio_length = zio->io_size; + break; + } break; } bp->bio_done = vdev_geom_io_intr; Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c (wersja 240867) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c (wersja 240868) @@ -62,6 +62,7 @@ #include #include #include +#include #ifdef _KERNEL #include @@ -1001,6 +1002,11 @@ spa_create_zio_taskqs(spa); } + /* + * Start TRIM thread. + */ + trim_thread_create(spa); + list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), @@ -1029,6 +1035,12 @@ ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); + /* + * Stop TRIM thread in case spa_unload() wasn't called directly + * before spa_deactivate(). + */ + trim_thread_destroy(spa); + txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); @@ -1145,6 +1157,11 @@ ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* + * Stop TRIM thread. + */ + trim_thread_destroy(spa); + + /* * Stop async tasks. */ spa_async_suspend(spa); @@ -5677,7 +5694,7 @@ zio_t *zio = arg; zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, - zio->io_flags)); + BP_GET_PSIZE(bp), zio->io_flags)); return (0); } Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c (wersja 240867) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c (wersja 240868) @@ -145,6 +145,7 @@ #include #include #include +#include #include /* @@ -718,6 +719,16 @@ } /* + * TRIM the whole thing so that we start with a clean slate. + * It's just an optimization, so we don't care if it fails. + * Don't TRIM if removing so that we don't interfere with zpool + * disaster recovery. + */ + if (!zfs_notrim && (reason == VDEV_LABEL_CREATE || + reason == VDEV_LABEL_SPARE || reason == VDEV_LABEL_L2CACHE)) + zio_wait(zio_trim(NULL, spa, vd, 0, vd->vdev_psize)); + + /* * Initialize its label. */ vp = zio_buf_alloc(sizeof (vdev_phys_t)); @@ -1282,5 +1293,10 @@ * to disk to ensure that all odd-label updates are committed to * stable storage before the next transaction group begins. */ - return (vdev_label_sync_list(spa, 1, txg, flags)); + if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) + return (error); + + trim_thread_wakeup(spa); + + return (0); } Index: sys/cddl/compat/opensolaris/sys/dkio.h =================================================================== --- sys/cddl/compat/opensolaris/sys/dkio.h (wersja 240867) +++ sys/cddl/compat/opensolaris/sys/dkio.h (wersja 240868) @@ -75,6 +75,8 @@ */ #define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */ +#define DKIOCTRIM (DKIOC|35) /* TRIM a block */ + struct dk_callback { void (*dkc_callback)(void *dkc_cookie, int error); void *dkc_cookie;