FreeBSD ZFS
The Zettabyte File System

trim_map.c

Go to the documentation of this file.
00001 /*
00002  * CDDL HEADER START
00003  *
00004  * The contents of this file are subject to the terms of the
00005  * Common Development and Distribution License (the "License").
00006  * You may not use this file except in compliance with the License.
00007  *
00008  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
00009  * or http://www.opensolaris.org/os/licensing.
00010  * See the License for the specific language governing permissions
00011  * and limitations under the License.
00012  *
00013  * When distributing Covered Code, include this CDDL HEADER in each
00014  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
00015  * If applicable, add the following below this CDDL HEADER, with the
00016  * fields enclosed by brackets "[]" replaced with your own identifying
00017  * information: Portions Copyright [yyyy] [name of copyright owner]
00018  *
00019  * CDDL HEADER END
00020  */
00021 /*
00022  * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
00023  * All rights reserved.
00024  */
00025 
00026 #include <sys/zfs_context.h>
00027 #include <sys/spa_impl.h>
00028 #include <sys/vdev_impl.h>
00029 #include <sys/trim_map.h>
00030 
00031 typedef struct trim_map {
00032         list_t          tm_head;                /* List of segments sorted by txg. */
00033         avl_tree_t      tm_queued_frees;        /* AVL tree of segments waiting for TRIM. */
00034         avl_tree_t      tm_inflight_frees;      /* AVL tree of in-flight TRIMs. */
00035         avl_tree_t      tm_inflight_writes;     /* AVL tree of in-flight writes. */
00036         list_t          tm_pending_writes;      /* Writes blocked on in-flight frees. */
00037         kmutex_t        tm_lock;
00038 } trim_map_t;
00039 
00040 typedef struct trim_seg {
00041         avl_node_t      ts_node;        /* AVL node. */
00042         list_node_t     ts_next;        /* List element. */
00043         uint64_t        ts_start;       /* Starting offset of this segment. */
00044         uint64_t        ts_end;         /* Ending offset (non-inclusive). */
00045         uint64_t        ts_txg;         /* Segment creation txg. */
00046 } trim_seg_t;
00047 
00048 extern boolean_t zfs_notrim;
00049 
00050 SYSCTL_DECL(_vfs_zfs);
00051 /* Delay TRIMs by that many TXGs. */
00052 static int trim_txg_limit = 64;
00053 TUNABLE_INT("vfs.zfs.trim_txg_limit", &trim_txg_limit);
00054 SYSCTL_INT(_vfs_zfs, OID_AUTO, trim_txg_limit, CTLFLAG_RW, &trim_txg_limit, 0,
00055     "Delay TRIMs by that many TXGs.");
00056 
00057 static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd);
00058 
00059 static int
00060 trim_map_seg_compare(const void *x1, const void *x2)
00061 {
00062         const trim_seg_t *s1 = x1;
00063         const trim_seg_t *s2 = x2;
00064 
00065         if (s1->ts_start < s2->ts_start) {
00066                 if (s1->ts_end > s2->ts_start)
00067                         return (0);
00068                 return (-1);
00069         }
00070         if (s1->ts_start > s2->ts_start) {
00071                 if (s1->ts_start < s2->ts_end)
00072                         return (0);
00073                 return (1);
00074         }
00075         return (0);
00076 }
00077 
00078 static int
00079 trim_map_zio_compare(const void *x1, const void *x2)
00080 {
00081         const zio_t *z1 = x1;
00082         const zio_t *z2 = x2;
00083 
00084         if (z1->io_offset < z2->io_offset) {
00085                 if (z1->io_offset + z1->io_size > z2->io_offset)
00086                         return (0);
00087                 return (-1);
00088         }
00089         if (z1->io_offset > z2->io_offset) {
00090                 if (z1->io_offset < z2->io_offset + z2->io_size)
00091                         return (0);
00092                 return (1);
00093         }
00094         return (0);
00095 }
00096 
00097 void
00098 trim_map_create(vdev_t *vd)
00099 {
00100         trim_map_t *tm;
00101 
00102         ASSERT(vd->vdev_ops->vdev_op_leaf);
00103 
00104         if (zfs_notrim)
00105                 return;
00106 
00107         tm = kmem_zalloc(sizeof (*tm), KM_SLEEP);
00108         mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL);
00109         list_create(&tm->tm_head, sizeof (trim_seg_t),
00110             offsetof(trim_seg_t, ts_next));
00111         list_create(&tm->tm_pending_writes, sizeof (zio_t),
00112             offsetof(zio_t, io_trim_link));
00113         avl_create(&tm->tm_queued_frees, trim_map_seg_compare,
00114             sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
00115         avl_create(&tm->tm_inflight_frees, trim_map_seg_compare,
00116             sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
00117         avl_create(&tm->tm_inflight_writes, trim_map_zio_compare,
00118             sizeof (zio_t), offsetof(zio_t, io_trim_node));
00119         vd->vdev_trimmap = tm;
00120 }
00121 
00122 void
00123 trim_map_destroy(vdev_t *vd)
00124 {
00125         trim_map_t *tm;
00126         trim_seg_t *ts;
00127 
00128         ASSERT(vd->vdev_ops->vdev_op_leaf);
00129 
00130         if (zfs_notrim)
00131                 return;
00132 
00133         tm = vd->vdev_trimmap;
00134         if (tm == NULL)
00135                 return;
00136 
00137         /*
00138          * We may have been called before trim_map_vdev_commit_done()
00139          * had a chance to run, so do it now to prune the remaining
00140          * inflight frees.
00141          */
00142         trim_map_vdev_commit_done(vd->vdev_spa, vd);
00143 
00144         mutex_enter(&tm->tm_lock);
00145         while ((ts = list_head(&tm->tm_head)) != NULL) {
00146                 avl_remove(&tm->tm_queued_frees, ts);
00147                 list_remove(&tm->tm_head, ts);
00148                 kmem_free(ts, sizeof (*ts));
00149         }
00150         mutex_exit(&tm->tm_lock);
00151 
00152         avl_destroy(&tm->tm_queued_frees);
00153         avl_destroy(&tm->tm_inflight_frees);
00154         avl_destroy(&tm->tm_inflight_writes);
00155         list_destroy(&tm->tm_pending_writes);
00156         list_destroy(&tm->tm_head);
00157         mutex_destroy(&tm->tm_lock);
00158         kmem_free(tm, sizeof (*tm));
00159         vd->vdev_trimmap = NULL;
00160 }
00161 
00162 static void
00163 trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
00164 {
00165         avl_index_t where;
00166         trim_seg_t tsearch, *ts_before, *ts_after, *ts;
00167         boolean_t merge_before, merge_after;
00168 
00169         ASSERT(MUTEX_HELD(&tm->tm_lock));
00170         VERIFY(start < end);
00171 
00172         tsearch.ts_start = start;
00173         tsearch.ts_end = end;
00174 
00175         ts = avl_find(&tm->tm_queued_frees, &tsearch, &where);
00176         if (ts != NULL) {
00177                 if (start < ts->ts_start)
00178                         trim_map_segment_add(tm, start, ts->ts_start, txg);
00179                 if (end > ts->ts_end)
00180                         trim_map_segment_add(tm, ts->ts_end, end, txg);
00181                 return;
00182         }
00183 
00184         ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE);
00185         ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER);
00186 
00187         merge_before = (ts_before != NULL && ts_before->ts_end == start &&
00188             ts_before->ts_txg == txg);
00189         merge_after = (ts_after != NULL && ts_after->ts_start == end &&
00190             ts_after->ts_txg == txg);
00191 
00192         if (merge_before && merge_after) {
00193                 avl_remove(&tm->tm_queued_frees, ts_before);
00194                 list_remove(&tm->tm_head, ts_before);
00195                 ts_after->ts_start = ts_before->ts_start;
00196                 kmem_free(ts_before, sizeof (*ts_before));
00197         } else if (merge_before) {
00198                 ts_before->ts_end = end;
00199         } else if (merge_after) {
00200                 ts_after->ts_start = start;
00201         } else {
00202                 ts = kmem_alloc(sizeof (*ts), KM_SLEEP);
00203                 ts->ts_start = start;
00204                 ts->ts_end = end;
00205                 ts->ts_txg = txg;
00206                 avl_insert(&tm->tm_queued_frees, ts, where);
00207                 list_insert_tail(&tm->tm_head, ts);
00208         }
00209 }
00210 
00211 static void
00212 trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start,
00213     uint64_t end)
00214 {
00215         trim_seg_t *nts;
00216         boolean_t left_over, right_over;
00217 
00218         ASSERT(MUTEX_HELD(&tm->tm_lock));
00219 
00220         left_over = (ts->ts_start < start);
00221         right_over = (ts->ts_end > end);
00222 
00223         if (left_over && right_over) {
00224                 nts = kmem_alloc(sizeof (*nts), KM_SLEEP);
00225                 nts->ts_start = end;
00226                 nts->ts_end = ts->ts_end;
00227                 nts->ts_txg = ts->ts_txg;
00228                 ts->ts_end = start;
00229                 avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER);
00230                 list_insert_after(&tm->tm_head, ts, nts);
00231         } else if (left_over) {
00232                 ts->ts_end = start;
00233         } else if (right_over) {
00234                 ts->ts_start = end;
00235         } else {
00236                 avl_remove(&tm->tm_queued_frees, ts);
00237                 list_remove(&tm->tm_head, ts);
00238                 kmem_free(ts, sizeof (*ts));
00239         }
00240 }
00241 
00242 static void
00243 trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
00244 {
00245         zio_t zsearch, *zs;
00246 
00247         ASSERT(MUTEX_HELD(&tm->tm_lock));
00248 
00249         zsearch.io_offset = start;
00250         zsearch.io_size = end - start;
00251 
00252         zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL);
00253         if (zs == NULL) {
00254                 trim_map_segment_add(tm, start, end, txg);
00255                 return;
00256         }
00257         if (start < zs->io_offset)
00258                 trim_map_free_locked(tm, start, zs->io_offset, txg);
00259         if (zs->io_offset + zs->io_size < end)
00260                 trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg);
00261 }
00262 
00263 void
00264 trim_map_free(zio_t *zio)
00265 {
00266         vdev_t *vd = zio->io_vd;
00267         trim_map_t *tm = vd->vdev_trimmap;
00268 
00269         if (zfs_notrim || vd->vdev_notrim || tm == NULL)
00270                 return;
00271 
00272         mutex_enter(&tm->tm_lock);
00273         trim_map_free_locked(tm, zio->io_offset, zio->io_offset + zio->io_size,
00274             vd->vdev_spa->spa_syncing_txg);
00275         mutex_exit(&tm->tm_lock);
00276 }
00277 
00278 boolean_t
00279 trim_map_write_start(zio_t *zio)
00280 {
00281         vdev_t *vd = zio->io_vd;
00282         trim_map_t *tm = vd->vdev_trimmap;
00283         trim_seg_t tsearch, *ts;
00284         boolean_t left_over, right_over;
00285         uint64_t start, end;
00286 
00287         if (zfs_notrim || vd->vdev_notrim || tm == NULL)
00288                 return (B_TRUE);
00289 
00290         start = zio->io_offset;
00291         end = start + zio->io_size;
00292         tsearch.ts_start = start;
00293         tsearch.ts_end = end;
00294 
00295         mutex_enter(&tm->tm_lock);
00296 
00297         /*
00298          * Checking for colliding in-flight frees.
00299          */
00300         ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL);
00301         if (ts != NULL) {
00302                 list_insert_tail(&tm->tm_pending_writes, zio);
00303                 mutex_exit(&tm->tm_lock);
00304                 return (B_FALSE);
00305         }
00306 
00307         ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
00308         if (ts != NULL) {
00309                 /*
00310                  * Loop until all overlapping segments are removed.
00311                  */
00312                 do {
00313                         trim_map_segment_remove(tm, ts, start, end);
00314                         ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
00315                 } while (ts != NULL);
00316         }
00317         avl_add(&tm->tm_inflight_writes, zio);
00318 
00319         mutex_exit(&tm->tm_lock);
00320 
00321         return (B_TRUE);
00322 }
00323 
00324 void
00325 trim_map_write_done(zio_t *zio)
00326 {
00327         vdev_t *vd = zio->io_vd;
00328         trim_map_t *tm = vd->vdev_trimmap;
00329 
00330         /*
00331          * Don't check for vdev_notrim, since the write could have
00332          * started before vdev_notrim was set.
00333          */
00334         if (zfs_notrim || tm == NULL)
00335                 return;
00336 
00337         mutex_enter(&tm->tm_lock);
00338         /*
00339          * Don't fail if the write isn't in the tree, since the write
00340          * could have started after vdev_notrim was set.
00341          */
00342         if (zio->io_trim_node.avl_child[0] ||
00343             zio->io_trim_node.avl_child[1] ||
00344             AVL_XPARENT(&zio->io_trim_node) ||
00345             tm->tm_inflight_writes.avl_root == &zio->io_trim_node)
00346                 avl_remove(&tm->tm_inflight_writes, zio);
00347         mutex_exit(&tm->tm_lock);
00348 }
00349 
00350 /*
00351  * Return the oldest segment (the one with the lowest txg) or false if
00352  * the list is empty or the first element's txg is greater than txg given
00353  * as function argument.
00354  */
00355 static trim_seg_t *
00356 trim_map_first(trim_map_t *tm, uint64_t txg)
00357 {
00358         trim_seg_t *ts;
00359 
00360         ASSERT(MUTEX_HELD(&tm->tm_lock));
00361 
00362         ts = list_head(&tm->tm_head);
00363         if (ts != NULL && ts->ts_txg <= txg)
00364                 return (ts);
00365         return (NULL);
00366 }
00367 
00368 static void
00369 trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
00370 {
00371         trim_map_t *tm = vd->vdev_trimmap;
00372         trim_seg_t *ts;
00373         uint64_t start, size, txglimit;
00374 
00375         ASSERT(vd->vdev_ops->vdev_op_leaf);
00376 
00377         if (tm == NULL)
00378                 return;
00379 
00380         txglimit = MIN(spa->spa_syncing_txg, spa_freeze_txg(spa)) -
00381             trim_txg_limit;
00382 
00383         mutex_enter(&tm->tm_lock);
00384         /*
00385          * Loop until we send all frees up to the txglimit.
00386          */
00387         while ((ts = trim_map_first(tm, txglimit)) != NULL) {
00388                 list_remove(&tm->tm_head, ts);
00389                 avl_remove(&tm->tm_queued_frees, ts);
00390                 avl_add(&tm->tm_inflight_frees, ts);
00391                 zio_nowait(zio_trim(zio, spa, vd, ts->ts_start,
00392                     ts->ts_end - ts->ts_start));
00393         }
00394         mutex_exit(&tm->tm_lock);
00395 }
00396 
00397 static void
00398 trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd)
00399 {
00400         trim_map_t *tm = vd->vdev_trimmap;
00401         trim_seg_t *ts;
00402         list_t pending_writes;
00403         zio_t *zio;
00404         uint64_t start, size;
00405         void *cookie;
00406 
00407         ASSERT(vd->vdev_ops->vdev_op_leaf);
00408 
00409         if (tm == NULL)
00410                 return;
00411 
00412         mutex_enter(&tm->tm_lock);
00413         if (!avl_is_empty(&tm->tm_inflight_frees)) {
00414                 cookie = NULL;
00415                 while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees,
00416                     &cookie)) != NULL) {
00417                         kmem_free(ts, sizeof (*ts));
00418                 }
00419         }
00420         list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t,
00421             io_trim_link));
00422         list_move_tail(&pending_writes, &tm->tm_pending_writes);
00423         mutex_exit(&tm->tm_lock);
00424 
00425         while ((zio = list_remove_head(&pending_writes)) != NULL) {
00426                 zio_vdev_io_reissue(zio);
00427                 zio_execute(zio);
00428         }
00429         list_destroy(&pending_writes);
00430 }
00431 
00432 static void
00433 trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
00434 {
00435         int c;
00436 
00437         if (vd == NULL || spa->spa_syncing_txg <= trim_txg_limit)
00438                 return;
00439 
00440         if (vd->vdev_ops->vdev_op_leaf) {
00441                 trim_map_vdev_commit(spa, zio, vd);
00442         } else {
00443                 for (c = 0; c < vd->vdev_children; c++)
00444                         trim_map_commit(spa, zio, vd->vdev_child[c]);
00445         }
00446 }
00447 
00448 static void
00449 trim_map_commit_done(spa_t *spa, vdev_t *vd)
00450 {
00451         int c;
00452 
00453         if (vd == NULL)
00454                 return;
00455 
00456         if (vd->vdev_ops->vdev_op_leaf) {
00457                 trim_map_vdev_commit_done(spa, vd);
00458         } else {
00459                 for (c = 0; c < vd->vdev_children; c++)
00460                         trim_map_commit_done(spa, vd->vdev_child[c]);
00461         }
00462 }
00463 
00464 static void
00465 trim_thread(void *arg)
00466 {
00467         spa_t *spa = arg;
00468         zio_t *zio;
00469 
00470         for (;;) {
00471                 mutex_enter(&spa->spa_trim_lock);
00472                 if (spa->spa_trim_thread == NULL) {
00473                         spa->spa_trim_thread = curthread;
00474                         cv_signal(&spa->spa_trim_cv);
00475                         mutex_exit(&spa->spa_trim_lock);
00476                         thread_exit();
00477                 }
00478                 cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
00479                 mutex_exit(&spa->spa_trim_lock);
00480 
00481                 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
00482 
00483                 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
00484                 trim_map_commit(spa, zio, spa->spa_root_vdev);
00485                 (void) zio_wait(zio);
00486                 trim_map_commit_done(spa, spa->spa_root_vdev);
00487                 spa_config_exit(spa, SCL_STATE, FTAG);
00488         }
00489 }
00490 
00491 void
00492 trim_thread_create(spa_t *spa)
00493 {
00494 
00495         if (zfs_notrim)
00496                 return;
00497 
00498         mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL);
00499         cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL);
00500         mutex_enter(&spa->spa_trim_lock);
00501         spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0,
00502             TS_RUN, minclsyspri);
00503         mutex_exit(&spa->spa_trim_lock);
00504 }
00505 
00506 void
00507 trim_thread_destroy(spa_t *spa)
00508 {
00509 
00510         if (zfs_notrim)
00511                 return;
00512         if (spa->spa_trim_thread == NULL)
00513                 return;
00514 
00515         mutex_enter(&spa->spa_trim_lock);
00516         /* Setting spa_trim_thread to NULL tells the thread to stop. */
00517         spa->spa_trim_thread = NULL;
00518         cv_signal(&spa->spa_trim_cv);
00519         /* The thread will set it back to != NULL on exit. */
00520         while (spa->spa_trim_thread == NULL)
00521                 cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
00522         spa->spa_trim_thread = NULL;
00523         mutex_exit(&spa->spa_trim_lock);
00524 
00525         cv_destroy(&spa->spa_trim_cv);
00526         mutex_destroy(&spa->spa_trim_lock);
00527 }
00528 
00529 void
00530 trim_thread_wakeup(spa_t *spa)
00531 {
00532 
00533         if (zfs_notrim)
00534                 return;
00535         if (spa->spa_trim_thread == NULL)
00536                 return;
00537 
00538         mutex_enter(&spa->spa_trim_lock);
00539         cv_signal(&spa->spa_trim_cv);
00540         mutex_exit(&spa->spa_trim_lock);
00541 }
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines