FreeBSD ZFS
The Zettabyte File System
|
00001 /* 00002 * CDDL HEADER START 00003 * 00004 * The contents of this file are subject to the terms of the 00005 * Common Development and Distribution License (the "License"). 00006 * You may not use this file except in compliance with the License. 00007 * 00008 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 00009 * or http://www.opensolaris.org/os/licensing. 00010 * See the License for the specific language governing permissions 00011 * and limitations under the License. 00012 * 00013 * When distributing Covered Code, include this CDDL HEADER in each 00014 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 00015 * If applicable, add the following below this CDDL HEADER, with the 00016 * fields enclosed by brackets "[]" replaced with your own identifying 00017 * information: Portions Copyright [yyyy] [name of copyright owner] 00018 * 00019 * CDDL HEADER END 00020 */ 00021 /* 00022 * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>. 00023 * All rights reserved. 00024 */ 00025 00026 #include <sys/zfs_context.h> 00027 #include <sys/spa_impl.h> 00028 #include <sys/vdev_impl.h> 00029 #include <sys/trim_map.h> 00030 00031 typedef struct trim_map { 00032 list_t tm_head; /* List of segments sorted by txg. */ 00033 avl_tree_t tm_queued_frees; /* AVL tree of segments waiting for TRIM. */ 00034 avl_tree_t tm_inflight_frees; /* AVL tree of in-flight TRIMs. */ 00035 avl_tree_t tm_inflight_writes; /* AVL tree of in-flight writes. */ 00036 list_t tm_pending_writes; /* Writes blocked on in-flight frees. */ 00037 kmutex_t tm_lock; 00038 } trim_map_t; 00039 00040 typedef struct trim_seg { 00041 avl_node_t ts_node; /* AVL node. */ 00042 list_node_t ts_next; /* List element. */ 00043 uint64_t ts_start; /* Starting offset of this segment. */ 00044 uint64_t ts_end; /* Ending offset (non-inclusive). */ 00045 uint64_t ts_txg; /* Segment creation txg. */ 00046 } trim_seg_t; 00047 00048 extern boolean_t zfs_notrim; 00049 00050 SYSCTL_DECL(_vfs_zfs); 00051 /* Delay TRIMs by that many TXGs. */ 00052 static int trim_txg_limit = 64; 00053 TUNABLE_INT("vfs.zfs.trim_txg_limit", &trim_txg_limit); 00054 SYSCTL_INT(_vfs_zfs, OID_AUTO, trim_txg_limit, CTLFLAG_RW, &trim_txg_limit, 0, 00055 "Delay TRIMs by that many TXGs."); 00056 00057 static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd); 00058 00059 static int 00060 trim_map_seg_compare(const void *x1, const void *x2) 00061 { 00062 const trim_seg_t *s1 = x1; 00063 const trim_seg_t *s2 = x2; 00064 00065 if (s1->ts_start < s2->ts_start) { 00066 if (s1->ts_end > s2->ts_start) 00067 return (0); 00068 return (-1); 00069 } 00070 if (s1->ts_start > s2->ts_start) { 00071 if (s1->ts_start < s2->ts_end) 00072 return (0); 00073 return (1); 00074 } 00075 return (0); 00076 } 00077 00078 static int 00079 trim_map_zio_compare(const void *x1, const void *x2) 00080 { 00081 const zio_t *z1 = x1; 00082 const zio_t *z2 = x2; 00083 00084 if (z1->io_offset < z2->io_offset) { 00085 if (z1->io_offset + z1->io_size > z2->io_offset) 00086 return (0); 00087 return (-1); 00088 } 00089 if (z1->io_offset > z2->io_offset) { 00090 if (z1->io_offset < z2->io_offset + z2->io_size) 00091 return (0); 00092 return (1); 00093 } 00094 return (0); 00095 } 00096 00097 void 00098 trim_map_create(vdev_t *vd) 00099 { 00100 trim_map_t *tm; 00101 00102 ASSERT(vd->vdev_ops->vdev_op_leaf); 00103 00104 if (zfs_notrim) 00105 return; 00106 00107 tm = kmem_zalloc(sizeof (*tm), KM_SLEEP); 00108 mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL); 00109 list_create(&tm->tm_head, sizeof (trim_seg_t), 00110 offsetof(trim_seg_t, ts_next)); 00111 list_create(&tm->tm_pending_writes, sizeof (zio_t), 00112 offsetof(zio_t, io_trim_link)); 00113 avl_create(&tm->tm_queued_frees, trim_map_seg_compare, 00114 sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); 00115 avl_create(&tm->tm_inflight_frees, trim_map_seg_compare, 00116 sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); 00117 avl_create(&tm->tm_inflight_writes, trim_map_zio_compare, 00118 sizeof (zio_t), offsetof(zio_t, io_trim_node)); 00119 vd->vdev_trimmap = tm; 00120 } 00121 00122 void 00123 trim_map_destroy(vdev_t *vd) 00124 { 00125 trim_map_t *tm; 00126 trim_seg_t *ts; 00127 00128 ASSERT(vd->vdev_ops->vdev_op_leaf); 00129 00130 if (zfs_notrim) 00131 return; 00132 00133 tm = vd->vdev_trimmap; 00134 if (tm == NULL) 00135 return; 00136 00137 /* 00138 * We may have been called before trim_map_vdev_commit_done() 00139 * had a chance to run, so do it now to prune the remaining 00140 * inflight frees. 00141 */ 00142 trim_map_vdev_commit_done(vd->vdev_spa, vd); 00143 00144 mutex_enter(&tm->tm_lock); 00145 while ((ts = list_head(&tm->tm_head)) != NULL) { 00146 avl_remove(&tm->tm_queued_frees, ts); 00147 list_remove(&tm->tm_head, ts); 00148 kmem_free(ts, sizeof (*ts)); 00149 } 00150 mutex_exit(&tm->tm_lock); 00151 00152 avl_destroy(&tm->tm_queued_frees); 00153 avl_destroy(&tm->tm_inflight_frees); 00154 avl_destroy(&tm->tm_inflight_writes); 00155 list_destroy(&tm->tm_pending_writes); 00156 list_destroy(&tm->tm_head); 00157 mutex_destroy(&tm->tm_lock); 00158 kmem_free(tm, sizeof (*tm)); 00159 vd->vdev_trimmap = NULL; 00160 } 00161 00162 static void 00163 trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) 00164 { 00165 avl_index_t where; 00166 trim_seg_t tsearch, *ts_before, *ts_after, *ts; 00167 boolean_t merge_before, merge_after; 00168 00169 ASSERT(MUTEX_HELD(&tm->tm_lock)); 00170 VERIFY(start < end); 00171 00172 tsearch.ts_start = start; 00173 tsearch.ts_end = end; 00174 00175 ts = avl_find(&tm->tm_queued_frees, &tsearch, &where); 00176 if (ts != NULL) { 00177 if (start < ts->ts_start) 00178 trim_map_segment_add(tm, start, ts->ts_start, txg); 00179 if (end > ts->ts_end) 00180 trim_map_segment_add(tm, ts->ts_end, end, txg); 00181 return; 00182 } 00183 00184 ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE); 00185 ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER); 00186 00187 merge_before = (ts_before != NULL && ts_before->ts_end == start && 00188 ts_before->ts_txg == txg); 00189 merge_after = (ts_after != NULL && ts_after->ts_start == end && 00190 ts_after->ts_txg == txg); 00191 00192 if (merge_before && merge_after) { 00193 avl_remove(&tm->tm_queued_frees, ts_before); 00194 list_remove(&tm->tm_head, ts_before); 00195 ts_after->ts_start = ts_before->ts_start; 00196 kmem_free(ts_before, sizeof (*ts_before)); 00197 } else if (merge_before) { 00198 ts_before->ts_end = end; 00199 } else if (merge_after) { 00200 ts_after->ts_start = start; 00201 } else { 00202 ts = kmem_alloc(sizeof (*ts), KM_SLEEP); 00203 ts->ts_start = start; 00204 ts->ts_end = end; 00205 ts->ts_txg = txg; 00206 avl_insert(&tm->tm_queued_frees, ts, where); 00207 list_insert_tail(&tm->tm_head, ts); 00208 } 00209 } 00210 00211 static void 00212 trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start, 00213 uint64_t end) 00214 { 00215 trim_seg_t *nts; 00216 boolean_t left_over, right_over; 00217 00218 ASSERT(MUTEX_HELD(&tm->tm_lock)); 00219 00220 left_over = (ts->ts_start < start); 00221 right_over = (ts->ts_end > end); 00222 00223 if (left_over && right_over) { 00224 nts = kmem_alloc(sizeof (*nts), KM_SLEEP); 00225 nts->ts_start = end; 00226 nts->ts_end = ts->ts_end; 00227 nts->ts_txg = ts->ts_txg; 00228 ts->ts_end = start; 00229 avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER); 00230 list_insert_after(&tm->tm_head, ts, nts); 00231 } else if (left_over) { 00232 ts->ts_end = start; 00233 } else if (right_over) { 00234 ts->ts_start = end; 00235 } else { 00236 avl_remove(&tm->tm_queued_frees, ts); 00237 list_remove(&tm->tm_head, ts); 00238 kmem_free(ts, sizeof (*ts)); 00239 } 00240 } 00241 00242 static void 00243 trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) 00244 { 00245 zio_t zsearch, *zs; 00246 00247 ASSERT(MUTEX_HELD(&tm->tm_lock)); 00248 00249 zsearch.io_offset = start; 00250 zsearch.io_size = end - start; 00251 00252 zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL); 00253 if (zs == NULL) { 00254 trim_map_segment_add(tm, start, end, txg); 00255 return; 00256 } 00257 if (start < zs->io_offset) 00258 trim_map_free_locked(tm, start, zs->io_offset, txg); 00259 if (zs->io_offset + zs->io_size < end) 00260 trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg); 00261 } 00262 00263 void 00264 trim_map_free(zio_t *zio) 00265 { 00266 vdev_t *vd = zio->io_vd; 00267 trim_map_t *tm = vd->vdev_trimmap; 00268 00269 if (zfs_notrim || vd->vdev_notrim || tm == NULL) 00270 return; 00271 00272 mutex_enter(&tm->tm_lock); 00273 trim_map_free_locked(tm, zio->io_offset, zio->io_offset + zio->io_size, 00274 vd->vdev_spa->spa_syncing_txg); 00275 mutex_exit(&tm->tm_lock); 00276 } 00277 00278 boolean_t 00279 trim_map_write_start(zio_t *zio) 00280 { 00281 vdev_t *vd = zio->io_vd; 00282 trim_map_t *tm = vd->vdev_trimmap; 00283 trim_seg_t tsearch, *ts; 00284 boolean_t left_over, right_over; 00285 uint64_t start, end; 00286 00287 if (zfs_notrim || vd->vdev_notrim || tm == NULL) 00288 return (B_TRUE); 00289 00290 start = zio->io_offset; 00291 end = start + zio->io_size; 00292 tsearch.ts_start = start; 00293 tsearch.ts_end = end; 00294 00295 mutex_enter(&tm->tm_lock); 00296 00297 /* 00298 * Checking for colliding in-flight frees. 00299 */ 00300 ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL); 00301 if (ts != NULL) { 00302 list_insert_tail(&tm->tm_pending_writes, zio); 00303 mutex_exit(&tm->tm_lock); 00304 return (B_FALSE); 00305 } 00306 00307 ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); 00308 if (ts != NULL) { 00309 /* 00310 * Loop until all overlapping segments are removed. 00311 */ 00312 do { 00313 trim_map_segment_remove(tm, ts, start, end); 00314 ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); 00315 } while (ts != NULL); 00316 } 00317 avl_add(&tm->tm_inflight_writes, zio); 00318 00319 mutex_exit(&tm->tm_lock); 00320 00321 return (B_TRUE); 00322 } 00323 00324 void 00325 trim_map_write_done(zio_t *zio) 00326 { 00327 vdev_t *vd = zio->io_vd; 00328 trim_map_t *tm = vd->vdev_trimmap; 00329 00330 /* 00331 * Don't check for vdev_notrim, since the write could have 00332 * started before vdev_notrim was set. 00333 */ 00334 if (zfs_notrim || tm == NULL) 00335 return; 00336 00337 mutex_enter(&tm->tm_lock); 00338 /* 00339 * Don't fail if the write isn't in the tree, since the write 00340 * could have started after vdev_notrim was set. 00341 */ 00342 if (zio->io_trim_node.avl_child[0] || 00343 zio->io_trim_node.avl_child[1] || 00344 AVL_XPARENT(&zio->io_trim_node) || 00345 tm->tm_inflight_writes.avl_root == &zio->io_trim_node) 00346 avl_remove(&tm->tm_inflight_writes, zio); 00347 mutex_exit(&tm->tm_lock); 00348 } 00349 00350 /* 00351 * Return the oldest segment (the one with the lowest txg) or false if 00352 * the list is empty or the first element's txg is greater than txg given 00353 * as function argument. 00354 */ 00355 static trim_seg_t * 00356 trim_map_first(trim_map_t *tm, uint64_t txg) 00357 { 00358 trim_seg_t *ts; 00359 00360 ASSERT(MUTEX_HELD(&tm->tm_lock)); 00361 00362 ts = list_head(&tm->tm_head); 00363 if (ts != NULL && ts->ts_txg <= txg) 00364 return (ts); 00365 return (NULL); 00366 } 00367 00368 static void 00369 trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd) 00370 { 00371 trim_map_t *tm = vd->vdev_trimmap; 00372 trim_seg_t *ts; 00373 uint64_t start, size, txglimit; 00374 00375 ASSERT(vd->vdev_ops->vdev_op_leaf); 00376 00377 if (tm == NULL) 00378 return; 00379 00380 txglimit = MIN(spa->spa_syncing_txg, spa_freeze_txg(spa)) - 00381 trim_txg_limit; 00382 00383 mutex_enter(&tm->tm_lock); 00384 /* 00385 * Loop until we send all frees up to the txglimit. 00386 */ 00387 while ((ts = trim_map_first(tm, txglimit)) != NULL) { 00388 list_remove(&tm->tm_head, ts); 00389 avl_remove(&tm->tm_queued_frees, ts); 00390 avl_add(&tm->tm_inflight_frees, ts); 00391 zio_nowait(zio_trim(zio, spa, vd, ts->ts_start, 00392 ts->ts_end - ts->ts_start)); 00393 } 00394 mutex_exit(&tm->tm_lock); 00395 } 00396 00397 static void 00398 trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd) 00399 { 00400 trim_map_t *tm = vd->vdev_trimmap; 00401 trim_seg_t *ts; 00402 list_t pending_writes; 00403 zio_t *zio; 00404 uint64_t start, size; 00405 void *cookie; 00406 00407 ASSERT(vd->vdev_ops->vdev_op_leaf); 00408 00409 if (tm == NULL) 00410 return; 00411 00412 mutex_enter(&tm->tm_lock); 00413 if (!avl_is_empty(&tm->tm_inflight_frees)) { 00414 cookie = NULL; 00415 while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees, 00416 &cookie)) != NULL) { 00417 kmem_free(ts, sizeof (*ts)); 00418 } 00419 } 00420 list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t, 00421 io_trim_link)); 00422 list_move_tail(&pending_writes, &tm->tm_pending_writes); 00423 mutex_exit(&tm->tm_lock); 00424 00425 while ((zio = list_remove_head(&pending_writes)) != NULL) { 00426 zio_vdev_io_reissue(zio); 00427 zio_execute(zio); 00428 } 00429 list_destroy(&pending_writes); 00430 } 00431 00432 static void 00433 trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd) 00434 { 00435 int c; 00436 00437 if (vd == NULL || spa->spa_syncing_txg <= trim_txg_limit) 00438 return; 00439 00440 if (vd->vdev_ops->vdev_op_leaf) { 00441 trim_map_vdev_commit(spa, zio, vd); 00442 } else { 00443 for (c = 0; c < vd->vdev_children; c++) 00444 trim_map_commit(spa, zio, vd->vdev_child[c]); 00445 } 00446 } 00447 00448 static void 00449 trim_map_commit_done(spa_t *spa, vdev_t *vd) 00450 { 00451 int c; 00452 00453 if (vd == NULL) 00454 return; 00455 00456 if (vd->vdev_ops->vdev_op_leaf) { 00457 trim_map_vdev_commit_done(spa, vd); 00458 } else { 00459 for (c = 0; c < vd->vdev_children; c++) 00460 trim_map_commit_done(spa, vd->vdev_child[c]); 00461 } 00462 } 00463 00464 static void 00465 trim_thread(void *arg) 00466 { 00467 spa_t *spa = arg; 00468 zio_t *zio; 00469 00470 for (;;) { 00471 mutex_enter(&spa->spa_trim_lock); 00472 if (spa->spa_trim_thread == NULL) { 00473 spa->spa_trim_thread = curthread; 00474 cv_signal(&spa->spa_trim_cv); 00475 mutex_exit(&spa->spa_trim_lock); 00476 thread_exit(); 00477 } 00478 cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock); 00479 mutex_exit(&spa->spa_trim_lock); 00480 00481 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 00482 00483 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 00484 trim_map_commit(spa, zio, spa->spa_root_vdev); 00485 (void) zio_wait(zio); 00486 trim_map_commit_done(spa, spa->spa_root_vdev); 00487 spa_config_exit(spa, SCL_STATE, FTAG); 00488 } 00489 } 00490 00491 void 00492 trim_thread_create(spa_t *spa) 00493 { 00494 00495 if (zfs_notrim) 00496 return; 00497 00498 mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL); 00499 cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL); 00500 mutex_enter(&spa->spa_trim_lock); 00501 spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0, 00502 TS_RUN, minclsyspri); 00503 mutex_exit(&spa->spa_trim_lock); 00504 } 00505 00506 void 00507 trim_thread_destroy(spa_t *spa) 00508 { 00509 00510 if (zfs_notrim) 00511 return; 00512 if (spa->spa_trim_thread == NULL) 00513 return; 00514 00515 mutex_enter(&spa->spa_trim_lock); 00516 /* Setting spa_trim_thread to NULL tells the thread to stop. */ 00517 spa->spa_trim_thread = NULL; 00518 cv_signal(&spa->spa_trim_cv); 00519 /* The thread will set it back to != NULL on exit. */ 00520 while (spa->spa_trim_thread == NULL) 00521 cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock); 00522 spa->spa_trim_thread = NULL; 00523 mutex_exit(&spa->spa_trim_lock); 00524 00525 cv_destroy(&spa->spa_trim_cv); 00526 mutex_destroy(&spa->spa_trim_lock); 00527 } 00528 00529 void 00530 trim_thread_wakeup(spa_t *spa) 00531 { 00532 00533 if (zfs_notrim) 00534 return; 00535 if (spa->spa_trim_thread == NULL) 00536 return; 00537 00538 mutex_enter(&spa->spa_trim_lock); 00539 cv_signal(&spa->spa_trim_cv); 00540 mutex_exit(&spa->spa_trim_lock); 00541 }