FreeBSD ZFS
The Zettabyte File System
|
00001 /* 00002 * CDDL HEADER START 00003 * 00004 * The contents of this file are subject to the terms of the 00005 * Common Development and Distribution License (the "License"). 00006 * You may not use this file except in compliance with the License. 00007 * 00008 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 00009 * or http://www.opensolaris.org/os/licensing. 00010 * See the License for the specific language governing permissions 00011 * and limitations under the License. 00012 * 00013 * When distributing Covered Code, include this CDDL HEADER in each 00014 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 00015 * If applicable, add the following below this CDDL HEADER, with the 00016 * fields enclosed by brackets "[]" replaced with your own identifying 00017 * information: Portions Copyright [yyyy] [name of copyright owner] 00018 * 00019 * CDDL HEADER END 00020 */ 00021 /* 00022 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 00023 * Use is subject to license terms. 00024 */ 00025 00026 #include <sys/zfs_context.h> 00027 #include <sys/vdev_impl.h> 00028 #include <sys/zio.h> 00029 #include <sys/avl.h> 00030 00031 /* These tunables are for performance analysis. */ 00039 int zfs_vdev_max_pending = 10; 00040 00045 int zfs_vdev_min_pending = 4; 00046 00048 int zfs_vdev_time_shift = 6; 00049 00051 int zfs_vdev_ramp_rate = 2; 00052 00059 int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; 00060 int zfs_vdev_read_gap_limit = 32 << 10; 00061 int zfs_vdev_write_gap_limit = 4 << 10; 00064 SYSCTL_DECL(_vfs_zfs_vdev); 00065 TUNABLE_INT("vfs.zfs.vdev.max_pending", &zfs_vdev_max_pending); 00066 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_pending, CTLFLAG_RW, 00067 &zfs_vdev_max_pending, 0, "Maximum I/O requests pending on each device"); 00068 TUNABLE_INT("vfs.zfs.vdev.min_pending", &zfs_vdev_min_pending); 00069 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_pending, CTLFLAG_RW, 00070 &zfs_vdev_min_pending, 0, 00071 "Initial number of I/O requests pending to each device"); 00072 TUNABLE_INT("vfs.zfs.vdev.time_shift", &zfs_vdev_time_shift); 00073 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, time_shift, CTLFLAG_RW, 00074 &zfs_vdev_time_shift, 0, "Used for calculating I/O request deadline"); 00075 TUNABLE_INT("vfs.zfs.vdev.ramp_rate", &zfs_vdev_ramp_rate); 00076 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, ramp_rate, CTLFLAG_RW, 00077 &zfs_vdev_ramp_rate, 0, "Exponential I/O issue ramp-up rate"); 00078 TUNABLE_INT("vfs.zfs.vdev.aggregation_limit", &zfs_vdev_aggregation_limit); 00079 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RW, 00080 &zfs_vdev_aggregation_limit, 0, 00081 "I/O requests are aggregated up to this size"); 00082 TUNABLE_INT("vfs.zfs.vdev.read_gap_limit", &zfs_vdev_read_gap_limit); 00083 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RW, 00084 &zfs_vdev_read_gap_limit, 0, 00085 "Acceptable gap between two reads being aggregated"); 00086 TUNABLE_INT("vfs.zfs.vdev.write_gap_limit", &zfs_vdev_write_gap_limit); 00087 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RW, 00088 &zfs_vdev_write_gap_limit, 0, 00089 "Acceptable gap between two writes being aggregated"); 00090 00094 int 00095 vdev_queue_deadline_compare(const void *x1, const void *x2) 00096 { 00097 const zio_t *z1 = x1; 00098 const zio_t *z2 = x2; 00099 00100 if (z1->io_deadline < z2->io_deadline) 00101 return (-1); 00102 if (z1->io_deadline > z2->io_deadline) 00103 return (1); 00104 00105 if (z1->io_offset < z2->io_offset) 00106 return (-1); 00107 if (z1->io_offset > z2->io_offset) 00108 return (1); 00109 00110 if (z1 < z2) 00111 return (-1); 00112 if (z1 > z2) 00113 return (1); 00114 00115 return (0); 00116 } 00117 00118 int 00119 vdev_queue_offset_compare(const void *x1, const void *x2) 00120 { 00121 const zio_t *z1 = x1; 00122 const zio_t *z2 = x2; 00123 00124 if (z1->io_offset < z2->io_offset) 00125 return (-1); 00126 if (z1->io_offset > z2->io_offset) 00127 return (1); 00128 00129 if (z1 < z2) 00130 return (-1); 00131 if (z1 > z2) 00132 return (1); 00133 00134 return (0); 00135 } 00136 00137 void 00138 vdev_queue_init(vdev_t *vd) 00139 { 00140 vdev_queue_t *vq = &vd->vdev_queue; 00141 00142 mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); 00143 00144 avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare, 00145 sizeof (zio_t), offsetof(struct zio, io_deadline_node)); 00146 00147 avl_create(&vq->vq_read_tree, vdev_queue_offset_compare, 00148 sizeof (zio_t), offsetof(struct zio, io_offset_node)); 00149 00150 avl_create(&vq->vq_write_tree, vdev_queue_offset_compare, 00151 sizeof (zio_t), offsetof(struct zio, io_offset_node)); 00152 00153 avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, 00154 sizeof (zio_t), offsetof(struct zio, io_offset_node)); 00155 } 00156 00157 void 00158 vdev_queue_fini(vdev_t *vd) 00159 { 00160 vdev_queue_t *vq = &vd->vdev_queue; 00161 00162 avl_destroy(&vq->vq_deadline_tree); 00163 avl_destroy(&vq->vq_read_tree); 00164 avl_destroy(&vq->vq_write_tree); 00165 avl_destroy(&vq->vq_pending_tree); 00166 00167 mutex_destroy(&vq->vq_lock); 00168 } 00169 00170 static void 00171 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) 00172 { 00173 avl_add(&vq->vq_deadline_tree, zio); 00174 avl_add(zio->io_vdev_tree, zio); 00175 } 00176 00177 static void 00178 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) 00179 { 00180 avl_remove(&vq->vq_deadline_tree, zio); 00181 avl_remove(zio->io_vdev_tree, zio); 00182 } 00183 00184 static void 00185 vdev_queue_agg_io_done(zio_t *aio) 00186 { 00187 zio_t *pio; 00188 00189 while ((pio = zio_walk_parents(aio)) != NULL) 00190 if (aio->io_type == ZIO_TYPE_READ) 00191 bcopy((char *)aio->io_data + (pio->io_offset - 00192 aio->io_offset), pio->io_data, pio->io_size); 00193 00194 zio_buf_free(aio->io_data, aio->io_size); 00195 } 00196 00203 #define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) 00204 #define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) 00205 00206 static zio_t * 00207 vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) 00208 { 00209 zio_t *fio, *lio, *aio, *dio, *nio, *mio; 00210 avl_tree_t *t; 00211 int flags; 00212 uint64_t maxspan = zfs_vdev_aggregation_limit; 00213 uint64_t maxgap; 00214 int stretch; 00215 00216 again: 00217 ASSERT(MUTEX_HELD(&vq->vq_lock)); 00218 00219 if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || 00220 avl_numnodes(&vq->vq_deadline_tree) == 0) 00221 return (NULL); 00222 00223 fio = lio = avl_first(&vq->vq_deadline_tree); 00224 00225 t = fio->io_vdev_tree; 00226 flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; 00227 maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0; 00228 00229 if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { 00230 /* 00231 * We can aggregate I/Os that are sufficiently adjacent and of 00232 * the same flavor, as expressed by the AGG_INHERIT flags. 00233 * The latter requirement is necessary so that certain 00234 * attributes of the I/O, such as whether it's a normal I/O 00235 * or a scrub/resilver, can be preserved in the aggregate. 00236 * We can include optional I/Os, but don't allow them 00237 * to begin a range as they add no benefit in that situation. 00238 */ 00239 00240 /* 00241 * We keep track of the last non-optional I/O. 00242 */ 00243 mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio; 00244 00245 /* 00246 * Walk backwards through sufficiently contiguous I/Os 00247 * recording the last non-option I/O. 00248 */ 00249 while ((dio = AVL_PREV(t, fio)) != NULL && 00250 (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && 00251 IO_SPAN(dio, lio) <= maxspan && 00252 IO_GAP(dio, fio) <= maxgap) { 00253 fio = dio; 00254 if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL)) 00255 mio = fio; 00256 } 00257 00258 /* 00259 * Skip any initial optional I/Os. 00260 */ 00261 while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) { 00262 fio = AVL_NEXT(t, fio); 00263 ASSERT(fio != NULL); 00264 } 00265 00266 /* 00267 * Walk forward through sufficiently contiguous I/Os. 00268 */ 00269 while ((dio = AVL_NEXT(t, lio)) != NULL && 00270 (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && 00271 IO_SPAN(fio, dio) <= maxspan && 00272 IO_GAP(lio, dio) <= maxgap) { 00273 lio = dio; 00274 if (!(lio->io_flags & ZIO_FLAG_OPTIONAL)) 00275 mio = lio; 00276 } 00277 00278 /* 00279 * Now that we've established the range of the I/O aggregation 00280 * we must decide what to do with trailing optional I/Os. 00281 * For reads, there's nothing to do. While we are unable to 00282 * aggregate further, it's possible that a trailing optional 00283 * I/O would allow the underlying device to aggregate with 00284 * subsequent I/Os. We must therefore determine if the next 00285 * non-optional I/O is close enough to make aggregation 00286 * worthwhile. 00287 */ 00288 stretch = B_FALSE; 00289 if (t != &vq->vq_read_tree && mio != NULL) { 00290 nio = lio; 00291 while ((dio = AVL_NEXT(t, nio)) != NULL && 00292 IO_GAP(nio, dio) == 0 && 00293 IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) { 00294 nio = dio; 00295 if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { 00296 stretch = B_TRUE; 00297 break; 00298 } 00299 } 00300 } 00301 00302 if (stretch) { 00303 /* This may be a no-op. */ 00304 VERIFY((dio = AVL_NEXT(t, lio)) != NULL); 00305 dio->io_flags &= ~ZIO_FLAG_OPTIONAL; 00306 } else { 00307 while (lio != mio && lio != fio) { 00308 ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL); 00309 lio = AVL_PREV(t, lio); 00310 ASSERT(lio != NULL); 00311 } 00312 } 00313 } 00314 00315 if (fio != lio) { 00316 uint64_t size = IO_SPAN(fio, lio); 00317 ASSERT(size <= zfs_vdev_aggregation_limit); 00318 00319 aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, 00320 zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG, 00321 flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, 00322 vdev_queue_agg_io_done, NULL); 00323 00324 nio = fio; 00325 do { 00326 dio = nio; 00327 nio = AVL_NEXT(t, dio); 00328 ASSERT(dio->io_type == aio->io_type); 00329 ASSERT(dio->io_vdev_tree == t); 00330 00331 if (dio->io_flags & ZIO_FLAG_NODATA) { 00332 ASSERT(dio->io_type == ZIO_TYPE_WRITE); 00333 bzero((char *)aio->io_data + (dio->io_offset - 00334 aio->io_offset), dio->io_size); 00335 } else if (dio->io_type == ZIO_TYPE_WRITE) { 00336 bcopy(dio->io_data, (char *)aio->io_data + 00337 (dio->io_offset - aio->io_offset), 00338 dio->io_size); 00339 } 00340 00341 zio_add_child(dio, aio); 00342 vdev_queue_io_remove(vq, dio); 00343 zio_vdev_io_bypass(dio); 00344 zio_execute(dio); 00345 } while (dio != lio); 00346 00347 avl_add(&vq->vq_pending_tree, aio); 00348 00349 return (aio); 00350 } 00351 00352 ASSERT(fio->io_vdev_tree == t); 00353 vdev_queue_io_remove(vq, fio); 00354 00355 /* 00356 * If the I/O is or was optional and therefore has no data, we need to 00357 * simply discard it. We need to drop the vdev queue's lock to avoid a 00358 * deadlock that we could encounter since this I/O will complete 00359 * immediately. 00360 */ 00361 if (fio->io_flags & ZIO_FLAG_NODATA) { 00362 mutex_exit(&vq->vq_lock); 00363 zio_vdev_io_bypass(fio); 00364 zio_execute(fio); 00365 mutex_enter(&vq->vq_lock); 00366 goto again; 00367 } 00368 00369 avl_add(&vq->vq_pending_tree, fio); 00370 00371 return (fio); 00372 } 00373 00374 zio_t * 00375 vdev_queue_io(zio_t *zio) 00376 { 00377 vdev_queue_t *vq = &zio->io_vd->vdev_queue; 00378 zio_t *nio; 00379 00380 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 00381 00382 if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) 00383 return (zio); 00384 00385 zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; 00386 00387 if (zio->io_type == ZIO_TYPE_READ) 00388 zio->io_vdev_tree = &vq->vq_read_tree; 00389 else 00390 zio->io_vdev_tree = &vq->vq_write_tree; 00391 00392 mutex_enter(&vq->vq_lock); 00393 00394 zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) + 00395 zio->io_priority; 00396 00397 vdev_queue_io_add(vq, zio); 00398 00399 nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending); 00400 00401 mutex_exit(&vq->vq_lock); 00402 00403 if (nio == NULL) 00404 return (NULL); 00405 00406 if (nio->io_done == vdev_queue_agg_io_done) { 00407 zio_nowait(nio); 00408 return (NULL); 00409 } 00410 00411 return (nio); 00412 } 00413 00414 void 00415 vdev_queue_io_done(zio_t *zio) 00416 { 00417 vdev_queue_t *vq = &zio->io_vd->vdev_queue; 00418 00419 mutex_enter(&vq->vq_lock); 00420 00421 avl_remove(&vq->vq_pending_tree, zio); 00422 00423 for (int i = 0; i < zfs_vdev_ramp_rate; i++) { 00424 zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); 00425 if (nio == NULL) 00426 break; 00427 mutex_exit(&vq->vq_lock); 00428 if (nio->io_done == vdev_queue_agg_io_done) { 00429 zio_nowait(nio); 00430 } else { 00431 zio_vdev_io_reissue(nio); 00432 zio_execute(nio); 00433 } 00434 mutex_enter(&vq->vq_lock); 00435 } 00436 00437 mutex_exit(&vq->vq_lock); 00438 }