FreeBSD ZFS
The Zettabyte File System

vdev_queue.c

Go to the documentation of this file.
00001 /*
00002  * CDDL HEADER START
00003  *
00004  * The contents of this file are subject to the terms of the
00005  * Common Development and Distribution License (the "License").
00006  * You may not use this file except in compliance with the License.
00007  *
00008  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
00009  * or http://www.opensolaris.org/os/licensing.
00010  * See the License for the specific language governing permissions
00011  * and limitations under the License.
00012  *
00013  * When distributing Covered Code, include this CDDL HEADER in each
00014  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
00015  * If applicable, add the following below this CDDL HEADER, with the
00016  * fields enclosed by brackets "[]" replaced with your own identifying
00017  * information: Portions Copyright [yyyy] [name of copyright owner]
00018  *
00019  * CDDL HEADER END
00020  */
00021 /*
00022  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
00023  * Use is subject to license terms.
00024  */
00025 
00026 #include <sys/zfs_context.h>
00027 #include <sys/vdev_impl.h>
00028 #include <sys/zio.h>
00029 #include <sys/avl.h>
00030 
00031 /* These tunables are for performance analysis. */
00039 int zfs_vdev_max_pending = 10;
00040 
00045 int zfs_vdev_min_pending = 4;
00046 
00048 int zfs_vdev_time_shift = 6;
00049 
00051 int zfs_vdev_ramp_rate = 2;
00052 
00059 int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
00060 int zfs_vdev_read_gap_limit = 32 << 10;
00061 int zfs_vdev_write_gap_limit = 4 << 10;
00064 SYSCTL_DECL(_vfs_zfs_vdev);
00065 TUNABLE_INT("vfs.zfs.vdev.max_pending", &zfs_vdev_max_pending);
00066 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_pending, CTLFLAG_RW,
00067     &zfs_vdev_max_pending, 0, "Maximum I/O requests pending on each device");
00068 TUNABLE_INT("vfs.zfs.vdev.min_pending", &zfs_vdev_min_pending);
00069 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_pending, CTLFLAG_RW,
00070     &zfs_vdev_min_pending, 0,
00071     "Initial number of I/O requests pending to each device");
00072 TUNABLE_INT("vfs.zfs.vdev.time_shift", &zfs_vdev_time_shift);
00073 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, time_shift, CTLFLAG_RW,
00074     &zfs_vdev_time_shift, 0, "Used for calculating I/O request deadline");
00075 TUNABLE_INT("vfs.zfs.vdev.ramp_rate", &zfs_vdev_ramp_rate);
00076 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, ramp_rate, CTLFLAG_RW,
00077     &zfs_vdev_ramp_rate, 0, "Exponential I/O issue ramp-up rate");
00078 TUNABLE_INT("vfs.zfs.vdev.aggregation_limit", &zfs_vdev_aggregation_limit);
00079 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RW,
00080     &zfs_vdev_aggregation_limit, 0,
00081     "I/O requests are aggregated up to this size");
00082 TUNABLE_INT("vfs.zfs.vdev.read_gap_limit", &zfs_vdev_read_gap_limit);
00083 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RW,
00084     &zfs_vdev_read_gap_limit, 0,
00085     "Acceptable gap between two reads being aggregated");
00086 TUNABLE_INT("vfs.zfs.vdev.write_gap_limit", &zfs_vdev_write_gap_limit);
00087 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RW,
00088     &zfs_vdev_write_gap_limit, 0,
00089     "Acceptable gap between two writes being aggregated");
00090 
00094 int
00095 vdev_queue_deadline_compare(const void *x1, const void *x2)
00096 {
00097         const zio_t *z1 = x1;
00098         const zio_t *z2 = x2;
00099 
00100         if (z1->io_deadline < z2->io_deadline)
00101                 return (-1);
00102         if (z1->io_deadline > z2->io_deadline)
00103                 return (1);
00104 
00105         if (z1->io_offset < z2->io_offset)
00106                 return (-1);
00107         if (z1->io_offset > z2->io_offset)
00108                 return (1);
00109 
00110         if (z1 < z2)
00111                 return (-1);
00112         if (z1 > z2)
00113                 return (1);
00114 
00115         return (0);
00116 }
00117 
00118 int
00119 vdev_queue_offset_compare(const void *x1, const void *x2)
00120 {
00121         const zio_t *z1 = x1;
00122         const zio_t *z2 = x2;
00123 
00124         if (z1->io_offset < z2->io_offset)
00125                 return (-1);
00126         if (z1->io_offset > z2->io_offset)
00127                 return (1);
00128 
00129         if (z1 < z2)
00130                 return (-1);
00131         if (z1 > z2)
00132                 return (1);
00133 
00134         return (0);
00135 }
00136 
00137 void
00138 vdev_queue_init(vdev_t *vd)
00139 {
00140         vdev_queue_t *vq = &vd->vdev_queue;
00141 
00142         mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
00143 
00144         avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
00145             sizeof (zio_t), offsetof(struct zio, io_deadline_node));
00146 
00147         avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
00148             sizeof (zio_t), offsetof(struct zio, io_offset_node));
00149 
00150         avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
00151             sizeof (zio_t), offsetof(struct zio, io_offset_node));
00152 
00153         avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
00154             sizeof (zio_t), offsetof(struct zio, io_offset_node));
00155 }
00156 
00157 void
00158 vdev_queue_fini(vdev_t *vd)
00159 {
00160         vdev_queue_t *vq = &vd->vdev_queue;
00161 
00162         avl_destroy(&vq->vq_deadline_tree);
00163         avl_destroy(&vq->vq_read_tree);
00164         avl_destroy(&vq->vq_write_tree);
00165         avl_destroy(&vq->vq_pending_tree);
00166 
00167         mutex_destroy(&vq->vq_lock);
00168 }
00169 
00170 static void
00171 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
00172 {
00173         avl_add(&vq->vq_deadline_tree, zio);
00174         avl_add(zio->io_vdev_tree, zio);
00175 }
00176 
00177 static void
00178 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
00179 {
00180         avl_remove(&vq->vq_deadline_tree, zio);
00181         avl_remove(zio->io_vdev_tree, zio);
00182 }
00183 
00184 static void
00185 vdev_queue_agg_io_done(zio_t *aio)
00186 {
00187         zio_t *pio;
00188 
00189         while ((pio = zio_walk_parents(aio)) != NULL)
00190                 if (aio->io_type == ZIO_TYPE_READ)
00191                         bcopy((char *)aio->io_data + (pio->io_offset -
00192                             aio->io_offset), pio->io_data, pio->io_size);
00193 
00194         zio_buf_free(aio->io_data, aio->io_size);
00195 }
00196 
00203 #define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
00204 #define IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
00205 
00206 static zio_t *
00207 vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
00208 {
00209         zio_t *fio, *lio, *aio, *dio, *nio, *mio;
00210         avl_tree_t *t;
00211         int flags;
00212         uint64_t maxspan = zfs_vdev_aggregation_limit;
00213         uint64_t maxgap;
00214         int stretch;
00215 
00216 again:
00217         ASSERT(MUTEX_HELD(&vq->vq_lock));
00218 
00219         if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
00220             avl_numnodes(&vq->vq_deadline_tree) == 0)
00221                 return (NULL);
00222 
00223         fio = lio = avl_first(&vq->vq_deadline_tree);
00224 
00225         t = fio->io_vdev_tree;
00226         flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
00227         maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0;
00228 
00229         if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
00230                 /*
00231                  * We can aggregate I/Os that are sufficiently adjacent and of
00232                  * the same flavor, as expressed by the AGG_INHERIT flags.
00233                  * The latter requirement is necessary so that certain
00234                  * attributes of the I/O, such as whether it's a normal I/O
00235                  * or a scrub/resilver, can be preserved in the aggregate.
00236                  * We can include optional I/Os, but don't allow them
00237                  * to begin a range as they add no benefit in that situation.
00238                  */
00239 
00240                 /*
00241                  * We keep track of the last non-optional I/O.
00242                  */
00243                 mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio;
00244 
00245                 /*
00246                  * Walk backwards through sufficiently contiguous I/Os
00247                  * recording the last non-option I/O.
00248                  */
00249                 while ((dio = AVL_PREV(t, fio)) != NULL &&
00250                     (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
00251                     IO_SPAN(dio, lio) <= maxspan &&
00252                     IO_GAP(dio, fio) <= maxgap) {
00253                         fio = dio;
00254                         if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL))
00255                                 mio = fio;
00256                 }
00257 
00258                 /*
00259                  * Skip any initial optional I/Os.
00260                  */
00261                 while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) {
00262                         fio = AVL_NEXT(t, fio);
00263                         ASSERT(fio != NULL);
00264                 }
00265 
00266                 /*
00267                  * Walk forward through sufficiently contiguous I/Os.
00268                  */
00269                 while ((dio = AVL_NEXT(t, lio)) != NULL &&
00270                     (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
00271                     IO_SPAN(fio, dio) <= maxspan &&
00272                     IO_GAP(lio, dio) <= maxgap) {
00273                         lio = dio;
00274                         if (!(lio->io_flags & ZIO_FLAG_OPTIONAL))
00275                                 mio = lio;
00276                 }
00277 
00278                 /*
00279                  * Now that we've established the range of the I/O aggregation
00280                  * we must decide what to do with trailing optional I/Os.
00281                  * For reads, there's nothing to do. While we are unable to
00282                  * aggregate further, it's possible that a trailing optional
00283                  * I/O would allow the underlying device to aggregate with
00284                  * subsequent I/Os. We must therefore determine if the next
00285                  * non-optional I/O is close enough to make aggregation
00286                  * worthwhile.
00287                  */
00288                 stretch = B_FALSE;
00289                 if (t != &vq->vq_read_tree && mio != NULL) {
00290                         nio = lio;
00291                         while ((dio = AVL_NEXT(t, nio)) != NULL &&
00292                             IO_GAP(nio, dio) == 0 &&
00293                             IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) {
00294                                 nio = dio;
00295                                 if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
00296                                         stretch = B_TRUE;
00297                                         break;
00298                                 }
00299                         }
00300                 }
00301 
00302                 if (stretch) {
00303                         /* This may be a no-op. */
00304                         VERIFY((dio = AVL_NEXT(t, lio)) != NULL);
00305                         dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
00306                 } else {
00307                         while (lio != mio && lio != fio) {
00308                                 ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL);
00309                                 lio = AVL_PREV(t, lio);
00310                                 ASSERT(lio != NULL);
00311                         }
00312                 }
00313         }
00314 
00315         if (fio != lio) {
00316                 uint64_t size = IO_SPAN(fio, lio);
00317                 ASSERT(size <= zfs_vdev_aggregation_limit);
00318 
00319                 aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
00320                     zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG,
00321                     flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
00322                     vdev_queue_agg_io_done, NULL);
00323 
00324                 nio = fio;
00325                 do {
00326                         dio = nio;
00327                         nio = AVL_NEXT(t, dio);
00328                         ASSERT(dio->io_type == aio->io_type);
00329                         ASSERT(dio->io_vdev_tree == t);
00330 
00331                         if (dio->io_flags & ZIO_FLAG_NODATA) {
00332                                 ASSERT(dio->io_type == ZIO_TYPE_WRITE);
00333                                 bzero((char *)aio->io_data + (dio->io_offset -
00334                                     aio->io_offset), dio->io_size);
00335                         } else if (dio->io_type == ZIO_TYPE_WRITE) {
00336                                 bcopy(dio->io_data, (char *)aio->io_data +
00337                                     (dio->io_offset - aio->io_offset),
00338                                     dio->io_size);
00339                         }
00340 
00341                         zio_add_child(dio, aio);
00342                         vdev_queue_io_remove(vq, dio);
00343                         zio_vdev_io_bypass(dio);
00344                         zio_execute(dio);
00345                 } while (dio != lio);
00346 
00347                 avl_add(&vq->vq_pending_tree, aio);
00348 
00349                 return (aio);
00350         }
00351 
00352         ASSERT(fio->io_vdev_tree == t);
00353         vdev_queue_io_remove(vq, fio);
00354 
00355         /*
00356          * If the I/O is or was optional and therefore has no data, we need to
00357          * simply discard it. We need to drop the vdev queue's lock to avoid a
00358          * deadlock that we could encounter since this I/O will complete
00359          * immediately.
00360          */
00361         if (fio->io_flags & ZIO_FLAG_NODATA) {
00362                 mutex_exit(&vq->vq_lock);
00363                 zio_vdev_io_bypass(fio);
00364                 zio_execute(fio);
00365                 mutex_enter(&vq->vq_lock);
00366                 goto again;
00367         }
00368 
00369         avl_add(&vq->vq_pending_tree, fio);
00370 
00371         return (fio);
00372 }
00373 
00374 zio_t *
00375 vdev_queue_io(zio_t *zio)
00376 {
00377         vdev_queue_t *vq = &zio->io_vd->vdev_queue;
00378         zio_t *nio;
00379 
00380         ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
00381 
00382         if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
00383                 return (zio);
00384 
00385         zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
00386 
00387         if (zio->io_type == ZIO_TYPE_READ)
00388                 zio->io_vdev_tree = &vq->vq_read_tree;
00389         else
00390                 zio->io_vdev_tree = &vq->vq_write_tree;
00391 
00392         mutex_enter(&vq->vq_lock);
00393 
00394         zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) +
00395             zio->io_priority;
00396 
00397         vdev_queue_io_add(vq, zio);
00398 
00399         nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending);
00400 
00401         mutex_exit(&vq->vq_lock);
00402 
00403         if (nio == NULL)
00404                 return (NULL);
00405 
00406         if (nio->io_done == vdev_queue_agg_io_done) {
00407                 zio_nowait(nio);
00408                 return (NULL);
00409         }
00410 
00411         return (nio);
00412 }
00413 
00414 void
00415 vdev_queue_io_done(zio_t *zio)
00416 {
00417         vdev_queue_t *vq = &zio->io_vd->vdev_queue;
00418 
00419         mutex_enter(&vq->vq_lock);
00420 
00421         avl_remove(&vq->vq_pending_tree, zio);
00422 
00423         for (int i = 0; i < zfs_vdev_ramp_rate; i++) {
00424                 zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
00425                 if (nio == NULL)
00426                         break;
00427                 mutex_exit(&vq->vq_lock);
00428                 if (nio->io_done == vdev_queue_agg_io_done) {
00429                         zio_nowait(nio);
00430                 } else {
00431                         zio_vdev_io_reissue(nio);
00432                         zio_execute(nio);
00433                 }
00434                 mutex_enter(&vq->vq_lock);
00435         }
00436 
00437         mutex_exit(&vq->vq_lock);
00438 }
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines