FreeBSD ZFS
The Zettabyte File System

zio.c

Go to the documentation of this file.
00001 /*
00002  * CDDL HEADER START
00003  *
00004  * The contents of this file are subject to the terms of the
00005  * Common Development and Distribution License (the "License").
00006  * You may not use this file except in compliance with the License.
00007  *
00008  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
00009  * or http://www.opensolaris.org/os/licensing.
00010  * See the License for the specific language governing permissions
00011  * and limitations under the License.
00012  *
00013  * When distributing Covered Code, include this CDDL HEADER in each
00014  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
00015  * If applicable, add the following below this CDDL HEADER, with the
00016  * fields enclosed by brackets "[]" replaced with your own identifying
00017  * information: Portions Copyright [yyyy] [name of copyright owner]
00018  *
00019  * CDDL HEADER END
00020  */
00021 /*
00022  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
00023  * Copyright (c) 2012 by Delphix. All rights reserved.
00024  */
00025 
00026 #include <sys/zfs_context.h>
00027 #include <sys/fm/fs/zfs.h>
00028 #include <sys/spa.h>
00029 #include <sys/txg.h>
00030 #include <sys/spa_impl.h>
00031 #include <sys/vdev_impl.h>
00032 #include <sys/zio_impl.h>
00033 #include <sys/zio_compress.h>
00034 #include <sys/zio_checksum.h>
00035 #include <sys/dmu_objset.h>
00036 #include <sys/arc.h>
00037 #include <sys/ddt.h>
00038 #include <sys/trim_map.h>
00039 
00040 SYSCTL_DECL(_vfs_zfs);
00041 SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
00045 int zio_use_uma = 0;
00046 TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma);
00047 SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
00048     "Use uma(9) for ZIO allocations");
00049 static int zio_exclude_metadata = 0;
00050 TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata);
00051 SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
00052     "Exclude metadata buffers from dumps as well");
00053 
00057 zio_trim_stats_t zio_trim_stats = {
00058         { "zio_trim_bytes",             KSTAT_DATA_UINT64 },
00059         { "zio_trim_success",           KSTAT_DATA_UINT64 },
00060         { "zio_trim_unsupported",       KSTAT_DATA_UINT64 },
00061         { "zio_trim_failed",            KSTAT_DATA_UINT64 },
00062 };
00063 
00064 static kstat_t *zio_trim_ksp;
00065 
00069 uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
00070         0,      /* ZIO_PRIORITY_NOW             */
00071         0,      /* ZIO_PRIORITY_SYNC_READ       */
00072         0,      /* ZIO_PRIORITY_SYNC_WRITE      */
00073         0,      /* ZIO_PRIORITY_LOG_WRITE       */
00074         1,      /* ZIO_PRIORITY_CACHE_FILL      */
00075         1,      /* ZIO_PRIORITY_AGG             */
00076         4,      /* ZIO_PRIORITY_FREE            */
00077         4,      /* ZIO_PRIORITY_ASYNC_WRITE     */
00078         6,      /* ZIO_PRIORITY_ASYNC_READ      */
00079         10,     /* ZIO_PRIORITY_RESILVER        */
00080         20,     /* ZIO_PRIORITY_SCRUB           */
00081         2,      /* ZIO_PRIORITY_DDT_PREFETCH    */
00082         30,     /* ZIO_PRIORITY_TRIM            */
00083 };
00084 
00088 char *zio_type_name[ZIO_TYPES] = {
00089         "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
00090         "zio_ioctl"
00091 };
00092 
00093 /*
00094  * ==========================================================================
00095  * I/O kmem caches
00096  * ==========================================================================
00097  */
00098 kmem_cache_t *zio_cache;
00099 kmem_cache_t *zio_link_cache;
00100 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
00101 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
00102 
00103 #ifdef _KERNEL
00104 extern vmem_t *zio_alloc_arena;
00105 #endif
00106 extern int zfs_mg_alloc_failures;
00107 
00112 #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
00113 
00114 boolean_t       zio_requeue_io_start_cut_in_line = B_TRUE;
00115 
00116 #ifdef ZFS_DEBUG
00117 int zio_buf_debug_limit = 16384;
00118 #else
00119 int zio_buf_debug_limit = 0;
00120 #endif
00121 
00122 void
00123 zio_init(void)
00124 {
00125         size_t c;
00126         zio_cache = kmem_cache_create("zio_cache",
00127             sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
00128         zio_link_cache = kmem_cache_create("zio_link_cache",
00129             sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
00130 
00131         /*
00132          * For small buffers, we want a cache for each multiple of
00133          * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
00134          * for each quarter-power of 2.  For large buffers, we want
00135          * a cache for each multiple of PAGESIZE.
00136          */
00137         for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
00138                 size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
00139                 size_t p2 = size;
00140                 size_t align = 0;
00141                 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
00142 
00143                 while (p2 & (p2 - 1))
00144                         p2 &= p2 - 1;
00145 
00146 #ifdef illumos
00147 #ifndef _KERNEL
00148                 /*
00149                  * If we are using watchpoints, put each buffer on its own page,
00150                  * to eliminate the performance overhead of trapping to the
00151                  * kernel when modifying a non-watched buffer that shares the
00152                  * page with a watched buffer.
00153                  */
00154                 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
00155                         continue;
00156 #endif
00157 #endif /* illumos */
00158                 if (size <= 4 * SPA_MINBLOCKSIZE) {
00159                         align = SPA_MINBLOCKSIZE;
00160                 } else if (IS_P2ALIGNED(size, PAGESIZE)) {
00161                         align = PAGESIZE;
00162                 } else if (IS_P2ALIGNED(size, p2 >> 2)) {
00163                         align = p2 >> 2;
00164                 }
00165 
00166                 if (align != 0) {
00167                         char name[36];
00168                         (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
00169                         zio_buf_cache[c] = kmem_cache_create(name, size,
00170                             align, NULL, NULL, NULL, NULL, NULL, cflags);
00171 
00172                         /*
00173                          * Since zio_data bufs do not appear in crash dumps, we
00174                          * pass KMC_NOTOUCH so that no allocator metadata is
00175                          * stored with the buffers.
00176                          */
00177                         (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
00178                         zio_data_buf_cache[c] = kmem_cache_create(name, size,
00179                             align, NULL, NULL, NULL, NULL, NULL,
00180                             cflags | KMC_NOTOUCH | KMC_NODEBUG);
00181                 }
00182         }
00183 
00184         while (--c != 0) {
00185                 ASSERT(zio_buf_cache[c] != NULL);
00186                 if (zio_buf_cache[c - 1] == NULL)
00187                         zio_buf_cache[c - 1] = zio_buf_cache[c];
00188 
00189                 ASSERT(zio_data_buf_cache[c] != NULL);
00190                 if (zio_data_buf_cache[c - 1] == NULL)
00191                         zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
00192         }
00193 
00194         /*
00195          * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
00196          * to fail 3 times per txg or 8 failures, whichever is greater.
00197          */
00198         if (zfs_mg_alloc_failures == 0)
00199                 zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
00200         else if (zfs_mg_alloc_failures < 8)
00201                 zfs_mg_alloc_failures = 8;
00202 
00203         zio_inject_init();
00204 
00205         zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc",
00206             KSTAT_TYPE_NAMED,
00207             sizeof(zio_trim_stats) / sizeof(kstat_named_t),
00208             KSTAT_FLAG_VIRTUAL);
00209 
00210         if (zio_trim_ksp != NULL) {
00211                 zio_trim_ksp->ks_data = &zio_trim_stats;
00212                 kstat_install(zio_trim_ksp);
00213         }
00214 }
00215 
00216 void
00217 zio_fini(void)
00218 {
00219         size_t c;
00220         kmem_cache_t *last_cache = NULL;
00221         kmem_cache_t *last_data_cache = NULL;
00222 
00223         for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
00224                 if (zio_buf_cache[c] != last_cache) {
00225                         last_cache = zio_buf_cache[c];
00226                         kmem_cache_destroy(zio_buf_cache[c]);
00227                 }
00228                 zio_buf_cache[c] = NULL;
00229 
00230                 if (zio_data_buf_cache[c] != last_data_cache) {
00231                         last_data_cache = zio_data_buf_cache[c];
00232                         kmem_cache_destroy(zio_data_buf_cache[c]);
00233                 }
00234                 zio_data_buf_cache[c] = NULL;
00235         }
00236 
00237         kmem_cache_destroy(zio_link_cache);
00238         kmem_cache_destroy(zio_cache);
00239 
00240         zio_inject_fini();
00241 
00242         if (zio_trim_ksp != NULL) {
00243                 kstat_delete(zio_trim_ksp);
00244                 zio_trim_ksp = NULL;
00245         }
00246 }
00247 
00248 /*
00249  * ==========================================================================
00250  * Allocate and free I/O buffers
00251  * ==========================================================================
00252  */
00253 
00260 void *
00261 zio_buf_alloc(size_t size)
00262 {
00263         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
00264         int flags = zio_exclude_metadata ? KM_NODEBUG : 0;
00265 
00266         ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
00267 
00268         if (zio_use_uma)
00269                 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
00270         else
00271                 return (kmem_alloc(size, KM_SLEEP|flags));
00272 }
00273 
00280 void *
00281 zio_data_buf_alloc(size_t size)
00282 {
00283         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
00284 
00285         ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
00286 
00287         if (zio_use_uma)
00288                 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
00289         else
00290                 return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG));
00291 }
00292 
00293 void
00294 zio_buf_free(void *buf, size_t size)
00295 {
00296         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
00297 
00298         ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
00299 
00300         if (zio_use_uma)
00301                 kmem_cache_free(zio_buf_cache[c], buf);
00302         else
00303                 kmem_free(buf, size);
00304 }
00305 
00306 void
00307 zio_data_buf_free(void *buf, size_t size)
00308 {
00309         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
00310 
00311         ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
00312 
00313         if (zio_use_uma)
00314                 kmem_cache_free(zio_data_buf_cache[c], buf);
00315         else
00316                 kmem_free(buf, size);
00317 }
00318 
00319 /*
00320  * ==========================================================================
00321  * Push and pop I/O transform buffers
00322  * ==========================================================================
00323  */
00324 static void
00325 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
00326         zio_transform_func_t *transform)
00327 {
00328         zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
00329 
00330         zt->zt_orig_data = zio->io_data;
00331         zt->zt_orig_size = zio->io_size;
00332         zt->zt_bufsize = bufsize;
00333         zt->zt_transform = transform;
00334 
00335         zt->zt_next = zio->io_transform_stack;
00336         zio->io_transform_stack = zt;
00337 
00338         zio->io_data = data;
00339         zio->io_size = size;
00340 }
00341 
00342 static void
00343 zio_pop_transforms(zio_t *zio)
00344 {
00345         zio_transform_t *zt;
00346 
00347         while ((zt = zio->io_transform_stack) != NULL) {
00348                 if (zt->zt_transform != NULL)
00349                         zt->zt_transform(zio,
00350                             zt->zt_orig_data, zt->zt_orig_size);
00351 
00352                 if (zt->zt_bufsize != 0)
00353                         zio_buf_free(zio->io_data, zt->zt_bufsize);
00354 
00355                 zio->io_data = zt->zt_orig_data;
00356                 zio->io_size = zt->zt_orig_size;
00357                 zio->io_transform_stack = zt->zt_next;
00358 
00359                 kmem_free(zt, sizeof (zio_transform_t));
00360         }
00361 }
00362 
00363 /*
00364  * ==========================================================================
00365  * I/O transform callbacks for subblocks and decompression
00366  * ==========================================================================
00367  */
00368 static void
00369 zio_subblock(zio_t *zio, void *data, uint64_t size)
00370 {
00371         ASSERT(zio->io_size > size);
00372 
00373         if (zio->io_type == ZIO_TYPE_READ)
00374                 bcopy(zio->io_data, data, size);
00375 }
00376 
00377 static void
00378 zio_decompress(zio_t *zio, void *data, uint64_t size)
00379 {
00380         if (zio->io_error == 0 &&
00381             zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
00382             zio->io_data, data, zio->io_size, size) != 0)
00383                 zio->io_error = EIO;
00384 }
00385 
00386 /*
00387  * ==========================================================================
00388  * I/O parent/child relationships and pipeline interlocks
00389  * ==========================================================================
00390  */
00398 zio_t *
00399 zio_walk_parents(zio_t *cio)
00400 {
00401         zio_link_t *zl = cio->io_walk_link;
00402         list_t *pl = &cio->io_parent_list;
00403 
00404         zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
00405         cio->io_walk_link = zl;
00406 
00407         if (zl == NULL)
00408                 return (NULL);
00409 
00410         ASSERT(zl->zl_child == cio);
00411         return (zl->zl_parent);
00412 }
00413 
00421 zio_t *
00422 zio_walk_children(zio_t *pio)
00423 {
00424         zio_link_t *zl = pio->io_walk_link;
00425         list_t *cl = &pio->io_child_list;
00426 
00427         zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
00428         pio->io_walk_link = zl;
00429 
00430         if (zl == NULL)
00431                 return (NULL);
00432 
00433         ASSERT(zl->zl_parent == pio);
00434         return (zl->zl_child);
00435 }
00436 
00437 zio_t *
00438 zio_unique_parent(zio_t *cio)
00439 {
00440         zio_t *pio = zio_walk_parents(cio);
00441 
00442         VERIFY(zio_walk_parents(cio) == NULL);
00443         return (pio);
00444 }
00445 
00446 void
00447 zio_add_child(zio_t *pio, zio_t *cio)
00448 {
00449         zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
00450 
00451         /*
00452          * Logical I/Os can have logical, gang, or vdev children.
00453          * Gang I/Os can have gang or vdev children.
00454          * Vdev I/Os can only have vdev children.
00455          * The following ASSERT captures all of these constraints.
00456          */
00457         ASSERT(cio->io_child_type <= pio->io_child_type);
00458 
00459         zl->zl_parent = pio;
00460         zl->zl_child = cio;
00461 
00462         mutex_enter(&cio->io_lock);
00463         mutex_enter(&pio->io_lock);
00464 
00465         ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
00466 
00467         for (int w = 0; w < ZIO_WAIT_TYPES; w++)
00468                 pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
00469 
00470         list_insert_head(&pio->io_child_list, zl);
00471         list_insert_head(&cio->io_parent_list, zl);
00472 
00473         pio->io_child_count++;
00474         cio->io_parent_count++;
00475 
00476         mutex_exit(&pio->io_lock);
00477         mutex_exit(&cio->io_lock);
00478 }
00479 
00480 static void
00481 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
00482 {
00483         ASSERT(zl->zl_parent == pio);
00484         ASSERT(zl->zl_child == cio);
00485 
00486         mutex_enter(&cio->io_lock);
00487         mutex_enter(&pio->io_lock);
00488 
00489         list_remove(&pio->io_child_list, zl);
00490         list_remove(&cio->io_parent_list, zl);
00491 
00492         pio->io_child_count--;
00493         cio->io_parent_count--;
00494 
00495         mutex_exit(&pio->io_lock);
00496         mutex_exit(&cio->io_lock);
00497 
00498         kmem_cache_free(zio_link_cache, zl);
00499 }
00500 
00501 static boolean_t
00502 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
00503 {
00504         uint64_t *countp = &zio->io_children[child][wait];
00505         boolean_t waiting = B_FALSE;
00506 
00507         mutex_enter(&zio->io_lock);
00508         ASSERT(zio->io_stall == NULL);
00509         if (*countp != 0) {
00510                 zio->io_stage >>= 1;
00511                 zio->io_stall = countp;
00512                 waiting = B_TRUE;
00513         }
00514         mutex_exit(&zio->io_lock);
00515 
00516         return (waiting);
00517 }
00518 
00519 static void
00520 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
00521 {
00522         uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
00523         int *errorp = &pio->io_child_error[zio->io_child_type];
00524 
00525         mutex_enter(&pio->io_lock);
00526         if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
00527                 *errorp = zio_worst_error(*errorp, zio->io_error);
00528         pio->io_reexecute |= zio->io_reexecute;
00529         ASSERT3U(*countp, >, 0);
00530         if (--*countp == 0 && pio->io_stall == countp) {
00531                 pio->io_stall = NULL;
00532                 mutex_exit(&pio->io_lock);
00533                 zio_execute(pio);
00534         } else {
00535                 mutex_exit(&pio->io_lock);
00536         }
00537 }
00538 
00539 static void
00540 zio_inherit_child_errors(zio_t *zio, enum zio_child c)
00541 {
00542         if (zio->io_child_error[c] != 0 && zio->io_error == 0)
00543                 zio->io_error = zio->io_child_error[c];
00544 }
00545 
00549 static zio_t *
00550 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
00551     void *data, uint64_t size, zio_done_func_t *done, void *private,
00552     zio_type_t type, int priority, enum zio_flag flags,
00553     vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
00554     enum zio_stage stage, enum zio_stage pipeline)
00555 {
00556         zio_t *zio;
00557 
00558         ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE);
00559         ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
00560         ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
00561 
00562         ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
00563         ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
00564         ASSERT(vd || stage == ZIO_STAGE_OPEN);
00565 
00566         zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
00567         bzero(zio, sizeof (zio_t));
00568 
00569         mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
00570         cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
00571 
00572         list_create(&zio->io_parent_list, sizeof (zio_link_t),
00573             offsetof(zio_link_t, zl_parent_node));
00574         list_create(&zio->io_child_list, sizeof (zio_link_t),
00575             offsetof(zio_link_t, zl_child_node));
00576 
00577         if (vd != NULL)
00578                 zio->io_child_type = ZIO_CHILD_VDEV;
00579         else if (flags & ZIO_FLAG_GANG_CHILD)
00580                 zio->io_child_type = ZIO_CHILD_GANG;
00581         else if (flags & ZIO_FLAG_DDT_CHILD)
00582                 zio->io_child_type = ZIO_CHILD_DDT;
00583         else
00584                 zio->io_child_type = ZIO_CHILD_LOGICAL;
00585 
00586         if (bp != NULL) {
00587                 zio->io_bp = (blkptr_t *)bp;
00588                 zio->io_bp_copy = *bp;
00589                 zio->io_bp_orig = *bp;
00590                 if (type != ZIO_TYPE_WRITE ||
00591                     zio->io_child_type == ZIO_CHILD_DDT)
00592                         zio->io_bp = &zio->io_bp_copy;  /* so caller can free */
00593                 if (zio->io_child_type == ZIO_CHILD_LOGICAL)
00594                         zio->io_logical = zio;
00595                 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
00596                         pipeline |= ZIO_GANG_STAGES;
00597         }
00598 
00599         zio->io_spa = spa;
00600         zio->io_txg = txg;
00601         zio->io_done = done;
00602         zio->io_private = private;
00603         zio->io_type = type;
00604         zio->io_priority = priority;
00605         zio->io_vd = vd;
00606         zio->io_offset = offset;
00607         zio->io_orig_data = zio->io_data = data;
00608         zio->io_orig_size = zio->io_size = size;
00609         zio->io_orig_flags = zio->io_flags = flags;
00610         zio->io_orig_stage = zio->io_stage = stage;
00611         zio->io_orig_pipeline = zio->io_pipeline = pipeline;
00612 
00613         zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
00614         zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
00615 
00616         if (zb != NULL)
00617                 zio->io_bookmark = *zb;
00618 
00619         if (pio != NULL) {
00620                 if (zio->io_logical == NULL)
00621                         zio->io_logical = pio->io_logical;
00622                 if (zio->io_child_type == ZIO_CHILD_GANG)
00623                         zio->io_gang_leader = pio->io_gang_leader;
00624                 zio_add_child(pio, zio);
00625         }
00626 
00627         return (zio);
00628 }
00629 
00630 static void
00631 zio_destroy(zio_t *zio)
00632 {
00633         list_destroy(&zio->io_parent_list);
00634         list_destroy(&zio->io_child_list);
00635         mutex_destroy(&zio->io_lock);
00636         cv_destroy(&zio->io_cv);
00637         kmem_cache_free(zio_cache, zio);
00638 }
00639 
00640 zio_t *
00641 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
00642     void *private, enum zio_flag flags)
00643 {
00644         zio_t *zio;
00645 
00646         zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
00647             ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
00648             ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
00649 
00650         return (zio);
00651 }
00652 
00653 zio_t *
00654 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
00655 {
00656         return (zio_null(NULL, spa, NULL, done, private, flags));
00657 }
00658 
00659 zio_t *
00660 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
00661     void *data, uint64_t size, zio_done_func_t *done, void *private,
00662     int priority, enum zio_flag flags, const zbookmark_t *zb)
00663 {
00664         zio_t *zio;
00665 
00666         zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
00667             data, size, done, private,
00668             ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
00669             ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
00670             ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
00671 
00672         return (zio);
00673 }
00674 
00675 zio_t *
00676 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
00677     void *data, uint64_t size, const zio_prop_t *zp,
00678     zio_done_func_t *ready, zio_done_func_t *done, void *private,
00679     int priority, enum zio_flag flags, const zbookmark_t *zb)
00680 {
00681         zio_t *zio;
00682 
00683         ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
00684             zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
00685             zp->zp_compress >= ZIO_COMPRESS_OFF &&
00686             zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
00687             DMU_OT_IS_VALID(zp->zp_type) &&
00688             zp->zp_level < 32 &&
00689             zp->zp_copies > 0 &&
00690             zp->zp_copies <= spa_max_replication(spa) &&
00691             zp->zp_dedup <= 1 &&
00692             zp->zp_dedup_verify <= 1);
00693 
00694         zio = zio_create(pio, spa, txg, bp, data, size, done, private,
00695             ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
00696             ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
00697             ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
00698 
00699         zio->io_ready = ready;
00700         zio->io_prop = *zp;
00701 
00702         return (zio);
00703 }
00704 
00705 zio_t *
00706 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
00707     uint64_t size, zio_done_func_t *done, void *private, int priority,
00708     enum zio_flag flags, zbookmark_t *zb)
00709 {
00710         zio_t *zio;
00711 
00712         zio = zio_create(pio, spa, txg, bp, data, size, done, private,
00713             ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
00714             ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
00715 
00716         return (zio);
00717 }
00718 
00719 void
00720 zio_write_override(zio_t *zio, blkptr_t *bp, int copies)
00721 {
00722         ASSERT(zio->io_type == ZIO_TYPE_WRITE);
00723         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
00724         ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
00725         ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
00726 
00727         zio->io_prop.zp_copies = copies;
00728         zio->io_bp_override = bp;
00729 }
00730 
00731 void
00732 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
00733 {
00734         bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
00735 }
00736 
00737 zio_t *
00738 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
00739     uint64_t size, enum zio_flag flags)
00740 {
00741         zio_t *zio;
00742 
00743         dprintf_bp(bp, "freeing in txg %llu, pass %u",
00744             (longlong_t)txg, spa->spa_sync_pass);
00745 
00746         ASSERT(!BP_IS_HOLE(bp));
00747         ASSERT(spa_syncing_txg(spa) == txg);
00748         ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE);
00749 
00750         zio = zio_create(pio, spa, txg, bp, NULL, size,
00751             NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
00752             NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
00753 
00754         return (zio);
00755 }
00756 
00757 zio_t *
00758 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
00759     zio_done_func_t *done, void *private, enum zio_flag flags)
00760 {
00761         zio_t *zio;
00762 
00763         /*
00764          * A claim is an allocation of a specific block.  Claims are needed
00765          * to support immediate writes in the intent log.  The issue is that
00766          * immediate writes contain committed data, but in a txg that was
00767          * *not* committed.  Upon opening the pool after an unclean shutdown,
00768          * the intent log claims all blocks that contain immediate write data
00769          * so that the SPA knows they're in use.
00770          *
00771          * All claims *must* be resolved in the first txg -- before the SPA
00772          * starts allocating blocks -- so that nothing is allocated twice.
00773          * If txg == 0 we just verify that the block is claimable.
00774          */
00775         ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
00776         ASSERT(txg == spa_first_txg(spa) || txg == 0);
00777         ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));       /* zdb(1M) */
00778 
00779         zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
00780             done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
00781             NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
00782 
00783         return (zio);
00784 }
00785 
00786 zio_t *
00787 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset,
00788     uint64_t size, zio_done_func_t *done, void *private, int priority,
00789     enum zio_flag flags)
00790 {
00791         zio_t *zio;
00792         int c;
00793 
00794         if (vd->vdev_children == 0) {
00795                 zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private,
00796                     ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL,
00797                     ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
00798 
00799                 zio->io_cmd = cmd;
00800         } else {
00801                 zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
00802 
00803                 for (c = 0; c < vd->vdev_children; c++)
00804                         zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
00805                             offset, size, done, private, priority, flags));
00806         }
00807 
00808         return (zio);
00809 }
00810 
00811 zio_t *
00812 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
00813     void *data, int checksum, zio_done_func_t *done, void *private,
00814     int priority, enum zio_flag flags, boolean_t labels)
00815 {
00816         zio_t *zio;
00817 
00818         ASSERT(vd->vdev_children == 0);
00819         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
00820             offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
00821         ASSERT3U(offset + size, <=, vd->vdev_psize);
00822 
00823         zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
00824             ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
00825             ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
00826 
00827         zio->io_prop.zp_checksum = checksum;
00828 
00829         return (zio);
00830 }
00831 
00832 zio_t *
00833 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
00834     void *data, int checksum, zio_done_func_t *done, void *private,
00835     int priority, enum zio_flag flags, boolean_t labels)
00836 {
00837         zio_t *zio;
00838 
00839         ASSERT(vd->vdev_children == 0);
00840         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
00841             offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
00842         ASSERT3U(offset + size, <=, vd->vdev_psize);
00843 
00844         zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
00845             ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
00846             ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
00847 
00848         zio->io_prop.zp_checksum = checksum;
00849 
00850         if (zio_checksum_table[checksum].ci_eck) {
00851                 /*
00852                  * zec checksums are necessarily destructive -- they modify
00853                  * the end of the write buffer to hold the verifier/checksum.
00854                  * Therefore, we must make a local copy in case the data is
00855                  * being written to multiple places in parallel.
00856                  */
00857                 void *wbuf = zio_buf_alloc(size);
00858                 bcopy(data, wbuf, size);
00859                 zio_push_transform(zio, wbuf, size, size, NULL);
00860         }
00861 
00862         return (zio);
00863 }
00864 
00868 zio_t *
00869 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
00870         void *data, uint64_t size, int type, int priority, enum zio_flag flags,
00871         zio_done_func_t *done, void *private)
00872 {
00873         enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
00874         zio_t *zio;
00875 
00876         ASSERT(vd->vdev_parent ==
00877             (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
00878 
00879         if (type == ZIO_TYPE_READ && bp != NULL) {
00880                 /*
00881                  * If we have the bp, then the child should perform the
00882                  * checksum and the parent need not.  This pushes error
00883                  * detection as close to the leaves as possible and
00884                  * eliminates redundant checksums in the interior nodes.
00885                  */
00886                 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
00887                 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
00888         }
00889 
00890         if (vd->vdev_children == 0)
00891                 offset += VDEV_LABEL_START_SIZE;
00892 
00893         flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
00894 
00895         /*
00896          * If we've decided to do a repair, the write is not speculative --
00897          * even if the original read was.
00898          */
00899         if (flags & ZIO_FLAG_IO_REPAIR)
00900                 flags &= ~ZIO_FLAG_SPECULATIVE;
00901 
00902         zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
00903             done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
00904             ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
00905 
00906         return (zio);
00907 }
00908 
00909 zio_t *
00910 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
00911         int type, int priority, enum zio_flag flags,
00912         zio_done_func_t *done, void *private)
00913 {
00914         zio_t *zio;
00915 
00916         ASSERT(vd->vdev_ops->vdev_op_leaf);
00917 
00918         zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
00919             data, size, done, private, type, priority,
00920             flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
00921             vd, offset, NULL,
00922             ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
00923 
00924         return (zio);
00925 }
00926 
00927 void
00928 zio_flush(zio_t *zio, vdev_t *vd)
00929 {
00930         zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0,
00931             NULL, NULL, ZIO_PRIORITY_NOW,
00932             ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
00933 }
00934 
00935 zio_t *
00936 zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size)
00937 {
00938 
00939         ASSERT(vd->vdev_ops->vdev_op_leaf);
00940 
00941         return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size,
00942             NULL, NULL, ZIO_PRIORITY_TRIM,
00943             ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
00944 }
00945 
00946 void
00947 zio_shrink(zio_t *zio, uint64_t size)
00948 {
00949         ASSERT(zio->io_executor == NULL);
00950         ASSERT(zio->io_orig_size == zio->io_size);
00951         ASSERT(size <= zio->io_size);
00952 
00953         /*
00954          * We don't shrink for raidz because of problems with the
00955          * reconstruction when reading back less than the block size.
00956          * Note, BP_IS_RAIDZ() assumes no compression.
00957          */
00958         ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
00959         if (!BP_IS_RAIDZ(zio->io_bp))
00960                 zio->io_orig_size = zio->io_size = size;
00961 }
00962 
00966 static int
00967 zio_read_bp_init(zio_t *zio)
00968 {
00969         blkptr_t *bp = zio->io_bp;
00970 
00971         if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
00972             zio->io_child_type == ZIO_CHILD_LOGICAL &&
00973             !(zio->io_flags & ZIO_FLAG_RAW)) {
00974                 uint64_t psize = BP_GET_PSIZE(bp);
00975                 void *cbuf = zio_buf_alloc(psize);
00976 
00977                 zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
00978         }
00979 
00980         if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
00981                 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
00982 
00983         if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
00984                 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
00985 
00986         if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
00987                 zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
00988 
00989         return (ZIO_PIPELINE_CONTINUE);
00990 }
00991 
00992 static int
00993 zio_write_bp_init(zio_t *zio)
00994 {
00995         spa_t *spa = zio->io_spa;
00996         zio_prop_t *zp = &zio->io_prop;
00997         enum zio_compress compress = zp->zp_compress;
00998         blkptr_t *bp = zio->io_bp;
00999         uint64_t lsize = zio->io_size;
01000         uint64_t psize = lsize;
01001         int pass = 1;
01002 
01003         /*
01004          * If our children haven't all reached the ready stage,
01005          * wait for them and then repeat this pipeline stage.
01006          */
01007         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
01008             zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
01009                 return (ZIO_PIPELINE_STOP);
01010 
01011         if (!IO_IS_ALLOCATING(zio))
01012                 return (ZIO_PIPELINE_CONTINUE);
01013 
01014         ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
01015 
01016         if (zio->io_bp_override) {
01017                 ASSERT(bp->blk_birth != zio->io_txg);
01018                 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
01019 
01020                 *bp = *zio->io_bp_override;
01021                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
01022 
01023                 if (BP_IS_HOLE(bp) || !zp->zp_dedup)
01024                         return (ZIO_PIPELINE_CONTINUE);
01025 
01026                 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
01027                     zp->zp_dedup_verify);
01028 
01029                 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
01030                         BP_SET_DEDUP(bp, 1);
01031                         zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
01032                         return (ZIO_PIPELINE_CONTINUE);
01033                 }
01034                 zio->io_bp_override = NULL;
01035                 BP_ZERO(bp);
01036         }
01037 
01038         if (bp->blk_birth == zio->io_txg) {
01039                 /*
01040                  * We're rewriting an existing block, which means we're
01041                  * working on behalf of spa_sync().  For spa_sync() to
01042                  * converge, it must eventually be the case that we don't
01043                  * have to allocate new blocks.  But compression changes
01044                  * the blocksize, which forces a reallocate, and makes
01045                  * convergence take longer.  Therefore, after the first
01046                  * few passes, stop compressing to ensure convergence.
01047                  */
01048                 pass = spa_sync_pass(spa);
01049 
01050                 ASSERT(zio->io_txg == spa_syncing_txg(spa));
01051                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
01052                 ASSERT(!BP_GET_DEDUP(bp));
01053 
01054                 if (pass > SYNC_PASS_DONT_COMPRESS)
01055                         compress = ZIO_COMPRESS_OFF;
01056 
01057                 /* Make sure someone doesn't change their mind on overwrites */
01058                 ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
01059                     spa_max_replication(spa)) == BP_GET_NDVAS(bp));
01060         }
01061 
01062         if (compress != ZIO_COMPRESS_OFF) {
01063                 void *cbuf = zio_buf_alloc(lsize);
01064                 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
01065                 if (psize == 0 || psize == lsize) {
01066                         compress = ZIO_COMPRESS_OFF;
01067                         zio_buf_free(cbuf, lsize);
01068                 } else {
01069                         ASSERT(psize < lsize);
01070                         zio_push_transform(zio, cbuf, psize, lsize, NULL);
01071                 }
01072         }
01073 
01074         /*
01075          * The final pass of spa_sync() must be all rewrites, but the first
01076          * few passes offer a trade-off: allocating blocks defers convergence,
01077          * but newly allocated blocks are sequential, so they can be written
01078          * to disk faster.  Therefore, we allow the first few passes of
01079          * spa_sync() to allocate new blocks, but force rewrites after that.
01080          * There should only be a handful of blocks after pass 1 in any case.
01081          */
01082         if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
01083             pass > SYNC_PASS_REWRITE) {
01084                 ASSERT(psize != 0);
01085                 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
01086                 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
01087                 zio->io_flags |= ZIO_FLAG_IO_REWRITE;
01088         } else {
01089                 BP_ZERO(bp);
01090                 zio->io_pipeline = ZIO_WRITE_PIPELINE;
01091         }
01092 
01093         if (psize == 0) {
01094                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
01095         } else {
01096                 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
01097                 BP_SET_LSIZE(bp, lsize);
01098                 BP_SET_PSIZE(bp, psize);
01099                 BP_SET_COMPRESS(bp, compress);
01100                 BP_SET_CHECKSUM(bp, zp->zp_checksum);
01101                 BP_SET_TYPE(bp, zp->zp_type);
01102                 BP_SET_LEVEL(bp, zp->zp_level);
01103                 BP_SET_DEDUP(bp, zp->zp_dedup);
01104                 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
01105                 if (zp->zp_dedup) {
01106                         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
01107                         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
01108                         zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
01109                 }
01110         }
01111 
01112         return (ZIO_PIPELINE_CONTINUE);
01113 }
01114 
01115 static int
01116 zio_free_bp_init(zio_t *zio)
01117 {
01118         blkptr_t *bp = zio->io_bp;
01119 
01120         if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
01121                 if (BP_GET_DEDUP(bp))
01122                         zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
01123         }
01124 
01125         return (ZIO_PIPELINE_CONTINUE);
01126 }
01127 
01128 /*
01129  * ==========================================================================
01130  * Execute the I/O pipeline
01131  * ==========================================================================
01132  */
01133 
01134 static void
01135 zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
01136 {
01137         spa_t *spa = zio->io_spa;
01138         zio_type_t t = zio->io_type;
01139         int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0);
01140 
01141         ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT);
01142 
01143         /*
01144          * If we're a config writer or a probe, the normal issue and
01145          * interrupt threads may all be blocked waiting for the config lock.
01146          * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
01147          */
01148         if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
01149                 t = ZIO_TYPE_NULL;
01150 
01151         /*
01152          * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
01153          */
01154         if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
01155                 t = ZIO_TYPE_NULL;
01156 
01157         /*
01158          * If this is a high priority I/O, then use the high priority taskq.
01159          */
01160         if (zio->io_priority == ZIO_PRIORITY_NOW &&
01161             spa->spa_zio_taskq[t][q + 1] != NULL)
01162                 q++;
01163 
01164         ASSERT3U(q, <, ZIO_TASKQ_TYPES);
01165 #ifdef _KERNEL
01166         (void) taskq_dispatch_safe(spa->spa_zio_taskq[t][q],
01167             (task_func_t *)zio_execute, zio, flags, &zio->io_task);
01168 #else
01169         (void) taskq_dispatch(spa->spa_zio_taskq[t][q],
01170             (task_func_t *)zio_execute, zio, flags);
01171 #endif
01172 }
01173 
01174 static boolean_t
01175 zio_taskq_member(zio_t *zio, enum zio_taskq_type q)
01176 {
01177         kthread_t *executor = zio->io_executor;
01178         spa_t *spa = zio->io_spa;
01179 
01180         for (zio_type_t t = 0; t < ZIO_TYPES; t++)
01181                 if (taskq_member(spa->spa_zio_taskq[t][q], executor))
01182                         return (B_TRUE);
01183 
01184         return (B_FALSE);
01185 }
01186 
01187 static int
01188 zio_issue_async(zio_t *zio)
01189 {
01190         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
01191 
01192         return (ZIO_PIPELINE_STOP);
01193 }
01194 
01195 void
01196 zio_interrupt(zio_t *zio)
01197 {
01198         zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
01199 }
01200 
01201 static zio_pipe_stage_t *zio_pipeline[];
01202 
01218 void
01219 zio_execute(zio_t *zio)
01220 {
01221         zio->io_executor = curthread;
01222 
01223         while (zio->io_stage < ZIO_STAGE_DONE) {
01224                 enum zio_stage pipeline = zio->io_pipeline;
01225                 enum zio_stage stage = zio->io_stage;
01226                 int rv;
01227 
01228                 ASSERT(!MUTEX_HELD(&zio->io_lock));
01229                 ASSERT(ISP2(stage));
01230                 ASSERT(zio->io_stall == NULL);
01231 
01232                 do {
01233                         stage <<= 1;
01234                 } while ((stage & pipeline) == 0);
01235 
01236                 ASSERT(stage <= ZIO_STAGE_DONE);
01237 
01238                 /*
01239                  * If we are in interrupt context and this pipeline stage
01240                  * will grab a config lock that is held across I/O,
01241                  * or may wait for an I/O that needs an interrupt thread
01242                  * to complete, issue async to avoid deadlock.
01243                  *
01244                  * For VDEV_IO_START, we cut in line so that the io will
01245                  * be sent to disk promptly.
01246                  */
01247                 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
01248                     zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
01249                         boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
01250                             zio_requeue_io_start_cut_in_line : B_FALSE;
01251                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
01252                         return;
01253                 }
01254 
01255                 zio->io_stage = stage;
01256                 rv = zio_pipeline[highbit(stage) - 1](zio);
01257 
01258                 if (rv == ZIO_PIPELINE_STOP)
01259                         return;
01260 
01261                 ASSERT(rv == ZIO_PIPELINE_CONTINUE);
01262         }
01263 }
01264 
01268 int
01269 zio_wait(zio_t *zio)
01270 {
01271         int error;
01272 
01273         ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
01274         ASSERT(zio->io_executor == NULL);
01275 
01276         zio->io_waiter = curthread;
01277 
01278         zio_execute(zio);
01279 
01280         mutex_enter(&zio->io_lock);
01281         while (zio->io_executor != NULL)
01282                 cv_wait(&zio->io_cv, &zio->io_lock);
01283         mutex_exit(&zio->io_lock);
01284 
01285         error = zio->io_error;
01286         zio_destroy(zio);
01287 
01288         return (error);
01289 }
01290 
01291 void
01292 zio_nowait(zio_t *zio)
01293 {
01294         ASSERT(zio->io_executor == NULL);
01295 
01296         if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
01297             zio_unique_parent(zio) == NULL) {
01298                 /*
01299                  * This is a logical async I/O with no parent to wait for it.
01300                  * We add it to the spa_async_root_zio "Godfather" I/O which
01301                  * will ensure they complete prior to unloading the pool.
01302                  */
01303                 spa_t *spa = zio->io_spa;
01304 
01305                 zio_add_child(spa->spa_async_zio_root, zio);
01306         }
01307 
01308         zio_execute(zio);
01309 }
01310 
01314 static void
01315 zio_reexecute(zio_t *pio)
01316 {
01317         zio_t *cio, *cio_next;
01318 
01319         ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
01320         ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
01321         ASSERT(pio->io_gang_leader == NULL);
01322         ASSERT(pio->io_gang_tree == NULL);
01323 
01324         pio->io_flags = pio->io_orig_flags;
01325         pio->io_stage = pio->io_orig_stage;
01326         pio->io_pipeline = pio->io_orig_pipeline;
01327         pio->io_reexecute = 0;
01328         pio->io_error = 0;
01329         for (int w = 0; w < ZIO_WAIT_TYPES; w++)
01330                 pio->io_state[w] = 0;
01331         for (int c = 0; c < ZIO_CHILD_TYPES; c++)
01332                 pio->io_child_error[c] = 0;
01333 
01334         if (IO_IS_ALLOCATING(pio))
01335                 BP_ZERO(pio->io_bp);
01336 
01337         /*
01338          * As we reexecute pio's children, new children could be created.
01339          * New children go to the head of pio's io_child_list, however,
01340          * so we will (correctly) not reexecute them.  The key is that
01341          * the remainder of pio's io_child_list, from 'cio_next' onward,
01342          * cannot be affected by any side effects of reexecuting 'cio'.
01343          */
01344         for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
01345                 cio_next = zio_walk_children(pio);
01346                 mutex_enter(&pio->io_lock);
01347                 for (int w = 0; w < ZIO_WAIT_TYPES; w++)
01348                         pio->io_children[cio->io_child_type][w]++;
01349                 mutex_exit(&pio->io_lock);
01350                 zio_reexecute(cio);
01351         }
01352 
01353         /*
01354          * Now that all children have been reexecuted, execute the parent.
01355          * We don't reexecute "The Godfather" I/O here as it's the
01356          * responsibility of the caller to wait on him.
01357          */
01358         if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
01359                 zio_execute(pio);
01360 }
01361 
01362 void
01363 zio_suspend(spa_t *spa, zio_t *zio)
01364 {
01365         if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
01366                 fm_panic("Pool '%s' has encountered an uncorrectable I/O "
01367                     "failure and the failure mode property for this pool "
01368                     "is set to panic.", spa_name(spa));
01369 
01370         zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
01371 
01372         mutex_enter(&spa->spa_suspend_lock);
01373 
01374         if (spa->spa_suspend_zio_root == NULL)
01375                 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
01376                     ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
01377                     ZIO_FLAG_GODFATHER);
01378 
01379         spa->spa_suspended = B_TRUE;
01380 
01381         if (zio != NULL) {
01382                 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
01383                 ASSERT(zio != spa->spa_suspend_zio_root);
01384                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
01385                 ASSERT(zio_unique_parent(zio) == NULL);
01386                 ASSERT(zio->io_stage == ZIO_STAGE_DONE);
01387                 zio_add_child(spa->spa_suspend_zio_root, zio);
01388         }
01389 
01390         mutex_exit(&spa->spa_suspend_lock);
01391 }
01392 
01393 int
01394 zio_resume(spa_t *spa)
01395 {
01396         zio_t *pio;
01397 
01398         /*
01399          * Reexecute all previously suspended i/o.
01400          */
01401         mutex_enter(&spa->spa_suspend_lock);
01402         spa->spa_suspended = B_FALSE;
01403         cv_broadcast(&spa->spa_suspend_cv);
01404         pio = spa->spa_suspend_zio_root;
01405         spa->spa_suspend_zio_root = NULL;
01406         mutex_exit(&spa->spa_suspend_lock);
01407 
01408         if (pio == NULL)
01409                 return (0);
01410 
01411         zio_reexecute(pio);
01412         return (zio_wait(pio));
01413 }
01414 
01415 void
01416 zio_resume_wait(spa_t *spa)
01417 {
01418         mutex_enter(&spa->spa_suspend_lock);
01419         while (spa_suspended(spa))
01420                 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
01421         mutex_exit(&spa->spa_suspend_lock);
01422 }
01423 
01489 static zio_t *
01490 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
01491 {
01492         if (gn != NULL)
01493                 return (pio);
01494 
01495         return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
01496             NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
01497             &pio->io_bookmark));
01498 }
01499 
01500 zio_t *
01501 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
01502 {
01503         zio_t *zio;
01504 
01505         if (gn != NULL) {
01506                 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
01507                     gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
01508                     ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
01509                 /*
01510                  * As we rewrite each gang header, the pipeline will compute
01511                  * a new gang block header checksum for it; but no one will
01512                  * compute a new data checksum, so we do that here.  The one
01513                  * exception is the gang leader: the pipeline already computed
01514                  * its data checksum because that stage precedes gang assembly.
01515                  * (Presently, nothing actually uses interior data checksums;
01516                  * this is just good hygiene.)
01517                  */
01518                 if (gn != pio->io_gang_leader->io_gang_tree) {
01519                         zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
01520                             data, BP_GET_PSIZE(bp));
01521                 }
01522                 /*
01523                  * If we are here to damage data for testing purposes,
01524                  * leave the GBH alone so that we can detect the damage.
01525                  */
01526                 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
01527                         zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
01528         } else {
01529                 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
01530                     data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
01531                     ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
01532         }
01533 
01534         return (zio);
01535 }
01536 
01537 /* ARGSUSED */
01538 zio_t *
01539 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
01540 {
01541         return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
01542             BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp),
01543             ZIO_GANG_CHILD_FLAGS(pio)));
01544 }
01545 
01546 /* ARGSUSED */
01547 zio_t *
01548 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
01549 {
01550         return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
01551             NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
01552 }
01553 
01554 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
01555         NULL,
01556         zio_read_gang,
01557         zio_rewrite_gang,
01558         zio_free_gang,
01559         zio_claim_gang,
01560         NULL
01561 };
01562 
01563 static void zio_gang_tree_assemble_done(zio_t *zio);
01564 
01565 static zio_gang_node_t *
01566 zio_gang_node_alloc(zio_gang_node_t **gnpp)
01567 {
01568         zio_gang_node_t *gn;
01569 
01570         ASSERT(*gnpp == NULL);
01571 
01572         gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
01573         gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
01574         *gnpp = gn;
01575 
01576         return (gn);
01577 }
01578 
01579 static void
01580 zio_gang_node_free(zio_gang_node_t **gnpp)
01581 {
01582         zio_gang_node_t *gn = *gnpp;
01583 
01584         for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
01585                 ASSERT(gn->gn_child[g] == NULL);
01586 
01587         zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
01588         kmem_free(gn, sizeof (*gn));
01589         *gnpp = NULL;
01590 }
01591 
01592 static void
01593 zio_gang_tree_free(zio_gang_node_t **gnpp)
01594 {
01595         zio_gang_node_t *gn = *gnpp;
01596 
01597         if (gn == NULL)
01598                 return;
01599 
01600         for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
01601                 zio_gang_tree_free(&gn->gn_child[g]);
01602 
01603         zio_gang_node_free(gnpp);
01604 }
01605 
01606 static void
01607 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
01608 {
01609         zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
01610 
01611         ASSERT(gio->io_gang_leader == gio);
01612         ASSERT(BP_IS_GANG(bp));
01613 
01614         zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
01615             SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
01616             gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
01617 }
01618 
01619 static void
01620 zio_gang_tree_assemble_done(zio_t *zio)
01621 {
01622         zio_t *gio = zio->io_gang_leader;
01623         zio_gang_node_t *gn = zio->io_private;
01624         blkptr_t *bp = zio->io_bp;
01625 
01626         ASSERT(gio == zio_unique_parent(zio));
01627         ASSERT(zio->io_child_count == 0);
01628 
01629         if (zio->io_error)
01630                 return;
01631 
01632         if (BP_SHOULD_BYTESWAP(bp))
01633                 byteswap_uint64_array(zio->io_data, zio->io_size);
01634 
01635         ASSERT(zio->io_data == gn->gn_gbh);
01636         ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
01637         ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
01638 
01639         for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
01640                 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
01641                 if (!BP_IS_GANG(gbp))
01642                         continue;
01643                 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
01644         }
01645 }
01646 
01647 static void
01648 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
01649 {
01650         zio_t *gio = pio->io_gang_leader;
01651         zio_t *zio;
01652 
01653         ASSERT(BP_IS_GANG(bp) == !!gn);
01654         ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
01655         ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
01656 
01657         /*
01658          * If you're a gang header, your data is in gn->gn_gbh.
01659          * If you're a gang member, your data is in 'data' and gn == NULL.
01660          */
01661         zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
01662 
01663         if (gn != NULL) {
01664                 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
01665 
01666                 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
01667                         blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
01668                         if (BP_IS_HOLE(gbp))
01669                                 continue;
01670                         zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
01671                         data = (char *)data + BP_GET_PSIZE(gbp);
01672                 }
01673         }
01674 
01675         if (gn == gio->io_gang_tree && gio->io_data != NULL)
01676                 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
01677 
01678         if (zio != pio)
01679                 zio_nowait(zio);
01680 }
01681 
01682 static int
01683 zio_gang_assemble(zio_t *zio)
01684 {
01685         blkptr_t *bp = zio->io_bp;
01686 
01687         ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
01688         ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
01689 
01690         zio->io_gang_leader = zio;
01691 
01692         zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
01693 
01694         return (ZIO_PIPELINE_CONTINUE);
01695 }
01696 
01697 static int
01698 zio_gang_issue(zio_t *zio)
01699 {
01700         blkptr_t *bp = zio->io_bp;
01701 
01702         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
01703                 return (ZIO_PIPELINE_STOP);
01704 
01705         ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
01706         ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
01707 
01708         if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
01709                 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
01710         else
01711                 zio_gang_tree_free(&zio->io_gang_tree);
01712 
01713         zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
01714 
01715         return (ZIO_PIPELINE_CONTINUE);
01716 }
01717 
01718 static void
01719 zio_write_gang_member_ready(zio_t *zio)
01720 {
01721         zio_t *pio = zio_unique_parent(zio);
01722         zio_t *gio = zio->io_gang_leader;
01723         dva_t *cdva = zio->io_bp->blk_dva;
01724         dva_t *pdva = pio->io_bp->blk_dva;
01725         uint64_t asize;
01726 
01727         if (BP_IS_HOLE(zio->io_bp))
01728                 return;
01729 
01730         ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
01731 
01732         ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
01733         ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
01734         ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
01735         ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
01736         ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
01737 
01738         mutex_enter(&pio->io_lock);
01739         for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
01740                 ASSERT(DVA_GET_GANG(&pdva[d]));
01741                 asize = DVA_GET_ASIZE(&pdva[d]);
01742                 asize += DVA_GET_ASIZE(&cdva[d]);
01743                 DVA_SET_ASIZE(&pdva[d], asize);
01744         }
01745         mutex_exit(&pio->io_lock);
01746 }
01747 
01748 static int
01749 zio_write_gang_block(zio_t *pio)
01750 {
01751         spa_t *spa = pio->io_spa;
01752         blkptr_t *bp = pio->io_bp;
01753         zio_t *gio = pio->io_gang_leader;
01754         zio_t *zio;
01755         zio_gang_node_t *gn, **gnpp;
01756         zio_gbh_phys_t *gbh;
01757         uint64_t txg = pio->io_txg;
01758         uint64_t resid = pio->io_size;
01759         uint64_t lsize;
01760         int copies = gio->io_prop.zp_copies;
01761         int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
01762         zio_prop_t zp;
01763         int error;
01764 
01765         error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
01766             bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
01767             METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
01768         if (error) {
01769                 pio->io_error = error;
01770                 return (ZIO_PIPELINE_CONTINUE);
01771         }
01772 
01773         if (pio == gio) {
01774                 gnpp = &gio->io_gang_tree;
01775         } else {
01776                 gnpp = pio->io_private;
01777                 ASSERT(pio->io_ready == zio_write_gang_member_ready);
01778         }
01779 
01780         gn = zio_gang_node_alloc(gnpp);
01781         gbh = gn->gn_gbh;
01782         bzero(gbh, SPA_GANGBLOCKSIZE);
01783 
01784         /*
01785          * Create the gang header.
01786          */
01787         zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
01788             pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
01789 
01790         /*
01791          * Create and nowait the gang children.
01792          */
01793         for (int g = 0; resid != 0; resid -= lsize, g++) {
01794                 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
01795                     SPA_MINBLOCKSIZE);
01796                 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
01797 
01798                 zp.zp_checksum = gio->io_prop.zp_checksum;
01799                 zp.zp_compress = ZIO_COMPRESS_OFF;
01800                 zp.zp_type = DMU_OT_NONE;
01801                 zp.zp_level = 0;
01802                 zp.zp_copies = gio->io_prop.zp_copies;
01803                 zp.zp_dedup = 0;
01804                 zp.zp_dedup_verify = 0;
01805 
01806                 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
01807                     (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
01808                     zio_write_gang_member_ready, NULL, &gn->gn_child[g],
01809                     pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
01810                     &pio->io_bookmark));
01811         }
01812 
01813         /*
01814          * Set pio's pipeline to just wait for zio to finish.
01815          */
01816         pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
01817 
01818         zio_nowait(zio);
01819 
01820         return (ZIO_PIPELINE_CONTINUE);
01821 }
01822 
01823 /*
01824  * ==========================================================================
01825  * Dedup
01826  * ==========================================================================
01827  */
01828 static void
01829 zio_ddt_child_read_done(zio_t *zio)
01830 {
01831         blkptr_t *bp = zio->io_bp;
01832         ddt_entry_t *dde = zio->io_private;
01833         ddt_phys_t *ddp;
01834         zio_t *pio = zio_unique_parent(zio);
01835 
01836         mutex_enter(&pio->io_lock);
01837         ddp = ddt_phys_select(dde, bp);
01838         if (zio->io_error == 0)
01839                 ddt_phys_clear(ddp);    /* this ddp doesn't need repair */
01840         if (zio->io_error == 0 && dde->dde_repair_data == NULL)
01841                 dde->dde_repair_data = zio->io_data;
01842         else
01843                 zio_buf_free(zio->io_data, zio->io_size);
01844         mutex_exit(&pio->io_lock);
01845 }
01846 
01847 static int
01848 zio_ddt_read_start(zio_t *zio)
01849 {
01850         blkptr_t *bp = zio->io_bp;
01851 
01852         ASSERT(BP_GET_DEDUP(bp));
01853         ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
01854         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
01855 
01856         if (zio->io_child_error[ZIO_CHILD_DDT]) {
01857                 ddt_t *ddt = ddt_select(zio->io_spa, bp);
01858                 ddt_entry_t *dde = ddt_repair_start(ddt, bp);
01859                 ddt_phys_t *ddp = dde->dde_phys;
01860                 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
01861                 blkptr_t blk;
01862 
01863                 ASSERT(zio->io_vsd == NULL);
01864                 zio->io_vsd = dde;
01865 
01866                 if (ddp_self == NULL)
01867                         return (ZIO_PIPELINE_CONTINUE);
01868 
01869                 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
01870                         if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
01871                                 continue;
01872                         ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
01873                             &blk);
01874                         zio_nowait(zio_read(zio, zio->io_spa, &blk,
01875                             zio_buf_alloc(zio->io_size), zio->io_size,
01876                             zio_ddt_child_read_done, dde, zio->io_priority,
01877                             ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
01878                             &zio->io_bookmark));
01879                 }
01880                 return (ZIO_PIPELINE_CONTINUE);
01881         }
01882 
01883         zio_nowait(zio_read(zio, zio->io_spa, bp,
01884             zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
01885             ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
01886 
01887         return (ZIO_PIPELINE_CONTINUE);
01888 }
01889 
01890 static int
01891 zio_ddt_read_done(zio_t *zio)
01892 {
01893         blkptr_t *bp = zio->io_bp;
01894 
01895         if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
01896                 return (ZIO_PIPELINE_STOP);
01897 
01898         ASSERT(BP_GET_DEDUP(bp));
01899         ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
01900         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
01901 
01902         if (zio->io_child_error[ZIO_CHILD_DDT]) {
01903                 ddt_t *ddt = ddt_select(zio->io_spa, bp);
01904                 ddt_entry_t *dde = zio->io_vsd;
01905                 if (ddt == NULL) {
01906                         ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
01907                         return (ZIO_PIPELINE_CONTINUE);
01908                 }
01909                 if (dde == NULL) {
01910                         zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
01911                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
01912                         return (ZIO_PIPELINE_STOP);
01913                 }
01914                 if (dde->dde_repair_data != NULL) {
01915                         bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
01916                         zio->io_child_error[ZIO_CHILD_DDT] = 0;
01917                 }
01918                 ddt_repair_done(ddt, dde);
01919                 zio->io_vsd = NULL;
01920         }
01921 
01922         ASSERT(zio->io_vsd == NULL);
01923 
01924         return (ZIO_PIPELINE_CONTINUE);
01925 }
01926 
01927 static boolean_t
01928 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
01929 {
01930         spa_t *spa = zio->io_spa;
01931 
01932         /*
01933          * Note: we compare the original data, not the transformed data,
01934          * because when zio->io_bp is an override bp, we will not have
01935          * pushed the I/O transforms.  That's an important optimization
01936          * because otherwise we'd compress/encrypt all dmu_sync() data twice.
01937          */
01938         for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
01939                 zio_t *lio = dde->dde_lead_zio[p];
01940 
01941                 if (lio != NULL) {
01942                         return (lio->io_orig_size != zio->io_orig_size ||
01943                             bcmp(zio->io_orig_data, lio->io_orig_data,
01944                             zio->io_orig_size) != 0);
01945                 }
01946         }
01947 
01948         for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
01949                 ddt_phys_t *ddp = &dde->dde_phys[p];
01950 
01951                 if (ddp->ddp_phys_birth != 0) {
01952                         arc_buf_t *abuf = NULL;
01953                         uint32_t aflags = ARC_WAIT;
01954                         blkptr_t blk = *zio->io_bp;
01955                         int error;
01956 
01957                         ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
01958 
01959                         ddt_exit(ddt);
01960 
01961                         error = arc_read_nolock(NULL, spa, &blk,
01962                             arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
01963                             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
01964                             &aflags, &zio->io_bookmark);
01965 
01966                         if (error == 0) {
01967                                 if (arc_buf_size(abuf) != zio->io_orig_size ||
01968                                     bcmp(abuf->b_data, zio->io_orig_data,
01969                                     zio->io_orig_size) != 0)
01970                                         error = EEXIST;
01971                                 VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
01972                         }
01973 
01974                         ddt_enter(ddt);
01975                         return (error != 0);
01976                 }
01977         }
01978 
01979         return (B_FALSE);
01980 }
01981 
01982 static void
01983 zio_ddt_child_write_ready(zio_t *zio)
01984 {
01985         int p = zio->io_prop.zp_copies;
01986         ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
01987         ddt_entry_t *dde = zio->io_private;
01988         ddt_phys_t *ddp = &dde->dde_phys[p];
01989         zio_t *pio;
01990 
01991         if (zio->io_error)
01992                 return;
01993 
01994         ddt_enter(ddt);
01995 
01996         ASSERT(dde->dde_lead_zio[p] == zio);
01997 
01998         ddt_phys_fill(ddp, zio->io_bp);
01999 
02000         while ((pio = zio_walk_parents(zio)) != NULL)
02001                 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
02002 
02003         ddt_exit(ddt);
02004 }
02005 
02006 static void
02007 zio_ddt_child_write_done(zio_t *zio)
02008 {
02009         int p = zio->io_prop.zp_copies;
02010         ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
02011         ddt_entry_t *dde = zio->io_private;
02012         ddt_phys_t *ddp = &dde->dde_phys[p];
02013 
02014         ddt_enter(ddt);
02015 
02016         ASSERT(ddp->ddp_refcnt == 0);
02017         ASSERT(dde->dde_lead_zio[p] == zio);
02018         dde->dde_lead_zio[p] = NULL;
02019 
02020         if (zio->io_error == 0) {
02021                 while (zio_walk_parents(zio) != NULL)
02022                         ddt_phys_addref(ddp);
02023         } else {
02024                 ddt_phys_clear(ddp);
02025         }
02026 
02027         ddt_exit(ddt);
02028 }
02029 
02030 static void
02031 zio_ddt_ditto_write_done(zio_t *zio)
02032 {
02033         int p = DDT_PHYS_DITTO;
02034         zio_prop_t *zp = &zio->io_prop;
02035         blkptr_t *bp = zio->io_bp;
02036         ddt_t *ddt = ddt_select(zio->io_spa, bp);
02037         ddt_entry_t *dde = zio->io_private;
02038         ddt_phys_t *ddp = &dde->dde_phys[p];
02039         ddt_key_t *ddk = &dde->dde_key;
02040 
02041         ddt_enter(ddt);
02042 
02043         ASSERT(ddp->ddp_refcnt == 0);
02044         ASSERT(dde->dde_lead_zio[p] == zio);
02045         dde->dde_lead_zio[p] = NULL;
02046 
02047         if (zio->io_error == 0) {
02048                 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
02049                 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
02050                 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
02051                 if (ddp->ddp_phys_birth != 0)
02052                         ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
02053                 ddt_phys_fill(ddp, bp);
02054         }
02055 
02056         ddt_exit(ddt);
02057 }
02058 
02059 static int
02060 zio_ddt_write(zio_t *zio)
02061 {
02062         spa_t *spa = zio->io_spa;
02063         blkptr_t *bp = zio->io_bp;
02064         uint64_t txg = zio->io_txg;
02065         zio_prop_t *zp = &zio->io_prop;
02066         int p = zp->zp_copies;
02067         int ditto_copies;
02068         zio_t *cio = NULL;
02069         zio_t *dio = NULL;
02070         ddt_t *ddt = ddt_select(spa, bp);
02071         ddt_entry_t *dde;
02072         ddt_phys_t *ddp;
02073 
02074         ASSERT(BP_GET_DEDUP(bp));
02075         ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
02076         ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
02077 
02078         ddt_enter(ddt);
02079         dde = ddt_lookup(ddt, bp, B_TRUE);
02080         ddp = &dde->dde_phys[p];
02081 
02082         if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
02083                 /*
02084                  * If we're using a weak checksum, upgrade to a strong checksum
02085                  * and try again.  If we're already using a strong checksum,
02086                  * we can't resolve it, so just convert to an ordinary write.
02087                  * (And automatically e-mail a paper to Nature?)
02088                  */
02089                 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
02090                         zp->zp_checksum = spa_dedup_checksum(spa);
02091                         zio_pop_transforms(zio);
02092                         zio->io_stage = ZIO_STAGE_OPEN;
02093                         BP_ZERO(bp);
02094                 } else {
02095                         zp->zp_dedup = 0;
02096                 }
02097                 zio->io_pipeline = ZIO_WRITE_PIPELINE;
02098                 ddt_exit(ddt);
02099                 return (ZIO_PIPELINE_CONTINUE);
02100         }
02101 
02102         ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
02103         ASSERT(ditto_copies < SPA_DVAS_PER_BP);
02104 
02105         if (ditto_copies > ddt_ditto_copies_present(dde) &&
02106             dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
02107                 zio_prop_t czp = *zp;
02108 
02109                 czp.zp_copies = ditto_copies;
02110 
02111                 /*
02112                  * If we arrived here with an override bp, we won't have run
02113                  * the transform stack, so we won't have the data we need to
02114                  * generate a child i/o.  So, toss the override bp and restart.
02115                  * This is safe, because using the override bp is just an
02116                  * optimization; and it's rare, so the cost doesn't matter.
02117                  */
02118                 if (zio->io_bp_override) {
02119                         zio_pop_transforms(zio);
02120                         zio->io_stage = ZIO_STAGE_OPEN;
02121                         zio->io_pipeline = ZIO_WRITE_PIPELINE;
02122                         zio->io_bp_override = NULL;
02123                         BP_ZERO(bp);
02124                         ddt_exit(ddt);
02125                         return (ZIO_PIPELINE_CONTINUE);
02126                 }
02127 
02128                 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
02129                     zio->io_orig_size, &czp, NULL,
02130                     zio_ddt_ditto_write_done, dde, zio->io_priority,
02131                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
02132 
02133                 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
02134                 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
02135         }
02136 
02137         if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
02138                 if (ddp->ddp_phys_birth != 0)
02139                         ddt_bp_fill(ddp, bp, txg);
02140                 if (dde->dde_lead_zio[p] != NULL)
02141                         zio_add_child(zio, dde->dde_lead_zio[p]);
02142                 else
02143                         ddt_phys_addref(ddp);
02144         } else if (zio->io_bp_override) {
02145                 ASSERT(bp->blk_birth == txg);
02146                 ASSERT(BP_EQUAL(bp, zio->io_bp_override));
02147                 ddt_phys_fill(ddp, bp);
02148                 ddt_phys_addref(ddp);
02149         } else {
02150                 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
02151                     zio->io_orig_size, zp, zio_ddt_child_write_ready,
02152                     zio_ddt_child_write_done, dde, zio->io_priority,
02153                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
02154 
02155                 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
02156                 dde->dde_lead_zio[p] = cio;
02157         }
02158 
02159         ddt_exit(ddt);
02160 
02161         if (cio)
02162                 zio_nowait(cio);
02163         if (dio)
02164                 zio_nowait(dio);
02165 
02166         return (ZIO_PIPELINE_CONTINUE);
02167 }
02168 
02169 ddt_entry_t *freedde; /* for debugging */
02170 
02171 static int
02172 zio_ddt_free(zio_t *zio)
02173 {
02174         spa_t *spa = zio->io_spa;
02175         blkptr_t *bp = zio->io_bp;
02176         ddt_t *ddt = ddt_select(spa, bp);
02177         ddt_entry_t *dde;
02178         ddt_phys_t *ddp;
02179 
02180         ASSERT(BP_GET_DEDUP(bp));
02181         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
02182 
02183         ddt_enter(ddt);
02184         freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
02185         ddp = ddt_phys_select(dde, bp);
02186         ddt_phys_decref(ddp);
02187         ddt_exit(ddt);
02188 
02189         return (ZIO_PIPELINE_CONTINUE);
02190 }
02191 
02192 /*
02193  * ==========================================================================
02194  * Allocate and free blocks
02195  * ==========================================================================
02196  */
02197 static int
02198 zio_dva_allocate(zio_t *zio)
02199 {
02200         spa_t *spa = zio->io_spa;
02201         metaslab_class_t *mc = spa_normal_class(spa);
02202         blkptr_t *bp = zio->io_bp;
02203         int error;
02204         int flags = 0;
02205 
02206         if (zio->io_gang_leader == NULL) {
02207                 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
02208                 zio->io_gang_leader = zio;
02209         }
02210 
02211         ASSERT(BP_IS_HOLE(bp));
02212         ASSERT0(BP_GET_NDVAS(bp));
02213         ASSERT3U(zio->io_prop.zp_copies, >, 0);
02214         ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
02215         ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
02216 
02217         /*
02218          * The dump device does not support gang blocks so allocation on
02219          * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
02220          * the "fast" gang feature.
02221          */
02222         flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
02223         flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
02224             METASLAB_GANG_CHILD : 0;
02225         error = metaslab_alloc(spa, mc, zio->io_size, bp,
02226             zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
02227 
02228         if (error) {
02229                 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
02230                     "size %llu, error %d", spa_name(spa), zio, zio->io_size,
02231                     error);
02232                 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
02233                         return (zio_write_gang_block(zio));
02234                 zio->io_error = error;
02235         }
02236 
02237         return (ZIO_PIPELINE_CONTINUE);
02238 }
02239 
02240 static int
02241 zio_dva_free(zio_t *zio)
02242 {
02243         metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
02244 
02245         return (ZIO_PIPELINE_CONTINUE);
02246 }
02247 
02248 static int
02249 zio_dva_claim(zio_t *zio)
02250 {
02251         int error;
02252 
02253         error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
02254         if (error)
02255                 zio->io_error = error;
02256 
02257         return (ZIO_PIPELINE_CONTINUE);
02258 }
02259 
02265 static void
02266 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
02267 {
02268         ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
02269         ASSERT(zio->io_bp_override == NULL);
02270 
02271         if (!BP_IS_HOLE(bp))
02272                 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
02273 
02274         if (gn != NULL) {
02275                 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
02276                         zio_dva_unallocate(zio, gn->gn_child[g],
02277                             &gn->gn_gbh->zg_blkptr[g]);
02278                 }
02279         }
02280 }
02281 
02287 int
02288 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
02289     uint64_t size, boolean_t use_slog)
02290 {
02291         int error = 1;
02292 
02293         ASSERT(txg > spa_syncing_txg(spa));
02294 
02295         /*
02296          * ZIL blocks are always contiguous (i.e. not gang blocks) so we
02297          * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
02298          * when allocating them.
02299          */
02300         if (use_slog) {
02301                 error = metaslab_alloc(spa, spa_log_class(spa), size,
02302                     new_bp, 1, txg, old_bp,
02303                     METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
02304         }
02305 
02306         if (error) {
02307                 error = metaslab_alloc(spa, spa_normal_class(spa), size,
02308                     new_bp, 1, txg, old_bp,
02309                     METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
02310         }
02311 
02312         if (error == 0) {
02313                 BP_SET_LSIZE(new_bp, size);
02314                 BP_SET_PSIZE(new_bp, size);
02315                 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
02316                 BP_SET_CHECKSUM(new_bp,
02317                     spa_version(spa) >= SPA_VERSION_SLIM_ZIL
02318                     ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
02319                 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
02320                 BP_SET_LEVEL(new_bp, 0);
02321                 BP_SET_DEDUP(new_bp, 0);
02322                 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
02323         }
02324 
02325         return (error);
02326 }
02327 
02331 void
02332 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
02333 {
02334         ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
02335         ASSERT(!BP_IS_GANG(bp));
02336 
02337         zio_free(spa, txg, bp);
02338 }
02339 
02340 /*
02341  * ==========================================================================
02342  * Read and write to physical devices
02343  * ==========================================================================
02344  */
02345 static int
02346 zio_vdev_io_start(zio_t *zio)
02347 {
02348         vdev_t *vd = zio->io_vd;
02349         uint64_t align;
02350         spa_t *spa = zio->io_spa;
02351 
02352         ASSERT(zio->io_error == 0);
02353         ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
02354 
02355         if (vd == NULL) {
02356                 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
02357                         spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
02358 
02359                 /*
02360                  * The mirror_ops handle multiple DVAs in a single BP.
02361                  */
02362                 return (vdev_mirror_ops.vdev_op_io_start(zio));
02363         }
02364 
02365         if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) {
02366                 trim_map_free(zio);
02367                 return (ZIO_PIPELINE_CONTINUE);
02368         }
02369 
02370         /*
02371          * We keep track of time-sensitive I/Os so that the scan thread
02372          * can quickly react to certain workloads.  In particular, we care
02373          * about non-scrubbing, top-level reads and writes with the following
02374          * characteristics:
02375          *      - synchronous writes of user data to non-slog devices
02376          *      - any reads of user data
02377          * When these conditions are met, adjust the timestamp of spa_last_io
02378          * which allows the scan thread to adjust its workload accordingly.
02379          */
02380         if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
02381             vd == vd->vdev_top && !vd->vdev_islog &&
02382             zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
02383             zio->io_txg != spa_syncing_txg(spa)) {
02384                 uint64_t old = spa->spa_last_io;
02385                 uint64_t new = ddi_get_lbolt64();
02386                 if (old != new)
02387                         (void) atomic_cas_64(&spa->spa_last_io, old, new);
02388         }
02389 
02390         align = 1ULL << vd->vdev_top->vdev_ashift;
02391 
02392         if (P2PHASE(zio->io_size, align) != 0) {
02393                 uint64_t asize = P2ROUNDUP(zio->io_size, align);
02394                 char *abuf = NULL;
02395                 if (zio->io_type == ZIO_TYPE_READ ||
02396                     zio->io_type == ZIO_TYPE_WRITE)
02397                         abuf = zio_buf_alloc(asize);
02398                 ASSERT(vd == vd->vdev_top);
02399                 if (zio->io_type == ZIO_TYPE_WRITE) {
02400                         bcopy(zio->io_data, abuf, zio->io_size);
02401                         bzero(abuf + zio->io_size, asize - zio->io_size);
02402                 }
02403                 zio_push_transform(zio, abuf, asize, abuf ? asize : 0,
02404                     zio_subblock);
02405         }
02406 
02407         ASSERT(P2PHASE(zio->io_offset, align) == 0);
02408         ASSERT(P2PHASE(zio->io_size, align) == 0);
02409         VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa));
02410 
02411         /*
02412          * If this is a repair I/O, and there's no self-healing involved --
02413          * that is, we're just resilvering what we expect to resilver --
02414          * then don't do the I/O unless zio's txg is actually in vd's DTL.
02415          * This prevents spurious resilvering with nested replication.
02416          * For example, given a mirror of mirrors, (A+B)+(C+D), if only
02417          * A is out of date, we'll read from C+D, then use the data to
02418          * resilver A+B -- but we don't actually want to resilver B, just A.
02419          * The top-level mirror has no way to know this, so instead we just
02420          * discard unnecessary repairs as we work our way down the vdev tree.
02421          * The same logic applies to any form of nested replication:
02422          * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
02423          */
02424         if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
02425             !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
02426             zio->io_txg != 0 && /* not a delegated i/o */
02427             !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
02428                 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
02429                 zio_vdev_io_bypass(zio);
02430                 return (ZIO_PIPELINE_CONTINUE);
02431         }
02432 
02433         if (vd->vdev_ops->vdev_op_leaf &&
02434             (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
02435 
02436                 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
02437                         return (ZIO_PIPELINE_CONTINUE);
02438 
02439                 if ((zio = vdev_queue_io(zio)) == NULL)
02440                         return (ZIO_PIPELINE_STOP);
02441 
02442                 if (!vdev_accessible(vd, zio)) {
02443                         zio->io_error = ENXIO;
02444                         zio_interrupt(zio);
02445                         return (ZIO_PIPELINE_STOP);
02446                 }
02447         }
02448 
02449         if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE) {
02450                 if (!trim_map_write_start(zio))
02451                         return (ZIO_PIPELINE_STOP);
02452         }
02453 
02454         return (vd->vdev_ops->vdev_op_io_start(zio));
02455 }
02456 
02457 static int
02458 zio_vdev_io_done(zio_t *zio)
02459 {
02460         vdev_t *vd = zio->io_vd;
02461         vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
02462         boolean_t unexpected_error = B_FALSE;
02463 
02464         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
02465                 return (ZIO_PIPELINE_STOP);
02466 
02467         ASSERT(zio->io_type == ZIO_TYPE_READ ||
02468             zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE);
02469 
02470         if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
02471             zio->io_type == ZIO_TYPE_WRITE) {
02472                 trim_map_write_done(zio);
02473         }
02474 
02475         if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
02476             (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
02477 
02478                 vdev_queue_io_done(zio);
02479 
02480                 if (zio->io_type == ZIO_TYPE_WRITE)
02481                         vdev_cache_write(zio);
02482 
02483                 if (zio_injection_enabled && zio->io_error == 0)
02484                         zio->io_error = zio_handle_device_injection(vd,
02485                             zio, EIO);
02486 
02487                 if (zio_injection_enabled && zio->io_error == 0)
02488                         zio->io_error = zio_handle_label_injection(zio, EIO);
02489 
02490                 if (zio->io_error) {
02491                         if (!vdev_accessible(vd, zio)) {
02492                                 zio->io_error = ENXIO;
02493                         } else {
02494                                 unexpected_error = B_TRUE;
02495                         }
02496                 }
02497         }
02498 
02499         ops->vdev_op_io_done(zio);
02500 
02501         if (unexpected_error)
02502                 VERIFY(vdev_probe(vd, zio) == NULL);
02503 
02504         return (ZIO_PIPELINE_CONTINUE);
02505 }
02506 
02511 static void
02512 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
02513     const void *good_buf)
02514 {
02515         /* no processing needed */
02516         zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
02517 }
02518 
02519 /*ARGSUSED*/
02520 void
02521 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
02522 {
02523         void *buf = zio_buf_alloc(zio->io_size);
02524 
02525         bcopy(zio->io_data, buf, zio->io_size);
02526 
02527         zcr->zcr_cbinfo = zio->io_size;
02528         zcr->zcr_cbdata = buf;
02529         zcr->zcr_finish = zio_vsd_default_cksum_finish;
02530         zcr->zcr_free = zio_buf_free;
02531 }
02532 
02533 static int
02534 zio_vdev_io_assess(zio_t *zio)
02535 {
02536         vdev_t *vd = zio->io_vd;
02537 
02538         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
02539                 return (ZIO_PIPELINE_STOP);
02540 
02541         if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
02542                 spa_config_exit(zio->io_spa, SCL_ZIO, zio);
02543 
02544         if (zio->io_vsd != NULL) {
02545                 zio->io_vsd_ops->vsd_free(zio);
02546                 zio->io_vsd = NULL;
02547         }
02548 
02549         if (zio_injection_enabled && zio->io_error == 0)
02550                 zio->io_error = zio_handle_fault_injection(zio, EIO);
02551 
02552         if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM)
02553                 switch (zio->io_error) {
02554                 case 0:
02555                         ZIO_TRIM_STAT_INCR(zio_trim_bytes, zio->io_size);
02556                         ZIO_TRIM_STAT_BUMP(zio_trim_success);
02557                         break;
02558                 case EOPNOTSUPP:
02559                         ZIO_TRIM_STAT_BUMP(zio_trim_unsupported);
02560                         break;
02561                 default:
02562                         ZIO_TRIM_STAT_BUMP(zio_trim_failed);
02563                         break;
02564                 }
02565 
02566         /*
02567          * If the I/O failed, determine whether we should attempt to retry it.
02568          *
02569          * On retry, we cut in line in the issue queue, since we don't want
02570          * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
02571          */
02572         if (zio->io_error && vd == NULL &&
02573             !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
02574                 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
02575                 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));  /* not a leaf */
02576                 zio->io_error = 0;
02577                 zio->io_flags |= ZIO_FLAG_IO_RETRY |
02578                     ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
02579                 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
02580                 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
02581                     zio_requeue_io_start_cut_in_line);
02582                 return (ZIO_PIPELINE_STOP);
02583         }
02584 
02585         /*
02586          * If we got an error on a leaf device, convert it to ENXIO
02587          * if the device is not accessible at all.
02588          */
02589         if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
02590             !vdev_accessible(vd, zio))
02591                 zio->io_error = ENXIO;
02592 
02593         /*
02594          * If we can't write to an interior vdev (mirror or RAID-Z),
02595          * set vdev_cant_write so that we stop trying to allocate from it.
02596          */
02597         if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
02598             vd != NULL && !vd->vdev_ops->vdev_op_leaf)
02599                 vd->vdev_cant_write = B_TRUE;
02600 
02601         if (zio->io_error)
02602                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
02603 
02604         return (ZIO_PIPELINE_CONTINUE);
02605 }
02606 
02607 void
02608 zio_vdev_io_reissue(zio_t *zio)
02609 {
02610         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
02611         ASSERT(zio->io_error == 0);
02612 
02613         zio->io_stage >>= 1;
02614 }
02615 
02616 void
02617 zio_vdev_io_redone(zio_t *zio)
02618 {
02619         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
02620 
02621         zio->io_stage >>= 1;
02622 }
02623 
02624 void
02625 zio_vdev_io_bypass(zio_t *zio)
02626 {
02627         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
02628         ASSERT(zio->io_error == 0);
02629 
02630         zio->io_flags |= ZIO_FLAG_IO_BYPASS;
02631         zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
02632 }
02633 
02634 /*
02635  * ==========================================================================
02636  * Generate and verify checksums
02637  * ==========================================================================
02638  */
02639 static int
02640 zio_checksum_generate(zio_t *zio)
02641 {
02642         blkptr_t *bp = zio->io_bp;
02643         enum zio_checksum checksum;
02644 
02645         if (bp == NULL) {
02646                 /*
02647                  * This is zio_write_phys().
02648                  * We're either generating a label checksum, or none at all.
02649                  */
02650                 checksum = zio->io_prop.zp_checksum;
02651 
02652                 if (checksum == ZIO_CHECKSUM_OFF)
02653                         return (ZIO_PIPELINE_CONTINUE);
02654 
02655                 ASSERT(checksum == ZIO_CHECKSUM_LABEL);
02656         } else {
02657                 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
02658                         ASSERT(!IO_IS_ALLOCATING(zio));
02659                         checksum = ZIO_CHECKSUM_GANG_HEADER;
02660                 } else {
02661                         checksum = BP_GET_CHECKSUM(bp);
02662                 }
02663         }
02664 
02665         zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
02666 
02667         return (ZIO_PIPELINE_CONTINUE);
02668 }
02669 
02670 static int
02671 zio_checksum_verify(zio_t *zio)
02672 {
02673         zio_bad_cksum_t info;
02674         blkptr_t *bp = zio->io_bp;
02675         int error;
02676 
02677         ASSERT(zio->io_vd != NULL);
02678 
02679         if (bp == NULL) {
02680                 /*
02681                  * This is zio_read_phys().
02682                  * We're either verifying a label checksum, or nothing at all.
02683                  */
02684                 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
02685                         return (ZIO_PIPELINE_CONTINUE);
02686 
02687                 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
02688         }
02689 
02690         if ((error = zio_checksum_error(zio, &info)) != 0) {
02691                 zio->io_error = error;
02692                 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
02693                         zfs_ereport_start_checksum(zio->io_spa,
02694                             zio->io_vd, zio, zio->io_offset,
02695                             zio->io_size, NULL, &info);
02696                 }
02697         }
02698 
02699         return (ZIO_PIPELINE_CONTINUE);
02700 }
02701 
02705 void
02706 zio_checksum_verified(zio_t *zio)
02707 {
02708         zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
02709 }
02710 
02720 int
02721 zio_worst_error(int e1, int e2)
02722 {
02723         static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
02724         int r1, r2;
02725 
02726         for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
02727                 if (e1 == zio_error_rank[r1])
02728                         break;
02729 
02730         for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
02731                 if (e2 == zio_error_rank[r2])
02732                         break;
02733 
02734         return (r1 > r2 ? e1 : e2);
02735 }
02736 
02737 /*
02738  * ==========================================================================
02739  * I/O completion
02740  * ==========================================================================
02741  */
02742 static int
02743 zio_ready(zio_t *zio)
02744 {
02745         blkptr_t *bp = zio->io_bp;
02746         zio_t *pio, *pio_next;
02747 
02748         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
02749             zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
02750                 return (ZIO_PIPELINE_STOP);
02751 
02752         if (zio->io_ready) {
02753                 ASSERT(IO_IS_ALLOCATING(zio));
02754                 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
02755                 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
02756 
02757                 zio->io_ready(zio);
02758         }
02759 
02760         if (bp != NULL && bp != &zio->io_bp_copy)
02761                 zio->io_bp_copy = *bp;
02762 
02763         if (zio->io_error)
02764                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
02765 
02766         mutex_enter(&zio->io_lock);
02767         zio->io_state[ZIO_WAIT_READY] = 1;
02768         pio = zio_walk_parents(zio);
02769         mutex_exit(&zio->io_lock);
02770 
02771         /*
02772          * As we notify zio's parents, new parents could be added.
02773          * New parents go to the head of zio's io_parent_list, however,
02774          * so we will (correctly) not notify them.  The remainder of zio's
02775          * io_parent_list, from 'pio_next' onward, cannot change because
02776          * all parents must wait for us to be done before they can be done.
02777          */
02778         for (; pio != NULL; pio = pio_next) {
02779                 pio_next = zio_walk_parents(zio);
02780                 zio_notify_parent(pio, zio, ZIO_WAIT_READY);
02781         }
02782 
02783         if (zio->io_flags & ZIO_FLAG_NODATA) {
02784                 if (BP_IS_GANG(bp)) {
02785                         zio->io_flags &= ~ZIO_FLAG_NODATA;
02786                 } else {
02787                         ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
02788                         zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
02789                 }
02790         }
02791 
02792         if (zio_injection_enabled &&
02793             zio->io_spa->spa_syncing_txg == zio->io_txg)
02794                 zio_handle_ignored_writes(zio);
02795 
02796         return (ZIO_PIPELINE_CONTINUE);
02797 }
02798 
02799 static int
02800 zio_done(zio_t *zio)
02801 {
02802         spa_t *spa = zio->io_spa;
02803         zio_t *lio = zio->io_logical;
02804         blkptr_t *bp = zio->io_bp;
02805         vdev_t *vd = zio->io_vd;
02806         uint64_t psize = zio->io_size;
02807         zio_t *pio, *pio_next;
02808 
02809         /*
02810          * If our children haven't all completed,
02811          * wait for them and then repeat this pipeline stage.
02812          */
02813         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
02814             zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
02815             zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
02816             zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
02817                 return (ZIO_PIPELINE_STOP);
02818 
02819         for (int c = 0; c < ZIO_CHILD_TYPES; c++)
02820                 for (int w = 0; w < ZIO_WAIT_TYPES; w++)
02821                         ASSERT(zio->io_children[c][w] == 0);
02822 
02823         if (bp != NULL) {
02824                 ASSERT(bp->blk_pad[0] == 0);
02825                 ASSERT(bp->blk_pad[1] == 0);
02826                 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
02827                     (bp == zio_unique_parent(zio)->io_bp));
02828                 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
02829                     zio->io_bp_override == NULL &&
02830                     !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
02831                         ASSERT(!BP_SHOULD_BYTESWAP(bp));
02832                         ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
02833                         ASSERT(BP_COUNT_GANG(bp) == 0 ||
02834                             (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
02835                 }
02836         }
02837 
02838         /*
02839          * If there were child vdev/gang/ddt errors, they apply to us now.
02840          */
02841         zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
02842         zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
02843         zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
02844 
02845         /*
02846          * If the I/O on the transformed data was successful, generate any
02847          * checksum reports now while we still have the transformed data.
02848          */
02849         if (zio->io_error == 0) {
02850                 while (zio->io_cksum_report != NULL) {
02851                         zio_cksum_report_t *zcr = zio->io_cksum_report;
02852                         uint64_t align = zcr->zcr_align;
02853                         uint64_t asize = P2ROUNDUP(psize, align);
02854                         char *abuf = zio->io_data;
02855 
02856                         if (asize != psize) {
02857                                 abuf = zio_buf_alloc(asize);
02858                                 bcopy(zio->io_data, abuf, psize);
02859                                 bzero(abuf + psize, asize - psize);
02860                         }
02861 
02862                         zio->io_cksum_report = zcr->zcr_next;
02863                         zcr->zcr_next = NULL;
02864                         zcr->zcr_finish(zcr, abuf);
02865                         zfs_ereport_free_checksum(zcr);
02866 
02867                         if (asize != psize)
02868                                 zio_buf_free(abuf, asize);
02869                 }
02870         }
02871 
02872         zio_pop_transforms(zio);        /* note: may set zio->io_error */
02873 
02874         vdev_stat_update(zio, psize);
02875 
02876         if (zio->io_error) {
02877                 /*
02878                  * If this I/O is attached to a particular vdev,
02879                  * generate an error message describing the I/O failure
02880                  * at the block level.  We ignore these errors if the
02881                  * device is currently unavailable.
02882                  */
02883                 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
02884                         zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
02885 
02886                 if ((zio->io_error == EIO || !(zio->io_flags &
02887                     (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
02888                     zio == lio) {
02889                         /*
02890                          * For logical I/O requests, tell the SPA to log the
02891                          * error and generate a logical data ereport.
02892                          */
02893                         spa_log_error(spa, zio);
02894                         zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
02895                             0, 0);
02896                 }
02897         }
02898 
02899         if (zio->io_error && zio == lio) {
02900                 /*
02901                  * Determine whether zio should be reexecuted.  This will
02902                  * propagate all the way to the root via zio_notify_parent().
02903                  */
02904                 ASSERT(vd == NULL && bp != NULL);
02905                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
02906 
02907                 if (IO_IS_ALLOCATING(zio) &&
02908                     !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
02909                         if (zio->io_error != ENOSPC)
02910                                 zio->io_reexecute |= ZIO_REEXECUTE_NOW;
02911                         else
02912                                 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
02913                 }
02914 
02915                 if ((zio->io_type == ZIO_TYPE_READ ||
02916                     zio->io_type == ZIO_TYPE_FREE) &&
02917                     !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
02918                     zio->io_error == ENXIO &&
02919                     spa_load_state(spa) == SPA_LOAD_NONE &&
02920                     spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
02921                         zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
02922 
02923                 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
02924                         zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
02925 
02926                 /*
02927                  * Here is a possibly good place to attempt to do
02928                  * either combinatorial reconstruction or error correction
02929                  * based on checksums.  It also might be a good place
02930                  * to send out preliminary ereports before we suspend
02931                  * processing.
02932                  */
02933         }
02934 
02935         /*
02936          * If there were logical child errors, they apply to us now.
02937          * We defer this until now to avoid conflating logical child
02938          * errors with errors that happened to the zio itself when
02939          * updating vdev stats and reporting FMA events above.
02940          */
02941         zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
02942 
02943         if ((zio->io_error || zio->io_reexecute) &&
02944             IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
02945             !(zio->io_flags & ZIO_FLAG_IO_REWRITE))
02946                 zio_dva_unallocate(zio, zio->io_gang_tree, bp);
02947 
02948         zio_gang_tree_free(&zio->io_gang_tree);
02949 
02950         /*
02951          * Godfather I/Os should never suspend.
02952          */
02953         if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
02954             (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
02955                 zio->io_reexecute = 0;
02956 
02957         if (zio->io_reexecute) {
02958                 /*
02959                  * This is a logical I/O that wants to reexecute.
02960                  *
02961                  * Reexecute is top-down.  When an i/o fails, if it's not
02962                  * the root, it simply notifies its parent and sticks around.
02963                  * The parent, seeing that it still has children in zio_done(),
02964                  * does the same.  This percolates all the way up to the root.
02965                  * The root i/o will reexecute or suspend the entire tree.
02966                  *
02967                  * This approach ensures that zio_reexecute() honors
02968                  * all the original i/o dependency relationships, e.g.
02969                  * parents not executing until children are ready.
02970                  */
02971                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
02972 
02973                 zio->io_gang_leader = NULL;
02974 
02975                 mutex_enter(&zio->io_lock);
02976                 zio->io_state[ZIO_WAIT_DONE] = 1;
02977                 mutex_exit(&zio->io_lock);
02978 
02979                 /*
02980                  * "The Godfather" I/O monitors its children but is
02981                  * not a true parent to them. It will track them through
02982                  * the pipeline but severs its ties whenever they get into
02983                  * trouble (e.g. suspended). This allows "The Godfather"
02984                  * I/O to return status without blocking.
02985                  */
02986                 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
02987                         zio_link_t *zl = zio->io_walk_link;
02988                         pio_next = zio_walk_parents(zio);
02989 
02990                         if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
02991                             (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
02992                                 zio_remove_child(pio, zio, zl);
02993                                 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
02994                         }
02995                 }
02996 
02997                 if ((pio = zio_unique_parent(zio)) != NULL) {
02998                         /*
02999                          * We're not a root i/o, so there's nothing to do
03000                          * but notify our parent.  Don't propagate errors
03001                          * upward since we haven't permanently failed yet.
03002                          */
03003                         ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
03004                         zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
03005                         zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
03006                 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
03007                         /*
03008                          * We'd fail again if we reexecuted now, so suspend
03009                          * until conditions improve (e.g. device comes online).
03010                          */
03011                         zio_suspend(spa, zio);
03012                 } else {
03013                         /*
03014                          * Reexecution is potentially a huge amount of work.
03015                          * Hand it off to the otherwise-unused claim taskq.
03016                          */
03017 #ifdef _KERNEL
03018                         (void) taskq_dispatch_safe(
03019                             spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
03020                             (task_func_t *)zio_reexecute, zio, TQ_SLEEP,
03021                             &zio->io_task);
03022 #else
03023                         (void) taskq_dispatch(
03024                             spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
03025                             (task_func_t *)zio_reexecute, zio, TQ_SLEEP);
03026 #endif
03027                 }
03028                 return (ZIO_PIPELINE_STOP);
03029         }
03030 
03031         ASSERT(zio->io_child_count == 0);
03032         ASSERT(zio->io_reexecute == 0);
03033         ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
03034 
03035         /*
03036          * Report any checksum errors, since the I/O is complete.
03037          */
03038         while (zio->io_cksum_report != NULL) {
03039                 zio_cksum_report_t *zcr = zio->io_cksum_report;
03040                 zio->io_cksum_report = zcr->zcr_next;
03041                 zcr->zcr_next = NULL;
03042                 zcr->zcr_finish(zcr, NULL);
03043                 zfs_ereport_free_checksum(zcr);
03044         }
03045 
03046         /*
03047          * It is the responsibility of the done callback to ensure that this
03048          * particular zio is no longer discoverable for adoption, and as
03049          * such, cannot acquire any new parents.
03050          */
03051         if (zio->io_done)
03052                 zio->io_done(zio);
03053 
03054         mutex_enter(&zio->io_lock);
03055         zio->io_state[ZIO_WAIT_DONE] = 1;
03056         mutex_exit(&zio->io_lock);
03057 
03058         for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
03059                 zio_link_t *zl = zio->io_walk_link;
03060                 pio_next = zio_walk_parents(zio);
03061                 zio_remove_child(pio, zio, zl);
03062                 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
03063         }
03064 
03065         if (zio->io_waiter != NULL) {
03066                 mutex_enter(&zio->io_lock);
03067                 zio->io_executor = NULL;
03068                 cv_broadcast(&zio->io_cv);
03069                 mutex_exit(&zio->io_lock);
03070         } else {
03071                 zio_destroy(zio);
03072         }
03073 
03074         return (ZIO_PIPELINE_STOP);
03075 }
03076 
03077 /*
03078  * ==========================================================================
03079  * I/O pipeline definition
03080  * ==========================================================================
03081  */
03082 static zio_pipe_stage_t *zio_pipeline[] = {
03083         NULL,
03084         zio_read_bp_init,
03085         zio_free_bp_init,
03086         zio_issue_async,
03087         zio_write_bp_init,
03088         zio_checksum_generate,
03089         zio_ddt_read_start,
03090         zio_ddt_read_done,
03091         zio_ddt_write,
03092         zio_ddt_free,
03093         zio_gang_assemble,
03094         zio_gang_issue,
03095         zio_dva_allocate,
03096         zio_dva_free,
03097         zio_dva_claim,
03098         zio_ready,
03099         zio_vdev_io_start,
03100         zio_vdev_io_done,
03101         zio_vdev_io_assess,
03102         zio_checksum_verify,
03103         zio_done
03104 };
03105 
03109 boolean_t
03110 zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
03111     const zbookmark_t *zb2)
03112 {
03113         uint64_t zb1nextL0, zb2thisobj;
03114 
03115         ASSERT(zb1->zb_objset == zb2->zb_objset);
03116         ASSERT(zb2->zb_level == 0);
03117 
03118         /*
03119          * A bookmark in the deadlist is considered to be after
03120          * everything else.
03121          */
03122         if (zb2->zb_object == DMU_DEADLIST_OBJECT)
03123                 return (B_TRUE);
03124 
03125         /* The objset_phys_t isn't before anything. */
03126         if (dnp == NULL)
03127                 return (B_FALSE);
03128 
03129         zb1nextL0 = (zb1->zb_blkid + 1) <<
03130             ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
03131 
03132         zb2thisobj = zb2->zb_object ? zb2->zb_object :
03133             zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
03134 
03135         if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
03136                 uint64_t nextobj = zb1nextL0 *
03137                     (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
03138                 return (nextobj <= zb2thisobj);
03139         }
03140 
03141         if (zb1->zb_object < zb2thisobj)
03142                 return (B_TRUE);
03143         if (zb1->zb_object > zb2thisobj)
03144                 return (B_FALSE);
03145         if (zb2->zb_object == DMU_META_DNODE_OBJECT)
03146                 return (B_FALSE);
03147         return (zb1nextL0 <= zb2->zb_blkid);
03148 }
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines