FreeBSD ZFS
The Zettabyte File System

dsl_scan.c

Go to the documentation of this file.
00001 /*
00002  * CDDL HEADER START
00003  *
00004  * The contents of this file are subject to the terms of the
00005  * Common Development and Distribution License (the "License").
00006  * You may not use this file except in compliance with the License.
00007  *
00008  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
00009  * or http://www.opensolaris.org/os/licensing.
00010  * See the License for the specific language governing permissions
00011  * and limitations under the License.
00012  *
00013  * When distributing Covered Code, include this CDDL HEADER in each
00014  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
00015  * If applicable, add the following below this CDDL HEADER, with the
00016  * fields enclosed by brackets "[]" replaced with your own identifying
00017  * information: Portions Copyright [yyyy] [name of copyright owner]
00018  *
00019  * CDDL HEADER END
00020  */
00021 /*
00022  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
00023  * Copyright (c) 2012 by Delphix. All rights reserved.
00024  */
00025 
00026 #include <sys/dsl_scan.h>
00027 #include <sys/dsl_pool.h>
00028 #include <sys/dsl_dataset.h>
00029 #include <sys/dsl_prop.h>
00030 #include <sys/dsl_dir.h>
00031 #include <sys/dsl_synctask.h>
00032 #include <sys/dnode.h>
00033 #include <sys/dmu_tx.h>
00034 #include <sys/dmu_objset.h>
00035 #include <sys/arc.h>
00036 #include <sys/zap.h>
00037 #include <sys/zio.h>
00038 #include <sys/zfs_context.h>
00039 #include <sys/fs/zfs.h>
00040 #include <sys/zfs_znode.h>
00041 #include <sys/spa_impl.h>
00042 #include <sys/vdev_impl.h>
00043 #include <sys/zil_impl.h>
00044 #include <sys/zio_checksum.h>
00045 #include <sys/ddt.h>
00046 #include <sys/sa.h>
00047 #include <sys/sa_impl.h>
00048 #include <sys/zfeature.h>
00049 #ifdef _KERNEL
00050 #include <sys/zfs_vfsops.h>
00051 #endif
00052 
00053 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
00054 
00055 static scan_cb_t dsl_scan_defrag_cb;
00056 static scan_cb_t dsl_scan_scrub_cb;
00057 static scan_cb_t dsl_scan_remove_cb;
00058 static dsl_syncfunc_t dsl_scan_cancel_sync;
00059 static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
00060 
00065 unsigned int zfs_top_maxinflight = 32;  
00066 unsigned int zfs_resilver_delay = 2;    
00067 unsigned int zfs_scrub_delay = 4;       
00068 unsigned int zfs_scan_idle = 50;        
00070 unsigned int zfs_scan_min_time_ms = 1000; 
00071 unsigned int zfs_free_min_time_ms = 1000; 
00073 unsigned int zfs_resilver_min_time_ms = 3000;
00074 boolean_t zfs_no_scrub_io = B_FALSE; 
00076 boolean_t zfs_no_scrub_prefetch = B_FALSE;
00077 
00078 SYSCTL_DECL(_vfs_zfs);
00079 TUNABLE_INT("vfs.zfs.top_maxinflight", &zfs_top_maxinflight);
00080 SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RW,
00081     &zfs_top_maxinflight, 0, "Maximum I/Os per top-level vdev");
00082 TUNABLE_INT("vfs.zfs.resilver_delay", &zfs_resilver_delay);
00083 SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RW,
00084     &zfs_resilver_delay, 0, "Number of ticks to delay resilver");
00085 TUNABLE_INT("vfs.zfs.scrub_delay", &zfs_scrub_delay);
00086 SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RW,
00087     &zfs_scrub_delay, 0, "Number of ticks to delay scrub");
00088 TUNABLE_INT("vfs.zfs.scan_idle", &zfs_scan_idle);
00089 SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RW,
00090     &zfs_scan_idle, 0, "Idle scan window in clock ticks");
00091 TUNABLE_INT("vfs.zfs.scan_min_time_ms", &zfs_scan_min_time_ms);
00092 SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RW,
00093     &zfs_scan_min_time_ms, 0, "Min millisecs to scrub per txg");
00094 TUNABLE_INT("vfs.zfs.free_min_time_ms", &zfs_free_min_time_ms);
00095 SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RW,
00096     &zfs_free_min_time_ms, 0, "Min millisecs to free per txg");
00097 TUNABLE_INT("vfs.zfs.resilver_min_time_ms", &zfs_resilver_min_time_ms);
00098 SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RW,
00099     &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg");
00100 TUNABLE_INT("vfs.zfs.no_scrub_io", &zfs_no_scrub_io);
00101 SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RW,
00102     &zfs_no_scrub_io, 0, "Disable scrub I/O");
00103 TUNABLE_INT("vfs.zfs.no_scrub_prefetch", &zfs_no_scrub_prefetch);
00104 SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RW,
00105     &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching");
00107 enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
00108 
00109 #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \
00110         ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
00111         (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
00112 
00113 extern int zfs_txg_timeout;
00114 
00115 /* the order has to match pool_scan_type */
00116 static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
00117         NULL,
00118         dsl_scan_scrub_cb,      
00119         dsl_scan_scrub_cb,      
00120 };
00121 
00122 int
00123 dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
00124 {
00125         int err;
00126         dsl_scan_t *scn;
00127         spa_t *spa = dp->dp_spa;
00128         uint64_t f;
00129 
00130         scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
00131         scn->scn_dp = dp;
00132 
00133         err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
00134             "scrub_func", sizeof (uint64_t), 1, &f);
00135         if (err == 0) {
00136                 /*
00137                  * There was an old-style scrub in progress.  Restart a
00138                  * new-style scrub from the beginning.
00139                  */
00140                 scn->scn_restart_txg = txg;
00141                 zfs_dbgmsg("old-style scrub was in progress; "
00142                     "restarting new-style scrub in txg %llu",
00143                     scn->scn_restart_txg);
00144 
00145                 /*
00146                  * Load the queue obj from the old location so that it
00147                  * can be freed by dsl_scan_done().
00148                  */
00149                 (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
00150                     "scrub_queue", sizeof (uint64_t), 1,
00151                     &scn->scn_phys.scn_queue_obj);
00152         } else {
00153                 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
00154                     DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
00155                     &scn->scn_phys);
00156                 if (err == ENOENT)
00157                         return (0);
00158                 else if (err)
00159                         return (err);
00160 
00161                 if (scn->scn_phys.scn_state == DSS_SCANNING &&
00162                     spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
00163                         /*
00164                          * A new-type scrub was in progress on an old
00165                          * pool, and the pool was accessed by old
00166                          * software.  Restart from the beginning, since
00167                          * the old software may have changed the pool in
00168                          * the meantime.
00169                          */
00170                         scn->scn_restart_txg = txg;
00171                         zfs_dbgmsg("new-style scrub was modified "
00172                             "by old software; restarting in txg %llu",
00173                             scn->scn_restart_txg);
00174                 }
00175         }
00176 
00177         spa_scan_stat_init(spa);
00178         return (0);
00179 }
00180 
00181 void
00182 dsl_scan_fini(dsl_pool_t *dp)
00183 {
00184         if (dp->dp_scan) {
00185                 kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
00186                 dp->dp_scan = NULL;
00187         }
00188 }
00189 
00190 /* ARGSUSED */
00191 static int
00192 dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx)
00193 {
00194         dsl_scan_t *scn = arg1;
00195 
00196         if (scn->scn_phys.scn_state == DSS_SCANNING)
00197                 return (EBUSY);
00198 
00199         return (0);
00200 }
00201 
00202 /* ARGSUSED */
00203 static void
00204 dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx)
00205 {
00206         dsl_scan_t *scn = arg1;
00207         pool_scan_func_t *funcp = arg2;
00208         dmu_object_type_t ot = 0;
00209         dsl_pool_t *dp = scn->scn_dp;
00210         spa_t *spa = dp->dp_spa;
00211 
00212         ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
00213         ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
00214         bzero(&scn->scn_phys, sizeof (scn->scn_phys));
00215         scn->scn_phys.scn_func = *funcp;
00216         scn->scn_phys.scn_state = DSS_SCANNING;
00217         scn->scn_phys.scn_min_txg = 0;
00218         scn->scn_phys.scn_max_txg = tx->tx_txg;
00219         scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
00220         scn->scn_phys.scn_start_time = gethrestime_sec();
00221         scn->scn_phys.scn_errors = 0;
00222         scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
00223         scn->scn_restart_txg = 0;
00224         spa_scan_stat_init(spa);
00225 
00226         if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
00227                 scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
00228 
00229                 /* rewrite all disk labels */
00230                 vdev_config_dirty(spa->spa_root_vdev);
00231 
00232                 if (vdev_resilver_needed(spa->spa_root_vdev,
00233                     &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
00234                         spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
00235                 } else {
00236                         spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START);
00237                 }
00238 
00239                 spa->spa_scrub_started = B_TRUE;
00240                 /*
00241                  * If this is an incremental scrub, limit the DDT scrub phase
00242                  * to just the auto-ditto class (for correctness); the rest
00243                  * of the scrub should go faster using top-down pruning.
00244                  */
00245                 if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
00246                         scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
00247 
00248         }
00249 
00250         /* back to the generic stuff */
00251 
00252         if (dp->dp_blkstats == NULL) {
00253                 dp->dp_blkstats =
00254                     kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
00255         }
00256         bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
00257 
00258         if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
00259                 ot = DMU_OT_ZAP_OTHER;
00260 
00261         scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
00262             ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
00263 
00264         dsl_scan_sync_state(scn, tx);
00265 
00266         spa_history_log_internal(LOG_POOL_SCAN, spa, tx,
00267             "func=%u mintxg=%llu maxtxg=%llu",
00268             *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
00269 }
00270 
00271 /* ARGSUSED */
00272 static void
00273 dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
00274 {
00275         static const char *old_names[] = {
00276                 "scrub_bookmark",
00277                 "scrub_ddt_bookmark",
00278                 "scrub_ddt_class_max",
00279                 "scrub_queue",
00280                 "scrub_min_txg",
00281                 "scrub_max_txg",
00282                 "scrub_func",
00283                 "scrub_errors",
00284                 NULL
00285         };
00286 
00287         dsl_pool_t *dp = scn->scn_dp;
00288         spa_t *spa = dp->dp_spa;
00289         int i;
00290 
00291         /* Remove any remnants of an old-style scrub. */
00292         for (i = 0; old_names[i]; i++) {
00293                 (void) zap_remove(dp->dp_meta_objset,
00294                     DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
00295         }
00296 
00297         if (scn->scn_phys.scn_queue_obj != 0) {
00298                 VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
00299                     scn->scn_phys.scn_queue_obj, tx));
00300                 scn->scn_phys.scn_queue_obj = 0;
00301         }
00302 
00303         /*
00304          * If we were "restarted" from a stopped state, don't bother
00305          * with anything else.
00306          */
00307         if (scn->scn_phys.scn_state != DSS_SCANNING)
00308                 return;
00309 
00310         if (complete)
00311                 scn->scn_phys.scn_state = DSS_FINISHED;
00312         else
00313                 scn->scn_phys.scn_state = DSS_CANCELED;
00314 
00315         spa_history_log_internal(LOG_POOL_SCAN_DONE, spa, tx,
00316             "complete=%u", complete);
00317 
00318         if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
00319                 mutex_enter(&spa->spa_scrub_lock);
00320                 while (spa->spa_scrub_inflight > 0) {
00321                         cv_wait(&spa->spa_scrub_io_cv,
00322                             &spa->spa_scrub_lock);
00323                 }
00324                 mutex_exit(&spa->spa_scrub_lock);
00325                 spa->spa_scrub_started = B_FALSE;
00326                 spa->spa_scrub_active = B_FALSE;
00327 
00328                 /*
00329                  * If the scrub/resilver completed, update all DTLs to
00330                  * reflect this.  Whether it succeeded or not, vacate
00331                  * all temporary scrub DTLs.
00332                  */
00333                 vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
00334                     complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
00335                 if (complete) {
00336                         spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ?
00337                             ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
00338                 }
00339                 spa_errlog_rotate(spa);
00340 
00341                 /*
00342                  * We may have finished replacing a device.
00343                  * Let the async thread assess this and handle the detach.
00344                  */
00345                 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
00346         }
00347 
00348         scn->scn_phys.scn_end_time = gethrestime_sec();
00349 }
00350 
00351 /* ARGSUSED */
00352 static int
00353 dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx)
00354 {
00355         dsl_scan_t *scn = arg1;
00356 
00357         if (scn->scn_phys.scn_state != DSS_SCANNING)
00358                 return (ENOENT);
00359         return (0);
00360 }
00361 
00362 /* ARGSUSED */
00363 static void
00364 dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx)
00365 {
00366         dsl_scan_t *scn = arg1;
00367 
00368         dsl_scan_done(scn, B_FALSE, tx);
00369         dsl_scan_sync_state(scn, tx);
00370 }
00371 
00372 int
00373 dsl_scan_cancel(dsl_pool_t *dp)
00374 {
00375         boolean_t complete = B_FALSE;
00376         int err;
00377 
00378         err = dsl_sync_task_do(dp, dsl_scan_cancel_check,
00379             dsl_scan_cancel_sync, dp->dp_scan, &complete, 3);
00380         return (err);
00381 }
00382 
00383 static void dsl_scan_visitbp(blkptr_t *bp,
00384     const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf,
00385     dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
00386     dmu_tx_t *tx);
00387 static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds,
00388     dmu_objset_type_t ostype,
00389     dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx);
00390 
00391 void
00392 dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
00393 {
00394         zio_free(dp->dp_spa, txg, bp);
00395 }
00396 
00397 void
00398 dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
00399 {
00400         ASSERT(dsl_pool_sync_context(dp));
00401         zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp),
00402             pio->io_flags));
00403 }
00404 
00405 int
00406 dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
00407     arc_done_func_t *done, void *private, int priority, int zio_flags,
00408     uint32_t *arc_flags, const zbookmark_t *zb)
00409 {
00410         return (arc_read(pio, spa, bpp, pbuf, done, private,
00411             priority, zio_flags, arc_flags, zb));
00412 }
00413 
00414 int
00415 dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
00416     arc_done_func_t *done, void *private, int priority, int zio_flags,
00417     uint32_t *arc_flags, const zbookmark_t *zb)
00418 {
00419         return (arc_read_nolock(pio, spa, bpp, done, private,
00420             priority, zio_flags, arc_flags, zb));
00421 }
00422 
00423 static uint64_t
00424 dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
00425 {
00426         uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
00427         if (dsl_dataset_is_snapshot(ds))
00428                 return (MIN(smt, ds->ds_phys->ds_creation_txg));
00429         return (smt);
00430 }
00431 
00432 static void
00433 dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
00434 {
00435         VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset,
00436             DMU_POOL_DIRECTORY_OBJECT,
00437             DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
00438             &scn->scn_phys, tx));
00439 }
00440 
00441 static boolean_t
00442 dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
00443 {
00444         uint64_t elapsed_nanosecs;
00445         unsigned int mintime;
00446 
00447         /* we never skip user/group accounting objects */
00448         if (zb && (int64_t)zb->zb_object < 0)
00449                 return (B_FALSE);
00450 
00451         if (scn->scn_pausing)
00452                 return (B_TRUE); /* we're already pausing */
00453 
00454         if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
00455                 return (B_FALSE); /* we're resuming */
00456 
00457         /* We only know how to resume from level-0 blocks. */
00458         if (zb && zb->zb_level != 0)
00459                 return (B_FALSE);
00460 
00461         mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
00462             zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
00463         elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
00464         if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
00465             (elapsed_nanosecs / MICROSEC > mintime &&
00466             txg_sync_waiting(scn->scn_dp)) ||
00467             spa_shutting_down(scn->scn_dp->dp_spa)) {
00468                 if (zb) {
00469                         dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
00470                             (longlong_t)zb->zb_objset,
00471                             (longlong_t)zb->zb_object,
00472                             (longlong_t)zb->zb_level,
00473                             (longlong_t)zb->zb_blkid);
00474                         scn->scn_phys.scn_bookmark = *zb;
00475                 }
00476                 dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
00477                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
00478                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
00479                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
00480                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
00481                 scn->scn_pausing = B_TRUE;
00482                 return (B_TRUE);
00483         }
00484         return (B_FALSE);
00485 }
00486 
00487 typedef struct zil_scan_arg {
00488         dsl_pool_t      *zsa_dp;
00489         zil_header_t    *zsa_zh;
00490 } zil_scan_arg_t;
00491 
00492 /* ARGSUSED */
00493 static int
00494 dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
00495 {
00496         zil_scan_arg_t *zsa = arg;
00497         dsl_pool_t *dp = zsa->zsa_dp;
00498         dsl_scan_t *scn = dp->dp_scan;
00499         zil_header_t *zh = zsa->zsa_zh;
00500         zbookmark_t zb;
00501 
00502         if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
00503                 return (0);
00504 
00505         /*
00506          * One block ("stubby") can be allocated a long time ago; we
00507          * want to visit that one because it has been allocated
00508          * (on-disk) even if it hasn't been claimed (even though for
00509          * scrub there's nothing to do to it).
00510          */
00511         if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
00512                 return (0);
00513 
00514         SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
00515             ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
00516 
00517         VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
00518         return (0);
00519 }
00520 
00521 /* ARGSUSED */
00522 static int
00523 dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
00524 {
00525         if (lrc->lrc_txtype == TX_WRITE) {
00526                 zil_scan_arg_t *zsa = arg;
00527                 dsl_pool_t *dp = zsa->zsa_dp;
00528                 dsl_scan_t *scn = dp->dp_scan;
00529                 zil_header_t *zh = zsa->zsa_zh;
00530                 lr_write_t *lr = (lr_write_t *)lrc;
00531                 blkptr_t *bp = &lr->lr_blkptr;
00532                 zbookmark_t zb;
00533 
00534                 if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
00535                         return (0);
00536 
00537                 /*
00538                  * birth can be < claim_txg if this record's txg is
00539                  * already txg sync'ed (but this log block contains
00540                  * other records that are not synced)
00541                  */
00542                 if (claim_txg == 0 || bp->blk_birth < claim_txg)
00543                         return (0);
00544 
00545                 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
00546                     lr->lr_foid, ZB_ZIL_LEVEL,
00547                     lr->lr_offset / BP_GET_LSIZE(bp));
00548 
00549                 VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
00550         }
00551         return (0);
00552 }
00553 
00554 static void
00555 dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
00556 {
00557         uint64_t claim_txg = zh->zh_claim_txg;
00558         zil_scan_arg_t zsa = { dp, zh };
00559         zilog_t *zilog;
00560 
00561         /*
00562          * We only want to visit blocks that have been claimed but not yet
00563          * replayed (or, in read-only mode, blocks that *would* be claimed).
00564          */
00565         if (claim_txg == 0 && spa_writeable(dp->dp_spa))
00566                 return;
00567 
00568         zilog = zil_alloc(dp->dp_meta_objset, zh);
00569 
00570         (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
00571             claim_txg);
00572 
00573         zil_free(zilog);
00574 }
00575 
00576 /* ARGSUSED */
00577 static void
00578 dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
00579     uint64_t objset, uint64_t object, uint64_t blkid)
00580 {
00581         zbookmark_t czb;
00582         uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
00583 
00584         if (zfs_no_scrub_prefetch)
00585                 return;
00586 
00587         if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
00588             (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
00589                 return;
00590 
00591         SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
00592 
00593         /*
00594          * XXX need to make sure all of these arc_read() prefetches are
00595          * done before setting xlateall (similar to dsl_read())
00596          */
00597         (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
00598             buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
00599             ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
00600 }
00601 
00602 static boolean_t
00603 dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
00604     const zbookmark_t *zb)
00605 {
00606         /*
00607          * We never skip over user/group accounting objects (obj<0)
00608          */
00609         if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
00610             (int64_t)zb->zb_object >= 0) {
00611                 /*
00612                  * If we already visited this bp & everything below (in
00613                  * a prior txg sync), don't bother doing it again.
00614                  */
00615                 if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
00616                         return (B_TRUE);
00617 
00618                 /*
00619                  * If we found the block we're trying to resume from, or
00620                  * we went past it to a different object, zero it out to
00621                  * indicate that it's OK to start checking for pausing
00622                  * again.
00623                  */
00624                 if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
00625                     zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
00626                         dprintf("resuming at %llx/%llx/%llx/%llx\n",
00627                             (longlong_t)zb->zb_objset,
00628                             (longlong_t)zb->zb_object,
00629                             (longlong_t)zb->zb_level,
00630                             (longlong_t)zb->zb_blkid);
00631                         bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
00632                 }
00633         }
00634         return (B_FALSE);
00635 }
00636 
00637 /*
00638  * \param       bufp    return location for new buf to write out
00639  *
00640  * \return  nonzero on i/o error.
00641  */
00642 static int
00643 dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
00644     dnode_phys_t *dnp, const blkptr_t *bp,
00645     const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp)
00646 {
00647         dsl_pool_t *dp = scn->scn_dp;
00648         int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
00649         int err;
00650 
00651         if (BP_GET_LEVEL(bp) > 0) {
00652                 uint32_t flags = ARC_WAIT;
00653                 int i;
00654                 blkptr_t *cbp;
00655                 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
00656 
00657                 err = arc_read_nolock(NULL, dp->dp_spa, bp,
00658                     arc_getbuf_func, bufp,
00659                     ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
00660                 if (err) {
00661                         scn->scn_phys.scn_errors++;
00662                         return (err);
00663                 }
00664                 for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
00665                         dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset,
00666                             zb->zb_object, zb->zb_blkid * epb + i);
00667                 }
00668                 for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
00669                         zbookmark_t czb;
00670 
00671                         SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
00672                             zb->zb_level - 1,
00673                             zb->zb_blkid * epb + i);
00674                         dsl_scan_visitbp(cbp, &czb, dnp,
00675                             *bufp, ds, scn, ostype, tx);
00676                 }
00677         } else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) {
00678                 uint32_t flags = ARC_WAIT;
00679 
00680                 err = arc_read_nolock(NULL, dp->dp_spa, bp,
00681                     arc_getbuf_func, bufp,
00682                     ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
00683                 if (err) {
00684                         scn->scn_phys.scn_errors++;
00685                         return (err);
00686                 }
00687         } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
00688                 uint32_t flags = ARC_WAIT;
00689                 dnode_phys_t *cdnp;
00690                 int i, j;
00691                 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
00692 
00693                 err = arc_read_nolock(NULL, dp->dp_spa, bp,
00694                     arc_getbuf_func, bufp,
00695                     ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
00696                 if (err) {
00697                         scn->scn_phys.scn_errors++;
00698                         return (err);
00699                 }
00700                 for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
00701                         for (j = 0; j < cdnp->dn_nblkptr; j++) {
00702                                 blkptr_t *cbp = &cdnp->dn_blkptr[j];
00703                                 dsl_scan_prefetch(scn, *bufp, cbp,
00704                                     zb->zb_objset, zb->zb_blkid * epb + i, j);
00705                         }
00706                 }
00707                 for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
00708                         dsl_scan_visitdnode(scn, ds, ostype,
00709                             cdnp, *bufp, zb->zb_blkid * epb + i, tx);
00710                 }
00711 
00712         } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
00713                 uint32_t flags = ARC_WAIT;
00714                 objset_phys_t *osp;
00715 
00716                 err = arc_read_nolock(NULL, dp->dp_spa, bp,
00717                     arc_getbuf_func, bufp,
00718                     ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
00719                 if (err) {
00720                         scn->scn_phys.scn_errors++;
00721                         return (err);
00722                 }
00723 
00724                 osp = (*bufp)->b_data;
00725 
00726                 dsl_scan_visitdnode(scn, ds, osp->os_type,
00727                     &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx);
00728 
00729                 if (OBJSET_BUF_HAS_USERUSED(*bufp)) {
00730                         /*
00731                          * We also always visit user/group accounting
00732                          * objects, and never skip them, even if we are
00733                          * pausing.  This is necessary so that the space
00734                          * deltas from this txg get integrated.
00735                          */
00736                         dsl_scan_visitdnode(scn, ds, osp->os_type,
00737                             &osp->os_groupused_dnode, *bufp,
00738                             DMU_GROUPUSED_OBJECT, tx);
00739                         dsl_scan_visitdnode(scn, ds, osp->os_type,
00740                             &osp->os_userused_dnode, *bufp,
00741                             DMU_USERUSED_OBJECT, tx);
00742                 }
00743         }
00744 
00745         return (0);
00746 }
00747 
00748 static void
00749 dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
00750     dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf,
00751     uint64_t object, dmu_tx_t *tx)
00752 {
00753         int j;
00754 
00755         for (j = 0; j < dnp->dn_nblkptr; j++) {
00756                 zbookmark_t czb;
00757 
00758                 SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
00759                     dnp->dn_nlevels - 1, j);
00760                 dsl_scan_visitbp(&dnp->dn_blkptr[j],
00761                     &czb, dnp, buf, ds, scn, ostype, tx);
00762         }
00763 
00764         if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
00765                 zbookmark_t czb;
00766                 SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
00767                     0, DMU_SPILL_BLKID);
00768                 dsl_scan_visitbp(&dnp->dn_spill,
00769                     &czb, dnp, buf, ds, scn, ostype, tx);
00770         }
00771 }
00772 
00773 /*
00774  * The arguments are in this order because mdb can only print the
00775  * first 5; we want them to be useful.
00776  */
00777 static void
00778 dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb,
00779     dnode_phys_t *dnp, arc_buf_t *pbuf,
00780     dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
00781     dmu_tx_t *tx)
00782 {
00783         dsl_pool_t *dp = scn->scn_dp;
00784         arc_buf_t *buf = NULL;
00785         blkptr_t bp_toread = *bp;
00786 
00787         /* ASSERT(pbuf == NULL || arc_released(pbuf)); */
00788 
00789         if (dsl_scan_check_pause(scn, zb))
00790                 return;
00791 
00792         if (dsl_scan_check_resume(scn, dnp, zb))
00793                 return;
00794 
00795         if (bp->blk_birth == 0)
00796                 return;
00797 
00798         scn->scn_visited_this_txg++;
00799 
00800         dprintf_bp(bp,
00801             "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p",
00802             ds, ds ? ds->ds_object : 0,
00803             zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
00804             pbuf, bp);
00805 
00806         if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
00807                 return;
00808 
00809         if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
00810             &buf) != 0)
00811                 return;
00812 
00813         /*
00814          * If dsl_scan_ddt() has aready visited this block, it will have
00815          * already done any translations or scrubbing, so don't call the
00816          * callback again.
00817          */
00818         if (ddt_class_contains(dp->dp_spa,
00819             scn->scn_phys.scn_ddt_class_max, bp)) {
00820                 ASSERT(buf == NULL);
00821                 return;
00822         }
00823 
00824         /*
00825          * If this block is from the future (after cur_max_txg), then we
00826          * are doing this on behalf of a deleted snapshot, and we will
00827          * revisit the future block on the next pass of this dataset.
00828          * Don't scan it now unless we need to because something
00829          * under it was modified.
00830          */
00831         if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) {
00832                 scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
00833         }
00834         if (buf)
00835                 (void) arc_buf_remove_ref(buf, &buf);
00836 }
00837 
00838 static void
00839 dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
00840     dmu_tx_t *tx)
00841 {
00842         zbookmark_t zb;
00843 
00844         SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
00845             ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
00846         dsl_scan_visitbp(bp, &zb, NULL, NULL,
00847             ds, scn, DMU_OST_NONE, tx);
00848 
00849         dprintf_ds(ds, "finished scan%s", "");
00850 }
00851 
00852 void
00853 dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
00854 {
00855         dsl_pool_t *dp = ds->ds_dir->dd_pool;
00856         dsl_scan_t *scn = dp->dp_scan;
00857         uint64_t mintxg;
00858 
00859         if (scn->scn_phys.scn_state != DSS_SCANNING)
00860                 return;
00861 
00862         if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
00863                 if (dsl_dataset_is_snapshot(ds)) {
00864                         /* Note, scn_cur_{min,max}_txg stays the same. */
00865                         scn->scn_phys.scn_bookmark.zb_objset =
00866                             ds->ds_phys->ds_next_snap_obj;
00867                         zfs_dbgmsg("destroying ds %llu; currently traversing; "
00868                             "reset zb_objset to %llu",
00869                             (u_longlong_t)ds->ds_object,
00870                             (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
00871                         scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
00872                 } else {
00873                         SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
00874                             ZB_DESTROYED_OBJSET, 0, 0, 0);
00875                         zfs_dbgmsg("destroying ds %llu; currently traversing; "
00876                             "reset bookmark to -1,0,0,0",
00877                             (u_longlong_t)ds->ds_object);
00878                 }
00879         } else if (zap_lookup_int_key(dp->dp_meta_objset,
00880             scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
00881                 ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
00882                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
00883                     scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
00884                 if (dsl_dataset_is_snapshot(ds)) {
00885                         /*
00886                          * We keep the same mintxg; it could be >
00887                          * ds_creation_txg if the previous snapshot was
00888                          * deleted too.
00889                          */
00890                         VERIFY(zap_add_int_key(dp->dp_meta_objset,
00891                             scn->scn_phys.scn_queue_obj,
00892                             ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0);
00893                         zfs_dbgmsg("destroying ds %llu; in queue; "
00894                             "replacing with %llu",
00895                             (u_longlong_t)ds->ds_object,
00896                             (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
00897                 } else {
00898                         zfs_dbgmsg("destroying ds %llu; in queue; removing",
00899                             (u_longlong_t)ds->ds_object);
00900                 }
00901         } else {
00902                 zfs_dbgmsg("destroying ds %llu; ignoring",
00903                     (u_longlong_t)ds->ds_object);
00904         }
00905 
00906         /*
00907          * dsl_scan_sync() should be called after this, and should sync
00908          * out our changed state, but just to be safe, do it here.
00909          */
00910         dsl_scan_sync_state(scn, tx);
00911 }
00912 
00913 void
00914 dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
00915 {
00916         dsl_pool_t *dp = ds->ds_dir->dd_pool;
00917         dsl_scan_t *scn = dp->dp_scan;
00918         uint64_t mintxg;
00919 
00920         if (scn->scn_phys.scn_state != DSS_SCANNING)
00921                 return;
00922 
00923         ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
00924 
00925         if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
00926                 scn->scn_phys.scn_bookmark.zb_objset =
00927                     ds->ds_phys->ds_prev_snap_obj;
00928                 zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
00929                     "reset zb_objset to %llu",
00930                     (u_longlong_t)ds->ds_object,
00931                     (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
00932         } else if (zap_lookup_int_key(dp->dp_meta_objset,
00933             scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
00934                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
00935                     scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
00936                 VERIFY(zap_add_int_key(dp->dp_meta_objset,
00937                     scn->scn_phys.scn_queue_obj,
00938                     ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0);
00939                 zfs_dbgmsg("snapshotting ds %llu; in queue; "
00940                     "replacing with %llu",
00941                     (u_longlong_t)ds->ds_object,
00942                     (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
00943         }
00944         dsl_scan_sync_state(scn, tx);
00945 }
00946 
00947 void
00948 dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
00949 {
00950         dsl_pool_t *dp = ds1->ds_dir->dd_pool;
00951         dsl_scan_t *scn = dp->dp_scan;
00952         uint64_t mintxg;
00953 
00954         if (scn->scn_phys.scn_state != DSS_SCANNING)
00955                 return;
00956 
00957         if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
00958                 scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
00959                 zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
00960                     "reset zb_objset to %llu",
00961                     (u_longlong_t)ds1->ds_object,
00962                     (u_longlong_t)ds2->ds_object);
00963         } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
00964                 scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
00965                 zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
00966                     "reset zb_objset to %llu",
00967                     (u_longlong_t)ds2->ds_object,
00968                     (u_longlong_t)ds1->ds_object);
00969         }
00970 
00971         if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
00972             ds1->ds_object, &mintxg) == 0) {
00973                 int err;
00974 
00975                 ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
00976                 ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
00977                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
00978                     scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
00979                 err = zap_add_int_key(dp->dp_meta_objset,
00980                     scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
00981                 VERIFY(err == 0 || err == EEXIST);
00982                 if (err == EEXIST) {
00983                         /* Both were there to begin with */
00984                         VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
00985                             scn->scn_phys.scn_queue_obj,
00986                             ds1->ds_object, mintxg, tx));
00987                 }
00988                 zfs_dbgmsg("clone_swap ds %llu; in queue; "
00989                     "replacing with %llu",
00990                     (u_longlong_t)ds1->ds_object,
00991                     (u_longlong_t)ds2->ds_object);
00992         } else if (zap_lookup_int_key(dp->dp_meta_objset,
00993             scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
00994                 ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
00995                 ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
00996                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
00997                     scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
00998                 VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
00999                     scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
01000                 zfs_dbgmsg("clone_swap ds %llu; in queue; "
01001                     "replacing with %llu",
01002                     (u_longlong_t)ds2->ds_object,
01003                     (u_longlong_t)ds1->ds_object);
01004         }
01005 
01006         dsl_scan_sync_state(scn, tx);
01007 }
01008 
01009 struct enqueue_clones_arg {
01010         dmu_tx_t *tx;
01011         uint64_t originobj;
01012 };
01013 
01014 /* ARGSUSED */
01015 static int
01016 enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
01017 {
01018         struct enqueue_clones_arg *eca = arg;
01019         dsl_dataset_t *ds;
01020         int err;
01021         dsl_pool_t *dp = spa->spa_dsl_pool;
01022         dsl_scan_t *scn = dp->dp_scan;
01023 
01024         err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
01025         if (err)
01026                 return (err);
01027 
01028         if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
01029                 while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
01030                         dsl_dataset_t *prev;
01031                         err = dsl_dataset_hold_obj(dp,
01032                             ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
01033 
01034                         dsl_dataset_rele(ds, FTAG);
01035                         if (err)
01036                                 return (err);
01037                         ds = prev;
01038                 }
01039                 VERIFY(zap_add_int_key(dp->dp_meta_objset,
01040                     scn->scn_phys.scn_queue_obj, ds->ds_object,
01041                     ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0);
01042         }
01043         dsl_dataset_rele(ds, FTAG);
01044         return (0);
01045 }
01046 
01047 static void
01048 dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
01049 {
01050         dsl_pool_t *dp = scn->scn_dp;
01051         dsl_dataset_t *ds;
01052         objset_t *os;
01053 
01054         VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
01055 
01056         if (dmu_objset_from_ds(ds, &os))
01057                 goto out;
01058 
01059         /*
01060          * Only the ZIL in the head (non-snapshot) is valid.  Even though
01061          * snapshots can have ZIL block pointers (which may be the same
01062          * BP as in the head), they must be ignored.  So we traverse the
01063          * ZIL here, rather than in scan_recurse(), because the regular
01064          * snapshot block-sharing rules don't apply to it.
01065          */
01066         if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds))
01067                 dsl_scan_zil(dp, &os->os_zil_header);
01068 
01069         /*
01070          * Iterate over the bps in this ds.
01071          */
01072         dmu_buf_will_dirty(ds->ds_dbuf, tx);
01073         dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx);
01074 
01075         char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP);
01076         dsl_dataset_name(ds, dsname);
01077         zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
01078             "pausing=%u",
01079             (longlong_t)dsobj, dsname,
01080             (longlong_t)scn->scn_phys.scn_cur_min_txg,
01081             (longlong_t)scn->scn_phys.scn_cur_max_txg,
01082             (int)scn->scn_pausing);
01083         kmem_free(dsname, ZFS_MAXNAMELEN);
01084 
01085         if (scn->scn_pausing)
01086                 goto out;
01087 
01088         /*
01089          * We've finished this pass over this dataset.
01090          */
01091 
01092         /*
01093          * If we did not completely visit this dataset, do another pass.
01094          */
01095         if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
01096                 zfs_dbgmsg("incomplete pass; visiting again");
01097                 scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
01098                 VERIFY(zap_add_int_key(dp->dp_meta_objset,
01099                     scn->scn_phys.scn_queue_obj, ds->ds_object,
01100                     scn->scn_phys.scn_cur_max_txg, tx) == 0);
01101                 goto out;
01102         }
01103 
01104         /*
01105          * Add descendent datasets to work queue.
01106          */
01107         if (ds->ds_phys->ds_next_snap_obj != 0) {
01108                 VERIFY(zap_add_int_key(dp->dp_meta_objset,
01109                     scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj,
01110                     ds->ds_phys->ds_creation_txg, tx) == 0);
01111         }
01112         if (ds->ds_phys->ds_num_children > 1) {
01113                 boolean_t usenext = B_FALSE;
01114                 if (ds->ds_phys->ds_next_clones_obj != 0) {
01115                         uint64_t count;
01116                         /*
01117                          * A bug in a previous version of the code could
01118                          * cause upgrade_clones_cb() to not set
01119                          * ds_next_snap_obj when it should, leading to a
01120                          * missing entry.  Therefore we can only use the
01121                          * next_clones_obj when its count is correct.
01122                          */
01123                         int err = zap_count(dp->dp_meta_objset,
01124                             ds->ds_phys->ds_next_clones_obj, &count);
01125                         if (err == 0 &&
01126                             count == ds->ds_phys->ds_num_children - 1)
01127                                 usenext = B_TRUE;
01128                 }
01129 
01130                 if (usenext) {
01131                         VERIFY(zap_join_key(dp->dp_meta_objset,
01132                             ds->ds_phys->ds_next_clones_obj,
01133                             scn->scn_phys.scn_queue_obj,
01134                             ds->ds_phys->ds_creation_txg, tx) == 0);
01135                 } else {
01136                         struct enqueue_clones_arg eca;
01137                         eca.tx = tx;
01138                         eca.originobj = ds->ds_object;
01139 
01140                         (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
01141                             NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
01142                 }
01143         }
01144 
01145 out:
01146         dsl_dataset_rele(ds, FTAG);
01147 }
01148 
01149 /* ARGSUSED */
01150 static int
01151 enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
01152 {
01153         dmu_tx_t *tx = arg;
01154         dsl_dataset_t *ds;
01155         int err;
01156         dsl_pool_t *dp = spa->spa_dsl_pool;
01157         dsl_scan_t *scn = dp->dp_scan;
01158 
01159         err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
01160         if (err)
01161                 return (err);
01162 
01163         while (ds->ds_phys->ds_prev_snap_obj != 0) {
01164                 dsl_dataset_t *prev;
01165                 err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
01166                     FTAG, &prev);
01167                 if (err) {
01168                         dsl_dataset_rele(ds, FTAG);
01169                         return (err);
01170                 }
01171 
01172                 /*
01173                  * If this is a clone, we don't need to worry about it for now.
01174                  */
01175                 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
01176                         dsl_dataset_rele(ds, FTAG);
01177                         dsl_dataset_rele(prev, FTAG);
01178                         return (0);
01179                 }
01180                 dsl_dataset_rele(ds, FTAG);
01181                 ds = prev;
01182         }
01183 
01184         VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
01185             ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0);
01186         dsl_dataset_rele(ds, FTAG);
01187         return (0);
01188 }
01189 
01223 static void
01224 dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
01225 {
01226         ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
01227         ddt_entry_t dde = { 0 };
01228         int error;
01229         uint64_t n = 0;
01230 
01231         while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
01232                 ddt_t *ddt;
01233 
01234                 if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
01235                         break;
01236                 dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
01237                     (longlong_t)ddb->ddb_class,
01238                     (longlong_t)ddb->ddb_type,
01239                     (longlong_t)ddb->ddb_checksum,
01240                     (longlong_t)ddb->ddb_cursor);
01241 
01242                 /* There should be no pending changes to the dedup table */
01243                 ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
01244                 ASSERT(avl_first(&ddt->ddt_tree) == NULL);
01245 
01246                 dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
01247                 n++;
01248 
01249                 if (dsl_scan_check_pause(scn, NULL))
01250                         break;
01251         }
01252 
01253         zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u",
01254             (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max,
01255             (int)scn->scn_pausing);
01256 
01257         ASSERT(error == 0 || error == ENOENT);
01258         ASSERT(error != ENOENT ||
01259             ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
01260 }
01261 
01262 /* ARGSUSED */
01263 void
01264 dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
01265     ddt_entry_t *dde, dmu_tx_t *tx)
01266 {
01267         const ddt_key_t *ddk = &dde->dde_key;
01268         ddt_phys_t *ddp = dde->dde_phys;
01269         blkptr_t bp;
01270         zbookmark_t zb = { 0 };
01271 
01272         if (scn->scn_phys.scn_state != DSS_SCANNING)
01273                 return;
01274 
01275         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
01276                 if (ddp->ddp_phys_birth == 0 ||
01277                     ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg)
01278                         continue;
01279                 ddt_bp_create(checksum, ddk, ddp, &bp);
01280 
01281                 scn->scn_visited_this_txg++;
01282                 scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
01283         }
01284 }
01285 
01286 static void
01287 dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
01288 {
01289         dsl_pool_t *dp = scn->scn_dp;
01290         zap_cursor_t zc;
01291         zap_attribute_t za;
01292 
01293         if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
01294             scn->scn_phys.scn_ddt_class_max) {
01295                 scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
01296                 scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
01297                 dsl_scan_ddt(scn, tx);
01298                 if (scn->scn_pausing)
01299                         return;
01300         }
01301 
01302         if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
01303                 /* First do the MOS & ORIGIN */
01304 
01305                 scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
01306                 scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
01307                 dsl_scan_visit_rootbp(scn, NULL,
01308                     &dp->dp_meta_rootbp, tx);
01309                 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
01310                 if (scn->scn_pausing)
01311                         return;
01312 
01313                 if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
01314                         VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
01315                             NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
01316                 } else {
01317                         dsl_scan_visitds(scn,
01318                             dp->dp_origin_snap->ds_object, tx);
01319                 }
01320                 ASSERT(!scn->scn_pausing);
01321         } else if (scn->scn_phys.scn_bookmark.zb_objset !=
01322             ZB_DESTROYED_OBJSET) {
01323                 /*
01324                  * If we were paused, continue from here.  Note if the
01325                  * ds we were paused on was deleted, the zb_objset may
01326                  * be -1, so we will skip this and find a new objset
01327                  * below.
01328                  */
01329                 dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
01330                 if (scn->scn_pausing)
01331                         return;
01332         }
01333 
01334         /*
01335          * In case we were paused right at the end of the ds, zero the
01336          * bookmark so we don't think that we're still trying to resume.
01337          */
01338         bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t));
01339 
01340         /* keep pulling things out of the zap-object-as-queue */
01341         while (zap_cursor_init(&zc, dp->dp_meta_objset,
01342             scn->scn_phys.scn_queue_obj),
01343             zap_cursor_retrieve(&zc, &za) == 0) {
01344                 dsl_dataset_t *ds;
01345                 uint64_t dsobj;
01346 
01347                 dsobj = strtonum(za.za_name, NULL);
01348                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
01349                     scn->scn_phys.scn_queue_obj, dsobj, tx));
01350 
01351                 /* Set up min/max txg */
01352                 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
01353                 if (za.za_first_integer != 0) {
01354                         scn->scn_phys.scn_cur_min_txg =
01355                             MAX(scn->scn_phys.scn_min_txg,
01356                             za.za_first_integer);
01357                 } else {
01358                         scn->scn_phys.scn_cur_min_txg =
01359                             MAX(scn->scn_phys.scn_min_txg,
01360                             ds->ds_phys->ds_prev_snap_txg);
01361                 }
01362                 scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
01363                 dsl_dataset_rele(ds, FTAG);
01364 
01365                 dsl_scan_visitds(scn, dsobj, tx);
01366                 zap_cursor_fini(&zc);
01367                 if (scn->scn_pausing)
01368                         return;
01369         }
01370         zap_cursor_fini(&zc);
01371 }
01372 
01373 static boolean_t
01374 dsl_scan_free_should_pause(dsl_scan_t *scn)
01375 {
01376         uint64_t elapsed_nanosecs;
01377 
01378         elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
01379         return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
01380             (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
01381             txg_sync_waiting(scn->scn_dp)) ||
01382             spa_shutting_down(scn->scn_dp->dp_spa));
01383 }
01384 
01385 static int
01386 dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
01387 {
01388         dsl_scan_t *scn = arg;
01389 
01390         if (!scn->scn_is_bptree ||
01391             (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
01392                 if (dsl_scan_free_should_pause(scn))
01393                         return (ERESTART);
01394         }
01395 
01396         zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
01397             dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0));
01398         dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
01399             -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
01400             -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
01401         scn->scn_visited_this_txg++;
01402         return (0);
01403 }
01404 
01405 boolean_t
01406 dsl_scan_active(dsl_scan_t *scn)
01407 {
01408         spa_t *spa = scn->scn_dp->dp_spa;
01409         uint64_t used = 0, comp, uncomp;
01410 
01411         if (spa->spa_load_state != SPA_LOAD_NONE)
01412                 return (B_FALSE);
01413         if (spa_shutting_down(spa))
01414                 return (B_FALSE);
01415 
01416         if (scn->scn_phys.scn_state == DSS_SCANNING)
01417                 return (B_TRUE);
01418 
01419         if (spa_feature_is_active(spa,
01420             &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
01421                 return (B_TRUE);
01422         }
01423         if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
01424                 (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
01425                     &used, &comp, &uncomp);
01426         }
01427         return (used != 0);
01428 }
01429 
01430 void
01431 dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
01432 {
01433         dsl_scan_t *scn = dp->dp_scan;
01434         spa_t *spa = dp->dp_spa;
01435         int err;
01436 
01437         /*
01438          * Check for scn_restart_txg before checking spa_load_state, so
01439          * that we can restart an old-style scan while the pool is being
01440          * imported (see dsl_scan_init).
01441          */
01442         if (scn->scn_restart_txg != 0 &&
01443             scn->scn_restart_txg <= tx->tx_txg) {
01444                 pool_scan_func_t func = POOL_SCAN_SCRUB;
01445                 dsl_scan_done(scn, B_FALSE, tx);
01446                 if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
01447                         func = POOL_SCAN_RESILVER;
01448                 zfs_dbgmsg("restarting scan func=%u txg=%llu",
01449                     func, tx->tx_txg);
01450                 dsl_scan_setup_sync(scn, &func, tx);
01451         }
01452 
01453         if (!dsl_scan_active(scn) ||
01454             spa_sync_pass(dp->dp_spa) > 1)
01455                 return;
01456 
01457         scn->scn_visited_this_txg = 0;
01458         scn->scn_pausing = B_FALSE;
01459         scn->scn_sync_start_time = gethrtime();
01460         spa->spa_scrub_active = B_TRUE;
01461 
01462         /*
01463          * First process the free list.  If we pause the free, don't do
01464          * any scanning.  This ensures that there is no free list when
01465          * we are scanning, so the scan code doesn't have to worry about
01466          * traversing it.
01467          */
01468         if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
01469                 scn->scn_is_bptree = B_FALSE;
01470                 scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
01471                     NULL, ZIO_FLAG_MUSTSUCCEED);
01472                 err = bpobj_iterate(&dp->dp_free_bpobj,
01473                     dsl_scan_free_block_cb, scn, tx);
01474                 VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
01475 
01476                 if (err == 0 && spa_feature_is_active(spa,
01477                     &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
01478                         scn->scn_is_bptree = B_TRUE;
01479                         scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
01480                             NULL, ZIO_FLAG_MUSTSUCCEED);
01481                         err = bptree_iterate(dp->dp_meta_objset,
01482                             dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
01483                             scn, tx);
01484                         VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
01485                         if (err != 0)
01486                                 return;
01487 
01488                         /* disable async destroy feature */
01489                         spa_feature_decr(spa,
01490                             &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx);
01491                         ASSERT(!spa_feature_is_active(spa,
01492                             &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]));
01493                         VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
01494                             DMU_POOL_DIRECTORY_OBJECT,
01495                             DMU_POOL_BPTREE_OBJ, tx));
01496                         VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset,
01497                             dp->dp_bptree_obj, tx));
01498                         dp->dp_bptree_obj = 0;
01499                 }
01500                 if (scn->scn_visited_this_txg) {
01501                         zfs_dbgmsg("freed %llu blocks in %llums from "
01502                             "free_bpobj/bptree txg %llu",
01503                             (longlong_t)scn->scn_visited_this_txg,
01504                             (longlong_t)
01505                             (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
01506                             (longlong_t)tx->tx_txg);
01507                         scn->scn_visited_this_txg = 0;
01508                         /*
01509                          * Re-sync the ddt so that we can further modify
01510                          * it when doing bprewrite.
01511                          */
01512                         ddt_sync(spa, tx->tx_txg);
01513                 }
01514                 if (err == ERESTART)
01515                         return;
01516         }
01517 
01518         if (scn->scn_phys.scn_state != DSS_SCANNING)
01519                 return;
01520 
01521         if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
01522             scn->scn_phys.scn_ddt_class_max) {
01523                 zfs_dbgmsg("doing scan sync txg %llu; "
01524                     "ddt bm=%llu/%llu/%llu/%llx",
01525                     (longlong_t)tx->tx_txg,
01526                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
01527                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
01528                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
01529                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
01530                 ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
01531                 ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
01532                 ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
01533                 ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
01534         } else {
01535                 zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
01536                     (longlong_t)tx->tx_txg,
01537                     (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
01538                     (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
01539                     (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
01540                     (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
01541         }
01542 
01543         scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
01544             NULL, ZIO_FLAG_CANFAIL);
01545         dsl_scan_visit(scn, tx);
01546         (void) zio_wait(scn->scn_zio_root);
01547         scn->scn_zio_root = NULL;
01548 
01549         zfs_dbgmsg("visited %llu blocks in %llums",
01550             (longlong_t)scn->scn_visited_this_txg,
01551             (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC);
01552 
01553         if (!scn->scn_pausing) {
01554                 /* finished with scan. */
01555                 zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg);
01556                 dsl_scan_done(scn, B_TRUE, tx);
01557         }
01558 
01559         if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
01560                 mutex_enter(&spa->spa_scrub_lock);
01561                 while (spa->spa_scrub_inflight > 0) {
01562                         cv_wait(&spa->spa_scrub_io_cv,
01563                             &spa->spa_scrub_lock);
01564                 }
01565                 mutex_exit(&spa->spa_scrub_lock);
01566         }
01567 
01568         dsl_scan_sync_state(scn, tx);
01569 }
01570 
01574 void
01575 dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
01576 {
01577         if (txg == 0) {
01578                 dmu_tx_t *tx;
01579                 tx = dmu_tx_create_dd(dp->dp_mos_dir);
01580                 VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
01581 
01582                 txg = dmu_tx_get_txg(tx);
01583                 dp->dp_scan->scn_restart_txg = txg;
01584                 dmu_tx_commit(tx);
01585         } else {
01586                 dp->dp_scan->scn_restart_txg = txg;
01587         }
01588         zfs_dbgmsg("restarting resilver txg=%llu", txg);
01589 }
01590 
01591 boolean_t
01592 dsl_scan_resilvering(dsl_pool_t *dp)
01593 {
01594         return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
01595             dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
01596 }
01597 
01598 /*
01599  * scrub consumers
01600  */
01601 
01602 static void
01603 count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
01604 {
01605         int i;
01606 
01607         /*
01608          * If we resume after a reboot, zab will be NULL; don't record
01609          * incomplete stats in that case.
01610          */
01611         if (zab == NULL)
01612                 return;
01613 
01614         for (i = 0; i < 4; i++) {
01615                 int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
01616                 int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
01617                 if (t & DMU_OT_NEWTYPE)
01618                         t = DMU_OT_OTHER;
01619                 zfs_blkstat_t *zb = &zab->zab_type[l][t];
01620                 int equal;
01621 
01622                 zb->zb_count++;
01623                 zb->zb_asize += BP_GET_ASIZE(bp);
01624                 zb->zb_lsize += BP_GET_LSIZE(bp);
01625                 zb->zb_psize += BP_GET_PSIZE(bp);
01626                 zb->zb_gangs += BP_COUNT_GANG(bp);
01627 
01628                 switch (BP_GET_NDVAS(bp)) {
01629                 case 2:
01630                         if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
01631                             DVA_GET_VDEV(&bp->blk_dva[1]))
01632                                 zb->zb_ditto_2_of_2_samevdev++;
01633                         break;
01634                 case 3:
01635                         equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
01636                             DVA_GET_VDEV(&bp->blk_dva[1])) +
01637                             (DVA_GET_VDEV(&bp->blk_dva[0]) ==
01638                             DVA_GET_VDEV(&bp->blk_dva[2])) +
01639                             (DVA_GET_VDEV(&bp->blk_dva[1]) ==
01640                             DVA_GET_VDEV(&bp->blk_dva[2]));
01641                         if (equal == 1)
01642                                 zb->zb_ditto_2_of_3_samevdev++;
01643                         else if (equal == 3)
01644                                 zb->zb_ditto_3_of_3_samevdev++;
01645                         break;
01646                 }
01647         }
01648 }
01649 
01650 static void
01651 dsl_scan_scrub_done(zio_t *zio)
01652 {
01653         spa_t *spa = zio->io_spa;
01654 
01655         zio_data_buf_free(zio->io_data, zio->io_size);
01656 
01657         mutex_enter(&spa->spa_scrub_lock);
01658         spa->spa_scrub_inflight--;
01659         cv_broadcast(&spa->spa_scrub_io_cv);
01660 
01661         if (zio->io_error && (zio->io_error != ECKSUM ||
01662             !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
01663                 spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
01664         }
01665         mutex_exit(&spa->spa_scrub_lock);
01666 }
01667 
01668 static int
01669 dsl_scan_scrub_cb(dsl_pool_t *dp,
01670     const blkptr_t *bp, const zbookmark_t *zb)
01671 {
01672         dsl_scan_t *scn = dp->dp_scan;
01673         size_t size = BP_GET_PSIZE(bp);
01674         spa_t *spa = dp->dp_spa;
01675         uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
01676         boolean_t needs_io;
01677         int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
01678         int zio_priority;
01679         unsigned int scan_delay = 0;
01680 
01681         if (phys_birth <= scn->scn_phys.scn_min_txg ||
01682             phys_birth >= scn->scn_phys.scn_max_txg)
01683                 return (0);
01684 
01685         count_block(dp->dp_blkstats, bp);
01686 
01687         ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
01688         if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
01689                 zio_flags |= ZIO_FLAG_SCRUB;
01690                 zio_priority = ZIO_PRIORITY_SCRUB;
01691                 needs_io = B_TRUE;
01692                 scan_delay = zfs_scrub_delay;
01693         } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
01694                 zio_flags |= ZIO_FLAG_RESILVER;
01695                 zio_priority = ZIO_PRIORITY_RESILVER;
01696                 needs_io = B_FALSE;
01697                 scan_delay = zfs_resilver_delay;
01698         }
01699 
01700         /* If it's an intent log block, failure is expected. */
01701         if (zb->zb_level == ZB_ZIL_LEVEL)
01702                 zio_flags |= ZIO_FLAG_SPECULATIVE;
01703 
01704         for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
01705                 vdev_t *vd = vdev_lookup_top(spa,
01706                     DVA_GET_VDEV(&bp->blk_dva[d]));
01707 
01708                 /*
01709                  * Keep track of how much data we've examined so that
01710                  * zpool(1M) status can make useful progress reports.
01711                  */
01712                 scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
01713                 spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
01714 
01715                 /* if it's a resilver, this may not be in the target range */
01716                 if (!needs_io) {
01717                         if (DVA_GET_GANG(&bp->blk_dva[d])) {
01718                                 /*
01719                                  * Gang members may be spread across multiple
01720                                  * vdevs, so the best estimate we have is the
01721                                  * scrub range, which has already been checked.
01722                                  * XXX -- it would be better to change our
01723                                  * allocation policy to ensure that all
01724                                  * gang members reside on the same vdev.
01725                                  */
01726                                 needs_io = B_TRUE;
01727                         } else {
01728                                 needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
01729                                     phys_birth, 1);
01730                         }
01731                 }
01732         }
01733 
01734         if (needs_io && !zfs_no_scrub_io) {
01735                 vdev_t *rvd = spa->spa_root_vdev;
01736                 uint64_t maxinflight = rvd->vdev_children *
01737                     MAX(zfs_top_maxinflight, 1);
01738                 void *data = zio_data_buf_alloc(size);
01739 
01740                 mutex_enter(&spa->spa_scrub_lock);
01741                 while (spa->spa_scrub_inflight >= maxinflight)
01742                         cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
01743                 spa->spa_scrub_inflight++;
01744                 mutex_exit(&spa->spa_scrub_lock);
01745 
01746                 /*
01747                  * If we're seeing recent (zfs_scan_idle) "important" I/Os
01748                  * then throttle our workload to limit the impact of a scan.
01749                  */
01750                 if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
01751                         delay(MAX((int)scan_delay, 0));
01752 
01753                 zio_nowait(zio_read(NULL, spa, bp, data, size,
01754                     dsl_scan_scrub_done, NULL, zio_priority,
01755                     zio_flags, zb));
01756         }
01757 
01758         /* do not relocate this block */
01759         return (0);
01760 }
01761 
01762 int
01763 dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
01764 {
01765         spa_t *spa = dp->dp_spa;
01766 
01767         /*
01768          * Purge all vdev caches and probe all devices.  We do this here
01769          * rather than in sync context because this requires a writer lock
01770          * on the spa_config lock, which we can't do from sync context.  The
01771          * spa_scrub_reopen flag indicates that vdev_open() should not
01772          * attempt to start another scrub.
01773          */
01774         spa_vdev_state_enter(spa, SCL_NONE);
01775         spa->spa_scrub_reopen = B_TRUE;
01776         vdev_reopen(spa->spa_root_vdev);
01777         spa->spa_scrub_reopen = B_FALSE;
01778         (void) spa_vdev_state_exit(spa, NULL, 0);
01779 
01780         return (dsl_sync_task_do(dp, dsl_scan_setup_check,
01781             dsl_scan_setup_sync, dp->dp_scan, &func, 0));
01782 }
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines