FreeBSD ZFS
The Zettabyte File System
|
00001 /* 00002 * CDDL HEADER START 00003 * 00004 * The contents of this file are subject to the terms of the 00005 * Common Development and Distribution License (the "License"). 00006 * You may not use this file except in compliance with the License. 00007 * 00008 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 00009 * or http://www.opensolaris.org/os/licensing. 00010 * See the License for the specific language governing permissions 00011 * and limitations under the License. 00012 * 00013 * When distributing Covered Code, include this CDDL HEADER in each 00014 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 00015 * If applicable, add the following below this CDDL HEADER, with the 00016 * fields enclosed by brackets "[]" replaced with your own identifying 00017 * information: Portions Copyright [yyyy] [name of copyright owner] 00018 * 00019 * CDDL HEADER END 00020 */ 00021 /* 00022 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 00023 * Copyright (c) 2012 by Delphix. All rights reserved. 00024 */ 00025 00026 #include <sys/dsl_scan.h> 00027 #include <sys/dsl_pool.h> 00028 #include <sys/dsl_dataset.h> 00029 #include <sys/dsl_prop.h> 00030 #include <sys/dsl_dir.h> 00031 #include <sys/dsl_synctask.h> 00032 #include <sys/dnode.h> 00033 #include <sys/dmu_tx.h> 00034 #include <sys/dmu_objset.h> 00035 #include <sys/arc.h> 00036 #include <sys/zap.h> 00037 #include <sys/zio.h> 00038 #include <sys/zfs_context.h> 00039 #include <sys/fs/zfs.h> 00040 #include <sys/zfs_znode.h> 00041 #include <sys/spa_impl.h> 00042 #include <sys/vdev_impl.h> 00043 #include <sys/zil_impl.h> 00044 #include <sys/zio_checksum.h> 00045 #include <sys/ddt.h> 00046 #include <sys/sa.h> 00047 #include <sys/sa_impl.h> 00048 #include <sys/zfeature.h> 00049 #ifdef _KERNEL 00050 #include <sys/zfs_vfsops.h> 00051 #endif 00052 00053 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); 00054 00055 static scan_cb_t dsl_scan_defrag_cb; 00056 static scan_cb_t dsl_scan_scrub_cb; 00057 static scan_cb_t dsl_scan_remove_cb; 00058 static dsl_syncfunc_t dsl_scan_cancel_sync; 00059 static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx); 00060 00065 unsigned int zfs_top_maxinflight = 32; 00066 unsigned int zfs_resilver_delay = 2; 00067 unsigned int zfs_scrub_delay = 4; 00068 unsigned int zfs_scan_idle = 50; 00070 unsigned int zfs_scan_min_time_ms = 1000; 00071 unsigned int zfs_free_min_time_ms = 1000; 00073 unsigned int zfs_resilver_min_time_ms = 3000; 00074 boolean_t zfs_no_scrub_io = B_FALSE; 00076 boolean_t zfs_no_scrub_prefetch = B_FALSE; 00077 00078 SYSCTL_DECL(_vfs_zfs); 00079 TUNABLE_INT("vfs.zfs.top_maxinflight", &zfs_top_maxinflight); 00080 SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RW, 00081 &zfs_top_maxinflight, 0, "Maximum I/Os per top-level vdev"); 00082 TUNABLE_INT("vfs.zfs.resilver_delay", &zfs_resilver_delay); 00083 SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RW, 00084 &zfs_resilver_delay, 0, "Number of ticks to delay resilver"); 00085 TUNABLE_INT("vfs.zfs.scrub_delay", &zfs_scrub_delay); 00086 SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RW, 00087 &zfs_scrub_delay, 0, "Number of ticks to delay scrub"); 00088 TUNABLE_INT("vfs.zfs.scan_idle", &zfs_scan_idle); 00089 SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RW, 00090 &zfs_scan_idle, 0, "Idle scan window in clock ticks"); 00091 TUNABLE_INT("vfs.zfs.scan_min_time_ms", &zfs_scan_min_time_ms); 00092 SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RW, 00093 &zfs_scan_min_time_ms, 0, "Min millisecs to scrub per txg"); 00094 TUNABLE_INT("vfs.zfs.free_min_time_ms", &zfs_free_min_time_ms); 00095 SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RW, 00096 &zfs_free_min_time_ms, 0, "Min millisecs to free per txg"); 00097 TUNABLE_INT("vfs.zfs.resilver_min_time_ms", &zfs_resilver_min_time_ms); 00098 SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RW, 00099 &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg"); 00100 TUNABLE_INT("vfs.zfs.no_scrub_io", &zfs_no_scrub_io); 00101 SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RW, 00102 &zfs_no_scrub_io, 0, "Disable scrub I/O"); 00103 TUNABLE_INT("vfs.zfs.no_scrub_prefetch", &zfs_no_scrub_prefetch); 00104 SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RW, 00105 &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching"); 00107 enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; 00108 00109 #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ 00110 ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ 00111 (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) 00112 00113 extern int zfs_txg_timeout; 00114 00115 /* the order has to match pool_scan_type */ 00116 static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { 00117 NULL, 00118 dsl_scan_scrub_cb, 00119 dsl_scan_scrub_cb, 00120 }; 00121 00122 int 00123 dsl_scan_init(dsl_pool_t *dp, uint64_t txg) 00124 { 00125 int err; 00126 dsl_scan_t *scn; 00127 spa_t *spa = dp->dp_spa; 00128 uint64_t f; 00129 00130 scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); 00131 scn->scn_dp = dp; 00132 00133 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 00134 "scrub_func", sizeof (uint64_t), 1, &f); 00135 if (err == 0) { 00136 /* 00137 * There was an old-style scrub in progress. Restart a 00138 * new-style scrub from the beginning. 00139 */ 00140 scn->scn_restart_txg = txg; 00141 zfs_dbgmsg("old-style scrub was in progress; " 00142 "restarting new-style scrub in txg %llu", 00143 scn->scn_restart_txg); 00144 00145 /* 00146 * Load the queue obj from the old location so that it 00147 * can be freed by dsl_scan_done(). 00148 */ 00149 (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 00150 "scrub_queue", sizeof (uint64_t), 1, 00151 &scn->scn_phys.scn_queue_obj); 00152 } else { 00153 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 00154 DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, 00155 &scn->scn_phys); 00156 if (err == ENOENT) 00157 return (0); 00158 else if (err) 00159 return (err); 00160 00161 if (scn->scn_phys.scn_state == DSS_SCANNING && 00162 spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { 00163 /* 00164 * A new-type scrub was in progress on an old 00165 * pool, and the pool was accessed by old 00166 * software. Restart from the beginning, since 00167 * the old software may have changed the pool in 00168 * the meantime. 00169 */ 00170 scn->scn_restart_txg = txg; 00171 zfs_dbgmsg("new-style scrub was modified " 00172 "by old software; restarting in txg %llu", 00173 scn->scn_restart_txg); 00174 } 00175 } 00176 00177 spa_scan_stat_init(spa); 00178 return (0); 00179 } 00180 00181 void 00182 dsl_scan_fini(dsl_pool_t *dp) 00183 { 00184 if (dp->dp_scan) { 00185 kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); 00186 dp->dp_scan = NULL; 00187 } 00188 } 00189 00190 /* ARGSUSED */ 00191 static int 00192 dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx) 00193 { 00194 dsl_scan_t *scn = arg1; 00195 00196 if (scn->scn_phys.scn_state == DSS_SCANNING) 00197 return (EBUSY); 00198 00199 return (0); 00200 } 00201 00202 /* ARGSUSED */ 00203 static void 00204 dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx) 00205 { 00206 dsl_scan_t *scn = arg1; 00207 pool_scan_func_t *funcp = arg2; 00208 dmu_object_type_t ot = 0; 00209 dsl_pool_t *dp = scn->scn_dp; 00210 spa_t *spa = dp->dp_spa; 00211 00212 ASSERT(scn->scn_phys.scn_state != DSS_SCANNING); 00213 ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); 00214 bzero(&scn->scn_phys, sizeof (scn->scn_phys)); 00215 scn->scn_phys.scn_func = *funcp; 00216 scn->scn_phys.scn_state = DSS_SCANNING; 00217 scn->scn_phys.scn_min_txg = 0; 00218 scn->scn_phys.scn_max_txg = tx->tx_txg; 00219 scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ 00220 scn->scn_phys.scn_start_time = gethrestime_sec(); 00221 scn->scn_phys.scn_errors = 0; 00222 scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; 00223 scn->scn_restart_txg = 0; 00224 spa_scan_stat_init(spa); 00225 00226 if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { 00227 scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; 00228 00229 /* rewrite all disk labels */ 00230 vdev_config_dirty(spa->spa_root_vdev); 00231 00232 if (vdev_resilver_needed(spa->spa_root_vdev, 00233 &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { 00234 spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); 00235 } else { 00236 spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START); 00237 } 00238 00239 spa->spa_scrub_started = B_TRUE; 00240 /* 00241 * If this is an incremental scrub, limit the DDT scrub phase 00242 * to just the auto-ditto class (for correctness); the rest 00243 * of the scrub should go faster using top-down pruning. 00244 */ 00245 if (scn->scn_phys.scn_min_txg > TXG_INITIAL) 00246 scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; 00247 00248 } 00249 00250 /* back to the generic stuff */ 00251 00252 if (dp->dp_blkstats == NULL) { 00253 dp->dp_blkstats = 00254 kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); 00255 } 00256 bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); 00257 00258 if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) 00259 ot = DMU_OT_ZAP_OTHER; 00260 00261 scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, 00262 ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); 00263 00264 dsl_scan_sync_state(scn, tx); 00265 00266 spa_history_log_internal(LOG_POOL_SCAN, spa, tx, 00267 "func=%u mintxg=%llu maxtxg=%llu", 00268 *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); 00269 } 00270 00271 /* ARGSUSED */ 00272 static void 00273 dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) 00274 { 00275 static const char *old_names[] = { 00276 "scrub_bookmark", 00277 "scrub_ddt_bookmark", 00278 "scrub_ddt_class_max", 00279 "scrub_queue", 00280 "scrub_min_txg", 00281 "scrub_max_txg", 00282 "scrub_func", 00283 "scrub_errors", 00284 NULL 00285 }; 00286 00287 dsl_pool_t *dp = scn->scn_dp; 00288 spa_t *spa = dp->dp_spa; 00289 int i; 00290 00291 /* Remove any remnants of an old-style scrub. */ 00292 for (i = 0; old_names[i]; i++) { 00293 (void) zap_remove(dp->dp_meta_objset, 00294 DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); 00295 } 00296 00297 if (scn->scn_phys.scn_queue_obj != 0) { 00298 VERIFY(0 == dmu_object_free(dp->dp_meta_objset, 00299 scn->scn_phys.scn_queue_obj, tx)); 00300 scn->scn_phys.scn_queue_obj = 0; 00301 } 00302 00303 /* 00304 * If we were "restarted" from a stopped state, don't bother 00305 * with anything else. 00306 */ 00307 if (scn->scn_phys.scn_state != DSS_SCANNING) 00308 return; 00309 00310 if (complete) 00311 scn->scn_phys.scn_state = DSS_FINISHED; 00312 else 00313 scn->scn_phys.scn_state = DSS_CANCELED; 00314 00315 spa_history_log_internal(LOG_POOL_SCAN_DONE, spa, tx, 00316 "complete=%u", complete); 00317 00318 if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { 00319 mutex_enter(&spa->spa_scrub_lock); 00320 while (spa->spa_scrub_inflight > 0) { 00321 cv_wait(&spa->spa_scrub_io_cv, 00322 &spa->spa_scrub_lock); 00323 } 00324 mutex_exit(&spa->spa_scrub_lock); 00325 spa->spa_scrub_started = B_FALSE; 00326 spa->spa_scrub_active = B_FALSE; 00327 00328 /* 00329 * If the scrub/resilver completed, update all DTLs to 00330 * reflect this. Whether it succeeded or not, vacate 00331 * all temporary scrub DTLs. 00332 */ 00333 vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, 00334 complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE); 00335 if (complete) { 00336 spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ? 00337 ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); 00338 } 00339 spa_errlog_rotate(spa); 00340 00341 /* 00342 * We may have finished replacing a device. 00343 * Let the async thread assess this and handle the detach. 00344 */ 00345 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 00346 } 00347 00348 scn->scn_phys.scn_end_time = gethrestime_sec(); 00349 } 00350 00351 /* ARGSUSED */ 00352 static int 00353 dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx) 00354 { 00355 dsl_scan_t *scn = arg1; 00356 00357 if (scn->scn_phys.scn_state != DSS_SCANNING) 00358 return (ENOENT); 00359 return (0); 00360 } 00361 00362 /* ARGSUSED */ 00363 static void 00364 dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx) 00365 { 00366 dsl_scan_t *scn = arg1; 00367 00368 dsl_scan_done(scn, B_FALSE, tx); 00369 dsl_scan_sync_state(scn, tx); 00370 } 00371 00372 int 00373 dsl_scan_cancel(dsl_pool_t *dp) 00374 { 00375 boolean_t complete = B_FALSE; 00376 int err; 00377 00378 err = dsl_sync_task_do(dp, dsl_scan_cancel_check, 00379 dsl_scan_cancel_sync, dp->dp_scan, &complete, 3); 00380 return (err); 00381 } 00382 00383 static void dsl_scan_visitbp(blkptr_t *bp, 00384 const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf, 00385 dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, 00386 dmu_tx_t *tx); 00387 static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds, 00388 dmu_objset_type_t ostype, 00389 dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx); 00390 00391 void 00392 dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) 00393 { 00394 zio_free(dp->dp_spa, txg, bp); 00395 } 00396 00397 void 00398 dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) 00399 { 00400 ASSERT(dsl_pool_sync_context(dp)); 00401 zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp), 00402 pio->io_flags)); 00403 } 00404 00405 int 00406 dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf, 00407 arc_done_func_t *done, void *private, int priority, int zio_flags, 00408 uint32_t *arc_flags, const zbookmark_t *zb) 00409 { 00410 return (arc_read(pio, spa, bpp, pbuf, done, private, 00411 priority, zio_flags, arc_flags, zb)); 00412 } 00413 00414 int 00415 dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp, 00416 arc_done_func_t *done, void *private, int priority, int zio_flags, 00417 uint32_t *arc_flags, const zbookmark_t *zb) 00418 { 00419 return (arc_read_nolock(pio, spa, bpp, done, private, 00420 priority, zio_flags, arc_flags, zb)); 00421 } 00422 00423 static uint64_t 00424 dsl_scan_ds_maxtxg(dsl_dataset_t *ds) 00425 { 00426 uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; 00427 if (dsl_dataset_is_snapshot(ds)) 00428 return (MIN(smt, ds->ds_phys->ds_creation_txg)); 00429 return (smt); 00430 } 00431 00432 static void 00433 dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) 00434 { 00435 VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset, 00436 DMU_POOL_DIRECTORY_OBJECT, 00437 DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, 00438 &scn->scn_phys, tx)); 00439 } 00440 00441 static boolean_t 00442 dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb) 00443 { 00444 uint64_t elapsed_nanosecs; 00445 unsigned int mintime; 00446 00447 /* we never skip user/group accounting objects */ 00448 if (zb && (int64_t)zb->zb_object < 0) 00449 return (B_FALSE); 00450 00451 if (scn->scn_pausing) 00452 return (B_TRUE); /* we're already pausing */ 00453 00454 if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) 00455 return (B_FALSE); /* we're resuming */ 00456 00457 /* We only know how to resume from level-0 blocks. */ 00458 if (zb && zb->zb_level != 0) 00459 return (B_FALSE); 00460 00461 mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? 00462 zfs_resilver_min_time_ms : zfs_scan_min_time_ms; 00463 elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; 00464 if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || 00465 (elapsed_nanosecs / MICROSEC > mintime && 00466 txg_sync_waiting(scn->scn_dp)) || 00467 spa_shutting_down(scn->scn_dp->dp_spa)) { 00468 if (zb) { 00469 dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n", 00470 (longlong_t)zb->zb_objset, 00471 (longlong_t)zb->zb_object, 00472 (longlong_t)zb->zb_level, 00473 (longlong_t)zb->zb_blkid); 00474 scn->scn_phys.scn_bookmark = *zb; 00475 } 00476 dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n", 00477 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, 00478 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, 00479 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, 00480 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); 00481 scn->scn_pausing = B_TRUE; 00482 return (B_TRUE); 00483 } 00484 return (B_FALSE); 00485 } 00486 00487 typedef struct zil_scan_arg { 00488 dsl_pool_t *zsa_dp; 00489 zil_header_t *zsa_zh; 00490 } zil_scan_arg_t; 00491 00492 /* ARGSUSED */ 00493 static int 00494 dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 00495 { 00496 zil_scan_arg_t *zsa = arg; 00497 dsl_pool_t *dp = zsa->zsa_dp; 00498 dsl_scan_t *scn = dp->dp_scan; 00499 zil_header_t *zh = zsa->zsa_zh; 00500 zbookmark_t zb; 00501 00502 if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) 00503 return (0); 00504 00505 /* 00506 * One block ("stubby") can be allocated a long time ago; we 00507 * want to visit that one because it has been allocated 00508 * (on-disk) even if it hasn't been claimed (even though for 00509 * scrub there's nothing to do to it). 00510 */ 00511 if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) 00512 return (0); 00513 00514 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], 00515 ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 00516 00517 VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); 00518 return (0); 00519 } 00520 00521 /* ARGSUSED */ 00522 static int 00523 dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) 00524 { 00525 if (lrc->lrc_txtype == TX_WRITE) { 00526 zil_scan_arg_t *zsa = arg; 00527 dsl_pool_t *dp = zsa->zsa_dp; 00528 dsl_scan_t *scn = dp->dp_scan; 00529 zil_header_t *zh = zsa->zsa_zh; 00530 lr_write_t *lr = (lr_write_t *)lrc; 00531 blkptr_t *bp = &lr->lr_blkptr; 00532 zbookmark_t zb; 00533 00534 if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) 00535 return (0); 00536 00537 /* 00538 * birth can be < claim_txg if this record's txg is 00539 * already txg sync'ed (but this log block contains 00540 * other records that are not synced) 00541 */ 00542 if (claim_txg == 0 || bp->blk_birth < claim_txg) 00543 return (0); 00544 00545 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], 00546 lr->lr_foid, ZB_ZIL_LEVEL, 00547 lr->lr_offset / BP_GET_LSIZE(bp)); 00548 00549 VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); 00550 } 00551 return (0); 00552 } 00553 00554 static void 00555 dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) 00556 { 00557 uint64_t claim_txg = zh->zh_claim_txg; 00558 zil_scan_arg_t zsa = { dp, zh }; 00559 zilog_t *zilog; 00560 00561 /* 00562 * We only want to visit blocks that have been claimed but not yet 00563 * replayed (or, in read-only mode, blocks that *would* be claimed). 00564 */ 00565 if (claim_txg == 0 && spa_writeable(dp->dp_spa)) 00566 return; 00567 00568 zilog = zil_alloc(dp->dp_meta_objset, zh); 00569 00570 (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, 00571 claim_txg); 00572 00573 zil_free(zilog); 00574 } 00575 00576 /* ARGSUSED */ 00577 static void 00578 dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, 00579 uint64_t objset, uint64_t object, uint64_t blkid) 00580 { 00581 zbookmark_t czb; 00582 uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; 00583 00584 if (zfs_no_scrub_prefetch) 00585 return; 00586 00587 if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg || 00588 (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) 00589 return; 00590 00591 SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); 00592 00593 /* 00594 * XXX need to make sure all of these arc_read() prefetches are 00595 * done before setting xlateall (similar to dsl_read()) 00596 */ 00597 (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, 00598 buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 00599 ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb); 00600 } 00601 00602 static boolean_t 00603 dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, 00604 const zbookmark_t *zb) 00605 { 00606 /* 00607 * We never skip over user/group accounting objects (obj<0) 00608 */ 00609 if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && 00610 (int64_t)zb->zb_object >= 0) { 00611 /* 00612 * If we already visited this bp & everything below (in 00613 * a prior txg sync), don't bother doing it again. 00614 */ 00615 if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) 00616 return (B_TRUE); 00617 00618 /* 00619 * If we found the block we're trying to resume from, or 00620 * we went past it to a different object, zero it out to 00621 * indicate that it's OK to start checking for pausing 00622 * again. 00623 */ 00624 if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 || 00625 zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) { 00626 dprintf("resuming at %llx/%llx/%llx/%llx\n", 00627 (longlong_t)zb->zb_objset, 00628 (longlong_t)zb->zb_object, 00629 (longlong_t)zb->zb_level, 00630 (longlong_t)zb->zb_blkid); 00631 bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb)); 00632 } 00633 } 00634 return (B_FALSE); 00635 } 00636 00637 /* 00638 * \param bufp return location for new buf to write out 00639 * 00640 * \return nonzero on i/o error. 00641 */ 00642 static int 00643 dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, 00644 dnode_phys_t *dnp, const blkptr_t *bp, 00645 const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp) 00646 { 00647 dsl_pool_t *dp = scn->scn_dp; 00648 int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; 00649 int err; 00650 00651 if (BP_GET_LEVEL(bp) > 0) { 00652 uint32_t flags = ARC_WAIT; 00653 int i; 00654 blkptr_t *cbp; 00655 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 00656 00657 err = arc_read_nolock(NULL, dp->dp_spa, bp, 00658 arc_getbuf_func, bufp, 00659 ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); 00660 if (err) { 00661 scn->scn_phys.scn_errors++; 00662 return (err); 00663 } 00664 for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) { 00665 dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset, 00666 zb->zb_object, zb->zb_blkid * epb + i); 00667 } 00668 for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) { 00669 zbookmark_t czb; 00670 00671 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 00672 zb->zb_level - 1, 00673 zb->zb_blkid * epb + i); 00674 dsl_scan_visitbp(cbp, &czb, dnp, 00675 *bufp, ds, scn, ostype, tx); 00676 } 00677 } else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) { 00678 uint32_t flags = ARC_WAIT; 00679 00680 err = arc_read_nolock(NULL, dp->dp_spa, bp, 00681 arc_getbuf_func, bufp, 00682 ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); 00683 if (err) { 00684 scn->scn_phys.scn_errors++; 00685 return (err); 00686 } 00687 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { 00688 uint32_t flags = ARC_WAIT; 00689 dnode_phys_t *cdnp; 00690 int i, j; 00691 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; 00692 00693 err = arc_read_nolock(NULL, dp->dp_spa, bp, 00694 arc_getbuf_func, bufp, 00695 ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); 00696 if (err) { 00697 scn->scn_phys.scn_errors++; 00698 return (err); 00699 } 00700 for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) { 00701 for (j = 0; j < cdnp->dn_nblkptr; j++) { 00702 blkptr_t *cbp = &cdnp->dn_blkptr[j]; 00703 dsl_scan_prefetch(scn, *bufp, cbp, 00704 zb->zb_objset, zb->zb_blkid * epb + i, j); 00705 } 00706 } 00707 for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) { 00708 dsl_scan_visitdnode(scn, ds, ostype, 00709 cdnp, *bufp, zb->zb_blkid * epb + i, tx); 00710 } 00711 00712 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { 00713 uint32_t flags = ARC_WAIT; 00714 objset_phys_t *osp; 00715 00716 err = arc_read_nolock(NULL, dp->dp_spa, bp, 00717 arc_getbuf_func, bufp, 00718 ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); 00719 if (err) { 00720 scn->scn_phys.scn_errors++; 00721 return (err); 00722 } 00723 00724 osp = (*bufp)->b_data; 00725 00726 dsl_scan_visitdnode(scn, ds, osp->os_type, 00727 &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx); 00728 00729 if (OBJSET_BUF_HAS_USERUSED(*bufp)) { 00730 /* 00731 * We also always visit user/group accounting 00732 * objects, and never skip them, even if we are 00733 * pausing. This is necessary so that the space 00734 * deltas from this txg get integrated. 00735 */ 00736 dsl_scan_visitdnode(scn, ds, osp->os_type, 00737 &osp->os_groupused_dnode, *bufp, 00738 DMU_GROUPUSED_OBJECT, tx); 00739 dsl_scan_visitdnode(scn, ds, osp->os_type, 00740 &osp->os_userused_dnode, *bufp, 00741 DMU_USERUSED_OBJECT, tx); 00742 } 00743 } 00744 00745 return (0); 00746 } 00747 00748 static void 00749 dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, 00750 dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf, 00751 uint64_t object, dmu_tx_t *tx) 00752 { 00753 int j; 00754 00755 for (j = 0; j < dnp->dn_nblkptr; j++) { 00756 zbookmark_t czb; 00757 00758 SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, 00759 dnp->dn_nlevels - 1, j); 00760 dsl_scan_visitbp(&dnp->dn_blkptr[j], 00761 &czb, dnp, buf, ds, scn, ostype, tx); 00762 } 00763 00764 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 00765 zbookmark_t czb; 00766 SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, 00767 0, DMU_SPILL_BLKID); 00768 dsl_scan_visitbp(&dnp->dn_spill, 00769 &czb, dnp, buf, ds, scn, ostype, tx); 00770 } 00771 } 00772 00773 /* 00774 * The arguments are in this order because mdb can only print the 00775 * first 5; we want them to be useful. 00776 */ 00777 static void 00778 dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, 00779 dnode_phys_t *dnp, arc_buf_t *pbuf, 00780 dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, 00781 dmu_tx_t *tx) 00782 { 00783 dsl_pool_t *dp = scn->scn_dp; 00784 arc_buf_t *buf = NULL; 00785 blkptr_t bp_toread = *bp; 00786 00787 /* ASSERT(pbuf == NULL || arc_released(pbuf)); */ 00788 00789 if (dsl_scan_check_pause(scn, zb)) 00790 return; 00791 00792 if (dsl_scan_check_resume(scn, dnp, zb)) 00793 return; 00794 00795 if (bp->blk_birth == 0) 00796 return; 00797 00798 scn->scn_visited_this_txg++; 00799 00800 dprintf_bp(bp, 00801 "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p", 00802 ds, ds ? ds->ds_object : 0, 00803 zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, 00804 pbuf, bp); 00805 00806 if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) 00807 return; 00808 00809 if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx, 00810 &buf) != 0) 00811 return; 00812 00813 /* 00814 * If dsl_scan_ddt() has aready visited this block, it will have 00815 * already done any translations or scrubbing, so don't call the 00816 * callback again. 00817 */ 00818 if (ddt_class_contains(dp->dp_spa, 00819 scn->scn_phys.scn_ddt_class_max, bp)) { 00820 ASSERT(buf == NULL); 00821 return; 00822 } 00823 00824 /* 00825 * If this block is from the future (after cur_max_txg), then we 00826 * are doing this on behalf of a deleted snapshot, and we will 00827 * revisit the future block on the next pass of this dataset. 00828 * Don't scan it now unless we need to because something 00829 * under it was modified. 00830 */ 00831 if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) { 00832 scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); 00833 } 00834 if (buf) 00835 (void) arc_buf_remove_ref(buf, &buf); 00836 } 00837 00838 static void 00839 dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, 00840 dmu_tx_t *tx) 00841 { 00842 zbookmark_t zb; 00843 00844 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 00845 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 00846 dsl_scan_visitbp(bp, &zb, NULL, NULL, 00847 ds, scn, DMU_OST_NONE, tx); 00848 00849 dprintf_ds(ds, "finished scan%s", ""); 00850 } 00851 00852 void 00853 dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) 00854 { 00855 dsl_pool_t *dp = ds->ds_dir->dd_pool; 00856 dsl_scan_t *scn = dp->dp_scan; 00857 uint64_t mintxg; 00858 00859 if (scn->scn_phys.scn_state != DSS_SCANNING) 00860 return; 00861 00862 if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { 00863 if (dsl_dataset_is_snapshot(ds)) { 00864 /* Note, scn_cur_{min,max}_txg stays the same. */ 00865 scn->scn_phys.scn_bookmark.zb_objset = 00866 ds->ds_phys->ds_next_snap_obj; 00867 zfs_dbgmsg("destroying ds %llu; currently traversing; " 00868 "reset zb_objset to %llu", 00869 (u_longlong_t)ds->ds_object, 00870 (u_longlong_t)ds->ds_phys->ds_next_snap_obj); 00871 scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN; 00872 } else { 00873 SET_BOOKMARK(&scn->scn_phys.scn_bookmark, 00874 ZB_DESTROYED_OBJSET, 0, 0, 0); 00875 zfs_dbgmsg("destroying ds %llu; currently traversing; " 00876 "reset bookmark to -1,0,0,0", 00877 (u_longlong_t)ds->ds_object); 00878 } 00879 } else if (zap_lookup_int_key(dp->dp_meta_objset, 00880 scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { 00881 ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); 00882 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 00883 scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); 00884 if (dsl_dataset_is_snapshot(ds)) { 00885 /* 00886 * We keep the same mintxg; it could be > 00887 * ds_creation_txg if the previous snapshot was 00888 * deleted too. 00889 */ 00890 VERIFY(zap_add_int_key(dp->dp_meta_objset, 00891 scn->scn_phys.scn_queue_obj, 00892 ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0); 00893 zfs_dbgmsg("destroying ds %llu; in queue; " 00894 "replacing with %llu", 00895 (u_longlong_t)ds->ds_object, 00896 (u_longlong_t)ds->ds_phys->ds_next_snap_obj); 00897 } else { 00898 zfs_dbgmsg("destroying ds %llu; in queue; removing", 00899 (u_longlong_t)ds->ds_object); 00900 } 00901 } else { 00902 zfs_dbgmsg("destroying ds %llu; ignoring", 00903 (u_longlong_t)ds->ds_object); 00904 } 00905 00906 /* 00907 * dsl_scan_sync() should be called after this, and should sync 00908 * out our changed state, but just to be safe, do it here. 00909 */ 00910 dsl_scan_sync_state(scn, tx); 00911 } 00912 00913 void 00914 dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) 00915 { 00916 dsl_pool_t *dp = ds->ds_dir->dd_pool; 00917 dsl_scan_t *scn = dp->dp_scan; 00918 uint64_t mintxg; 00919 00920 if (scn->scn_phys.scn_state != DSS_SCANNING) 00921 return; 00922 00923 ASSERT(ds->ds_phys->ds_prev_snap_obj != 0); 00924 00925 if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { 00926 scn->scn_phys.scn_bookmark.zb_objset = 00927 ds->ds_phys->ds_prev_snap_obj; 00928 zfs_dbgmsg("snapshotting ds %llu; currently traversing; " 00929 "reset zb_objset to %llu", 00930 (u_longlong_t)ds->ds_object, 00931 (u_longlong_t)ds->ds_phys->ds_prev_snap_obj); 00932 } else if (zap_lookup_int_key(dp->dp_meta_objset, 00933 scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { 00934 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 00935 scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); 00936 VERIFY(zap_add_int_key(dp->dp_meta_objset, 00937 scn->scn_phys.scn_queue_obj, 00938 ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0); 00939 zfs_dbgmsg("snapshotting ds %llu; in queue; " 00940 "replacing with %llu", 00941 (u_longlong_t)ds->ds_object, 00942 (u_longlong_t)ds->ds_phys->ds_prev_snap_obj); 00943 } 00944 dsl_scan_sync_state(scn, tx); 00945 } 00946 00947 void 00948 dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) 00949 { 00950 dsl_pool_t *dp = ds1->ds_dir->dd_pool; 00951 dsl_scan_t *scn = dp->dp_scan; 00952 uint64_t mintxg; 00953 00954 if (scn->scn_phys.scn_state != DSS_SCANNING) 00955 return; 00956 00957 if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) { 00958 scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object; 00959 zfs_dbgmsg("clone_swap ds %llu; currently traversing; " 00960 "reset zb_objset to %llu", 00961 (u_longlong_t)ds1->ds_object, 00962 (u_longlong_t)ds2->ds_object); 00963 } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) { 00964 scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object; 00965 zfs_dbgmsg("clone_swap ds %llu; currently traversing; " 00966 "reset zb_objset to %llu", 00967 (u_longlong_t)ds2->ds_object, 00968 (u_longlong_t)ds1->ds_object); 00969 } 00970 00971 if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, 00972 ds1->ds_object, &mintxg) == 0) { 00973 int err; 00974 00975 ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg); 00976 ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg); 00977 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 00978 scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); 00979 err = zap_add_int_key(dp->dp_meta_objset, 00980 scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx); 00981 VERIFY(err == 0 || err == EEXIST); 00982 if (err == EEXIST) { 00983 /* Both were there to begin with */ 00984 VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, 00985 scn->scn_phys.scn_queue_obj, 00986 ds1->ds_object, mintxg, tx)); 00987 } 00988 zfs_dbgmsg("clone_swap ds %llu; in queue; " 00989 "replacing with %llu", 00990 (u_longlong_t)ds1->ds_object, 00991 (u_longlong_t)ds2->ds_object); 00992 } else if (zap_lookup_int_key(dp->dp_meta_objset, 00993 scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) { 00994 ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg); 00995 ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg); 00996 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 00997 scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); 00998 VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, 00999 scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); 01000 zfs_dbgmsg("clone_swap ds %llu; in queue; " 01001 "replacing with %llu", 01002 (u_longlong_t)ds2->ds_object, 01003 (u_longlong_t)ds1->ds_object); 01004 } 01005 01006 dsl_scan_sync_state(scn, tx); 01007 } 01008 01009 struct enqueue_clones_arg { 01010 dmu_tx_t *tx; 01011 uint64_t originobj; 01012 }; 01013 01014 /* ARGSUSED */ 01015 static int 01016 enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 01017 { 01018 struct enqueue_clones_arg *eca = arg; 01019 dsl_dataset_t *ds; 01020 int err; 01021 dsl_pool_t *dp = spa->spa_dsl_pool; 01022 dsl_scan_t *scn = dp->dp_scan; 01023 01024 err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); 01025 if (err) 01026 return (err); 01027 01028 if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { 01029 while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { 01030 dsl_dataset_t *prev; 01031 err = dsl_dataset_hold_obj(dp, 01032 ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); 01033 01034 dsl_dataset_rele(ds, FTAG); 01035 if (err) 01036 return (err); 01037 ds = prev; 01038 } 01039 VERIFY(zap_add_int_key(dp->dp_meta_objset, 01040 scn->scn_phys.scn_queue_obj, ds->ds_object, 01041 ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0); 01042 } 01043 dsl_dataset_rele(ds, FTAG); 01044 return (0); 01045 } 01046 01047 static void 01048 dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) 01049 { 01050 dsl_pool_t *dp = scn->scn_dp; 01051 dsl_dataset_t *ds; 01052 objset_t *os; 01053 01054 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 01055 01056 if (dmu_objset_from_ds(ds, &os)) 01057 goto out; 01058 01059 /* 01060 * Only the ZIL in the head (non-snapshot) is valid. Even though 01061 * snapshots can have ZIL block pointers (which may be the same 01062 * BP as in the head), they must be ignored. So we traverse the 01063 * ZIL here, rather than in scan_recurse(), because the regular 01064 * snapshot block-sharing rules don't apply to it. 01065 */ 01066 if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds)) 01067 dsl_scan_zil(dp, &os->os_zil_header); 01068 01069 /* 01070 * Iterate over the bps in this ds. 01071 */ 01072 dmu_buf_will_dirty(ds->ds_dbuf, tx); 01073 dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx); 01074 01075 char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP); 01076 dsl_dataset_name(ds, dsname); 01077 zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " 01078 "pausing=%u", 01079 (longlong_t)dsobj, dsname, 01080 (longlong_t)scn->scn_phys.scn_cur_min_txg, 01081 (longlong_t)scn->scn_phys.scn_cur_max_txg, 01082 (int)scn->scn_pausing); 01083 kmem_free(dsname, ZFS_MAXNAMELEN); 01084 01085 if (scn->scn_pausing) 01086 goto out; 01087 01088 /* 01089 * We've finished this pass over this dataset. 01090 */ 01091 01092 /* 01093 * If we did not completely visit this dataset, do another pass. 01094 */ 01095 if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { 01096 zfs_dbgmsg("incomplete pass; visiting again"); 01097 scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; 01098 VERIFY(zap_add_int_key(dp->dp_meta_objset, 01099 scn->scn_phys.scn_queue_obj, ds->ds_object, 01100 scn->scn_phys.scn_cur_max_txg, tx) == 0); 01101 goto out; 01102 } 01103 01104 /* 01105 * Add descendent datasets to work queue. 01106 */ 01107 if (ds->ds_phys->ds_next_snap_obj != 0) { 01108 VERIFY(zap_add_int_key(dp->dp_meta_objset, 01109 scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj, 01110 ds->ds_phys->ds_creation_txg, tx) == 0); 01111 } 01112 if (ds->ds_phys->ds_num_children > 1) { 01113 boolean_t usenext = B_FALSE; 01114 if (ds->ds_phys->ds_next_clones_obj != 0) { 01115 uint64_t count; 01116 /* 01117 * A bug in a previous version of the code could 01118 * cause upgrade_clones_cb() to not set 01119 * ds_next_snap_obj when it should, leading to a 01120 * missing entry. Therefore we can only use the 01121 * next_clones_obj when its count is correct. 01122 */ 01123 int err = zap_count(dp->dp_meta_objset, 01124 ds->ds_phys->ds_next_clones_obj, &count); 01125 if (err == 0 && 01126 count == ds->ds_phys->ds_num_children - 1) 01127 usenext = B_TRUE; 01128 } 01129 01130 if (usenext) { 01131 VERIFY(zap_join_key(dp->dp_meta_objset, 01132 ds->ds_phys->ds_next_clones_obj, 01133 scn->scn_phys.scn_queue_obj, 01134 ds->ds_phys->ds_creation_txg, tx) == 0); 01135 } else { 01136 struct enqueue_clones_arg eca; 01137 eca.tx = tx; 01138 eca.originobj = ds->ds_object; 01139 01140 (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, 01141 NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); 01142 } 01143 } 01144 01145 out: 01146 dsl_dataset_rele(ds, FTAG); 01147 } 01148 01149 /* ARGSUSED */ 01150 static int 01151 enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 01152 { 01153 dmu_tx_t *tx = arg; 01154 dsl_dataset_t *ds; 01155 int err; 01156 dsl_pool_t *dp = spa->spa_dsl_pool; 01157 dsl_scan_t *scn = dp->dp_scan; 01158 01159 err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); 01160 if (err) 01161 return (err); 01162 01163 while (ds->ds_phys->ds_prev_snap_obj != 0) { 01164 dsl_dataset_t *prev; 01165 err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 01166 FTAG, &prev); 01167 if (err) { 01168 dsl_dataset_rele(ds, FTAG); 01169 return (err); 01170 } 01171 01172 /* 01173 * If this is a clone, we don't need to worry about it for now. 01174 */ 01175 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { 01176 dsl_dataset_rele(ds, FTAG); 01177 dsl_dataset_rele(prev, FTAG); 01178 return (0); 01179 } 01180 dsl_dataset_rele(ds, FTAG); 01181 ds = prev; 01182 } 01183 01184 VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, 01185 ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0); 01186 dsl_dataset_rele(ds, FTAG); 01187 return (0); 01188 } 01189 01223 static void 01224 dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) 01225 { 01226 ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; 01227 ddt_entry_t dde = { 0 }; 01228 int error; 01229 uint64_t n = 0; 01230 01231 while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { 01232 ddt_t *ddt; 01233 01234 if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) 01235 break; 01236 dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", 01237 (longlong_t)ddb->ddb_class, 01238 (longlong_t)ddb->ddb_type, 01239 (longlong_t)ddb->ddb_checksum, 01240 (longlong_t)ddb->ddb_cursor); 01241 01242 /* There should be no pending changes to the dedup table */ 01243 ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; 01244 ASSERT(avl_first(&ddt->ddt_tree) == NULL); 01245 01246 dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); 01247 n++; 01248 01249 if (dsl_scan_check_pause(scn, NULL)) 01250 break; 01251 } 01252 01253 zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u", 01254 (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max, 01255 (int)scn->scn_pausing); 01256 01257 ASSERT(error == 0 || error == ENOENT); 01258 ASSERT(error != ENOENT || 01259 ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); 01260 } 01261 01262 /* ARGSUSED */ 01263 void 01264 dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, 01265 ddt_entry_t *dde, dmu_tx_t *tx) 01266 { 01267 const ddt_key_t *ddk = &dde->dde_key; 01268 ddt_phys_t *ddp = dde->dde_phys; 01269 blkptr_t bp; 01270 zbookmark_t zb = { 0 }; 01271 01272 if (scn->scn_phys.scn_state != DSS_SCANNING) 01273 return; 01274 01275 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 01276 if (ddp->ddp_phys_birth == 0 || 01277 ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg) 01278 continue; 01279 ddt_bp_create(checksum, ddk, ddp, &bp); 01280 01281 scn->scn_visited_this_txg++; 01282 scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); 01283 } 01284 } 01285 01286 static void 01287 dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) 01288 { 01289 dsl_pool_t *dp = scn->scn_dp; 01290 zap_cursor_t zc; 01291 zap_attribute_t za; 01292 01293 if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= 01294 scn->scn_phys.scn_ddt_class_max) { 01295 scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; 01296 scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; 01297 dsl_scan_ddt(scn, tx); 01298 if (scn->scn_pausing) 01299 return; 01300 } 01301 01302 if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { 01303 /* First do the MOS & ORIGIN */ 01304 01305 scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; 01306 scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; 01307 dsl_scan_visit_rootbp(scn, NULL, 01308 &dp->dp_meta_rootbp, tx); 01309 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); 01310 if (scn->scn_pausing) 01311 return; 01312 01313 if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { 01314 VERIFY(0 == dmu_objset_find_spa(dp->dp_spa, 01315 NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); 01316 } else { 01317 dsl_scan_visitds(scn, 01318 dp->dp_origin_snap->ds_object, tx); 01319 } 01320 ASSERT(!scn->scn_pausing); 01321 } else if (scn->scn_phys.scn_bookmark.zb_objset != 01322 ZB_DESTROYED_OBJSET) { 01323 /* 01324 * If we were paused, continue from here. Note if the 01325 * ds we were paused on was deleted, the zb_objset may 01326 * be -1, so we will skip this and find a new objset 01327 * below. 01328 */ 01329 dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx); 01330 if (scn->scn_pausing) 01331 return; 01332 } 01333 01334 /* 01335 * In case we were paused right at the end of the ds, zero the 01336 * bookmark so we don't think that we're still trying to resume. 01337 */ 01338 bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t)); 01339 01340 /* keep pulling things out of the zap-object-as-queue */ 01341 while (zap_cursor_init(&zc, dp->dp_meta_objset, 01342 scn->scn_phys.scn_queue_obj), 01343 zap_cursor_retrieve(&zc, &za) == 0) { 01344 dsl_dataset_t *ds; 01345 uint64_t dsobj; 01346 01347 dsobj = strtonum(za.za_name, NULL); 01348 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 01349 scn->scn_phys.scn_queue_obj, dsobj, tx)); 01350 01351 /* Set up min/max txg */ 01352 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 01353 if (za.za_first_integer != 0) { 01354 scn->scn_phys.scn_cur_min_txg = 01355 MAX(scn->scn_phys.scn_min_txg, 01356 za.za_first_integer); 01357 } else { 01358 scn->scn_phys.scn_cur_min_txg = 01359 MAX(scn->scn_phys.scn_min_txg, 01360 ds->ds_phys->ds_prev_snap_txg); 01361 } 01362 scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); 01363 dsl_dataset_rele(ds, FTAG); 01364 01365 dsl_scan_visitds(scn, dsobj, tx); 01366 zap_cursor_fini(&zc); 01367 if (scn->scn_pausing) 01368 return; 01369 } 01370 zap_cursor_fini(&zc); 01371 } 01372 01373 static boolean_t 01374 dsl_scan_free_should_pause(dsl_scan_t *scn) 01375 { 01376 uint64_t elapsed_nanosecs; 01377 01378 elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; 01379 return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || 01380 (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms && 01381 txg_sync_waiting(scn->scn_dp)) || 01382 spa_shutting_down(scn->scn_dp->dp_spa)); 01383 } 01384 01385 static int 01386 dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 01387 { 01388 dsl_scan_t *scn = arg; 01389 01390 if (!scn->scn_is_bptree || 01391 (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { 01392 if (dsl_scan_free_should_pause(scn)) 01393 return (ERESTART); 01394 } 01395 01396 zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, 01397 dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0)); 01398 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, 01399 -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), 01400 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); 01401 scn->scn_visited_this_txg++; 01402 return (0); 01403 } 01404 01405 boolean_t 01406 dsl_scan_active(dsl_scan_t *scn) 01407 { 01408 spa_t *spa = scn->scn_dp->dp_spa; 01409 uint64_t used = 0, comp, uncomp; 01410 01411 if (spa->spa_load_state != SPA_LOAD_NONE) 01412 return (B_FALSE); 01413 if (spa_shutting_down(spa)) 01414 return (B_FALSE); 01415 01416 if (scn->scn_phys.scn_state == DSS_SCANNING) 01417 return (B_TRUE); 01418 01419 if (spa_feature_is_active(spa, 01420 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { 01421 return (B_TRUE); 01422 } 01423 if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { 01424 (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, 01425 &used, &comp, &uncomp); 01426 } 01427 return (used != 0); 01428 } 01429 01430 void 01431 dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) 01432 { 01433 dsl_scan_t *scn = dp->dp_scan; 01434 spa_t *spa = dp->dp_spa; 01435 int err; 01436 01437 /* 01438 * Check for scn_restart_txg before checking spa_load_state, so 01439 * that we can restart an old-style scan while the pool is being 01440 * imported (see dsl_scan_init). 01441 */ 01442 if (scn->scn_restart_txg != 0 && 01443 scn->scn_restart_txg <= tx->tx_txg) { 01444 pool_scan_func_t func = POOL_SCAN_SCRUB; 01445 dsl_scan_done(scn, B_FALSE, tx); 01446 if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) 01447 func = POOL_SCAN_RESILVER; 01448 zfs_dbgmsg("restarting scan func=%u txg=%llu", 01449 func, tx->tx_txg); 01450 dsl_scan_setup_sync(scn, &func, tx); 01451 } 01452 01453 if (!dsl_scan_active(scn) || 01454 spa_sync_pass(dp->dp_spa) > 1) 01455 return; 01456 01457 scn->scn_visited_this_txg = 0; 01458 scn->scn_pausing = B_FALSE; 01459 scn->scn_sync_start_time = gethrtime(); 01460 spa->spa_scrub_active = B_TRUE; 01461 01462 /* 01463 * First process the free list. If we pause the free, don't do 01464 * any scanning. This ensures that there is no free list when 01465 * we are scanning, so the scan code doesn't have to worry about 01466 * traversing it. 01467 */ 01468 if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { 01469 scn->scn_is_bptree = B_FALSE; 01470 scn->scn_zio_root = zio_root(dp->dp_spa, NULL, 01471 NULL, ZIO_FLAG_MUSTSUCCEED); 01472 err = bpobj_iterate(&dp->dp_free_bpobj, 01473 dsl_scan_free_block_cb, scn, tx); 01474 VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); 01475 01476 if (err == 0 && spa_feature_is_active(spa, 01477 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { 01478 scn->scn_is_bptree = B_TRUE; 01479 scn->scn_zio_root = zio_root(dp->dp_spa, NULL, 01480 NULL, ZIO_FLAG_MUSTSUCCEED); 01481 err = bptree_iterate(dp->dp_meta_objset, 01482 dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, 01483 scn, tx); 01484 VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); 01485 if (err != 0) 01486 return; 01487 01488 /* disable async destroy feature */ 01489 spa_feature_decr(spa, 01490 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx); 01491 ASSERT(!spa_feature_is_active(spa, 01492 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])); 01493 VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, 01494 DMU_POOL_DIRECTORY_OBJECT, 01495 DMU_POOL_BPTREE_OBJ, tx)); 01496 VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset, 01497 dp->dp_bptree_obj, tx)); 01498 dp->dp_bptree_obj = 0; 01499 } 01500 if (scn->scn_visited_this_txg) { 01501 zfs_dbgmsg("freed %llu blocks in %llums from " 01502 "free_bpobj/bptree txg %llu", 01503 (longlong_t)scn->scn_visited_this_txg, 01504 (longlong_t) 01505 (gethrtime() - scn->scn_sync_start_time) / MICROSEC, 01506 (longlong_t)tx->tx_txg); 01507 scn->scn_visited_this_txg = 0; 01508 /* 01509 * Re-sync the ddt so that we can further modify 01510 * it when doing bprewrite. 01511 */ 01512 ddt_sync(spa, tx->tx_txg); 01513 } 01514 if (err == ERESTART) 01515 return; 01516 } 01517 01518 if (scn->scn_phys.scn_state != DSS_SCANNING) 01519 return; 01520 01521 if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= 01522 scn->scn_phys.scn_ddt_class_max) { 01523 zfs_dbgmsg("doing scan sync txg %llu; " 01524 "ddt bm=%llu/%llu/%llu/%llx", 01525 (longlong_t)tx->tx_txg, 01526 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, 01527 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, 01528 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, 01529 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); 01530 ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0); 01531 ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0); 01532 ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0); 01533 ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0); 01534 } else { 01535 zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu", 01536 (longlong_t)tx->tx_txg, 01537 (longlong_t)scn->scn_phys.scn_bookmark.zb_objset, 01538 (longlong_t)scn->scn_phys.scn_bookmark.zb_object, 01539 (longlong_t)scn->scn_phys.scn_bookmark.zb_level, 01540 (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid); 01541 } 01542 01543 scn->scn_zio_root = zio_root(dp->dp_spa, NULL, 01544 NULL, ZIO_FLAG_CANFAIL); 01545 dsl_scan_visit(scn, tx); 01546 (void) zio_wait(scn->scn_zio_root); 01547 scn->scn_zio_root = NULL; 01548 01549 zfs_dbgmsg("visited %llu blocks in %llums", 01550 (longlong_t)scn->scn_visited_this_txg, 01551 (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC); 01552 01553 if (!scn->scn_pausing) { 01554 /* finished with scan. */ 01555 zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg); 01556 dsl_scan_done(scn, B_TRUE, tx); 01557 } 01558 01559 if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { 01560 mutex_enter(&spa->spa_scrub_lock); 01561 while (spa->spa_scrub_inflight > 0) { 01562 cv_wait(&spa->spa_scrub_io_cv, 01563 &spa->spa_scrub_lock); 01564 } 01565 mutex_exit(&spa->spa_scrub_lock); 01566 } 01567 01568 dsl_scan_sync_state(scn, tx); 01569 } 01570 01574 void 01575 dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) 01576 { 01577 if (txg == 0) { 01578 dmu_tx_t *tx; 01579 tx = dmu_tx_create_dd(dp->dp_mos_dir); 01580 VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); 01581 01582 txg = dmu_tx_get_txg(tx); 01583 dp->dp_scan->scn_restart_txg = txg; 01584 dmu_tx_commit(tx); 01585 } else { 01586 dp->dp_scan->scn_restart_txg = txg; 01587 } 01588 zfs_dbgmsg("restarting resilver txg=%llu", txg); 01589 } 01590 01591 boolean_t 01592 dsl_scan_resilvering(dsl_pool_t *dp) 01593 { 01594 return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING && 01595 dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); 01596 } 01597 01598 /* 01599 * scrub consumers 01600 */ 01601 01602 static void 01603 count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) 01604 { 01605 int i; 01606 01607 /* 01608 * If we resume after a reboot, zab will be NULL; don't record 01609 * incomplete stats in that case. 01610 */ 01611 if (zab == NULL) 01612 return; 01613 01614 for (i = 0; i < 4; i++) { 01615 int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; 01616 int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; 01617 if (t & DMU_OT_NEWTYPE) 01618 t = DMU_OT_OTHER; 01619 zfs_blkstat_t *zb = &zab->zab_type[l][t]; 01620 int equal; 01621 01622 zb->zb_count++; 01623 zb->zb_asize += BP_GET_ASIZE(bp); 01624 zb->zb_lsize += BP_GET_LSIZE(bp); 01625 zb->zb_psize += BP_GET_PSIZE(bp); 01626 zb->zb_gangs += BP_COUNT_GANG(bp); 01627 01628 switch (BP_GET_NDVAS(bp)) { 01629 case 2: 01630 if (DVA_GET_VDEV(&bp->blk_dva[0]) == 01631 DVA_GET_VDEV(&bp->blk_dva[1])) 01632 zb->zb_ditto_2_of_2_samevdev++; 01633 break; 01634 case 3: 01635 equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == 01636 DVA_GET_VDEV(&bp->blk_dva[1])) + 01637 (DVA_GET_VDEV(&bp->blk_dva[0]) == 01638 DVA_GET_VDEV(&bp->blk_dva[2])) + 01639 (DVA_GET_VDEV(&bp->blk_dva[1]) == 01640 DVA_GET_VDEV(&bp->blk_dva[2])); 01641 if (equal == 1) 01642 zb->zb_ditto_2_of_3_samevdev++; 01643 else if (equal == 3) 01644 zb->zb_ditto_3_of_3_samevdev++; 01645 break; 01646 } 01647 } 01648 } 01649 01650 static void 01651 dsl_scan_scrub_done(zio_t *zio) 01652 { 01653 spa_t *spa = zio->io_spa; 01654 01655 zio_data_buf_free(zio->io_data, zio->io_size); 01656 01657 mutex_enter(&spa->spa_scrub_lock); 01658 spa->spa_scrub_inflight--; 01659 cv_broadcast(&spa->spa_scrub_io_cv); 01660 01661 if (zio->io_error && (zio->io_error != ECKSUM || 01662 !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { 01663 spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++; 01664 } 01665 mutex_exit(&spa->spa_scrub_lock); 01666 } 01667 01668 static int 01669 dsl_scan_scrub_cb(dsl_pool_t *dp, 01670 const blkptr_t *bp, const zbookmark_t *zb) 01671 { 01672 dsl_scan_t *scn = dp->dp_scan; 01673 size_t size = BP_GET_PSIZE(bp); 01674 spa_t *spa = dp->dp_spa; 01675 uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); 01676 boolean_t needs_io; 01677 int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; 01678 int zio_priority; 01679 unsigned int scan_delay = 0; 01680 01681 if (phys_birth <= scn->scn_phys.scn_min_txg || 01682 phys_birth >= scn->scn_phys.scn_max_txg) 01683 return (0); 01684 01685 count_block(dp->dp_blkstats, bp); 01686 01687 ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); 01688 if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { 01689 zio_flags |= ZIO_FLAG_SCRUB; 01690 zio_priority = ZIO_PRIORITY_SCRUB; 01691 needs_io = B_TRUE; 01692 scan_delay = zfs_scrub_delay; 01693 } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { 01694 zio_flags |= ZIO_FLAG_RESILVER; 01695 zio_priority = ZIO_PRIORITY_RESILVER; 01696 needs_io = B_FALSE; 01697 scan_delay = zfs_resilver_delay; 01698 } 01699 01700 /* If it's an intent log block, failure is expected. */ 01701 if (zb->zb_level == ZB_ZIL_LEVEL) 01702 zio_flags |= ZIO_FLAG_SPECULATIVE; 01703 01704 for (int d = 0; d < BP_GET_NDVAS(bp); d++) { 01705 vdev_t *vd = vdev_lookup_top(spa, 01706 DVA_GET_VDEV(&bp->blk_dva[d])); 01707 01708 /* 01709 * Keep track of how much data we've examined so that 01710 * zpool(1M) status can make useful progress reports. 01711 */ 01712 scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]); 01713 spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]); 01714 01715 /* if it's a resilver, this may not be in the target range */ 01716 if (!needs_io) { 01717 if (DVA_GET_GANG(&bp->blk_dva[d])) { 01718 /* 01719 * Gang members may be spread across multiple 01720 * vdevs, so the best estimate we have is the 01721 * scrub range, which has already been checked. 01722 * XXX -- it would be better to change our 01723 * allocation policy to ensure that all 01724 * gang members reside on the same vdev. 01725 */ 01726 needs_io = B_TRUE; 01727 } else { 01728 needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, 01729 phys_birth, 1); 01730 } 01731 } 01732 } 01733 01734 if (needs_io && !zfs_no_scrub_io) { 01735 vdev_t *rvd = spa->spa_root_vdev; 01736 uint64_t maxinflight = rvd->vdev_children * 01737 MAX(zfs_top_maxinflight, 1); 01738 void *data = zio_data_buf_alloc(size); 01739 01740 mutex_enter(&spa->spa_scrub_lock); 01741 while (spa->spa_scrub_inflight >= maxinflight) 01742 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 01743 spa->spa_scrub_inflight++; 01744 mutex_exit(&spa->spa_scrub_lock); 01745 01746 /* 01747 * If we're seeing recent (zfs_scan_idle) "important" I/Os 01748 * then throttle our workload to limit the impact of a scan. 01749 */ 01750 if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) 01751 delay(MAX((int)scan_delay, 0)); 01752 01753 zio_nowait(zio_read(NULL, spa, bp, data, size, 01754 dsl_scan_scrub_done, NULL, zio_priority, 01755 zio_flags, zb)); 01756 } 01757 01758 /* do not relocate this block */ 01759 return (0); 01760 } 01761 01762 int 01763 dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) 01764 { 01765 spa_t *spa = dp->dp_spa; 01766 01767 /* 01768 * Purge all vdev caches and probe all devices. We do this here 01769 * rather than in sync context because this requires a writer lock 01770 * on the spa_config lock, which we can't do from sync context. The 01771 * spa_scrub_reopen flag indicates that vdev_open() should not 01772 * attempt to start another scrub. 01773 */ 01774 spa_vdev_state_enter(spa, SCL_NONE); 01775 spa->spa_scrub_reopen = B_TRUE; 01776 vdev_reopen(spa->spa_root_vdev); 01777 spa->spa_scrub_reopen = B_FALSE; 01778 (void) spa_vdev_state_exit(spa, NULL, 0); 01779 01780 return (dsl_sync_task_do(dp, dsl_scan_setup_check, 01781 dsl_scan_setup_sync, dp->dp_scan, &func, 0)); 01782 }