FreeBSD ZFS
The Zettabyte File System
|
00001 /* 00002 * CDDL HEADER START 00003 * 00004 * The contents of this file are subject to the terms of the 00005 * Common Development and Distribution License (the "License"). 00006 * You may not use this file except in compliance with the License. 00007 * 00008 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 00009 * or http://www.opensolaris.org/os/licensing. 00010 * See the License for the specific language governing permissions 00011 * and limitations under the License. 00012 * 00013 * When distributing Covered Code, include this CDDL HEADER in each 00014 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 00015 * If applicable, add the following below this CDDL HEADER, with the 00016 * fields enclosed by brackets "[]" replaced with your own identifying 00017 * information: Portions Copyright [yyyy] [name of copyright owner] 00018 * 00019 * CDDL HEADER END 00020 */ 00021 00022 /* 00023 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 00024 * Copyright (c) 2012 by Delphix. All rights reserved. 00025 */ 00026 00027 #include <sys/zfs_context.h> 00028 #include <sys/spa.h> 00029 #include <sys/spa_impl.h> 00030 #include <sys/zio.h> 00031 #include <sys/ddt.h> 00032 #include <sys/zap.h> 00033 #include <sys/dmu_tx.h> 00034 #include <sys/arc.h> 00035 #include <sys/dsl_pool.h> 00036 #include <sys/zio_checksum.h> 00037 #include <sys/zio_compress.h> 00038 #include <sys/dsl_scan.h> 00039 00045 int zfs_dedup_prefetch = 1; 00046 00047 SYSCTL_DECL(_vfs_zfs); 00048 SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS DEDUP"); 00049 TUNABLE_INT("vfs.zfs.dedup.prefetch", &zfs_dedup_prefetch); 00050 SYSCTL_INT(_vfs_zfs_dedup, OID_AUTO, prefetch, CTLFLAG_RW, &zfs_dedup_prefetch, 00051 0, "Enable/disable prefetching of dedup-ed blocks which are going to be freed"); 00052 00053 static const ddt_ops_t *ddt_ops[DDT_TYPES] = { 00054 &ddt_zap_ops, 00055 }; 00056 00057 static const char *ddt_class_name[DDT_CLASSES] = { 00058 "ditto", 00059 "duplicate", 00060 "unique", 00061 }; 00062 00063 static void 00064 ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 00065 dmu_tx_t *tx) 00066 { 00067 spa_t *spa = ddt->ddt_spa; 00068 objset_t *os = ddt->ddt_os; 00069 uint64_t *objectp = &ddt->ddt_object[type][class]; 00070 boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup; 00071 char name[DDT_NAMELEN]; 00072 00073 ddt_object_name(ddt, type, class, name); 00074 00075 ASSERT(*objectp == 0); 00076 VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0); 00077 ASSERT(*objectp != 0); 00078 00079 VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, 00080 sizeof (uint64_t), 1, objectp, tx) == 0); 00081 00082 VERIFY(zap_add(os, spa->spa_ddt_stat_object, name, 00083 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 00084 &ddt->ddt_histogram[type][class], tx) == 0); 00085 } 00086 00087 static void 00088 ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 00089 dmu_tx_t *tx) 00090 { 00091 spa_t *spa = ddt->ddt_spa; 00092 objset_t *os = ddt->ddt_os; 00093 uint64_t *objectp = &ddt->ddt_object[type][class]; 00094 char name[DDT_NAMELEN]; 00095 00096 ddt_object_name(ddt, type, class, name); 00097 00098 ASSERT(*objectp != 0); 00099 ASSERT(ddt_object_count(ddt, type, class) == 0); 00100 ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); 00101 VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0); 00102 VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0); 00103 VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0); 00104 bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t)); 00105 00106 *objectp = 0; 00107 } 00108 00109 static int 00110 ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) 00111 { 00112 ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; 00113 dmu_object_info_t doi; 00114 char name[DDT_NAMELEN]; 00115 int error; 00116 00117 ddt_object_name(ddt, type, class, name); 00118 00119 error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, 00120 sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); 00121 00122 if (error) 00123 return (error); 00124 00125 error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, 00126 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 00127 &ddt->ddt_histogram[type][class]); 00128 00129 /* 00130 * Seed the cached statistics. 00131 */ 00132 VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); 00133 00134 ddo->ddo_count = ddt_object_count(ddt, type, class); 00135 ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; 00136 ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; 00137 00138 ASSERT(error == 0); 00139 return (error); 00140 } 00141 00142 static void 00143 ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 00144 dmu_tx_t *tx) 00145 { 00146 ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; 00147 dmu_object_info_t doi; 00148 char name[DDT_NAMELEN]; 00149 00150 ddt_object_name(ddt, type, class, name); 00151 00152 VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, 00153 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 00154 &ddt->ddt_histogram[type][class], tx) == 0); 00155 00156 /* 00157 * Cache DDT statistics; this is the only time they'll change. 00158 */ 00159 VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); 00160 00161 ddo->ddo_count = ddt_object_count(ddt, type, class); 00162 ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; 00163 ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; 00164 } 00165 00166 static int 00167 ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 00168 ddt_entry_t *dde) 00169 { 00170 if (!ddt_object_exists(ddt, type, class)) 00171 return (ENOENT); 00172 00173 return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, 00174 ddt->ddt_object[type][class], dde)); 00175 } 00176 00177 static void 00178 ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 00179 ddt_entry_t *dde) 00180 { 00181 if (!ddt_object_exists(ddt, type, class)) 00182 return; 00183 00184 ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os, 00185 ddt->ddt_object[type][class], dde); 00186 } 00187 00188 int 00189 ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 00190 ddt_entry_t *dde, dmu_tx_t *tx) 00191 { 00192 ASSERT(ddt_object_exists(ddt, type, class)); 00193 00194 return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, 00195 ddt->ddt_object[type][class], dde, tx)); 00196 } 00197 00198 static int 00199 ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 00200 ddt_entry_t *dde, dmu_tx_t *tx) 00201 { 00202 ASSERT(ddt_object_exists(ddt, type, class)); 00203 00204 return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, 00205 ddt->ddt_object[type][class], dde, tx)); 00206 } 00207 00208 int 00209 ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 00210 uint64_t *walk, ddt_entry_t *dde) 00211 { 00212 ASSERT(ddt_object_exists(ddt, type, class)); 00213 00214 return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, 00215 ddt->ddt_object[type][class], dde, walk)); 00216 } 00217 00218 uint64_t 00219 ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class) 00220 { 00221 ASSERT(ddt_object_exists(ddt, type, class)); 00222 00223 return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, 00224 ddt->ddt_object[type][class])); 00225 } 00226 00227 int 00228 ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 00229 dmu_object_info_t *doi) 00230 { 00231 if (!ddt_object_exists(ddt, type, class)) 00232 return (ENOENT); 00233 00234 return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], 00235 doi)); 00236 } 00237 00238 boolean_t 00239 ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class) 00240 { 00241 return (!!ddt->ddt_object[type][class]); 00242 } 00243 00244 void 00245 ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 00246 char *name) 00247 { 00248 (void) sprintf(name, DMU_POOL_DDT, 00249 zio_checksum_table[ddt->ddt_checksum].ci_name, 00250 ddt_ops[type]->ddt_op_name, ddt_class_name[class]); 00251 } 00252 00253 void 00254 ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) 00255 { 00256 ASSERT(txg != 0); 00257 00258 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 00259 bp->blk_dva[d] = ddp->ddp_dva[d]; 00260 BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); 00261 } 00262 00263 void 00264 ddt_bp_create(enum zio_checksum checksum, 00265 const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) 00266 { 00267 BP_ZERO(bp); 00268 00269 if (ddp != NULL) 00270 ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); 00271 00272 bp->blk_cksum = ddk->ddk_cksum; 00273 bp->blk_fill = 1; 00274 00275 BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); 00276 BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); 00277 BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); 00278 BP_SET_CHECKSUM(bp, checksum); 00279 BP_SET_TYPE(bp, DMU_OT_DEDUP); 00280 BP_SET_LEVEL(bp, 0); 00281 BP_SET_DEDUP(bp, 0); 00282 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 00283 } 00284 00285 void 00286 ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) 00287 { 00288 ddk->ddk_cksum = bp->blk_cksum; 00289 ddk->ddk_prop = 0; 00290 00291 DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); 00292 DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); 00293 DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); 00294 } 00295 00296 void 00297 ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) 00298 { 00299 ASSERT(ddp->ddp_phys_birth == 0); 00300 00301 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 00302 ddp->ddp_dva[d] = bp->blk_dva[d]; 00303 ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp); 00304 } 00305 00306 void 00307 ddt_phys_clear(ddt_phys_t *ddp) 00308 { 00309 bzero(ddp, sizeof (*ddp)); 00310 } 00311 00312 void 00313 ddt_phys_addref(ddt_phys_t *ddp) 00314 { 00315 ddp->ddp_refcnt++; 00316 } 00317 00318 void 00319 ddt_phys_decref(ddt_phys_t *ddp) 00320 { 00321 ASSERT((int64_t)ddp->ddp_refcnt > 0); 00322 ddp->ddp_refcnt--; 00323 } 00324 00325 void 00326 ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) 00327 { 00328 blkptr_t blk; 00329 00330 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); 00331 ddt_phys_clear(ddp); 00332 zio_free(ddt->ddt_spa, txg, &blk); 00333 } 00334 00335 ddt_phys_t * 00336 ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) 00337 { 00338 ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys; 00339 00340 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 00341 if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && 00342 BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth) 00343 return (ddp); 00344 } 00345 return (NULL); 00346 } 00347 00348 uint64_t 00349 ddt_phys_total_refcnt(const ddt_entry_t *dde) 00350 { 00351 uint64_t refcnt = 0; 00352 00353 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) 00354 refcnt += dde->dde_phys[p].ddp_refcnt; 00355 00356 return (refcnt); 00357 } 00358 00359 static void 00360 ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) 00361 { 00362 spa_t *spa = ddt->ddt_spa; 00363 ddt_phys_t *ddp = dde->dde_phys; 00364 ddt_key_t *ddk = &dde->dde_key; 00365 uint64_t lsize = DDK_GET_LSIZE(ddk); 00366 uint64_t psize = DDK_GET_PSIZE(ddk); 00367 00368 bzero(dds, sizeof (*dds)); 00369 00370 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 00371 uint64_t dsize = 0; 00372 uint64_t refcnt = ddp->ddp_refcnt; 00373 00374 if (ddp->ddp_phys_birth == 0) 00375 continue; 00376 00377 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 00378 dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); 00379 00380 dds->dds_blocks += 1; 00381 dds->dds_lsize += lsize; 00382 dds->dds_psize += psize; 00383 dds->dds_dsize += dsize; 00384 00385 dds->dds_ref_blocks += refcnt; 00386 dds->dds_ref_lsize += lsize * refcnt; 00387 dds->dds_ref_psize += psize * refcnt; 00388 dds->dds_ref_dsize += dsize * refcnt; 00389 } 00390 } 00391 00392 void 00393 ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) 00394 { 00395 const uint64_t *s = (const uint64_t *)src; 00396 uint64_t *d = (uint64_t *)dst; 00397 uint64_t *d_end = (uint64_t *)(dst + 1); 00398 00399 ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ 00400 00401 while (d < d_end) 00402 *d++ += (*s++ ^ neg) - neg; 00403 } 00404 00405 static void 00406 ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) 00407 { 00408 ddt_stat_t dds; 00409 ddt_histogram_t *ddh; 00410 int bucket; 00411 00412 ddt_stat_generate(ddt, dde, &dds); 00413 00414 bucket = highbit(dds.dds_ref_blocks) - 1; 00415 ASSERT(bucket >= 0); 00416 00417 ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; 00418 00419 ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); 00420 } 00421 00422 void 00423 ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) 00424 { 00425 for (int h = 0; h < 64; h++) 00426 ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); 00427 } 00428 00429 void 00430 ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) 00431 { 00432 bzero(dds, sizeof (*dds)); 00433 00434 for (int h = 0; h < 64; h++) 00435 ddt_stat_add(dds, &ddh->ddh_stat[h], 0); 00436 } 00437 00438 boolean_t 00439 ddt_histogram_empty(const ddt_histogram_t *ddh) 00440 { 00441 const uint64_t *s = (const uint64_t *)ddh; 00442 const uint64_t *s_end = (const uint64_t *)(ddh + 1); 00443 00444 while (s < s_end) 00445 if (*s++ != 0) 00446 return (B_FALSE); 00447 00448 return (B_TRUE); 00449 } 00450 00451 void 00452 ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) 00453 { 00454 /* Sum the statistics we cached in ddt_object_sync(). */ 00455 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 00456 ddt_t *ddt = spa->spa_ddt[c]; 00457 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 00458 for (enum ddt_class class = 0; class < DDT_CLASSES; 00459 class++) { 00460 ddt_object_t *ddo = 00461 &ddt->ddt_object_stats[type][class]; 00462 ddo_total->ddo_count += ddo->ddo_count; 00463 ddo_total->ddo_dspace += ddo->ddo_dspace; 00464 ddo_total->ddo_mspace += ddo->ddo_mspace; 00465 } 00466 } 00467 } 00468 00469 /* ... and compute the averages. */ 00470 if (ddo_total->ddo_count != 0) { 00471 ddo_total->ddo_dspace /= ddo_total->ddo_count; 00472 ddo_total->ddo_mspace /= ddo_total->ddo_count; 00473 } 00474 } 00475 00476 void 00477 ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) 00478 { 00479 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 00480 ddt_t *ddt = spa->spa_ddt[c]; 00481 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 00482 for (enum ddt_class class = 0; class < DDT_CLASSES; 00483 class++) { 00484 ddt_histogram_add(ddh, 00485 &ddt->ddt_histogram_cache[type][class]); 00486 } 00487 } 00488 } 00489 } 00490 00491 void 00492 ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) 00493 { 00494 ddt_histogram_t *ddh_total; 00495 00496 ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); 00497 ddt_get_dedup_histogram(spa, ddh_total); 00498 ddt_histogram_stat(dds_total, ddh_total); 00499 kmem_free(ddh_total, sizeof (ddt_histogram_t)); 00500 } 00501 00502 uint64_t 00503 ddt_get_dedup_dspace(spa_t *spa) 00504 { 00505 ddt_stat_t dds_total = { 0 }; 00506 00507 ddt_get_dedup_stats(spa, &dds_total); 00508 return (dds_total.dds_ref_dsize - dds_total.dds_dsize); 00509 } 00510 00511 uint64_t 00512 ddt_get_pool_dedup_ratio(spa_t *spa) 00513 { 00514 ddt_stat_t dds_total = { 0 }; 00515 00516 ddt_get_dedup_stats(spa, &dds_total); 00517 if (dds_total.dds_dsize == 0) 00518 return (100); 00519 00520 return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); 00521 } 00522 00523 int 00524 ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref) 00525 { 00526 spa_t *spa = ddt->ddt_spa; 00527 uint64_t total_refcnt = 0; 00528 uint64_t ditto = spa->spa_dedup_ditto; 00529 int total_copies = 0; 00530 int desired_copies = 0; 00531 00532 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 00533 ddt_phys_t *ddp = &dde->dde_phys[p]; 00534 zio_t *zio = dde->dde_lead_zio[p]; 00535 uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */ 00536 if (zio != NULL) 00537 refcnt += zio->io_parent_count; /* pending refs */ 00538 if (ddp == ddp_willref) 00539 refcnt++; /* caller's ref */ 00540 if (refcnt != 0) { 00541 total_refcnt += refcnt; 00542 total_copies += p; 00543 } 00544 } 00545 00546 if (ditto == 0 || ditto > UINT32_MAX) 00547 ditto = UINT32_MAX; 00548 00549 if (total_refcnt >= 1) 00550 desired_copies++; 00551 if (total_refcnt >= ditto) 00552 desired_copies++; 00553 if (total_refcnt >= ditto * ditto) 00554 desired_copies++; 00555 00556 return (MAX(desired_copies, total_copies) - total_copies); 00557 } 00558 00559 int 00560 ddt_ditto_copies_present(ddt_entry_t *dde) 00561 { 00562 ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO]; 00563 dva_t *dva = ddp->ddp_dva; 00564 int copies = 0 - DVA_GET_GANG(dva); 00565 00566 for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++) 00567 if (DVA_IS_VALID(dva)) 00568 copies++; 00569 00570 ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP); 00571 00572 return (copies); 00573 } 00574 00575 size_t 00576 ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) 00577 { 00578 uchar_t *version = dst++; 00579 int cpfunc = ZIO_COMPRESS_ZLE; 00580 zio_compress_info_t *ci = &zio_compress_table[cpfunc]; 00581 size_t c_len; 00582 00583 ASSERT(d_len >= s_len + 1); /* no compression plus version byte */ 00584 00585 c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level); 00586 00587 if (c_len == s_len) { 00588 cpfunc = ZIO_COMPRESS_OFF; 00589 bcopy(src, dst, s_len); 00590 } 00591 00592 *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc; 00593 00594 return (c_len + 1); 00595 } 00596 00597 void 00598 ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) 00599 { 00600 uchar_t version = *src++; 00601 int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK; 00602 zio_compress_info_t *ci = &zio_compress_table[cpfunc]; 00603 00604 if (ci->ci_decompress != NULL) 00605 (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); 00606 else 00607 bcopy(src, dst, d_len); 00608 00609 if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK) 00610 byteswap_uint64_array(dst, d_len); 00611 } 00612 00613 ddt_t * 00614 ddt_select_by_checksum(spa_t *spa, enum zio_checksum c) 00615 { 00616 return (spa->spa_ddt[c]); 00617 } 00618 00619 ddt_t * 00620 ddt_select(spa_t *spa, const blkptr_t *bp) 00621 { 00622 return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); 00623 } 00624 00625 void 00626 ddt_enter(ddt_t *ddt) 00627 { 00628 mutex_enter(&ddt->ddt_lock); 00629 } 00630 00631 void 00632 ddt_exit(ddt_t *ddt) 00633 { 00634 mutex_exit(&ddt->ddt_lock); 00635 } 00636 00637 static ddt_entry_t * 00638 ddt_alloc(const ddt_key_t *ddk) 00639 { 00640 ddt_entry_t *dde; 00641 00642 dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP); 00643 cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); 00644 00645 dde->dde_key = *ddk; 00646 00647 return (dde); 00648 } 00649 00650 static void 00651 ddt_free(ddt_entry_t *dde) 00652 { 00653 ASSERT(!dde->dde_loading); 00654 00655 for (int p = 0; p < DDT_PHYS_TYPES; p++) 00656 ASSERT(dde->dde_lead_zio[p] == NULL); 00657 00658 if (dde->dde_repair_data != NULL) 00659 zio_buf_free(dde->dde_repair_data, 00660 DDK_GET_PSIZE(&dde->dde_key)); 00661 00662 cv_destroy(&dde->dde_cv); 00663 kmem_free(dde, sizeof (*dde)); 00664 } 00665 00666 void 00667 ddt_remove(ddt_t *ddt, ddt_entry_t *dde) 00668 { 00669 ASSERT(MUTEX_HELD(&ddt->ddt_lock)); 00670 00671 avl_remove(&ddt->ddt_tree, dde); 00672 ddt_free(dde); 00673 } 00674 00675 ddt_entry_t * 00676 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) 00677 { 00678 ddt_entry_t *dde, dde_search; 00679 enum ddt_type type; 00680 enum ddt_class class; 00681 avl_index_t where; 00682 int error; 00683 00684 ASSERT(MUTEX_HELD(&ddt->ddt_lock)); 00685 00686 ddt_key_fill(&dde_search.dde_key, bp); 00687 00688 dde = avl_find(&ddt->ddt_tree, &dde_search, &where); 00689 if (dde == NULL) { 00690 if (!add) 00691 return (NULL); 00692 dde = ddt_alloc(&dde_search.dde_key); 00693 avl_insert(&ddt->ddt_tree, dde, where); 00694 } 00695 00696 while (dde->dde_loading) 00697 cv_wait(&dde->dde_cv, &ddt->ddt_lock); 00698 00699 if (dde->dde_loaded) 00700 return (dde); 00701 00702 dde->dde_loading = B_TRUE; 00703 00704 ddt_exit(ddt); 00705 00706 error = ENOENT; 00707 00708 for (type = 0; type < DDT_TYPES; type++) { 00709 for (class = 0; class < DDT_CLASSES; class++) { 00710 error = ddt_object_lookup(ddt, type, class, dde); 00711 if (error != ENOENT) 00712 break; 00713 } 00714 if (error != ENOENT) 00715 break; 00716 } 00717 00718 ASSERT(error == 0 || error == ENOENT); 00719 00720 ddt_enter(ddt); 00721 00722 ASSERT(dde->dde_loaded == B_FALSE); 00723 ASSERT(dde->dde_loading == B_TRUE); 00724 00725 dde->dde_type = type; /* will be DDT_TYPES if no entry found */ 00726 dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ 00727 dde->dde_loaded = B_TRUE; 00728 dde->dde_loading = B_FALSE; 00729 00730 if (error == 0) 00731 ddt_stat_update(ddt, dde, -1ULL); 00732 00733 cv_broadcast(&dde->dde_cv); 00734 00735 return (dde); 00736 } 00737 00738 void 00739 ddt_prefetch(spa_t *spa, const blkptr_t *bp) 00740 { 00741 ddt_t *ddt; 00742 ddt_entry_t dde; 00743 00744 if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp)) 00745 return; 00746 00747 /* 00748 * We only remove the DDT once all tables are empty and only 00749 * prefetch dedup blocks when there are entries in the DDT. 00750 * Thus no locking is required as the DDT can't disappear on us. 00751 */ 00752 ddt = ddt_select(spa, bp); 00753 ddt_key_fill(&dde.dde_key, bp); 00754 00755 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 00756 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { 00757 ddt_object_prefetch(ddt, type, class, &dde); 00758 } 00759 } 00760 } 00761 00762 int 00763 ddt_entry_compare(const void *x1, const void *x2) 00764 { 00765 const ddt_entry_t *dde1 = x1; 00766 const ddt_entry_t *dde2 = x2; 00767 const uint64_t *u1 = (const uint64_t *)&dde1->dde_key; 00768 const uint64_t *u2 = (const uint64_t *)&dde2->dde_key; 00769 00770 for (int i = 0; i < DDT_KEY_WORDS; i++) { 00771 if (u1[i] < u2[i]) 00772 return (-1); 00773 if (u1[i] > u2[i]) 00774 return (1); 00775 } 00776 00777 return (0); 00778 } 00779 00780 static ddt_t * 00781 ddt_table_alloc(spa_t *spa, enum zio_checksum c) 00782 { 00783 ddt_t *ddt; 00784 00785 ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP); 00786 00787 mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); 00788 avl_create(&ddt->ddt_tree, ddt_entry_compare, 00789 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); 00790 avl_create(&ddt->ddt_repair_tree, ddt_entry_compare, 00791 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); 00792 ddt->ddt_checksum = c; 00793 ddt->ddt_spa = spa; 00794 ddt->ddt_os = spa->spa_meta_objset; 00795 00796 return (ddt); 00797 } 00798 00799 static void 00800 ddt_table_free(ddt_t *ddt) 00801 { 00802 ASSERT(avl_numnodes(&ddt->ddt_tree) == 0); 00803 ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0); 00804 avl_destroy(&ddt->ddt_tree); 00805 avl_destroy(&ddt->ddt_repair_tree); 00806 mutex_destroy(&ddt->ddt_lock); 00807 kmem_free(ddt, sizeof (*ddt)); 00808 } 00809 00810 void 00811 ddt_create(spa_t *spa) 00812 { 00813 spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; 00814 00815 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) 00816 spa->spa_ddt[c] = ddt_table_alloc(spa, c); 00817 } 00818 00819 int 00820 ddt_load(spa_t *spa) 00821 { 00822 int error; 00823 00824 ddt_create(spa); 00825 00826 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 00827 DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, 00828 &spa->spa_ddt_stat_object); 00829 00830 if (error) 00831 return (error == ENOENT ? 0 : error); 00832 00833 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 00834 ddt_t *ddt = spa->spa_ddt[c]; 00835 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 00836 for (enum ddt_class class = 0; class < DDT_CLASSES; 00837 class++) { 00838 error = ddt_object_load(ddt, type, class); 00839 if (error != 0 && error != ENOENT) 00840 return (error); 00841 } 00842 } 00843 00844 /* 00845 * Seed the cached histograms. 00846 */ 00847 bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, 00848 sizeof (ddt->ddt_histogram)); 00849 } 00850 00851 return (0); 00852 } 00853 00854 void 00855 ddt_unload(spa_t *spa) 00856 { 00857 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 00858 if (spa->spa_ddt[c]) { 00859 ddt_table_free(spa->spa_ddt[c]); 00860 spa->spa_ddt[c] = NULL; 00861 } 00862 } 00863 } 00864 00865 boolean_t 00866 ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) 00867 { 00868 ddt_t *ddt; 00869 ddt_entry_t dde; 00870 00871 if (!BP_GET_DEDUP(bp)) 00872 return (B_FALSE); 00873 00874 if (max_class == DDT_CLASS_UNIQUE) 00875 return (B_TRUE); 00876 00877 ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; 00878 00879 ddt_key_fill(&dde.dde_key, bp); 00880 00881 for (enum ddt_type type = 0; type < DDT_TYPES; type++) 00882 for (enum ddt_class class = 0; class <= max_class; class++) 00883 if (ddt_object_lookup(ddt, type, class, &dde) == 0) 00884 return (B_TRUE); 00885 00886 return (B_FALSE); 00887 } 00888 00889 ddt_entry_t * 00890 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) 00891 { 00892 ddt_key_t ddk; 00893 ddt_entry_t *dde; 00894 00895 ddt_key_fill(&ddk, bp); 00896 00897 dde = ddt_alloc(&ddk); 00898 00899 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 00900 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { 00901 /* 00902 * We can only do repair if there are multiple copies 00903 * of the block. For anything in the UNIQUE class, 00904 * there's definitely only one copy, so don't even try. 00905 */ 00906 if (class != DDT_CLASS_UNIQUE && 00907 ddt_object_lookup(ddt, type, class, dde) == 0) 00908 return (dde); 00909 } 00910 } 00911 00912 bzero(dde->dde_phys, sizeof (dde->dde_phys)); 00913 00914 return (dde); 00915 } 00916 00917 void 00918 ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) 00919 { 00920 avl_index_t where; 00921 00922 ddt_enter(ddt); 00923 00924 if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) && 00925 avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) 00926 avl_insert(&ddt->ddt_repair_tree, dde, where); 00927 else 00928 ddt_free(dde); 00929 00930 ddt_exit(ddt); 00931 } 00932 00933 static void 00934 ddt_repair_entry_done(zio_t *zio) 00935 { 00936 ddt_entry_t *rdde = zio->io_private; 00937 00938 ddt_free(rdde); 00939 } 00940 00941 static void 00942 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) 00943 { 00944 ddt_phys_t *ddp = dde->dde_phys; 00945 ddt_phys_t *rddp = rdde->dde_phys; 00946 ddt_key_t *ddk = &dde->dde_key; 00947 ddt_key_t *rddk = &rdde->dde_key; 00948 zio_t *zio; 00949 blkptr_t blk; 00950 00951 zio = zio_null(rio, rio->io_spa, NULL, 00952 ddt_repair_entry_done, rdde, rio->io_flags); 00953 00954 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { 00955 if (ddp->ddp_phys_birth == 0 || 00956 ddp->ddp_phys_birth != rddp->ddp_phys_birth || 00957 bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) 00958 continue; 00959 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); 00960 zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, 00961 rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL, 00962 ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); 00963 } 00964 00965 zio_nowait(zio); 00966 } 00967 00968 static void 00969 ddt_repair_table(ddt_t *ddt, zio_t *rio) 00970 { 00971 spa_t *spa = ddt->ddt_spa; 00972 ddt_entry_t *dde, *rdde_next, *rdde; 00973 avl_tree_t *t = &ddt->ddt_repair_tree; 00974 blkptr_t blk; 00975 00976 if (spa_sync_pass(spa) > 1) 00977 return; 00978 00979 ddt_enter(ddt); 00980 for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) { 00981 rdde_next = AVL_NEXT(t, rdde); 00982 avl_remove(&ddt->ddt_repair_tree, rdde); 00983 ddt_exit(ddt); 00984 ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); 00985 dde = ddt_repair_start(ddt, &blk); 00986 ddt_repair_entry(ddt, dde, rdde, rio); 00987 ddt_repair_done(ddt, dde); 00988 ddt_enter(ddt); 00989 } 00990 ddt_exit(ddt); 00991 } 00992 00993 static void 00994 ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) 00995 { 00996 dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; 00997 ddt_phys_t *ddp = dde->dde_phys; 00998 ddt_key_t *ddk = &dde->dde_key; 00999 enum ddt_type otype = dde->dde_type; 01000 enum ddt_type ntype = DDT_TYPE_CURRENT; 01001 enum ddt_class oclass = dde->dde_class; 01002 enum ddt_class nclass; 01003 uint64_t total_refcnt = 0; 01004 01005 ASSERT(dde->dde_loaded); 01006 ASSERT(!dde->dde_loading); 01007 01008 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 01009 ASSERT(dde->dde_lead_zio[p] == NULL); 01010 ASSERT((int64_t)ddp->ddp_refcnt >= 0); 01011 if (ddp->ddp_phys_birth == 0) { 01012 ASSERT(ddp->ddp_refcnt == 0); 01013 continue; 01014 } 01015 if (p == DDT_PHYS_DITTO) { 01016 if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0) 01017 ddt_phys_free(ddt, ddk, ddp, txg); 01018 continue; 01019 } 01020 if (ddp->ddp_refcnt == 0) 01021 ddt_phys_free(ddt, ddk, ddp, txg); 01022 total_refcnt += ddp->ddp_refcnt; 01023 } 01024 01025 if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0) 01026 nclass = DDT_CLASS_DITTO; 01027 else if (total_refcnt > 1) 01028 nclass = DDT_CLASS_DUPLICATE; 01029 else 01030 nclass = DDT_CLASS_UNIQUE; 01031 01032 if (otype != DDT_TYPES && 01033 (otype != ntype || oclass != nclass || total_refcnt == 0)) { 01034 VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0); 01035 ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT); 01036 } 01037 01038 if (total_refcnt != 0) { 01039 dde->dde_type = ntype; 01040 dde->dde_class = nclass; 01041 ddt_stat_update(ddt, dde, 0); 01042 if (!ddt_object_exists(ddt, ntype, nclass)) 01043 ddt_object_create(ddt, ntype, nclass, tx); 01044 VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); 01045 01046 /* 01047 * If the class changes, the order that we scan this bp 01048 * changes. If it decreases, we could miss it, so 01049 * scan it right now. (This covers both class changing 01050 * while we are doing ddt_walk(), and when we are 01051 * traversing.) 01052 */ 01053 if (nclass < oclass) { 01054 dsl_scan_ddt_entry(dp->dp_scan, 01055 ddt->ddt_checksum, dde, tx); 01056 } 01057 } 01058 } 01059 01060 static void 01061 ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) 01062 { 01063 spa_t *spa = ddt->ddt_spa; 01064 ddt_entry_t *dde; 01065 void *cookie = NULL; 01066 01067 if (avl_numnodes(&ddt->ddt_tree) == 0) 01068 return; 01069 01070 ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); 01071 01072 if (spa->spa_ddt_stat_object == 0) { 01073 spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, 01074 DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT, 01075 DMU_POOL_DDT_STATS, tx); 01076 } 01077 01078 while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { 01079 ddt_sync_entry(ddt, dde, tx, txg); 01080 ddt_free(dde); 01081 } 01082 01083 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 01084 uint64_t count = 0; 01085 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { 01086 if (ddt_object_exists(ddt, type, class)) { 01087 ddt_object_sync(ddt, type, class, tx); 01088 count += ddt_object_count(ddt, type, class); 01089 } 01090 } 01091 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { 01092 if (count == 0 && ddt_object_exists(ddt, type, class)) 01093 ddt_object_destroy(ddt, type, class, tx); 01094 } 01095 } 01096 01097 bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, 01098 sizeof (ddt->ddt_histogram)); 01099 } 01100 01101 void 01102 ddt_sync(spa_t *spa, uint64_t txg) 01103 { 01104 dmu_tx_t *tx; 01105 zio_t *rio = zio_root(spa, NULL, NULL, 01106 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 01107 01108 ASSERT(spa_syncing_txg(spa) == txg); 01109 01110 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 01111 01112 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 01113 ddt_t *ddt = spa->spa_ddt[c]; 01114 if (ddt == NULL) 01115 continue; 01116 ddt_sync_table(ddt, tx, txg); 01117 ddt_repair_table(ddt, rio); 01118 } 01119 01120 (void) zio_wait(rio); 01121 01122 dmu_tx_commit(tx); 01123 } 01124 01125 int 01126 ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) 01127 { 01128 do { 01129 do { 01130 do { 01131 ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; 01132 int error = ENOENT; 01133 if (ddt_object_exists(ddt, ddb->ddb_type, 01134 ddb->ddb_class)) { 01135 error = ddt_object_walk(ddt, 01136 ddb->ddb_type, ddb->ddb_class, 01137 &ddb->ddb_cursor, dde); 01138 } 01139 dde->dde_type = ddb->ddb_type; 01140 dde->dde_class = ddb->ddb_class; 01141 if (error == 0) 01142 return (0); 01143 if (error != ENOENT) 01144 return (error); 01145 ddb->ddb_cursor = 0; 01146 } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS); 01147 ddb->ddb_checksum = 0; 01148 } while (++ddb->ddb_type < DDT_TYPES); 01149 ddb->ddb_type = 0; 01150 } while (++ddb->ddb_class < DDT_CLASSES); 01151 01152 return (ENOENT); 01153 }