FreeBSD ZFS
The Zettabyte File System
|
00001 /* 00002 * CDDL HEADER START 00003 * 00004 * The contents of this file are subject to the terms of the 00005 * Common Development and Distribution License (the "License"). 00006 * You may not use this file except in compliance with the License. 00007 * 00008 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 00009 * or http://www.opensolaris.org/os/licensing. 00010 * See the License for the specific language governing permissions 00011 * and limitations under the License. 00012 * 00013 * When distributing Covered Code, include this CDDL HEADER in each 00014 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 00015 * If applicable, add the following below this CDDL HEADER, with the 00016 * fields enclosed by brackets "[]" replaced with your own identifying 00017 * information: Portions Copyright [yyyy] [name of copyright owner] 00018 * 00019 * CDDL HEADER END 00020 */ 00021 /* 00022 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 00023 * Use is subject to license terms. 00024 */ 00025 00026 /* 00027 * Copyright (c) 2012 by Delphix. All rights reserved. 00028 */ 00029 00030 #include <sys/zfs_context.h> 00031 #include <sys/spa.h> 00032 #include <sys/vdev_impl.h> 00033 #include <sys/zio.h> 00034 #include <sys/fs/zfs.h> 00035 00041 typedef struct mirror_child { 00042 vdev_t *mc_vd; 00043 uint64_t mc_offset; 00044 int mc_error; 00045 uint8_t mc_tried; 00046 uint8_t mc_skipped; 00047 uint8_t mc_speculative; 00048 } mirror_child_t; 00049 00050 typedef struct mirror_map { 00051 int mm_children; 00052 int mm_replacing; 00053 int mm_preferred; 00054 int mm_root; 00055 mirror_child_t mm_child[1]; 00056 } mirror_map_t; 00057 00058 int vdev_mirror_shift = 21; 00059 00060 static void 00061 vdev_mirror_map_free(zio_t *zio) 00062 { 00063 mirror_map_t *mm = zio->io_vsd; 00064 00065 kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children])); 00066 } 00067 00068 static const zio_vsd_ops_t vdev_mirror_vsd_ops = { 00069 vdev_mirror_map_free, 00070 zio_vsd_default_cksum_report 00071 }; 00072 00073 static mirror_map_t * 00074 vdev_mirror_map_alloc(zio_t *zio) 00075 { 00076 mirror_map_t *mm = NULL; 00077 mirror_child_t *mc; 00078 vdev_t *vd = zio->io_vd; 00079 int c, d; 00080 00081 if (vd == NULL) { 00082 dva_t *dva = zio->io_bp->blk_dva; 00083 spa_t *spa = zio->io_spa; 00084 00085 c = BP_GET_NDVAS(zio->io_bp); 00086 00087 mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); 00088 mm->mm_children = c; 00089 mm->mm_replacing = B_FALSE; 00090 mm->mm_preferred = spa_get_random(c); 00091 mm->mm_root = B_TRUE; 00092 00093 /* 00094 * Check the other, lower-index DVAs to see if they're on 00095 * the same vdev as the child we picked. If they are, use 00096 * them since they are likely to have been allocated from 00097 * the primary metaslab in use at the time, and hence are 00098 * more likely to have locality with single-copy data. 00099 */ 00100 for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) { 00101 if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c])) 00102 mm->mm_preferred = d; 00103 } 00104 00105 for (c = 0; c < mm->mm_children; c++) { 00106 mc = &mm->mm_child[c]; 00107 00108 mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); 00109 mc->mc_offset = DVA_GET_OFFSET(&dva[c]); 00110 } 00111 } else { 00112 c = vd->vdev_children; 00113 00114 mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); 00115 mm->mm_children = c; 00116 mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops || 00117 vd->vdev_ops == &vdev_spare_ops); 00118 mm->mm_preferred = mm->mm_replacing ? 0 : 00119 (zio->io_offset >> vdev_mirror_shift) % c; 00120 mm->mm_root = B_FALSE; 00121 00122 for (c = 0; c < mm->mm_children; c++) { 00123 mc = &mm->mm_child[c]; 00124 mc->mc_vd = vd->vdev_child[c]; 00125 mc->mc_offset = zio->io_offset; 00126 } 00127 } 00128 00129 zio->io_vsd = mm; 00130 zio->io_vsd_ops = &vdev_mirror_vsd_ops; 00131 return (mm); 00132 } 00133 00134 static int 00135 vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 00136 uint64_t *ashift) 00137 { 00138 int numerrors = 0; 00139 int lasterror = 0; 00140 00141 if (vd->vdev_children == 0) { 00142 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 00143 return (EINVAL); 00144 } 00145 00146 vdev_open_children(vd); 00147 00148 for (int c = 0; c < vd->vdev_children; c++) { 00149 vdev_t *cvd = vd->vdev_child[c]; 00150 00151 if (cvd->vdev_open_error) { 00152 lasterror = cvd->vdev_open_error; 00153 numerrors++; 00154 continue; 00155 } 00156 00157 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 00158 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 00159 *ashift = MAX(*ashift, cvd->vdev_ashift); 00160 } 00161 00162 if (numerrors == vd->vdev_children) { 00163 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 00164 return (lasterror); 00165 } 00166 00167 return (0); 00168 } 00169 00170 static void 00171 vdev_mirror_close(vdev_t *vd) 00172 { 00173 for (int c = 0; c < vd->vdev_children; c++) 00174 vdev_close(vd->vdev_child[c]); 00175 } 00176 00177 static void 00178 vdev_mirror_child_done(zio_t *zio) 00179 { 00180 mirror_child_t *mc = zio->io_private; 00181 00182 mc->mc_error = zio->io_error; 00183 mc->mc_tried = 1; 00184 mc->mc_skipped = 0; 00185 } 00186 00187 static void 00188 vdev_mirror_scrub_done(zio_t *zio) 00189 { 00190 mirror_child_t *mc = zio->io_private; 00191 00192 if (zio->io_error == 0) { 00193 zio_t *pio; 00194 00195 mutex_enter(&zio->io_lock); 00196 while ((pio = zio_walk_parents(zio)) != NULL) { 00197 mutex_enter(&pio->io_lock); 00198 ASSERT3U(zio->io_size, >=, pio->io_size); 00199 bcopy(zio->io_data, pio->io_data, pio->io_size); 00200 mutex_exit(&pio->io_lock); 00201 } 00202 mutex_exit(&zio->io_lock); 00203 } 00204 00205 zio_buf_free(zio->io_data, zio->io_size); 00206 00207 mc->mc_error = zio->io_error; 00208 mc->mc_tried = 1; 00209 mc->mc_skipped = 0; 00210 } 00211 00216 static int 00217 vdev_mirror_child_select(zio_t *zio) 00218 { 00219 mirror_map_t *mm = zio->io_vsd; 00220 mirror_child_t *mc; 00221 uint64_t txg = zio->io_txg; 00222 int i, c; 00223 00224 ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg); 00225 00226 /* 00227 * Try to find a child whose DTL doesn't contain the block to read. 00228 * If a child is known to be completely inaccessible (indicated by 00229 * vdev_readable() returning B_FALSE), don't even try. 00230 */ 00231 for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) { 00232 if (c >= mm->mm_children) 00233 c = 0; 00234 mc = &mm->mm_child[c]; 00235 if (mc->mc_tried || mc->mc_skipped) 00236 continue; 00237 if (!vdev_readable(mc->mc_vd)) { 00238 mc->mc_error = ENXIO; 00239 mc->mc_tried = 1; /* don't even try */ 00240 mc->mc_skipped = 1; 00241 continue; 00242 } 00243 if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) 00244 return (c); 00245 mc->mc_error = ESTALE; 00246 mc->mc_skipped = 1; 00247 mc->mc_speculative = 1; 00248 } 00249 00250 /* 00251 * Every device is either missing or has this txg in its DTL. 00252 * Look for any child we haven't already tried before giving up. 00253 */ 00254 for (c = 0; c < mm->mm_children; c++) 00255 if (!mm->mm_child[c].mc_tried) 00256 return (c); 00257 00258 /* 00259 * Every child failed. There's no place left to look. 00260 */ 00261 return (-1); 00262 } 00263 00264 static int 00265 vdev_mirror_io_start(zio_t *zio) 00266 { 00267 mirror_map_t *mm; 00268 mirror_child_t *mc; 00269 int c, children; 00270 00271 mm = vdev_mirror_map_alloc(zio); 00272 00273 if (zio->io_type == ZIO_TYPE_READ) { 00274 if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) { 00275 /* 00276 * For scrubbing reads we need to allocate a read 00277 * buffer for each child and issue reads to all 00278 * children. If any child succeeds, it will copy its 00279 * data into zio->io_data in vdev_mirror_scrub_done. 00280 */ 00281 for (c = 0; c < mm->mm_children; c++) { 00282 mc = &mm->mm_child[c]; 00283 zio_nowait(zio_vdev_child_io(zio, zio->io_bp, 00284 mc->mc_vd, mc->mc_offset, 00285 zio_buf_alloc(zio->io_size), zio->io_size, 00286 zio->io_type, zio->io_priority, 0, 00287 vdev_mirror_scrub_done, mc)); 00288 } 00289 return (ZIO_PIPELINE_CONTINUE); 00290 } 00291 /* 00292 * For normal reads just pick one child. 00293 */ 00294 c = vdev_mirror_child_select(zio); 00295 children = (c >= 0); 00296 } else { 00297 ASSERT(zio->io_type == ZIO_TYPE_WRITE || 00298 zio->io_type == ZIO_TYPE_FREE); 00299 00300 /* 00301 * Writes and frees go to all children. 00302 */ 00303 c = 0; 00304 children = mm->mm_children; 00305 } 00306 00307 while (children--) { 00308 mc = &mm->mm_child[c]; 00309 zio_nowait(zio_vdev_child_io(zio, zio->io_bp, 00310 mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, 00311 zio->io_type, zio->io_priority, 0, 00312 vdev_mirror_child_done, mc)); 00313 c++; 00314 } 00315 00316 return (ZIO_PIPELINE_CONTINUE); 00317 } 00318 00319 static int 00320 vdev_mirror_worst_error(mirror_map_t *mm) 00321 { 00322 int error[2] = { 0, 0 }; 00323 00324 for (int c = 0; c < mm->mm_children; c++) { 00325 mirror_child_t *mc = &mm->mm_child[c]; 00326 int s = mc->mc_speculative; 00327 error[s] = zio_worst_error(error[s], mc->mc_error); 00328 } 00329 00330 return (error[0] ? error[0] : error[1]); 00331 } 00332 00333 static void 00334 vdev_mirror_io_done(zio_t *zio) 00335 { 00336 mirror_map_t *mm = zio->io_vsd; 00337 mirror_child_t *mc; 00338 int c; 00339 int good_copies = 0; 00340 int unexpected_errors = 0; 00341 00342 for (c = 0; c < mm->mm_children; c++) { 00343 mc = &mm->mm_child[c]; 00344 00345 if (mc->mc_error) { 00346 if (!mc->mc_skipped) 00347 unexpected_errors++; 00348 } else if (mc->mc_tried) { 00349 good_copies++; 00350 } 00351 } 00352 00353 if (zio->io_type == ZIO_TYPE_WRITE) { 00354 /* 00355 * XXX -- for now, treat partial writes as success. 00356 * 00357 * Now that we support write reallocation, it would be better 00358 * to treat partial failure as real failure unless there are 00359 * no non-degraded top-level vdevs left, and not update DTLs 00360 * if we intend to reallocate. 00361 */ 00362 /* XXPOLICY */ 00363 if (good_copies != mm->mm_children) { 00364 /* 00365 * Always require at least one good copy. 00366 * 00367 * For ditto blocks (io_vd == NULL), require 00368 * all copies to be good. 00369 * 00370 * XXX -- for replacing vdevs, there's no great answer. 00371 * If the old device is really dead, we may not even 00372 * be able to access it -- so we only want to 00373 * require good writes to the new device. But if 00374 * the new device turns out to be flaky, we want 00375 * to be able to detach it -- which requires all 00376 * writes to the old device to have succeeded. 00377 */ 00378 if (good_copies == 0 || zio->io_vd == NULL) 00379 zio->io_error = vdev_mirror_worst_error(mm); 00380 } 00381 return; 00382 } else if (zio->io_type == ZIO_TYPE_FREE) { 00383 return; 00384 } 00385 00386 ASSERT(zio->io_type == ZIO_TYPE_READ); 00387 00388 /* 00389 * If we don't have a good copy yet, keep trying other children. 00390 */ 00391 /* XXPOLICY */ 00392 if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) { 00393 ASSERT(c >= 0 && c < mm->mm_children); 00394 mc = &mm->mm_child[c]; 00395 zio_vdev_io_redone(zio); 00396 zio_nowait(zio_vdev_child_io(zio, zio->io_bp, 00397 mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, 00398 ZIO_TYPE_READ, zio->io_priority, 0, 00399 vdev_mirror_child_done, mc)); 00400 return; 00401 } 00402 00403 /* XXPOLICY */ 00404 if (good_copies == 0) { 00405 zio->io_error = vdev_mirror_worst_error(mm); 00406 ASSERT(zio->io_error != 0); 00407 } 00408 00409 if (good_copies && spa_writeable(zio->io_spa) && 00410 (unexpected_errors || 00411 (zio->io_flags & ZIO_FLAG_RESILVER) || 00412 ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) { 00413 /* 00414 * Use the good data we have in hand to repair damaged children. 00415 */ 00416 for (c = 0; c < mm->mm_children; c++) { 00417 /* 00418 * Don't rewrite known good children. 00419 * Not only is it unnecessary, it could 00420 * actually be harmful: if the system lost 00421 * power while rewriting the only good copy, 00422 * there would be no good copies left! 00423 */ 00424 mc = &mm->mm_child[c]; 00425 00426 if (mc->mc_error == 0) { 00427 if (mc->mc_tried) 00428 continue; 00429 if (!(zio->io_flags & ZIO_FLAG_SCRUB) && 00430 !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, 00431 zio->io_txg, 1)) 00432 continue; 00433 mc->mc_error = ESTALE; 00434 } 00435 00436 zio_nowait(zio_vdev_child_io(zio, zio->io_bp, 00437 mc->mc_vd, mc->mc_offset, 00438 zio->io_data, zio->io_size, 00439 ZIO_TYPE_WRITE, zio->io_priority, 00440 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 00441 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 00442 } 00443 } 00444 } 00445 00446 static void 00447 vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) 00448 { 00449 if (faulted == vd->vdev_children) 00450 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 00451 VDEV_AUX_NO_REPLICAS); 00452 else if (degraded + faulted != 0) 00453 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 00454 else 00455 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 00456 } 00457 00458 vdev_ops_t vdev_mirror_ops = { 00459 vdev_mirror_open, 00460 vdev_mirror_close, 00461 vdev_default_asize, 00462 vdev_mirror_io_start, 00463 vdev_mirror_io_done, 00464 vdev_mirror_state_change, 00465 NULL, 00466 NULL, 00467 VDEV_TYPE_MIRROR, /* name of this vdev type */ 00468 B_FALSE /* not a leaf vdev */ 00469 }; 00470 00471 vdev_ops_t vdev_replacing_ops = { 00472 vdev_mirror_open, 00473 vdev_mirror_close, 00474 vdev_default_asize, 00475 vdev_mirror_io_start, 00476 vdev_mirror_io_done, 00477 vdev_mirror_state_change, 00478 NULL, 00479 NULL, 00480 VDEV_TYPE_REPLACING, /* name of this vdev type */ 00481 B_FALSE /* not a leaf vdev */ 00482 }; 00483 00484 vdev_ops_t vdev_spare_ops = { 00485 vdev_mirror_open, 00486 vdev_mirror_close, 00487 vdev_default_asize, 00488 vdev_mirror_io_start, 00489 vdev_mirror_io_done, 00490 vdev_mirror_state_change, 00491 NULL, 00492 NULL, 00493 VDEV_TYPE_SPARE, /* name of this vdev type */ 00494 B_FALSE /* not a leaf vdev */ 00495 };