FreeBSD ZFS
The Zettabyte File System

vdev_mirror.c

Go to the documentation of this file.
00001 /*
00002  * CDDL HEADER START
00003  *
00004  * The contents of this file are subject to the terms of the
00005  * Common Development and Distribution License (the "License").
00006  * You may not use this file except in compliance with the License.
00007  *
00008  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
00009  * or http://www.opensolaris.org/os/licensing.
00010  * See the License for the specific language governing permissions
00011  * and limitations under the License.
00012  *
00013  * When distributing Covered Code, include this CDDL HEADER in each
00014  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
00015  * If applicable, add the following below this CDDL HEADER, with the
00016  * fields enclosed by brackets "[]" replaced with your own identifying
00017  * information: Portions Copyright [yyyy] [name of copyright owner]
00018  *
00019  * CDDL HEADER END
00020  */
00021 /*
00022  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
00023  * Use is subject to license terms.
00024  */
00025 
00026 /*
00027  * Copyright (c) 2012 by Delphix. All rights reserved.
00028  */
00029 
00030 #include <sys/zfs_context.h>
00031 #include <sys/spa.h>
00032 #include <sys/vdev_impl.h>
00033 #include <sys/zio.h>
00034 #include <sys/fs/zfs.h>
00035 
00041 typedef struct mirror_child {
00042         vdev_t          *mc_vd;
00043         uint64_t        mc_offset;
00044         int             mc_error;
00045         uint8_t         mc_tried;
00046         uint8_t         mc_skipped;
00047         uint8_t         mc_speculative;
00048 } mirror_child_t;
00049 
00050 typedef struct mirror_map {
00051         int             mm_children;
00052         int             mm_replacing;
00053         int             mm_preferred;
00054         int             mm_root;
00055         mirror_child_t  mm_child[1];
00056 } mirror_map_t;
00057 
00058 int vdev_mirror_shift = 21;
00059 
00060 static void
00061 vdev_mirror_map_free(zio_t *zio)
00062 {
00063         mirror_map_t *mm = zio->io_vsd;
00064 
00065         kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
00066 }
00067 
00068 static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
00069         vdev_mirror_map_free,
00070         zio_vsd_default_cksum_report
00071 };
00072 
00073 static mirror_map_t *
00074 vdev_mirror_map_alloc(zio_t *zio)
00075 {
00076         mirror_map_t *mm = NULL;
00077         mirror_child_t *mc;
00078         vdev_t *vd = zio->io_vd;
00079         int c, d;
00080 
00081         if (vd == NULL) {
00082                 dva_t *dva = zio->io_bp->blk_dva;
00083                 spa_t *spa = zio->io_spa;
00084 
00085                 c = BP_GET_NDVAS(zio->io_bp);
00086 
00087                 mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
00088                 mm->mm_children = c;
00089                 mm->mm_replacing = B_FALSE;
00090                 mm->mm_preferred = spa_get_random(c);
00091                 mm->mm_root = B_TRUE;
00092 
00093                 /*
00094                  * Check the other, lower-index DVAs to see if they're on
00095                  * the same vdev as the child we picked.  If they are, use
00096                  * them since they are likely to have been allocated from
00097                  * the primary metaslab in use at the time, and hence are
00098                  * more likely to have locality with single-copy data.
00099                  */
00100                 for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
00101                         if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
00102                                 mm->mm_preferred = d;
00103                 }
00104 
00105                 for (c = 0; c < mm->mm_children; c++) {
00106                         mc = &mm->mm_child[c];
00107 
00108                         mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
00109                         mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
00110                 }
00111         } else {
00112                 c = vd->vdev_children;
00113 
00114                 mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
00115                 mm->mm_children = c;
00116                 mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
00117                     vd->vdev_ops == &vdev_spare_ops);
00118                 mm->mm_preferred = mm->mm_replacing ? 0 :
00119                     (zio->io_offset >> vdev_mirror_shift) % c;
00120                 mm->mm_root = B_FALSE;
00121 
00122                 for (c = 0; c < mm->mm_children; c++) {
00123                         mc = &mm->mm_child[c];
00124                         mc->mc_vd = vd->vdev_child[c];
00125                         mc->mc_offset = zio->io_offset;
00126                 }
00127         }
00128 
00129         zio->io_vsd = mm;
00130         zio->io_vsd_ops = &vdev_mirror_vsd_ops;
00131         return (mm);
00132 }
00133 
00134 static int
00135 vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
00136     uint64_t *ashift)
00137 {
00138         int numerrors = 0;
00139         int lasterror = 0;
00140 
00141         if (vd->vdev_children == 0) {
00142                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
00143                 return (EINVAL);
00144         }
00145 
00146         vdev_open_children(vd);
00147 
00148         for (int c = 0; c < vd->vdev_children; c++) {
00149                 vdev_t *cvd = vd->vdev_child[c];
00150 
00151                 if (cvd->vdev_open_error) {
00152                         lasterror = cvd->vdev_open_error;
00153                         numerrors++;
00154                         continue;
00155                 }
00156 
00157                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
00158                 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
00159                 *ashift = MAX(*ashift, cvd->vdev_ashift);
00160         }
00161 
00162         if (numerrors == vd->vdev_children) {
00163                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
00164                 return (lasterror);
00165         }
00166 
00167         return (0);
00168 }
00169 
00170 static void
00171 vdev_mirror_close(vdev_t *vd)
00172 {
00173         for (int c = 0; c < vd->vdev_children; c++)
00174                 vdev_close(vd->vdev_child[c]);
00175 }
00176 
00177 static void
00178 vdev_mirror_child_done(zio_t *zio)
00179 {
00180         mirror_child_t *mc = zio->io_private;
00181 
00182         mc->mc_error = zio->io_error;
00183         mc->mc_tried = 1;
00184         mc->mc_skipped = 0;
00185 }
00186 
00187 static void
00188 vdev_mirror_scrub_done(zio_t *zio)
00189 {
00190         mirror_child_t *mc = zio->io_private;
00191 
00192         if (zio->io_error == 0) {
00193                 zio_t *pio;
00194 
00195                 mutex_enter(&zio->io_lock);
00196                 while ((pio = zio_walk_parents(zio)) != NULL) {
00197                         mutex_enter(&pio->io_lock);
00198                         ASSERT3U(zio->io_size, >=, pio->io_size);
00199                         bcopy(zio->io_data, pio->io_data, pio->io_size);
00200                         mutex_exit(&pio->io_lock);
00201                 }
00202                 mutex_exit(&zio->io_lock);
00203         }
00204 
00205         zio_buf_free(zio->io_data, zio->io_size);
00206 
00207         mc->mc_error = zio->io_error;
00208         mc->mc_tried = 1;
00209         mc->mc_skipped = 0;
00210 }
00211 
00216 static int
00217 vdev_mirror_child_select(zio_t *zio)
00218 {
00219         mirror_map_t *mm = zio->io_vsd;
00220         mirror_child_t *mc;
00221         uint64_t txg = zio->io_txg;
00222         int i, c;
00223 
00224         ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
00225 
00226         /*
00227          * Try to find a child whose DTL doesn't contain the block to read.
00228          * If a child is known to be completely inaccessible (indicated by
00229          * vdev_readable() returning B_FALSE), don't even try.
00230          */
00231         for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
00232                 if (c >= mm->mm_children)
00233                         c = 0;
00234                 mc = &mm->mm_child[c];
00235                 if (mc->mc_tried || mc->mc_skipped)
00236                         continue;
00237                 if (!vdev_readable(mc->mc_vd)) {
00238                         mc->mc_error = ENXIO;
00239                         mc->mc_tried = 1;       /* don't even try */
00240                         mc->mc_skipped = 1;
00241                         continue;
00242                 }
00243                 if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1))
00244                         return (c);
00245                 mc->mc_error = ESTALE;
00246                 mc->mc_skipped = 1;
00247                 mc->mc_speculative = 1;
00248         }
00249 
00250         /*
00251          * Every device is either missing or has this txg in its DTL.
00252          * Look for any child we haven't already tried before giving up.
00253          */
00254         for (c = 0; c < mm->mm_children; c++)
00255                 if (!mm->mm_child[c].mc_tried)
00256                         return (c);
00257 
00258         /*
00259          * Every child failed.  There's no place left to look.
00260          */
00261         return (-1);
00262 }
00263 
00264 static int
00265 vdev_mirror_io_start(zio_t *zio)
00266 {
00267         mirror_map_t *mm;
00268         mirror_child_t *mc;
00269         int c, children;
00270 
00271         mm = vdev_mirror_map_alloc(zio);
00272 
00273         if (zio->io_type == ZIO_TYPE_READ) {
00274                 if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
00275                         /*
00276                          * For scrubbing reads we need to allocate a read
00277                          * buffer for each child and issue reads to all
00278                          * children.  If any child succeeds, it will copy its
00279                          * data into zio->io_data in vdev_mirror_scrub_done.
00280                          */
00281                         for (c = 0; c < mm->mm_children; c++) {
00282                                 mc = &mm->mm_child[c];
00283                                 zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
00284                                     mc->mc_vd, mc->mc_offset,
00285                                     zio_buf_alloc(zio->io_size), zio->io_size,
00286                                     zio->io_type, zio->io_priority, 0,
00287                                     vdev_mirror_scrub_done, mc));
00288                         }
00289                         return (ZIO_PIPELINE_CONTINUE);
00290                 }
00291                 /*
00292                  * For normal reads just pick one child.
00293                  */
00294                 c = vdev_mirror_child_select(zio);
00295                 children = (c >= 0);
00296         } else {
00297                 ASSERT(zio->io_type == ZIO_TYPE_WRITE ||
00298                     zio->io_type == ZIO_TYPE_FREE);
00299 
00300                 /*
00301                  * Writes and frees go to all children.
00302                  */
00303                 c = 0;
00304                 children = mm->mm_children;
00305         }
00306 
00307         while (children--) {
00308                 mc = &mm->mm_child[c];
00309                 zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
00310                     mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
00311                     zio->io_type, zio->io_priority, 0,
00312                     vdev_mirror_child_done, mc));
00313                 c++;
00314         }
00315 
00316         return (ZIO_PIPELINE_CONTINUE);
00317 }
00318 
00319 static int
00320 vdev_mirror_worst_error(mirror_map_t *mm)
00321 {
00322         int error[2] = { 0, 0 };
00323 
00324         for (int c = 0; c < mm->mm_children; c++) {
00325                 mirror_child_t *mc = &mm->mm_child[c];
00326                 int s = mc->mc_speculative;
00327                 error[s] = zio_worst_error(error[s], mc->mc_error);
00328         }
00329 
00330         return (error[0] ? error[0] : error[1]);
00331 }
00332 
00333 static void
00334 vdev_mirror_io_done(zio_t *zio)
00335 {
00336         mirror_map_t *mm = zio->io_vsd;
00337         mirror_child_t *mc;
00338         int c;
00339         int good_copies = 0;
00340         int unexpected_errors = 0;
00341 
00342         for (c = 0; c < mm->mm_children; c++) {
00343                 mc = &mm->mm_child[c];
00344 
00345                 if (mc->mc_error) {
00346                         if (!mc->mc_skipped)
00347                                 unexpected_errors++;
00348                 } else if (mc->mc_tried) {
00349                         good_copies++;
00350                 }
00351         }
00352 
00353         if (zio->io_type == ZIO_TYPE_WRITE) {
00354                 /*
00355                  * XXX -- for now, treat partial writes as success.
00356                  *
00357                  * Now that we support write reallocation, it would be better
00358                  * to treat partial failure as real failure unless there are
00359                  * no non-degraded top-level vdevs left, and not update DTLs
00360                  * if we intend to reallocate.
00361                  */
00362                 /* XXPOLICY */
00363                 if (good_copies != mm->mm_children) {
00364                         /*
00365                          * Always require at least one good copy.
00366                          *
00367                          * For ditto blocks (io_vd == NULL), require
00368                          * all copies to be good.
00369                          *
00370                          * XXX -- for replacing vdevs, there's no great answer.
00371                          * If the old device is really dead, we may not even
00372                          * be able to access it -- so we only want to
00373                          * require good writes to the new device.  But if
00374                          * the new device turns out to be flaky, we want
00375                          * to be able to detach it -- which requires all
00376                          * writes to the old device to have succeeded.
00377                          */
00378                         if (good_copies == 0 || zio->io_vd == NULL)
00379                                 zio->io_error = vdev_mirror_worst_error(mm);
00380                 }
00381                 return;
00382         } else if (zio->io_type == ZIO_TYPE_FREE) {
00383                 return;
00384         }
00385 
00386         ASSERT(zio->io_type == ZIO_TYPE_READ);
00387 
00388         /*
00389          * If we don't have a good copy yet, keep trying other children.
00390          */
00391         /* XXPOLICY */
00392         if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
00393                 ASSERT(c >= 0 && c < mm->mm_children);
00394                 mc = &mm->mm_child[c];
00395                 zio_vdev_io_redone(zio);
00396                 zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
00397                     mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
00398                     ZIO_TYPE_READ, zio->io_priority, 0,
00399                     vdev_mirror_child_done, mc));
00400                 return;
00401         }
00402 
00403         /* XXPOLICY */
00404         if (good_copies == 0) {
00405                 zio->io_error = vdev_mirror_worst_error(mm);
00406                 ASSERT(zio->io_error != 0);
00407         }
00408 
00409         if (good_copies && spa_writeable(zio->io_spa) &&
00410             (unexpected_errors ||
00411             (zio->io_flags & ZIO_FLAG_RESILVER) ||
00412             ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) {
00413                 /*
00414                  * Use the good data we have in hand to repair damaged children.
00415                  */
00416                 for (c = 0; c < mm->mm_children; c++) {
00417                         /*
00418                          * Don't rewrite known good children.
00419                          * Not only is it unnecessary, it could
00420                          * actually be harmful: if the system lost
00421                          * power while rewriting the only good copy,
00422                          * there would be no good copies left!
00423                          */
00424                         mc = &mm->mm_child[c];
00425 
00426                         if (mc->mc_error == 0) {
00427                                 if (mc->mc_tried)
00428                                         continue;
00429                                 if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
00430                                     !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
00431                                     zio->io_txg, 1))
00432                                         continue;
00433                                 mc->mc_error = ESTALE;
00434                         }
00435 
00436                         zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
00437                             mc->mc_vd, mc->mc_offset,
00438                             zio->io_data, zio->io_size,
00439                             ZIO_TYPE_WRITE, zio->io_priority,
00440                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
00441                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
00442                 }
00443         }
00444 }
00445 
00446 static void
00447 vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
00448 {
00449         if (faulted == vd->vdev_children)
00450                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
00451                     VDEV_AUX_NO_REPLICAS);
00452         else if (degraded + faulted != 0)
00453                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
00454         else
00455                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
00456 }
00457 
00458 vdev_ops_t vdev_mirror_ops = {
00459         vdev_mirror_open,
00460         vdev_mirror_close,
00461         vdev_default_asize,
00462         vdev_mirror_io_start,
00463         vdev_mirror_io_done,
00464         vdev_mirror_state_change,
00465         NULL,
00466         NULL,
00467         VDEV_TYPE_MIRROR,       /* name of this vdev type */
00468         B_FALSE                 /* not a leaf vdev */
00469 };
00470 
00471 vdev_ops_t vdev_replacing_ops = {
00472         vdev_mirror_open,
00473         vdev_mirror_close,
00474         vdev_default_asize,
00475         vdev_mirror_io_start,
00476         vdev_mirror_io_done,
00477         vdev_mirror_state_change,
00478         NULL,
00479         NULL,
00480         VDEV_TYPE_REPLACING,    /* name of this vdev type */
00481         B_FALSE                 /* not a leaf vdev */
00482 };
00483 
00484 vdev_ops_t vdev_spare_ops = {
00485         vdev_mirror_open,
00486         vdev_mirror_close,
00487         vdev_default_asize,
00488         vdev_mirror_io_start,
00489         vdev_mirror_io_done,
00490         vdev_mirror_state_change,
00491         NULL,
00492         NULL,
00493         VDEV_TYPE_SPARE,        /* name of this vdev type */
00494         B_FALSE                 /* not a leaf vdev */
00495 };
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines