FreeBSD ZFS
The Zettabyte File System

vdev_raidz.c

Go to the documentation of this file.
00001 /*
00002  * CDDL HEADER START
00003  *
00004  * The contents of this file are subject to the terms of the
00005  * Common Development and Distribution License (the "License").
00006  * You may not use this file except in compliance with the License.
00007  *
00008  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
00009  * or http://www.opensolaris.org/os/licensing.
00010  * See the License for the specific language governing permissions
00011  * and limitations under the License.
00012  *
00013  * When distributing Covered Code, include this CDDL HEADER in each
00014  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
00015  * If applicable, add the following below this CDDL HEADER, with the
00016  * fields enclosed by brackets "[]" replaced with your own identifying
00017  * information: Portions Copyright [yyyy] [name of copyright owner]
00018  *
00019  * CDDL HEADER END
00020  */
00021 
00022 /*
00023  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
00024  * Copyright (c) 2012 by Delphix. All rights reserved.
00025  */
00026 
00027 #include <sys/zfs_context.h>
00028 #include <sys/spa.h>
00029 #include <sys/vdev_impl.h>
00030 #include <sys/zio.h>
00031 #include <sys/zio_checksum.h>
00032 #include <sys/fs/zfs.h>
00033 #include <sys/fm/fs/zfs.h>
00034 
00108 typedef struct raidz_col {
00109         uint64_t rc_devidx;             
00110         uint64_t rc_offset;             
00111         uint64_t rc_size;               
00112         void *rc_data;                  
00113         void *rc_gdata;                 
00114         int rc_error;                   
00115         uint8_t rc_tried;               
00116         uint8_t rc_skipped;             
00117 } raidz_col_t;
00118 
00119 typedef struct raidz_map {
00120         uint64_t rm_cols;               
00121         uint64_t rm_scols;              
00122         uint64_t rm_bigcols;            
00123         uint64_t rm_asize;              
00124         uint64_t rm_missingdata;        
00125         uint64_t rm_missingparity;      
00126         uint64_t rm_firstdatacol;       
00127         uint64_t rm_nskip;              
00128         uint64_t rm_skipstart;          
00129         void *rm_datacopy;              
00130         uintptr_t rm_reports;           
00131         uint8_t rm_freed;               
00132         uint8_t rm_ecksuminjected;      
00133         raidz_col_t rm_col[1];          
00134 } raidz_map_t;
00135 
00136 #define VDEV_RAIDZ_P            0
00137 #define VDEV_RAIDZ_Q            1
00138 #define VDEV_RAIDZ_R            2
00139 
00140 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
00141 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
00142 
00149 #define VDEV_RAIDZ_64MUL_2(x, mask) \
00150 { \
00151         (mask) = (x) & 0x8080808080808080ULL; \
00152         (mask) = ((mask) << 1) - ((mask) >> 7); \
00153         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
00154             ((mask) & 0x1d1d1d1d1d1d1d1d); \
00155 }
00156 
00157 #define VDEV_RAIDZ_64MUL_4(x, mask) \
00158 { \
00159         VDEV_RAIDZ_64MUL_2((x), mask); \
00160         VDEV_RAIDZ_64MUL_2((x), mask); \
00161 }
00162 
00166 int vdev_raidz_default_to_general;
00167 
00169 static const uint8_t vdev_raidz_pow2[256] = {
00170         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
00171         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
00172         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
00173         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
00174         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
00175         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
00176         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
00177         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
00178         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
00179         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
00180         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
00181         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
00182         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
00183         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
00184         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
00185         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
00186         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
00187         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
00188         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
00189         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
00190         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
00191         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
00192         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
00193         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
00194         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
00195         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
00196         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
00197         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
00198         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
00199         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
00200         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
00201         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
00202 };
00204 static const uint8_t vdev_raidz_log2[256] = {
00205         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
00206         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
00207         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
00208         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
00209         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
00210         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
00211         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
00212         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
00213         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
00214         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
00215         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
00216         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
00217         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
00218         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
00219         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
00220         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
00221         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
00222         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
00223         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
00224         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
00225         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
00226         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
00227         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
00228         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
00229         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
00230         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
00231         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
00232         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
00233         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
00234         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
00235         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
00236         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
00237 };
00238 
00239 static void vdev_raidz_generate_parity(raidz_map_t *rm);
00240 
00244 static uint8_t
00245 vdev_raidz_exp2(uint_t a, int exp)
00246 {
00247         if (a == 0)
00248                 return (0);
00249 
00250         ASSERT(exp >= 0);
00251         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
00252 
00253         exp += vdev_raidz_log2[a];
00254         if (exp > 255)
00255                 exp -= 255;
00256 
00257         return (vdev_raidz_pow2[exp]);
00258 }
00259 
00260 static void
00261 vdev_raidz_map_free(raidz_map_t *rm)
00262 {
00263         int c;
00264         size_t size;
00265 
00266         for (c = 0; c < rm->rm_firstdatacol; c++) {
00267                 if (rm->rm_col[c].rc_data != NULL)
00268                         zio_buf_free(rm->rm_col[c].rc_data,
00269                             rm->rm_col[c].rc_size);
00270 
00271                 if (rm->rm_col[c].rc_gdata != NULL)
00272                         zio_buf_free(rm->rm_col[c].rc_gdata,
00273                             rm->rm_col[c].rc_size);
00274         }
00275 
00276         size = 0;
00277         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
00278                 size += rm->rm_col[c].rc_size;
00279 
00280         if (rm->rm_datacopy != NULL)
00281                 zio_buf_free(rm->rm_datacopy, size);
00282 
00283         kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
00284 }
00285 
00286 static void
00287 vdev_raidz_map_free_vsd(zio_t *zio)
00288 {
00289         raidz_map_t *rm = zio->io_vsd;
00290 
00291         ASSERT0(rm->rm_freed);
00292         rm->rm_freed = 1;
00293 
00294         if (rm->rm_reports == 0)
00295                 vdev_raidz_map_free(rm);
00296 }
00297 
00298 /*ARGSUSED*/
00299 static void
00300 vdev_raidz_cksum_free(void *arg, size_t ignored)
00301 {
00302         raidz_map_t *rm = arg;
00303 
00304         ASSERT3U(rm->rm_reports, >, 0);
00305 
00306         if (--rm->rm_reports == 0 && rm->rm_freed != 0)
00307                 vdev_raidz_map_free(rm);
00308 }
00309 
00310 static void
00311 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
00312 {
00313         raidz_map_t *rm = zcr->zcr_cbdata;
00314         size_t c = zcr->zcr_cbinfo;
00315         size_t x;
00316 
00317         const char *good = NULL;
00318         const char *bad = rm->rm_col[c].rc_data;
00319 
00320         if (good_data == NULL) {
00321                 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
00322                 return;
00323         }
00324 
00325         if (c < rm->rm_firstdatacol) {
00326                 /*
00327                  * The first time through, calculate the parity blocks for
00328                  * the good data (this relies on the fact that the good
00329                  * data never changes for a given logical ZIO)
00330                  */
00331                 if (rm->rm_col[0].rc_gdata == NULL) {
00332                         char *bad_parity[VDEV_RAIDZ_MAXPARITY];
00333                         char *buf;
00334 
00335                         /*
00336                          * Set up the rm_col[]s to generate the parity for
00337                          * good_data, first saving the parity bufs and
00338                          * replacing them with buffers to hold the result.
00339                          */
00340                         for (x = 0; x < rm->rm_firstdatacol; x++) {
00341                                 bad_parity[x] = rm->rm_col[x].rc_data;
00342                                 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
00343                                     zio_buf_alloc(rm->rm_col[x].rc_size);
00344                         }
00345 
00346                         /* fill in the data columns from good_data */
00347                         buf = (char *)good_data;
00348                         for (; x < rm->rm_cols; x++) {
00349                                 rm->rm_col[x].rc_data = buf;
00350                                 buf += rm->rm_col[x].rc_size;
00351                         }
00352 
00353                         /*
00354                          * Construct the parity from the good data.
00355                          */
00356                         vdev_raidz_generate_parity(rm);
00357 
00358                         /* restore everything back to its original state */
00359                         for (x = 0; x < rm->rm_firstdatacol; x++)
00360                                 rm->rm_col[x].rc_data = bad_parity[x];
00361 
00362                         buf = rm->rm_datacopy;
00363                         for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
00364                                 rm->rm_col[x].rc_data = buf;
00365                                 buf += rm->rm_col[x].rc_size;
00366                         }
00367                 }
00368 
00369                 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
00370                 good = rm->rm_col[c].rc_gdata;
00371         } else {
00372                 /* adjust good_data to point at the start of our column */
00373                 good = good_data;
00374 
00375                 for (x = rm->rm_firstdatacol; x < c; x++)
00376                         good += rm->rm_col[x].rc_size;
00377         }
00378 
00379         /* we drop the ereport if it ends up that the data was good */
00380         zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
00381 }
00382 
00389 static void
00390 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
00391 {
00392         size_t c = (size_t)(uintptr_t)arg;
00393         caddr_t buf;
00394 
00395         raidz_map_t *rm = zio->io_vsd;
00396         size_t size;
00397 
00398         /* set up the report and bump the refcount  */
00399         zcr->zcr_cbdata = rm;
00400         zcr->zcr_cbinfo = c;
00401         zcr->zcr_finish = vdev_raidz_cksum_finish;
00402         zcr->zcr_free = vdev_raidz_cksum_free;
00403 
00404         rm->rm_reports++;
00405         ASSERT3U(rm->rm_reports, >, 0);
00406 
00407         if (rm->rm_datacopy != NULL)
00408                 return;
00409 
00410         /*
00411          * It's the first time we're called for this raidz_map_t, so we need
00412          * to copy the data aside; there's no guarantee that our zio's buffer
00413          * won't be re-used for something else.
00414          *
00415          * Our parity data is already in separate buffers, so there's no need
00416          * to copy them.
00417          */
00418 
00419         size = 0;
00420         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
00421                 size += rm->rm_col[c].rc_size;
00422 
00423         buf = rm->rm_datacopy = zio_buf_alloc(size);
00424 
00425         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
00426                 raidz_col_t *col = &rm->rm_col[c];
00427 
00428                 bcopy(col->rc_data, buf, col->rc_size);
00429                 col->rc_data = buf;
00430 
00431                 buf += col->rc_size;
00432         }
00433         ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
00434 }
00435 
00436 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
00437         vdev_raidz_map_free_vsd,
00438         vdev_raidz_cksum_report
00439 };
00440 
00446 static raidz_map_t *
00447 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
00448     uint64_t nparity)
00449 {
00450         raidz_map_t *rm;
00451         uint64_t b = zio->io_offset >> unit_shift;
00452         /* The zio's size in units of the vdev's preferred sector size */
00453         uint64_t s = zio->io_size >> unit_shift;
00454         uint64_t f = b % dcols;
00455         uint64_t o = (b / dcols) << unit_shift;
00456         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
00457 
00458         q = s / (dcols - nparity);
00459         r = s - q * (dcols - nparity);
00460         bc = (r == 0 ? 0 : r + nparity);
00461         tot = s + nparity * (q + (r == 0 ? 0 : 1));
00462 
00463         if (q == 0) {
00464                 acols = bc;
00465                 scols = MIN(dcols, roundup(bc, nparity + 1));
00466         } else {
00467                 acols = dcols;
00468                 scols = dcols;
00469         }
00470 
00471         ASSERT3U(acols, <=, scols);
00472 
00473         rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
00474 
00475         rm->rm_cols = acols;
00476         rm->rm_scols = scols;
00477         rm->rm_bigcols = bc;
00478         rm->rm_skipstart = bc;
00479         rm->rm_missingdata = 0;
00480         rm->rm_missingparity = 0;
00481         rm->rm_firstdatacol = nparity;
00482         rm->rm_datacopy = NULL;
00483         rm->rm_reports = 0;
00484         rm->rm_freed = 0;
00485         rm->rm_ecksuminjected = 0;
00486 
00487         asize = 0;
00488 
00489         for (c = 0; c < scols; c++) {
00490                 col = f + c;
00491                 coff = o;
00492                 if (col >= dcols) {
00493                         col -= dcols;
00494                         coff += 1ULL << unit_shift;
00495                 }
00496                 rm->rm_col[c].rc_devidx = col;
00497                 rm->rm_col[c].rc_offset = coff;
00498                 rm->rm_col[c].rc_data = NULL;
00499                 rm->rm_col[c].rc_gdata = NULL;
00500                 rm->rm_col[c].rc_error = 0;
00501                 rm->rm_col[c].rc_tried = 0;
00502                 rm->rm_col[c].rc_skipped = 0;
00503 
00504                 if (c >= acols)
00505                         rm->rm_col[c].rc_size = 0;
00506                 else if (c < bc)
00507                         rm->rm_col[c].rc_size = (q + 1) << unit_shift;
00508                 else
00509                         rm->rm_col[c].rc_size = q << unit_shift;
00510 
00511                 asize += rm->rm_col[c].rc_size;
00512         }
00513 
00514         ASSERT3U(asize, ==, tot << unit_shift);
00515         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
00516         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
00517         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
00518         ASSERT3U(rm->rm_nskip, <=, nparity);
00519 
00520         if (zio->io_type != ZIO_TYPE_FREE) {
00521                 for (c = 0; c < rm->rm_firstdatacol; c++) {
00522                         rm->rm_col[c].rc_data =
00523                             zio_buf_alloc(rm->rm_col[c].rc_size);
00524                 }
00525 
00526                 rm->rm_col[c].rc_data = zio->io_data;
00527 
00528                 for (c = c + 1; c < acols; c++) {
00529                         rm->rm_col[c].rc_data =
00530                             (char *)rm->rm_col[c - 1].rc_data +
00531                             rm->rm_col[c - 1].rc_size;
00532                 }
00533         }
00534 
00535         /*
00536          * If all data stored spans all columns, there's a danger that parity
00537          * will always be on the same device and, since parity isn't read
00538          * during normal operation, that that device's I/O bandwidth won't be
00539          * used effectively. We therefore switch the parity every 1MB.
00540          *
00541          * ... at least that was, ostensibly, the theory. As a practical
00542          * matter unless we juggle the parity between all devices evenly, we
00543          * won't see any benefit. Further, occasional writes that aren't a
00544          * multiple of the LCM of the number of children and the minimum
00545          * stripe width are sufficient to avoid pessimal behavior.
00546          * Unfortunately, this decision created an implicit on-disk format
00547          * requirement that we need to support for all eternity, but only
00548          * for single-parity RAID-Z.
00549          *
00550          * If we intend to skip a sector in the zeroth column for padding
00551          * we must make sure to note this swap. We will never intend to
00552          * skip the first column since at least one data and one parity
00553          * column must appear in each row.
00554          */
00555         ASSERT(rm->rm_cols >= 2);
00556         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
00557 
00558         if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
00559                 devidx = rm->rm_col[0].rc_devidx;
00560                 o = rm->rm_col[0].rc_offset;
00561                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
00562                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
00563                 rm->rm_col[1].rc_devidx = devidx;
00564                 rm->rm_col[1].rc_offset = o;
00565 
00566                 if (rm->rm_skipstart == 0)
00567                         rm->rm_skipstart = 1;
00568         }
00569 
00570         zio->io_vsd = rm;
00571         zio->io_vsd_ops = &vdev_raidz_vsd_ops;
00572         return (rm);
00573 }
00574 
00575 static void
00576 vdev_raidz_generate_parity_p(raidz_map_t *rm)
00577 {
00578         uint64_t *p, *src, pcount, ccount, i;
00579         int c;
00580 
00581         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
00582 
00583         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
00584                 src = rm->rm_col[c].rc_data;
00585                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
00586                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
00587 
00588                 if (c == rm->rm_firstdatacol) {
00589                         ASSERT(ccount == pcount);
00590                         for (i = 0; i < ccount; i++, src++, p++) {
00591                                 *p = *src;
00592                         }
00593                 } else {
00594                         ASSERT(ccount <= pcount);
00595                         for (i = 0; i < ccount; i++, src++, p++) {
00596                                 *p ^= *src;
00597                         }
00598                 }
00599         }
00600 }
00601 
00602 static void
00603 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
00604 {
00605         uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
00606         int c;
00607 
00608         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
00609         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
00610             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
00611 
00612         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
00613                 src = rm->rm_col[c].rc_data;
00614                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
00615                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
00616 
00617                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
00618 
00619                 if (c == rm->rm_firstdatacol) {
00620                         ASSERT(ccnt == pcnt || ccnt == 0);
00621                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
00622                                 *p = *src;
00623                                 *q = *src;
00624                         }
00625                         for (; i < pcnt; i++, src++, p++, q++) {
00626                                 *p = 0;
00627                                 *q = 0;
00628                         }
00629                 } else {
00630                         ASSERT(ccnt <= pcnt);
00631 
00632                         /*
00633                          * Apply the algorithm described above by multiplying
00634                          * the previous result and adding in the new value.
00635                          */
00636                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
00637                                 *p ^= *src;
00638 
00639                                 VDEV_RAIDZ_64MUL_2(*q, mask);
00640                                 *q ^= *src;
00641                         }
00642 
00643                         /*
00644                          * Treat short columns as though they are full of 0s.
00645                          * Note that there's therefore nothing needed for P.
00646                          */
00647                         for (; i < pcnt; i++, q++) {
00648                                 VDEV_RAIDZ_64MUL_2(*q, mask);
00649                         }
00650                 }
00651         }
00652 }
00653 
00654 static void
00655 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
00656 {
00657         uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
00658         int c;
00659 
00660         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
00661         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
00662             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
00663         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
00664             rm->rm_col[VDEV_RAIDZ_R].rc_size);
00665 
00666         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
00667                 src = rm->rm_col[c].rc_data;
00668                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
00669                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
00670                 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
00671 
00672                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
00673 
00674                 if (c == rm->rm_firstdatacol) {
00675                         ASSERT(ccnt == pcnt || ccnt == 0);
00676                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
00677                                 *p = *src;
00678                                 *q = *src;
00679                                 *r = *src;
00680                         }
00681                         for (; i < pcnt; i++, src++, p++, q++, r++) {
00682                                 *p = 0;
00683                                 *q = 0;
00684                                 *r = 0;
00685                         }
00686                 } else {
00687                         ASSERT(ccnt <= pcnt);
00688 
00689                         /*
00690                          * Apply the algorithm described above by multiplying
00691                          * the previous result and adding in the new value.
00692                          */
00693                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
00694                                 *p ^= *src;
00695 
00696                                 VDEV_RAIDZ_64MUL_2(*q, mask);
00697                                 *q ^= *src;
00698 
00699                                 VDEV_RAIDZ_64MUL_4(*r, mask);
00700                                 *r ^= *src;
00701                         }
00702 
00703                         /*
00704                          * Treat short columns as though they are full of 0s.
00705                          * Note that there's therefore nothing needed for P.
00706                          */
00707                         for (; i < pcnt; i++, q++, r++) {
00708                                 VDEV_RAIDZ_64MUL_2(*q, mask);
00709                                 VDEV_RAIDZ_64MUL_4(*r, mask);
00710                         }
00711                 }
00712         }
00713 }
00714 
00719 static void
00720 vdev_raidz_generate_parity(raidz_map_t *rm)
00721 {
00722         switch (rm->rm_firstdatacol) {
00723         case 1:
00724                 vdev_raidz_generate_parity_p(rm);
00725                 break;
00726         case 2:
00727                 vdev_raidz_generate_parity_pq(rm);
00728                 break;
00729         case 3:
00730                 vdev_raidz_generate_parity_pqr(rm);
00731                 break;
00732         default:
00733                 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
00734         }
00735 }
00736 
00737 static int
00738 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
00739 {
00740         uint64_t *dst, *src, xcount, ccount, count, i;
00741         int x = tgts[0];
00742         int c;
00743 
00744         ASSERT(ntgts == 1);
00745         ASSERT(x >= rm->rm_firstdatacol);
00746         ASSERT(x < rm->rm_cols);
00747 
00748         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
00749         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
00750         ASSERT(xcount > 0);
00751 
00752         src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
00753         dst = rm->rm_col[x].rc_data;
00754         for (i = 0; i < xcount; i++, dst++, src++) {
00755                 *dst = *src;
00756         }
00757 
00758         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
00759                 src = rm->rm_col[c].rc_data;
00760                 dst = rm->rm_col[x].rc_data;
00761 
00762                 if (c == x)
00763                         continue;
00764 
00765                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
00766                 count = MIN(ccount, xcount);
00767 
00768                 for (i = 0; i < count; i++, dst++, src++) {
00769                         *dst ^= *src;
00770                 }
00771         }
00772 
00773         return (1 << VDEV_RAIDZ_P);
00774 }
00775 
00776 static int
00777 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
00778 {
00779         uint64_t *dst, *src, xcount, ccount, count, mask, i;
00780         uint8_t *b;
00781         int x = tgts[0];
00782         int c, j, exp;
00783 
00784         ASSERT(ntgts == 1);
00785 
00786         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
00787         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
00788 
00789         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
00790                 src = rm->rm_col[c].rc_data;
00791                 dst = rm->rm_col[x].rc_data;
00792 
00793                 if (c == x)
00794                         ccount = 0;
00795                 else
00796                         ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
00797 
00798                 count = MIN(ccount, xcount);
00799 
00800                 if (c == rm->rm_firstdatacol) {
00801                         for (i = 0; i < count; i++, dst++, src++) {
00802                                 *dst = *src;
00803                         }
00804                         for (; i < xcount; i++, dst++) {
00805                                 *dst = 0;
00806                         }
00807 
00808                 } else {
00809                         for (i = 0; i < count; i++, dst++, src++) {
00810                                 VDEV_RAIDZ_64MUL_2(*dst, mask);
00811                                 *dst ^= *src;
00812                         }
00813 
00814                         for (; i < xcount; i++, dst++) {
00815                                 VDEV_RAIDZ_64MUL_2(*dst, mask);
00816                         }
00817                 }
00818         }
00819 
00820         src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
00821         dst = rm->rm_col[x].rc_data;
00822         exp = 255 - (rm->rm_cols - 1 - x);
00823 
00824         for (i = 0; i < xcount; i++, dst++, src++) {
00825                 *dst ^= *src;
00826                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
00827                         *b = vdev_raidz_exp2(*b, exp);
00828                 }
00829         }
00830 
00831         return (1 << VDEV_RAIDZ_Q);
00832 }
00833 
00834 static int
00835 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
00836 {
00837         uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
00838         void *pdata, *qdata;
00839         uint64_t xsize, ysize, i;
00840         int x = tgts[0];
00841         int y = tgts[1];
00842 
00843         ASSERT(ntgts == 2);
00844         ASSERT(x < y);
00845         ASSERT(x >= rm->rm_firstdatacol);
00846         ASSERT(y < rm->rm_cols);
00847 
00848         ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
00849 
00850         /*
00851          * Move the parity data aside -- we're going to compute parity as
00852          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
00853          * reuse the parity generation mechanism without trashing the actual
00854          * parity so we make those columns appear to be full of zeros by
00855          * setting their lengths to zero.
00856          */
00857         pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
00858         qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
00859         xsize = rm->rm_col[x].rc_size;
00860         ysize = rm->rm_col[y].rc_size;
00861 
00862         rm->rm_col[VDEV_RAIDZ_P].rc_data =
00863             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
00864         rm->rm_col[VDEV_RAIDZ_Q].rc_data =
00865             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
00866         rm->rm_col[x].rc_size = 0;
00867         rm->rm_col[y].rc_size = 0;
00868 
00869         vdev_raidz_generate_parity_pq(rm);
00870 
00871         rm->rm_col[x].rc_size = xsize;
00872         rm->rm_col[y].rc_size = ysize;
00873 
00874         p = pdata;
00875         q = qdata;
00876         pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
00877         qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
00878         xd = rm->rm_col[x].rc_data;
00879         yd = rm->rm_col[y].rc_data;
00880 
00881         /*
00882          * We now have:
00883          *      Pxy = P + D_x + D_y
00884          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
00885          *
00886          * We can then solve for D_x:
00887          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
00888          * where
00889          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
00890          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
00891          *
00892          * With D_x in hand, we can easily solve for D_y:
00893          *      D_y = P + Pxy + D_x
00894          */
00895 
00896         a = vdev_raidz_pow2[255 + x - y];
00897         b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
00898         tmp = 255 - vdev_raidz_log2[a ^ 1];
00899 
00900         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
00901         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
00902 
00903         for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
00904                 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
00905                     vdev_raidz_exp2(*q ^ *qxy, bexp);
00906 
00907                 if (i < ysize)
00908                         *yd = *p ^ *pxy ^ *xd;
00909         }
00910 
00911         zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
00912             rm->rm_col[VDEV_RAIDZ_P].rc_size);
00913         zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
00914             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
00915 
00916         /*
00917          * Restore the saved parity data.
00918          */
00919         rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
00920         rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
00921 
00922         return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
00923 }
00924 
01089 static void
01090 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
01091     uint8_t **rows)
01092 {
01093         int i, j;
01094         int pow;
01095 
01096         ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
01097 
01098         /*
01099          * Fill in the missing rows of interest.
01100          */
01101         for (i = 0; i < nmap; i++) {
01102                 ASSERT3S(0, <=, map[i]);
01103                 ASSERT3S(map[i], <=, 2);
01104 
01105                 pow = map[i] * n;
01106                 if (pow > 255)
01107                         pow -= 255;
01108                 ASSERT(pow <= 255);
01109 
01110                 for (j = 0; j < n; j++) {
01111                         pow -= map[i];
01112                         if (pow < 0)
01113                                 pow += 255;
01114                         rows[i][j] = vdev_raidz_pow2[pow];
01115                 }
01116         }
01117 }
01118 
01119 static void
01120 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
01121     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
01122 {
01123         int i, j, ii, jj;
01124         uint8_t log;
01125 
01126         /*
01127          * Assert that the first nmissing entries from the array of used
01128          * columns correspond to parity columns and that subsequent entries
01129          * correspond to data columns.
01130          */
01131         for (i = 0; i < nmissing; i++) {
01132                 ASSERT3S(used[i], <, rm->rm_firstdatacol);
01133         }
01134         for (; i < n; i++) {
01135                 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
01136         }
01137 
01138         /*
01139          * First initialize the storage where we'll compute the inverse rows.
01140          */
01141         for (i = 0; i < nmissing; i++) {
01142                 for (j = 0; j < n; j++) {
01143                         invrows[i][j] = (i == j) ? 1 : 0;
01144                 }
01145         }
01146 
01147         /*
01148          * Subtract all trivial rows from the rows of consequence.
01149          */
01150         for (i = 0; i < nmissing; i++) {
01151                 for (j = nmissing; j < n; j++) {
01152                         ASSERT3U(used[j], >=, rm->rm_firstdatacol);
01153                         jj = used[j] - rm->rm_firstdatacol;
01154                         ASSERT3S(jj, <, n);
01155                         invrows[i][j] = rows[i][jj];
01156                         rows[i][jj] = 0;
01157                 }
01158         }
01159 
01160         /*
01161          * For each of the rows of interest, we must normalize it and subtract
01162          * a multiple of it from the other rows.
01163          */
01164         for (i = 0; i < nmissing; i++) {
01165                 for (j = 0; j < missing[i]; j++) {
01166                         ASSERT0(rows[i][j]);
01167                 }
01168                 ASSERT3U(rows[i][missing[i]], !=, 0);
01169 
01170                 /*
01171                  * Compute the inverse of the first element and multiply each
01172                  * element in the row by that value.
01173                  */
01174                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
01175 
01176                 for (j = 0; j < n; j++) {
01177                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
01178                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
01179                 }
01180 
01181                 for (ii = 0; ii < nmissing; ii++) {
01182                         if (i == ii)
01183                                 continue;
01184 
01185                         ASSERT3U(rows[ii][missing[i]], !=, 0);
01186 
01187                         log = vdev_raidz_log2[rows[ii][missing[i]]];
01188 
01189                         for (j = 0; j < n; j++) {
01190                                 rows[ii][j] ^=
01191                                     vdev_raidz_exp2(rows[i][j], log);
01192                                 invrows[ii][j] ^=
01193                                     vdev_raidz_exp2(invrows[i][j], log);
01194                         }
01195                 }
01196         }
01197 
01198         /*
01199          * Verify that the data that is left in the rows are properly part of
01200          * an identity matrix.
01201          */
01202         for (i = 0; i < nmissing; i++) {
01203                 for (j = 0; j < n; j++) {
01204                         if (j == missing[i]) {
01205                                 ASSERT3U(rows[i][j], ==, 1);
01206                         } else {
01207                                 ASSERT0(rows[i][j]);
01208                         }
01209                 }
01210         }
01211 }
01212 
01213 static void
01214 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
01215     int *missing, uint8_t **invrows, const uint8_t *used)
01216 {
01217         int i, j, x, cc, c;
01218         uint8_t *src;
01219         uint64_t ccount;
01220         uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
01221         uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
01222         uint8_t log, val;
01223         int ll;
01224         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
01225         uint8_t *p, *pp;
01226         size_t psize;
01227 
01228         psize = sizeof (invlog[0][0]) * n * nmissing;
01229         p = kmem_alloc(psize, KM_SLEEP);
01230 
01231         for (pp = p, i = 0; i < nmissing; i++) {
01232                 invlog[i] = pp;
01233                 pp += n;
01234         }
01235 
01236         for (i = 0; i < nmissing; i++) {
01237                 for (j = 0; j < n; j++) {
01238                         ASSERT3U(invrows[i][j], !=, 0);
01239                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
01240                 }
01241         }
01242 
01243         for (i = 0; i < n; i++) {
01244                 c = used[i];
01245                 ASSERT3U(c, <, rm->rm_cols);
01246 
01247                 src = rm->rm_col[c].rc_data;
01248                 ccount = rm->rm_col[c].rc_size;
01249                 for (j = 0; j < nmissing; j++) {
01250                         cc = missing[j] + rm->rm_firstdatacol;
01251                         ASSERT3U(cc, >=, rm->rm_firstdatacol);
01252                         ASSERT3U(cc, <, rm->rm_cols);
01253                         ASSERT3U(cc, !=, c);
01254 
01255                         dst[j] = rm->rm_col[cc].rc_data;
01256                         dcount[j] = rm->rm_col[cc].rc_size;
01257                 }
01258 
01259                 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
01260 
01261                 for (x = 0; x < ccount; x++, src++) {
01262                         if (*src != 0)
01263                                 log = vdev_raidz_log2[*src];
01264 
01265                         for (cc = 0; cc < nmissing; cc++) {
01266                                 if (x >= dcount[cc])
01267                                         continue;
01268 
01269                                 if (*src == 0) {
01270                                         val = 0;
01271                                 } else {
01272                                         if ((ll = log + invlog[cc][i]) >= 255)
01273                                                 ll -= 255;
01274                                         val = vdev_raidz_pow2[ll];
01275                                 }
01276 
01277                                 if (i == 0)
01278                                         dst[cc][x] = val;
01279                                 else
01280                                         dst[cc][x] ^= val;
01281                         }
01282                 }
01283         }
01284 
01285         kmem_free(p, psize);
01286 }
01287 
01288 static int
01289 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
01290 {
01291         int n, i, c, t, tt;
01292         int nmissing_rows;
01293         int missing_rows[VDEV_RAIDZ_MAXPARITY];
01294         int parity_map[VDEV_RAIDZ_MAXPARITY];
01295 
01296         uint8_t *p, *pp;
01297         size_t psize;
01298 
01299         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
01300         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
01301         uint8_t *used;
01302 
01303         int code = 0;
01304 
01305 
01306         n = rm->rm_cols - rm->rm_firstdatacol;
01307 
01308         /*
01309          * Figure out which data columns are missing.
01310          */
01311         nmissing_rows = 0;
01312         for (t = 0; t < ntgts; t++) {
01313                 if (tgts[t] >= rm->rm_firstdatacol) {
01314                         missing_rows[nmissing_rows++] =
01315                             tgts[t] - rm->rm_firstdatacol;
01316                 }
01317         }
01318 
01319         /*
01320          * Figure out which parity columns to use to help generate the missing
01321          * data columns.
01322          */
01323         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
01324                 ASSERT(tt < ntgts);
01325                 ASSERT(c < rm->rm_firstdatacol);
01326 
01327                 /*
01328                  * Skip any targeted parity columns.
01329                  */
01330                 if (c == tgts[tt]) {
01331                         tt++;
01332                         continue;
01333                 }
01334 
01335                 code |= 1 << c;
01336 
01337                 parity_map[i] = c;
01338                 i++;
01339         }
01340 
01341         ASSERT(code != 0);
01342         ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
01343 
01344         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
01345             nmissing_rows * n + sizeof (used[0]) * n;
01346         p = kmem_alloc(psize, KM_SLEEP);
01347 
01348         for (pp = p, i = 0; i < nmissing_rows; i++) {
01349                 rows[i] = pp;
01350                 pp += n;
01351                 invrows[i] = pp;
01352                 pp += n;
01353         }
01354         used = pp;
01355 
01356         for (i = 0; i < nmissing_rows; i++) {
01357                 used[i] = parity_map[i];
01358         }
01359 
01360         for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
01361                 if (tt < nmissing_rows &&
01362                     c == missing_rows[tt] + rm->rm_firstdatacol) {
01363                         tt++;
01364                         continue;
01365                 }
01366 
01367                 ASSERT3S(i, <, n);
01368                 used[i] = c;
01369                 i++;
01370         }
01371 
01372         /*
01373          * Initialize the interesting rows of the matrix.
01374          */
01375         vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
01376 
01377         /*
01378          * Invert the matrix.
01379          */
01380         vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
01381             invrows, used);
01382 
01383         /*
01384          * Reconstruct the missing data using the generated matrix.
01385          */
01386         vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
01387             invrows, used);
01388 
01389         kmem_free(p, psize);
01390 
01391         return (code);
01392 }
01393 
01394 static int
01395 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
01396 {
01397         int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
01398         int ntgts;
01399         int i, c;
01400         int code;
01401         int nbadparity, nbaddata;
01402         int parity_valid[VDEV_RAIDZ_MAXPARITY];
01403 
01404         /*
01405          * The tgts list must already be sorted.
01406          */
01407         for (i = 1; i < nt; i++) {
01408                 ASSERT(t[i] > t[i - 1]);
01409         }
01410 
01411         nbadparity = rm->rm_firstdatacol;
01412         nbaddata = rm->rm_cols - nbadparity;
01413         ntgts = 0;
01414         for (i = 0, c = 0; c < rm->rm_cols; c++) {
01415                 if (c < rm->rm_firstdatacol)
01416                         parity_valid[c] = B_FALSE;
01417 
01418                 if (i < nt && c == t[i]) {
01419                         tgts[ntgts++] = c;
01420                         i++;
01421                 } else if (rm->rm_col[c].rc_error != 0) {
01422                         tgts[ntgts++] = c;
01423                 } else if (c >= rm->rm_firstdatacol) {
01424                         nbaddata--;
01425                 } else {
01426                         parity_valid[c] = B_TRUE;
01427                         nbadparity--;
01428                 }
01429         }
01430 
01431         ASSERT(ntgts >= nt);
01432         ASSERT(nbaddata >= 0);
01433         ASSERT(nbaddata + nbadparity == ntgts);
01434 
01435         dt = &tgts[nbadparity];
01436 
01437         /*
01438          * See if we can use any of our optimized reconstruction routines.
01439          */
01440         if (!vdev_raidz_default_to_general) {
01441                 switch (nbaddata) {
01442                 case 1:
01443                         if (parity_valid[VDEV_RAIDZ_P])
01444                                 return (vdev_raidz_reconstruct_p(rm, dt, 1));
01445 
01446                         ASSERT(rm->rm_firstdatacol > 1);
01447 
01448                         if (parity_valid[VDEV_RAIDZ_Q])
01449                                 return (vdev_raidz_reconstruct_q(rm, dt, 1));
01450 
01451                         ASSERT(rm->rm_firstdatacol > 2);
01452                         break;
01453 
01454                 case 2:
01455                         ASSERT(rm->rm_firstdatacol > 1);
01456 
01457                         if (parity_valid[VDEV_RAIDZ_P] &&
01458                             parity_valid[VDEV_RAIDZ_Q])
01459                                 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
01460 
01461                         ASSERT(rm->rm_firstdatacol > 2);
01462 
01463                         break;
01464                 }
01465         }
01466 
01467         code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
01468         ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
01469         ASSERT(code > 0);
01470         return (code);
01471 }
01472 
01476 static int
01477 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
01478     uint64_t *ashift)
01479 {
01480         vdev_t *cvd;
01481         uint64_t nparity = vd->vdev_nparity;
01482         int c;
01483         int lasterror = 0;
01484         int numerrors = 0;
01485 
01486         ASSERT(nparity > 0);
01487 
01488         if (nparity > VDEV_RAIDZ_MAXPARITY ||
01489             vd->vdev_children < nparity + 1) {
01490                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
01491                 return (EINVAL);
01492         }
01493 
01494         vdev_open_children(vd);
01495 
01496         for (c = 0; c < vd->vdev_children; c++) {
01497                 cvd = vd->vdev_child[c];
01498 
01499                 if (cvd->vdev_open_error != 0) {
01500                         lasterror = cvd->vdev_open_error;
01501                         numerrors++;
01502                         continue;
01503                 }
01504 
01505                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
01506                 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
01507                 *ashift = MAX(*ashift, cvd->vdev_ashift);
01508         }
01509 
01510         *asize *= vd->vdev_children;
01511         *max_asize *= vd->vdev_children;
01512 
01513         if (numerrors > nparity) {
01514                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
01515                 return (lasterror);
01516         }
01517 
01518         return (0);
01519 }
01520 
01521 static void
01522 vdev_raidz_close(vdev_t *vd)
01523 {
01524         int c;
01525 
01526         for (c = 0; c < vd->vdev_children; c++)
01527                 vdev_close(vd->vdev_child[c]);
01528 }
01529 
01530 static uint64_t
01531 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
01532 {
01533         uint64_t asize;
01534         uint64_t ashift = vd->vdev_top->vdev_ashift;
01535         uint64_t cols = vd->vdev_children;
01536         uint64_t nparity = vd->vdev_nparity;
01537 
01538         asize = ((psize - 1) >> ashift) + 1;
01539         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
01540         asize = roundup(asize, nparity + 1) << ashift;
01541 
01542         return (asize);
01543 }
01544 
01548 static void
01549 vdev_raidz_child_done(zio_t *zio)
01550 {
01551         raidz_col_t *rc = zio->io_private;
01552 
01553         rc->rc_error = zio->io_error;
01554         rc->rc_tried = 1;
01555         rc->rc_skipped = 0;
01556 }
01557 
01574 static int
01575 vdev_raidz_io_start(zio_t *zio)
01576 {
01577         vdev_t *vd = zio->io_vd;
01578         vdev_t *tvd = vd->vdev_top;
01579         vdev_t *cvd;
01580         raidz_map_t *rm;
01581         raidz_col_t *rc;
01582         int c, i;
01583 
01584         rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
01585             vd->vdev_nparity);
01586 
01587         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
01588 
01589         if (zio->io_type == ZIO_TYPE_FREE) {
01590                 for (c = 0; c < rm->rm_cols; c++) {
01591                         rc = &rm->rm_col[c];
01592                         cvd = vd->vdev_child[rc->rc_devidx];
01593                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
01594                             rc->rc_offset, rc->rc_data, rc->rc_size,
01595                             zio->io_type, zio->io_priority, 0,
01596                             vdev_raidz_child_done, rc));
01597                 }
01598                 return (ZIO_PIPELINE_CONTINUE);
01599         }
01600 
01601         if (zio->io_type == ZIO_TYPE_WRITE) {
01602                 vdev_raidz_generate_parity(rm);
01603 
01604                 for (c = 0; c < rm->rm_cols; c++) {
01605                         rc = &rm->rm_col[c];
01606                         cvd = vd->vdev_child[rc->rc_devidx];
01607                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
01608                             rc->rc_offset, rc->rc_data, rc->rc_size,
01609                             zio->io_type, zio->io_priority, 0,
01610                             vdev_raidz_child_done, rc));
01611                 }
01612 
01613                 /*
01614                  * Generate optional I/Os for any skipped sectors to improve
01615                  * aggregation contiguity.
01616                  */
01617                 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
01618                         ASSERT(c <= rm->rm_scols);
01619                         if (c == rm->rm_scols)
01620                                 c = 0;
01621                         rc = &rm->rm_col[c];
01622                         cvd = vd->vdev_child[rc->rc_devidx];
01623                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
01624                             rc->rc_offset + rc->rc_size, NULL,
01625                             1 << tvd->vdev_ashift,
01626                             zio->io_type, zio->io_priority,
01627                             ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
01628                 }
01629 
01630                 return (ZIO_PIPELINE_CONTINUE);
01631         }
01632 
01633         ASSERT(zio->io_type == ZIO_TYPE_READ);
01634 
01635         /*
01636          * Iterate over the columns in reverse order so that we hit the parity
01637          * last -- any errors along the way will force us to read the parity.
01638          */
01639         for (c = rm->rm_cols - 1; c >= 0; c--) {
01640                 rc = &rm->rm_col[c];
01641                 cvd = vd->vdev_child[rc->rc_devidx];
01642                 if (!vdev_readable(cvd)) {
01643                         if (c >= rm->rm_firstdatacol)
01644                                 rm->rm_missingdata++;
01645                         else
01646                                 rm->rm_missingparity++;
01647                         rc->rc_error = ENXIO;
01648                         rc->rc_tried = 1;       /* don't even try */
01649                         rc->rc_skipped = 1;
01650                         continue;
01651                 }
01652                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
01653                         if (c >= rm->rm_firstdatacol)
01654                                 rm->rm_missingdata++;
01655                         else
01656                                 rm->rm_missingparity++;
01657                         rc->rc_error = ESTALE;
01658                         rc->rc_skipped = 1;
01659                         continue;
01660                 }
01661                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
01662                     (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
01663                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
01664                             rc->rc_offset, rc->rc_data, rc->rc_size,
01665                             zio->io_type, zio->io_priority, 0,
01666                             vdev_raidz_child_done, rc));
01667                 }
01668         }
01669 
01670         return (ZIO_PIPELINE_CONTINUE);
01671 }
01672 
01673 
01677 static void
01678 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
01679 {
01680         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
01681 
01682         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
01683                 zio_bad_cksum_t zbc;
01684                 raidz_map_t *rm = zio->io_vsd;
01685 
01686                 mutex_enter(&vd->vdev_stat_lock);
01687                 vd->vdev_stat.vs_checksum_errors++;
01688                 mutex_exit(&vd->vdev_stat_lock);
01689 
01690                 zbc.zbc_has_cksum = 0;
01691                 zbc.zbc_injected = rm->rm_ecksuminjected;
01692 
01693                 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
01694                     rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
01695                     &zbc);
01696         }
01697 }
01698 
01703 static int
01704 raidz_checksum_verify(zio_t *zio)
01705 {
01706         zio_bad_cksum_t zbc;
01707         raidz_map_t *rm = zio->io_vsd;
01708 
01709         int ret = zio_checksum_error(zio, &zbc);
01710         if (ret != 0 && zbc.zbc_injected != 0)
01711                 rm->rm_ecksuminjected = 1;
01712 
01713         return (ret);
01714 }
01715 
01722 static int
01723 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
01724 {
01725         void *orig[VDEV_RAIDZ_MAXPARITY];
01726         int c, ret = 0;
01727         raidz_col_t *rc;
01728 
01729         for (c = 0; c < rm->rm_firstdatacol; c++) {
01730                 rc = &rm->rm_col[c];
01731                 if (!rc->rc_tried || rc->rc_error != 0)
01732                         continue;
01733                 orig[c] = zio_buf_alloc(rc->rc_size);
01734                 bcopy(rc->rc_data, orig[c], rc->rc_size);
01735         }
01736 
01737         vdev_raidz_generate_parity(rm);
01738 
01739         for (c = 0; c < rm->rm_firstdatacol; c++) {
01740                 rc = &rm->rm_col[c];
01741                 if (!rc->rc_tried || rc->rc_error != 0)
01742                         continue;
01743                 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
01744                         raidz_checksum_error(zio, rc, orig[c]);
01745                         rc->rc_error = ECKSUM;
01746                         ret++;
01747                 }
01748                 zio_buf_free(orig[c], rc->rc_size);
01749         }
01750 
01751         return (ret);
01752 }
01753 
01757 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
01758 
01759 static int
01760 vdev_raidz_worst_error(raidz_map_t *rm)
01761 {
01762         int error = 0;
01763 
01764         for (int c = 0; c < rm->rm_cols; c++)
01765                 error = zio_worst_error(error, rm->rm_col[c].rc_error);
01766 
01767         return (error);
01768 }
01769 
01778 static int
01779 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
01780 {
01781         raidz_map_t *rm = zio->io_vsd;
01782         raidz_col_t *rc;
01783         void *orig[VDEV_RAIDZ_MAXPARITY];
01784         int tstore[VDEV_RAIDZ_MAXPARITY + 2];
01785         int *tgts = &tstore[1];
01786         int current, next, i, c, n;
01787         int code, ret = 0;
01788 
01789         ASSERT(total_errors < rm->rm_firstdatacol);
01790 
01791         /*
01792          * This simplifies one edge condition.
01793          */
01794         tgts[-1] = -1;
01795 
01796         for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
01797                 /*
01798                  * Initialize the targets array by finding the first n columns
01799                  * that contain no error.
01800                  *
01801                  * If there were no data errors, we need to ensure that we're
01802                  * always explicitly attempting to reconstruct at least one
01803                  * data column. To do this, we simply push the highest target
01804                  * up into the data columns.
01805                  */
01806                 for (c = 0, i = 0; i < n; i++) {
01807                         if (i == n - 1 && data_errors == 0 &&
01808                             c < rm->rm_firstdatacol) {
01809                                 c = rm->rm_firstdatacol;
01810                         }
01811 
01812                         while (rm->rm_col[c].rc_error != 0) {
01813                                 c++;
01814                                 ASSERT3S(c, <, rm->rm_cols);
01815                         }
01816 
01817                         tgts[i] = c++;
01818                 }
01819 
01820                 /*
01821                  * Setting tgts[n] simplifies the other edge condition.
01822                  */
01823                 tgts[n] = rm->rm_cols;
01824 
01825                 /*
01826                  * These buffers were allocated in previous iterations.
01827                  */
01828                 for (i = 0; i < n - 1; i++) {
01829                         ASSERT(orig[i] != NULL);
01830                 }
01831 
01832                 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
01833 
01834                 current = 0;
01835                 next = tgts[current];
01836 
01837                 while (current != n) {
01838                         tgts[current] = next;
01839                         current = 0;
01840 
01841                         /*
01842                          * Save off the original data that we're going to
01843                          * attempt to reconstruct.
01844                          */
01845                         for (i = 0; i < n; i++) {
01846                                 ASSERT(orig[i] != NULL);
01847                                 c = tgts[i];
01848                                 ASSERT3S(c, >=, 0);
01849                                 ASSERT3S(c, <, rm->rm_cols);
01850                                 rc = &rm->rm_col[c];
01851                                 bcopy(rc->rc_data, orig[i], rc->rc_size);
01852                         }
01853 
01854                         /*
01855                          * Attempt a reconstruction and exit the outer loop on
01856                          * success.
01857                          */
01858                         code = vdev_raidz_reconstruct(rm, tgts, n);
01859                         if (raidz_checksum_verify(zio) == 0) {
01860                                 atomic_inc_64(&raidz_corrected[code]);
01861 
01862                                 for (i = 0; i < n; i++) {
01863                                         c = tgts[i];
01864                                         rc = &rm->rm_col[c];
01865                                         ASSERT(rc->rc_error == 0);
01866                                         if (rc->rc_tried)
01867                                                 raidz_checksum_error(zio, rc,
01868                                                     orig[i]);
01869                                         rc->rc_error = ECKSUM;
01870                                 }
01871 
01872                                 ret = code;
01873                                 goto done;
01874                         }
01875 
01876                         /*
01877                          * Restore the original data.
01878                          */
01879                         for (i = 0; i < n; i++) {
01880                                 c = tgts[i];
01881                                 rc = &rm->rm_col[c];
01882                                 bcopy(orig[i], rc->rc_data, rc->rc_size);
01883                         }
01884 
01885                         do {
01886                                 /*
01887                                  * Find the next valid column after the current
01888                                  * position..
01889                                  */
01890                                 for (next = tgts[current] + 1;
01891                                     next < rm->rm_cols &&
01892                                     rm->rm_col[next].rc_error != 0; next++)
01893                                         continue;
01894 
01895                                 ASSERT(next <= tgts[current + 1]);
01896 
01897                                 /*
01898                                  * If that spot is available, we're done here.
01899                                  */
01900                                 if (next != tgts[current + 1])
01901                                         break;
01902 
01903                                 /*
01904                                  * Otherwise, find the next valid column after
01905                                  * the previous position.
01906                                  */
01907                                 for (c = tgts[current - 1] + 1;
01908                                     rm->rm_col[c].rc_error != 0; c++)
01909                                         continue;
01910 
01911                                 tgts[current] = c;
01912                                 current++;
01913 
01914                         } while (current != n);
01915                 }
01916         }
01917         n--;
01918 done:
01919         for (i = 0; i < n; i++) {
01920                 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
01921         }
01922 
01923         return (ret);
01924 }
01925 
01946 static void
01947 vdev_raidz_io_done(zio_t *zio)
01948 {
01949         vdev_t *vd = zio->io_vd;
01950         vdev_t *cvd;
01951         raidz_map_t *rm = zio->io_vsd;
01952         raidz_col_t *rc;
01953         int unexpected_errors = 0;
01954         int parity_errors = 0;
01955         int parity_untried = 0;
01956         int data_errors = 0;
01957         int total_errors = 0;
01958         int n, c;
01959         int tgts[VDEV_RAIDZ_MAXPARITY];
01960         int code;
01961 
01962         ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
01963 
01964         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
01965         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
01966 
01967         for (c = 0; c < rm->rm_cols; c++) {
01968                 rc = &rm->rm_col[c];
01969 
01970                 if (rc->rc_error) {
01971                         ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
01972 
01973                         if (c < rm->rm_firstdatacol)
01974                                 parity_errors++;
01975                         else
01976                                 data_errors++;
01977 
01978                         if (!rc->rc_skipped)
01979                                 unexpected_errors++;
01980 
01981                         total_errors++;
01982                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
01983                         parity_untried++;
01984                 }
01985         }
01986 
01987         if (zio->io_type == ZIO_TYPE_WRITE) {
01998                 /* XXPOLICY */
01999                 if (total_errors > rm->rm_firstdatacol)
02000                         zio->io_error = vdev_raidz_worst_error(rm);
02001 
02002                 return;
02003         } else if (zio->io_type == ZIO_TYPE_FREE) {
02004                 return;
02005         }
02006 
02007         ASSERT(zio->io_type == ZIO_TYPE_READ);
02008         /*
02009          * There are three potential phases for a read:
02010          *      1. produce valid data from the columns read
02011          *      2. read all disks and try again
02012          *      3. perform combinatorial reconstruction
02013          *
02014          * Each phase is progressively both more expensive and less likely to
02015          * occur. If we encounter more errors than we can repair or all phases
02016          * fail, we have no choice but to return an error.
02017          */
02018 
02019         /*
02020          * If the number of errors we saw was correctable -- less than or equal
02021          * to the number of parity disks read -- attempt to produce data that
02022          * has a valid checksum. Naturally, this case applies in the absence of
02023          * any errors.
02024          */
02025         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
02026                 if (data_errors == 0) {
02027                         if (raidz_checksum_verify(zio) == 0) {
02028                                 /*
02029                                  * If we read parity information (unnecessarily
02030                                  * as it happens since no reconstruction was
02031                                  * needed) regenerate and verify the parity.
02032                                  * We also regenerate parity when resilvering
02033                                  * so we can write it out to the failed device
02034                                  * later.
02035                                  */
02036                                 if (parity_errors + parity_untried <
02037                                     rm->rm_firstdatacol ||
02038                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
02039                                         n = raidz_parity_verify(zio, rm);
02040                                         unexpected_errors += n;
02041                                         ASSERT(parity_errors + n <=
02042                                             rm->rm_firstdatacol);
02043                                 }
02044                                 goto done;
02045                         }
02046                 } else {
02047                         /*
02048                          * We either attempt to read all the parity columns or
02049                          * none of them. If we didn't try to read parity, we
02050                          * wouldn't be here in the correctable case. There must
02051                          * also have been fewer parity errors than parity
02052                          * columns or, again, we wouldn't be in this code path.
02053                          */
02054                         ASSERT(parity_untried == 0);
02055                         ASSERT(parity_errors < rm->rm_firstdatacol);
02056 
02057                         /*
02058                          * Identify the data columns that reported an error.
02059                          */
02060                         n = 0;
02061                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
02062                                 rc = &rm->rm_col[c];
02063                                 if (rc->rc_error != 0) {
02064                                         ASSERT(n < VDEV_RAIDZ_MAXPARITY);
02065                                         tgts[n++] = c;
02066                                 }
02067                         }
02068 
02069                         ASSERT(rm->rm_firstdatacol >= n);
02070 
02071                         code = vdev_raidz_reconstruct(rm, tgts, n);
02072 
02073                         if (raidz_checksum_verify(zio) == 0) {
02074                                 atomic_inc_64(&raidz_corrected[code]);
02075 
02076                                 /*
02077                                  * If we read more parity disks than were used
02078                                  * for reconstruction, confirm that the other
02079                                  * parity disks produced correct data. This
02080                                  * routine is suboptimal in that it regenerates
02081                                  * the parity that we already used in addition
02082                                  * to the parity that we're attempting to
02083                                  * verify, but this should be a relatively
02084                                  * uncommon case, and can be optimized if it
02085                                  * becomes a problem. Note that we regenerate
02086                                  * parity when resilvering so we can write it
02087                                  * out to failed devices later.
02088                                  */
02089                                 if (parity_errors < rm->rm_firstdatacol - n ||
02090                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
02091                                         n = raidz_parity_verify(zio, rm);
02092                                         unexpected_errors += n;
02093                                         ASSERT(parity_errors + n <=
02094                                             rm->rm_firstdatacol);
02095                                 }
02096 
02097                                 goto done;
02098                         }
02099                 }
02100         }
02101 
02102         /*
02103          * This isn't a typical situation -- either we got a read error or
02104          * a child silently returned bad data. Read every block so we can
02105          * try again with as much data and parity as we can track down. If
02106          * we've already been through once before, all children will be marked
02107          * as tried so we'll proceed to combinatorial reconstruction.
02108          */
02109         unexpected_errors = 1;
02110         rm->rm_missingdata = 0;
02111         rm->rm_missingparity = 0;
02112 
02113         for (c = 0; c < rm->rm_cols; c++) {
02114                 if (rm->rm_col[c].rc_tried)
02115                         continue;
02116 
02117                 zio_vdev_io_redone(zio);
02118                 do {
02119                         rc = &rm->rm_col[c];
02120                         if (rc->rc_tried)
02121                                 continue;
02122                         zio_nowait(zio_vdev_child_io(zio, NULL,
02123                             vd->vdev_child[rc->rc_devidx],
02124                             rc->rc_offset, rc->rc_data, rc->rc_size,
02125                             zio->io_type, zio->io_priority, 0,
02126                             vdev_raidz_child_done, rc));
02127                 } while (++c < rm->rm_cols);
02128 
02129                 return;
02130         }
02131 
02132         /*
02133          * At this point we've attempted to reconstruct the data given the
02134          * errors we detected, and we've attempted to read all columns. There
02135          * must, therefore, be one or more additional problems -- silent errors
02136          * resulting in invalid data rather than explicit I/O errors resulting
02137          * in absent data. We check if there is enough additional data to
02138          * possibly reconstruct the data and then perform combinatorial
02139          * reconstruction over all possible combinations. If that fails,
02140          * we're cooked.
02141          */
02142         if (total_errors > rm->rm_firstdatacol) {
02143                 zio->io_error = vdev_raidz_worst_error(rm);
02144 
02145         } else if (total_errors < rm->rm_firstdatacol &&
02146             (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
02147                 /*
02148                  * If we didn't use all the available parity for the
02149                  * combinatorial reconstruction, verify that the remaining
02150                  * parity is correct.
02151                  */
02152                 if (code != (1 << rm->rm_firstdatacol) - 1)
02153                         (void) raidz_parity_verify(zio, rm);
02154         } else {
02155                 /*
02156                  * We're here because either:
02157                  *
02158                  *      total_errors == rm_first_datacol, or
02159                  *      vdev_raidz_combrec() failed
02160                  *
02161                  * In either case, there is enough bad data to prevent
02162                  * reconstruction.
02163                  *
02164                  * Start checksum ereports for all children which haven't
02165                  * failed, and the IO wasn't speculative.
02166                  */
02167                 zio->io_error = ECKSUM;
02168 
02169                 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
02170                         for (c = 0; c < rm->rm_cols; c++) {
02171                                 rc = &rm->rm_col[c];
02172                                 if (rc->rc_error == 0) {
02173                                         zio_bad_cksum_t zbc;
02174                                         zbc.zbc_has_cksum = 0;
02175                                         zbc.zbc_injected =
02176                                             rm->rm_ecksuminjected;
02177 
02178                                         zfs_ereport_start_checksum(
02179                                             zio->io_spa,
02180                                             vd->vdev_child[rc->rc_devidx],
02181                                             zio, rc->rc_offset, rc->rc_size,
02182                                             (void *)(uintptr_t)c, &zbc);
02183                                 }
02184                         }
02185                 }
02186         }
02187 
02188 done:
02189         zio_checksum_verified(zio);
02190 
02191         if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
02192             (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
02193                 /*
02194                  * Use the good data we have in hand to repair damaged children.
02195                  */
02196                 for (c = 0; c < rm->rm_cols; c++) {
02197                         rc = &rm->rm_col[c];
02198                         cvd = vd->vdev_child[rc->rc_devidx];
02199 
02200                         if (rc->rc_error == 0)
02201                                 continue;
02202 
02203                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
02204                             rc->rc_offset, rc->rc_data, rc->rc_size,
02205                             ZIO_TYPE_WRITE, zio->io_priority,
02206                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
02207                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
02208                 }
02209         }
02210 }
02211 
02212 static void
02213 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
02214 {
02215         if (faulted > vd->vdev_nparity)
02216                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
02217                     VDEV_AUX_NO_REPLICAS);
02218         else if (degraded + faulted != 0)
02219                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
02220         else
02221                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
02222 }
02223 
02224 vdev_ops_t vdev_raidz_ops = {
02225         vdev_raidz_open,
02226         vdev_raidz_close,
02227         vdev_raidz_asize,
02228         vdev_raidz_io_start,
02229         vdev_raidz_io_done,
02230         vdev_raidz_state_change,
02231         NULL,
02232         NULL,
02233         VDEV_TYPE_RAIDZ,        /* name of this vdev type */
02234         B_FALSE                 /* not a leaf vdev */
02235 };
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines