FreeBSD ZFS
The Zettabyte File System
|
00001 /* 00002 * CDDL HEADER START 00003 * 00004 * The contents of this file are subject to the terms of the 00005 * Common Development and Distribution License (the "License"). 00006 * You may not use this file except in compliance with the License. 00007 * 00008 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 00009 * or http://www.opensolaris.org/os/licensing. 00010 * See the License for the specific language governing permissions 00011 * and limitations under the License. 00012 * 00013 * When distributing Covered Code, include this CDDL HEADER in each 00014 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 00015 * If applicable, add the following below this CDDL HEADER, with the 00016 * fields enclosed by brackets "[]" replaced with your own identifying 00017 * information: Portions Copyright [yyyy] [name of copyright owner] 00018 * 00019 * CDDL HEADER END 00020 */ 00021 00022 /* 00023 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 00024 * Copyright (c) 2012 by Delphix. All rights reserved. 00025 */ 00026 00027 #include <sys/zfs_context.h> 00028 #include <sys/spa.h> 00029 #include <sys/vdev_impl.h> 00030 #include <sys/zio.h> 00031 #include <sys/zio_checksum.h> 00032 #include <sys/fs/zfs.h> 00033 #include <sys/fm/fs/zfs.h> 00034 00108 typedef struct raidz_col { 00109 uint64_t rc_devidx; 00110 uint64_t rc_offset; 00111 uint64_t rc_size; 00112 void *rc_data; 00113 void *rc_gdata; 00114 int rc_error; 00115 uint8_t rc_tried; 00116 uint8_t rc_skipped; 00117 } raidz_col_t; 00118 00119 typedef struct raidz_map { 00120 uint64_t rm_cols; 00121 uint64_t rm_scols; 00122 uint64_t rm_bigcols; 00123 uint64_t rm_asize; 00124 uint64_t rm_missingdata; 00125 uint64_t rm_missingparity; 00126 uint64_t rm_firstdatacol; 00127 uint64_t rm_nskip; 00128 uint64_t rm_skipstart; 00129 void *rm_datacopy; 00130 uintptr_t rm_reports; 00131 uint8_t rm_freed; 00132 uint8_t rm_ecksuminjected; 00133 raidz_col_t rm_col[1]; 00134 } raidz_map_t; 00135 00136 #define VDEV_RAIDZ_P 0 00137 #define VDEV_RAIDZ_Q 1 00138 #define VDEV_RAIDZ_R 2 00139 00140 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 00141 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 00142 00149 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 00150 { \ 00151 (mask) = (x) & 0x8080808080808080ULL; \ 00152 (mask) = ((mask) << 1) - ((mask) >> 7); \ 00153 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 00154 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 00155 } 00156 00157 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 00158 { \ 00159 VDEV_RAIDZ_64MUL_2((x), mask); \ 00160 VDEV_RAIDZ_64MUL_2((x), mask); \ 00161 } 00162 00166 int vdev_raidz_default_to_general; 00167 00169 static const uint8_t vdev_raidz_pow2[256] = { 00170 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 00171 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 00172 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 00173 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 00174 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 00175 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 00176 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 00177 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 00178 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 00179 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 00180 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 00181 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 00182 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 00183 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 00184 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 00185 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 00186 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 00187 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 00188 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 00189 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 00190 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 00191 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 00192 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 00193 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 00194 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 00195 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 00196 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 00197 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 00198 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 00199 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 00200 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 00201 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 00202 }; 00204 static const uint8_t vdev_raidz_log2[256] = { 00205 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 00206 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 00207 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 00208 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 00209 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 00210 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 00211 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 00212 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 00213 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 00214 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 00215 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 00216 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 00217 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 00218 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 00219 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 00220 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 00221 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 00222 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 00223 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 00224 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 00225 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 00226 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 00227 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 00228 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 00229 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 00230 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 00231 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 00232 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 00233 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 00234 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 00235 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 00236 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 00237 }; 00238 00239 static void vdev_raidz_generate_parity(raidz_map_t *rm); 00240 00244 static uint8_t 00245 vdev_raidz_exp2(uint_t a, int exp) 00246 { 00247 if (a == 0) 00248 return (0); 00249 00250 ASSERT(exp >= 0); 00251 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 00252 00253 exp += vdev_raidz_log2[a]; 00254 if (exp > 255) 00255 exp -= 255; 00256 00257 return (vdev_raidz_pow2[exp]); 00258 } 00259 00260 static void 00261 vdev_raidz_map_free(raidz_map_t *rm) 00262 { 00263 int c; 00264 size_t size; 00265 00266 for (c = 0; c < rm->rm_firstdatacol; c++) { 00267 if (rm->rm_col[c].rc_data != NULL) 00268 zio_buf_free(rm->rm_col[c].rc_data, 00269 rm->rm_col[c].rc_size); 00270 00271 if (rm->rm_col[c].rc_gdata != NULL) 00272 zio_buf_free(rm->rm_col[c].rc_gdata, 00273 rm->rm_col[c].rc_size); 00274 } 00275 00276 size = 0; 00277 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 00278 size += rm->rm_col[c].rc_size; 00279 00280 if (rm->rm_datacopy != NULL) 00281 zio_buf_free(rm->rm_datacopy, size); 00282 00283 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 00284 } 00285 00286 static void 00287 vdev_raidz_map_free_vsd(zio_t *zio) 00288 { 00289 raidz_map_t *rm = zio->io_vsd; 00290 00291 ASSERT0(rm->rm_freed); 00292 rm->rm_freed = 1; 00293 00294 if (rm->rm_reports == 0) 00295 vdev_raidz_map_free(rm); 00296 } 00297 00298 /*ARGSUSED*/ 00299 static void 00300 vdev_raidz_cksum_free(void *arg, size_t ignored) 00301 { 00302 raidz_map_t *rm = arg; 00303 00304 ASSERT3U(rm->rm_reports, >, 0); 00305 00306 if (--rm->rm_reports == 0 && rm->rm_freed != 0) 00307 vdev_raidz_map_free(rm); 00308 } 00309 00310 static void 00311 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 00312 { 00313 raidz_map_t *rm = zcr->zcr_cbdata; 00314 size_t c = zcr->zcr_cbinfo; 00315 size_t x; 00316 00317 const char *good = NULL; 00318 const char *bad = rm->rm_col[c].rc_data; 00319 00320 if (good_data == NULL) { 00321 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 00322 return; 00323 } 00324 00325 if (c < rm->rm_firstdatacol) { 00326 /* 00327 * The first time through, calculate the parity blocks for 00328 * the good data (this relies on the fact that the good 00329 * data never changes for a given logical ZIO) 00330 */ 00331 if (rm->rm_col[0].rc_gdata == NULL) { 00332 char *bad_parity[VDEV_RAIDZ_MAXPARITY]; 00333 char *buf; 00334 00335 /* 00336 * Set up the rm_col[]s to generate the parity for 00337 * good_data, first saving the parity bufs and 00338 * replacing them with buffers to hold the result. 00339 */ 00340 for (x = 0; x < rm->rm_firstdatacol; x++) { 00341 bad_parity[x] = rm->rm_col[x].rc_data; 00342 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = 00343 zio_buf_alloc(rm->rm_col[x].rc_size); 00344 } 00345 00346 /* fill in the data columns from good_data */ 00347 buf = (char *)good_data; 00348 for (; x < rm->rm_cols; x++) { 00349 rm->rm_col[x].rc_data = buf; 00350 buf += rm->rm_col[x].rc_size; 00351 } 00352 00353 /* 00354 * Construct the parity from the good data. 00355 */ 00356 vdev_raidz_generate_parity(rm); 00357 00358 /* restore everything back to its original state */ 00359 for (x = 0; x < rm->rm_firstdatacol; x++) 00360 rm->rm_col[x].rc_data = bad_parity[x]; 00361 00362 buf = rm->rm_datacopy; 00363 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 00364 rm->rm_col[x].rc_data = buf; 00365 buf += rm->rm_col[x].rc_size; 00366 } 00367 } 00368 00369 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 00370 good = rm->rm_col[c].rc_gdata; 00371 } else { 00372 /* adjust good_data to point at the start of our column */ 00373 good = good_data; 00374 00375 for (x = rm->rm_firstdatacol; x < c; x++) 00376 good += rm->rm_col[x].rc_size; 00377 } 00378 00379 /* we drop the ereport if it ends up that the data was good */ 00380 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 00381 } 00382 00389 static void 00390 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 00391 { 00392 size_t c = (size_t)(uintptr_t)arg; 00393 caddr_t buf; 00394 00395 raidz_map_t *rm = zio->io_vsd; 00396 size_t size; 00397 00398 /* set up the report and bump the refcount */ 00399 zcr->zcr_cbdata = rm; 00400 zcr->zcr_cbinfo = c; 00401 zcr->zcr_finish = vdev_raidz_cksum_finish; 00402 zcr->zcr_free = vdev_raidz_cksum_free; 00403 00404 rm->rm_reports++; 00405 ASSERT3U(rm->rm_reports, >, 0); 00406 00407 if (rm->rm_datacopy != NULL) 00408 return; 00409 00410 /* 00411 * It's the first time we're called for this raidz_map_t, so we need 00412 * to copy the data aside; there's no guarantee that our zio's buffer 00413 * won't be re-used for something else. 00414 * 00415 * Our parity data is already in separate buffers, so there's no need 00416 * to copy them. 00417 */ 00418 00419 size = 0; 00420 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 00421 size += rm->rm_col[c].rc_size; 00422 00423 buf = rm->rm_datacopy = zio_buf_alloc(size); 00424 00425 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 00426 raidz_col_t *col = &rm->rm_col[c]; 00427 00428 bcopy(col->rc_data, buf, col->rc_size); 00429 col->rc_data = buf; 00430 00431 buf += col->rc_size; 00432 } 00433 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); 00434 } 00435 00436 static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 00437 vdev_raidz_map_free_vsd, 00438 vdev_raidz_cksum_report 00439 }; 00440 00446 static raidz_map_t * 00447 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 00448 uint64_t nparity) 00449 { 00450 raidz_map_t *rm; 00451 uint64_t b = zio->io_offset >> unit_shift; 00452 /* The zio's size in units of the vdev's preferred sector size */ 00453 uint64_t s = zio->io_size >> unit_shift; 00454 uint64_t f = b % dcols; 00455 uint64_t o = (b / dcols) << unit_shift; 00456 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 00457 00458 q = s / (dcols - nparity); 00459 r = s - q * (dcols - nparity); 00460 bc = (r == 0 ? 0 : r + nparity); 00461 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 00462 00463 if (q == 0) { 00464 acols = bc; 00465 scols = MIN(dcols, roundup(bc, nparity + 1)); 00466 } else { 00467 acols = dcols; 00468 scols = dcols; 00469 } 00470 00471 ASSERT3U(acols, <=, scols); 00472 00473 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 00474 00475 rm->rm_cols = acols; 00476 rm->rm_scols = scols; 00477 rm->rm_bigcols = bc; 00478 rm->rm_skipstart = bc; 00479 rm->rm_missingdata = 0; 00480 rm->rm_missingparity = 0; 00481 rm->rm_firstdatacol = nparity; 00482 rm->rm_datacopy = NULL; 00483 rm->rm_reports = 0; 00484 rm->rm_freed = 0; 00485 rm->rm_ecksuminjected = 0; 00486 00487 asize = 0; 00488 00489 for (c = 0; c < scols; c++) { 00490 col = f + c; 00491 coff = o; 00492 if (col >= dcols) { 00493 col -= dcols; 00494 coff += 1ULL << unit_shift; 00495 } 00496 rm->rm_col[c].rc_devidx = col; 00497 rm->rm_col[c].rc_offset = coff; 00498 rm->rm_col[c].rc_data = NULL; 00499 rm->rm_col[c].rc_gdata = NULL; 00500 rm->rm_col[c].rc_error = 0; 00501 rm->rm_col[c].rc_tried = 0; 00502 rm->rm_col[c].rc_skipped = 0; 00503 00504 if (c >= acols) 00505 rm->rm_col[c].rc_size = 0; 00506 else if (c < bc) 00507 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 00508 else 00509 rm->rm_col[c].rc_size = q << unit_shift; 00510 00511 asize += rm->rm_col[c].rc_size; 00512 } 00513 00514 ASSERT3U(asize, ==, tot << unit_shift); 00515 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 00516 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 00517 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 00518 ASSERT3U(rm->rm_nskip, <=, nparity); 00519 00520 if (zio->io_type != ZIO_TYPE_FREE) { 00521 for (c = 0; c < rm->rm_firstdatacol; c++) { 00522 rm->rm_col[c].rc_data = 00523 zio_buf_alloc(rm->rm_col[c].rc_size); 00524 } 00525 00526 rm->rm_col[c].rc_data = zio->io_data; 00527 00528 for (c = c + 1; c < acols; c++) { 00529 rm->rm_col[c].rc_data = 00530 (char *)rm->rm_col[c - 1].rc_data + 00531 rm->rm_col[c - 1].rc_size; 00532 } 00533 } 00534 00535 /* 00536 * If all data stored spans all columns, there's a danger that parity 00537 * will always be on the same device and, since parity isn't read 00538 * during normal operation, that that device's I/O bandwidth won't be 00539 * used effectively. We therefore switch the parity every 1MB. 00540 * 00541 * ... at least that was, ostensibly, the theory. As a practical 00542 * matter unless we juggle the parity between all devices evenly, we 00543 * won't see any benefit. Further, occasional writes that aren't a 00544 * multiple of the LCM of the number of children and the minimum 00545 * stripe width are sufficient to avoid pessimal behavior. 00546 * Unfortunately, this decision created an implicit on-disk format 00547 * requirement that we need to support for all eternity, but only 00548 * for single-parity RAID-Z. 00549 * 00550 * If we intend to skip a sector in the zeroth column for padding 00551 * we must make sure to note this swap. We will never intend to 00552 * skip the first column since at least one data and one parity 00553 * column must appear in each row. 00554 */ 00555 ASSERT(rm->rm_cols >= 2); 00556 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 00557 00558 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 00559 devidx = rm->rm_col[0].rc_devidx; 00560 o = rm->rm_col[0].rc_offset; 00561 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 00562 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 00563 rm->rm_col[1].rc_devidx = devidx; 00564 rm->rm_col[1].rc_offset = o; 00565 00566 if (rm->rm_skipstart == 0) 00567 rm->rm_skipstart = 1; 00568 } 00569 00570 zio->io_vsd = rm; 00571 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 00572 return (rm); 00573 } 00574 00575 static void 00576 vdev_raidz_generate_parity_p(raidz_map_t *rm) 00577 { 00578 uint64_t *p, *src, pcount, ccount, i; 00579 int c; 00580 00581 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 00582 00583 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 00584 src = rm->rm_col[c].rc_data; 00585 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 00586 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 00587 00588 if (c == rm->rm_firstdatacol) { 00589 ASSERT(ccount == pcount); 00590 for (i = 0; i < ccount; i++, src++, p++) { 00591 *p = *src; 00592 } 00593 } else { 00594 ASSERT(ccount <= pcount); 00595 for (i = 0; i < ccount; i++, src++, p++) { 00596 *p ^= *src; 00597 } 00598 } 00599 } 00600 } 00601 00602 static void 00603 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 00604 { 00605 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 00606 int c; 00607 00608 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 00609 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 00610 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 00611 00612 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 00613 src = rm->rm_col[c].rc_data; 00614 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 00615 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 00616 00617 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 00618 00619 if (c == rm->rm_firstdatacol) { 00620 ASSERT(ccnt == pcnt || ccnt == 0); 00621 for (i = 0; i < ccnt; i++, src++, p++, q++) { 00622 *p = *src; 00623 *q = *src; 00624 } 00625 for (; i < pcnt; i++, src++, p++, q++) { 00626 *p = 0; 00627 *q = 0; 00628 } 00629 } else { 00630 ASSERT(ccnt <= pcnt); 00631 00632 /* 00633 * Apply the algorithm described above by multiplying 00634 * the previous result and adding in the new value. 00635 */ 00636 for (i = 0; i < ccnt; i++, src++, p++, q++) { 00637 *p ^= *src; 00638 00639 VDEV_RAIDZ_64MUL_2(*q, mask); 00640 *q ^= *src; 00641 } 00642 00643 /* 00644 * Treat short columns as though they are full of 0s. 00645 * Note that there's therefore nothing needed for P. 00646 */ 00647 for (; i < pcnt; i++, q++) { 00648 VDEV_RAIDZ_64MUL_2(*q, mask); 00649 } 00650 } 00651 } 00652 } 00653 00654 static void 00655 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 00656 { 00657 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 00658 int c; 00659 00660 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 00661 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 00662 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 00663 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 00664 rm->rm_col[VDEV_RAIDZ_R].rc_size); 00665 00666 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 00667 src = rm->rm_col[c].rc_data; 00668 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 00669 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 00670 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 00671 00672 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 00673 00674 if (c == rm->rm_firstdatacol) { 00675 ASSERT(ccnt == pcnt || ccnt == 0); 00676 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 00677 *p = *src; 00678 *q = *src; 00679 *r = *src; 00680 } 00681 for (; i < pcnt; i++, src++, p++, q++, r++) { 00682 *p = 0; 00683 *q = 0; 00684 *r = 0; 00685 } 00686 } else { 00687 ASSERT(ccnt <= pcnt); 00688 00689 /* 00690 * Apply the algorithm described above by multiplying 00691 * the previous result and adding in the new value. 00692 */ 00693 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 00694 *p ^= *src; 00695 00696 VDEV_RAIDZ_64MUL_2(*q, mask); 00697 *q ^= *src; 00698 00699 VDEV_RAIDZ_64MUL_4(*r, mask); 00700 *r ^= *src; 00701 } 00702 00703 /* 00704 * Treat short columns as though they are full of 0s. 00705 * Note that there's therefore nothing needed for P. 00706 */ 00707 for (; i < pcnt; i++, q++, r++) { 00708 VDEV_RAIDZ_64MUL_2(*q, mask); 00709 VDEV_RAIDZ_64MUL_4(*r, mask); 00710 } 00711 } 00712 } 00713 } 00714 00719 static void 00720 vdev_raidz_generate_parity(raidz_map_t *rm) 00721 { 00722 switch (rm->rm_firstdatacol) { 00723 case 1: 00724 vdev_raidz_generate_parity_p(rm); 00725 break; 00726 case 2: 00727 vdev_raidz_generate_parity_pq(rm); 00728 break; 00729 case 3: 00730 vdev_raidz_generate_parity_pqr(rm); 00731 break; 00732 default: 00733 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 00734 } 00735 } 00736 00737 static int 00738 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 00739 { 00740 uint64_t *dst, *src, xcount, ccount, count, i; 00741 int x = tgts[0]; 00742 int c; 00743 00744 ASSERT(ntgts == 1); 00745 ASSERT(x >= rm->rm_firstdatacol); 00746 ASSERT(x < rm->rm_cols); 00747 00748 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 00749 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 00750 ASSERT(xcount > 0); 00751 00752 src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 00753 dst = rm->rm_col[x].rc_data; 00754 for (i = 0; i < xcount; i++, dst++, src++) { 00755 *dst = *src; 00756 } 00757 00758 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 00759 src = rm->rm_col[c].rc_data; 00760 dst = rm->rm_col[x].rc_data; 00761 00762 if (c == x) 00763 continue; 00764 00765 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 00766 count = MIN(ccount, xcount); 00767 00768 for (i = 0; i < count; i++, dst++, src++) { 00769 *dst ^= *src; 00770 } 00771 } 00772 00773 return (1 << VDEV_RAIDZ_P); 00774 } 00775 00776 static int 00777 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 00778 { 00779 uint64_t *dst, *src, xcount, ccount, count, mask, i; 00780 uint8_t *b; 00781 int x = tgts[0]; 00782 int c, j, exp; 00783 00784 ASSERT(ntgts == 1); 00785 00786 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 00787 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 00788 00789 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 00790 src = rm->rm_col[c].rc_data; 00791 dst = rm->rm_col[x].rc_data; 00792 00793 if (c == x) 00794 ccount = 0; 00795 else 00796 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 00797 00798 count = MIN(ccount, xcount); 00799 00800 if (c == rm->rm_firstdatacol) { 00801 for (i = 0; i < count; i++, dst++, src++) { 00802 *dst = *src; 00803 } 00804 for (; i < xcount; i++, dst++) { 00805 *dst = 0; 00806 } 00807 00808 } else { 00809 for (i = 0; i < count; i++, dst++, src++) { 00810 VDEV_RAIDZ_64MUL_2(*dst, mask); 00811 *dst ^= *src; 00812 } 00813 00814 for (; i < xcount; i++, dst++) { 00815 VDEV_RAIDZ_64MUL_2(*dst, mask); 00816 } 00817 } 00818 } 00819 00820 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 00821 dst = rm->rm_col[x].rc_data; 00822 exp = 255 - (rm->rm_cols - 1 - x); 00823 00824 for (i = 0; i < xcount; i++, dst++, src++) { 00825 *dst ^= *src; 00826 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 00827 *b = vdev_raidz_exp2(*b, exp); 00828 } 00829 } 00830 00831 return (1 << VDEV_RAIDZ_Q); 00832 } 00833 00834 static int 00835 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 00836 { 00837 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 00838 void *pdata, *qdata; 00839 uint64_t xsize, ysize, i; 00840 int x = tgts[0]; 00841 int y = tgts[1]; 00842 00843 ASSERT(ntgts == 2); 00844 ASSERT(x < y); 00845 ASSERT(x >= rm->rm_firstdatacol); 00846 ASSERT(y < rm->rm_cols); 00847 00848 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 00849 00850 /* 00851 * Move the parity data aside -- we're going to compute parity as 00852 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 00853 * reuse the parity generation mechanism without trashing the actual 00854 * parity so we make those columns appear to be full of zeros by 00855 * setting their lengths to zero. 00856 */ 00857 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 00858 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 00859 xsize = rm->rm_col[x].rc_size; 00860 ysize = rm->rm_col[y].rc_size; 00861 00862 rm->rm_col[VDEV_RAIDZ_P].rc_data = 00863 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 00864 rm->rm_col[VDEV_RAIDZ_Q].rc_data = 00865 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 00866 rm->rm_col[x].rc_size = 0; 00867 rm->rm_col[y].rc_size = 0; 00868 00869 vdev_raidz_generate_parity_pq(rm); 00870 00871 rm->rm_col[x].rc_size = xsize; 00872 rm->rm_col[y].rc_size = ysize; 00873 00874 p = pdata; 00875 q = qdata; 00876 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 00877 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 00878 xd = rm->rm_col[x].rc_data; 00879 yd = rm->rm_col[y].rc_data; 00880 00881 /* 00882 * We now have: 00883 * Pxy = P + D_x + D_y 00884 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 00885 * 00886 * We can then solve for D_x: 00887 * D_x = A * (P + Pxy) + B * (Q + Qxy) 00888 * where 00889 * A = 2^(x - y) * (2^(x - y) + 1)^-1 00890 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 00891 * 00892 * With D_x in hand, we can easily solve for D_y: 00893 * D_y = P + Pxy + D_x 00894 */ 00895 00896 a = vdev_raidz_pow2[255 + x - y]; 00897 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 00898 tmp = 255 - vdev_raidz_log2[a ^ 1]; 00899 00900 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 00901 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 00902 00903 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 00904 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 00905 vdev_raidz_exp2(*q ^ *qxy, bexp); 00906 00907 if (i < ysize) 00908 *yd = *p ^ *pxy ^ *xd; 00909 } 00910 00911 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 00912 rm->rm_col[VDEV_RAIDZ_P].rc_size); 00913 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 00914 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 00915 00916 /* 00917 * Restore the saved parity data. 00918 */ 00919 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 00920 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 00921 00922 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 00923 } 00924 01089 static void 01090 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 01091 uint8_t **rows) 01092 { 01093 int i, j; 01094 int pow; 01095 01096 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 01097 01098 /* 01099 * Fill in the missing rows of interest. 01100 */ 01101 for (i = 0; i < nmap; i++) { 01102 ASSERT3S(0, <=, map[i]); 01103 ASSERT3S(map[i], <=, 2); 01104 01105 pow = map[i] * n; 01106 if (pow > 255) 01107 pow -= 255; 01108 ASSERT(pow <= 255); 01109 01110 for (j = 0; j < n; j++) { 01111 pow -= map[i]; 01112 if (pow < 0) 01113 pow += 255; 01114 rows[i][j] = vdev_raidz_pow2[pow]; 01115 } 01116 } 01117 } 01118 01119 static void 01120 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 01121 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 01122 { 01123 int i, j, ii, jj; 01124 uint8_t log; 01125 01126 /* 01127 * Assert that the first nmissing entries from the array of used 01128 * columns correspond to parity columns and that subsequent entries 01129 * correspond to data columns. 01130 */ 01131 for (i = 0; i < nmissing; i++) { 01132 ASSERT3S(used[i], <, rm->rm_firstdatacol); 01133 } 01134 for (; i < n; i++) { 01135 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 01136 } 01137 01138 /* 01139 * First initialize the storage where we'll compute the inverse rows. 01140 */ 01141 for (i = 0; i < nmissing; i++) { 01142 for (j = 0; j < n; j++) { 01143 invrows[i][j] = (i == j) ? 1 : 0; 01144 } 01145 } 01146 01147 /* 01148 * Subtract all trivial rows from the rows of consequence. 01149 */ 01150 for (i = 0; i < nmissing; i++) { 01151 for (j = nmissing; j < n; j++) { 01152 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 01153 jj = used[j] - rm->rm_firstdatacol; 01154 ASSERT3S(jj, <, n); 01155 invrows[i][j] = rows[i][jj]; 01156 rows[i][jj] = 0; 01157 } 01158 } 01159 01160 /* 01161 * For each of the rows of interest, we must normalize it and subtract 01162 * a multiple of it from the other rows. 01163 */ 01164 for (i = 0; i < nmissing; i++) { 01165 for (j = 0; j < missing[i]; j++) { 01166 ASSERT0(rows[i][j]); 01167 } 01168 ASSERT3U(rows[i][missing[i]], !=, 0); 01169 01170 /* 01171 * Compute the inverse of the first element and multiply each 01172 * element in the row by that value. 01173 */ 01174 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 01175 01176 for (j = 0; j < n; j++) { 01177 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 01178 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 01179 } 01180 01181 for (ii = 0; ii < nmissing; ii++) { 01182 if (i == ii) 01183 continue; 01184 01185 ASSERT3U(rows[ii][missing[i]], !=, 0); 01186 01187 log = vdev_raidz_log2[rows[ii][missing[i]]]; 01188 01189 for (j = 0; j < n; j++) { 01190 rows[ii][j] ^= 01191 vdev_raidz_exp2(rows[i][j], log); 01192 invrows[ii][j] ^= 01193 vdev_raidz_exp2(invrows[i][j], log); 01194 } 01195 } 01196 } 01197 01198 /* 01199 * Verify that the data that is left in the rows are properly part of 01200 * an identity matrix. 01201 */ 01202 for (i = 0; i < nmissing; i++) { 01203 for (j = 0; j < n; j++) { 01204 if (j == missing[i]) { 01205 ASSERT3U(rows[i][j], ==, 1); 01206 } else { 01207 ASSERT0(rows[i][j]); 01208 } 01209 } 01210 } 01211 } 01212 01213 static void 01214 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 01215 int *missing, uint8_t **invrows, const uint8_t *used) 01216 { 01217 int i, j, x, cc, c; 01218 uint8_t *src; 01219 uint64_t ccount; 01220 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 01221 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 01222 uint8_t log, val; 01223 int ll; 01224 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 01225 uint8_t *p, *pp; 01226 size_t psize; 01227 01228 psize = sizeof (invlog[0][0]) * n * nmissing; 01229 p = kmem_alloc(psize, KM_SLEEP); 01230 01231 for (pp = p, i = 0; i < nmissing; i++) { 01232 invlog[i] = pp; 01233 pp += n; 01234 } 01235 01236 for (i = 0; i < nmissing; i++) { 01237 for (j = 0; j < n; j++) { 01238 ASSERT3U(invrows[i][j], !=, 0); 01239 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 01240 } 01241 } 01242 01243 for (i = 0; i < n; i++) { 01244 c = used[i]; 01245 ASSERT3U(c, <, rm->rm_cols); 01246 01247 src = rm->rm_col[c].rc_data; 01248 ccount = rm->rm_col[c].rc_size; 01249 for (j = 0; j < nmissing; j++) { 01250 cc = missing[j] + rm->rm_firstdatacol; 01251 ASSERT3U(cc, >=, rm->rm_firstdatacol); 01252 ASSERT3U(cc, <, rm->rm_cols); 01253 ASSERT3U(cc, !=, c); 01254 01255 dst[j] = rm->rm_col[cc].rc_data; 01256 dcount[j] = rm->rm_col[cc].rc_size; 01257 } 01258 01259 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 01260 01261 for (x = 0; x < ccount; x++, src++) { 01262 if (*src != 0) 01263 log = vdev_raidz_log2[*src]; 01264 01265 for (cc = 0; cc < nmissing; cc++) { 01266 if (x >= dcount[cc]) 01267 continue; 01268 01269 if (*src == 0) { 01270 val = 0; 01271 } else { 01272 if ((ll = log + invlog[cc][i]) >= 255) 01273 ll -= 255; 01274 val = vdev_raidz_pow2[ll]; 01275 } 01276 01277 if (i == 0) 01278 dst[cc][x] = val; 01279 else 01280 dst[cc][x] ^= val; 01281 } 01282 } 01283 } 01284 01285 kmem_free(p, psize); 01286 } 01287 01288 static int 01289 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 01290 { 01291 int n, i, c, t, tt; 01292 int nmissing_rows; 01293 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 01294 int parity_map[VDEV_RAIDZ_MAXPARITY]; 01295 01296 uint8_t *p, *pp; 01297 size_t psize; 01298 01299 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 01300 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 01301 uint8_t *used; 01302 01303 int code = 0; 01304 01305 01306 n = rm->rm_cols - rm->rm_firstdatacol; 01307 01308 /* 01309 * Figure out which data columns are missing. 01310 */ 01311 nmissing_rows = 0; 01312 for (t = 0; t < ntgts; t++) { 01313 if (tgts[t] >= rm->rm_firstdatacol) { 01314 missing_rows[nmissing_rows++] = 01315 tgts[t] - rm->rm_firstdatacol; 01316 } 01317 } 01318 01319 /* 01320 * Figure out which parity columns to use to help generate the missing 01321 * data columns. 01322 */ 01323 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 01324 ASSERT(tt < ntgts); 01325 ASSERT(c < rm->rm_firstdatacol); 01326 01327 /* 01328 * Skip any targeted parity columns. 01329 */ 01330 if (c == tgts[tt]) { 01331 tt++; 01332 continue; 01333 } 01334 01335 code |= 1 << c; 01336 01337 parity_map[i] = c; 01338 i++; 01339 } 01340 01341 ASSERT(code != 0); 01342 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 01343 01344 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 01345 nmissing_rows * n + sizeof (used[0]) * n; 01346 p = kmem_alloc(psize, KM_SLEEP); 01347 01348 for (pp = p, i = 0; i < nmissing_rows; i++) { 01349 rows[i] = pp; 01350 pp += n; 01351 invrows[i] = pp; 01352 pp += n; 01353 } 01354 used = pp; 01355 01356 for (i = 0; i < nmissing_rows; i++) { 01357 used[i] = parity_map[i]; 01358 } 01359 01360 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 01361 if (tt < nmissing_rows && 01362 c == missing_rows[tt] + rm->rm_firstdatacol) { 01363 tt++; 01364 continue; 01365 } 01366 01367 ASSERT3S(i, <, n); 01368 used[i] = c; 01369 i++; 01370 } 01371 01372 /* 01373 * Initialize the interesting rows of the matrix. 01374 */ 01375 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 01376 01377 /* 01378 * Invert the matrix. 01379 */ 01380 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 01381 invrows, used); 01382 01383 /* 01384 * Reconstruct the missing data using the generated matrix. 01385 */ 01386 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 01387 invrows, used); 01388 01389 kmem_free(p, psize); 01390 01391 return (code); 01392 } 01393 01394 static int 01395 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 01396 { 01397 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 01398 int ntgts; 01399 int i, c; 01400 int code; 01401 int nbadparity, nbaddata; 01402 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 01403 01404 /* 01405 * The tgts list must already be sorted. 01406 */ 01407 for (i = 1; i < nt; i++) { 01408 ASSERT(t[i] > t[i - 1]); 01409 } 01410 01411 nbadparity = rm->rm_firstdatacol; 01412 nbaddata = rm->rm_cols - nbadparity; 01413 ntgts = 0; 01414 for (i = 0, c = 0; c < rm->rm_cols; c++) { 01415 if (c < rm->rm_firstdatacol) 01416 parity_valid[c] = B_FALSE; 01417 01418 if (i < nt && c == t[i]) { 01419 tgts[ntgts++] = c; 01420 i++; 01421 } else if (rm->rm_col[c].rc_error != 0) { 01422 tgts[ntgts++] = c; 01423 } else if (c >= rm->rm_firstdatacol) { 01424 nbaddata--; 01425 } else { 01426 parity_valid[c] = B_TRUE; 01427 nbadparity--; 01428 } 01429 } 01430 01431 ASSERT(ntgts >= nt); 01432 ASSERT(nbaddata >= 0); 01433 ASSERT(nbaddata + nbadparity == ntgts); 01434 01435 dt = &tgts[nbadparity]; 01436 01437 /* 01438 * See if we can use any of our optimized reconstruction routines. 01439 */ 01440 if (!vdev_raidz_default_to_general) { 01441 switch (nbaddata) { 01442 case 1: 01443 if (parity_valid[VDEV_RAIDZ_P]) 01444 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 01445 01446 ASSERT(rm->rm_firstdatacol > 1); 01447 01448 if (parity_valid[VDEV_RAIDZ_Q]) 01449 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 01450 01451 ASSERT(rm->rm_firstdatacol > 2); 01452 break; 01453 01454 case 2: 01455 ASSERT(rm->rm_firstdatacol > 1); 01456 01457 if (parity_valid[VDEV_RAIDZ_P] && 01458 parity_valid[VDEV_RAIDZ_Q]) 01459 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 01460 01461 ASSERT(rm->rm_firstdatacol > 2); 01462 01463 break; 01464 } 01465 } 01466 01467 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 01468 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 01469 ASSERT(code > 0); 01470 return (code); 01471 } 01472 01476 static int 01477 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 01478 uint64_t *ashift) 01479 { 01480 vdev_t *cvd; 01481 uint64_t nparity = vd->vdev_nparity; 01482 int c; 01483 int lasterror = 0; 01484 int numerrors = 0; 01485 01486 ASSERT(nparity > 0); 01487 01488 if (nparity > VDEV_RAIDZ_MAXPARITY || 01489 vd->vdev_children < nparity + 1) { 01490 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 01491 return (EINVAL); 01492 } 01493 01494 vdev_open_children(vd); 01495 01496 for (c = 0; c < vd->vdev_children; c++) { 01497 cvd = vd->vdev_child[c]; 01498 01499 if (cvd->vdev_open_error != 0) { 01500 lasterror = cvd->vdev_open_error; 01501 numerrors++; 01502 continue; 01503 } 01504 01505 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 01506 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 01507 *ashift = MAX(*ashift, cvd->vdev_ashift); 01508 } 01509 01510 *asize *= vd->vdev_children; 01511 *max_asize *= vd->vdev_children; 01512 01513 if (numerrors > nparity) { 01514 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 01515 return (lasterror); 01516 } 01517 01518 return (0); 01519 } 01520 01521 static void 01522 vdev_raidz_close(vdev_t *vd) 01523 { 01524 int c; 01525 01526 for (c = 0; c < vd->vdev_children; c++) 01527 vdev_close(vd->vdev_child[c]); 01528 } 01529 01530 static uint64_t 01531 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 01532 { 01533 uint64_t asize; 01534 uint64_t ashift = vd->vdev_top->vdev_ashift; 01535 uint64_t cols = vd->vdev_children; 01536 uint64_t nparity = vd->vdev_nparity; 01537 01538 asize = ((psize - 1) >> ashift) + 1; 01539 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 01540 asize = roundup(asize, nparity + 1) << ashift; 01541 01542 return (asize); 01543 } 01544 01548 static void 01549 vdev_raidz_child_done(zio_t *zio) 01550 { 01551 raidz_col_t *rc = zio->io_private; 01552 01553 rc->rc_error = zio->io_error; 01554 rc->rc_tried = 1; 01555 rc->rc_skipped = 0; 01556 } 01557 01574 static int 01575 vdev_raidz_io_start(zio_t *zio) 01576 { 01577 vdev_t *vd = zio->io_vd; 01578 vdev_t *tvd = vd->vdev_top; 01579 vdev_t *cvd; 01580 raidz_map_t *rm; 01581 raidz_col_t *rc; 01582 int c, i; 01583 01584 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 01585 vd->vdev_nparity); 01586 01587 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 01588 01589 if (zio->io_type == ZIO_TYPE_FREE) { 01590 for (c = 0; c < rm->rm_cols; c++) { 01591 rc = &rm->rm_col[c]; 01592 cvd = vd->vdev_child[rc->rc_devidx]; 01593 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 01594 rc->rc_offset, rc->rc_data, rc->rc_size, 01595 zio->io_type, zio->io_priority, 0, 01596 vdev_raidz_child_done, rc)); 01597 } 01598 return (ZIO_PIPELINE_CONTINUE); 01599 } 01600 01601 if (zio->io_type == ZIO_TYPE_WRITE) { 01602 vdev_raidz_generate_parity(rm); 01603 01604 for (c = 0; c < rm->rm_cols; c++) { 01605 rc = &rm->rm_col[c]; 01606 cvd = vd->vdev_child[rc->rc_devidx]; 01607 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 01608 rc->rc_offset, rc->rc_data, rc->rc_size, 01609 zio->io_type, zio->io_priority, 0, 01610 vdev_raidz_child_done, rc)); 01611 } 01612 01613 /* 01614 * Generate optional I/Os for any skipped sectors to improve 01615 * aggregation contiguity. 01616 */ 01617 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 01618 ASSERT(c <= rm->rm_scols); 01619 if (c == rm->rm_scols) 01620 c = 0; 01621 rc = &rm->rm_col[c]; 01622 cvd = vd->vdev_child[rc->rc_devidx]; 01623 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 01624 rc->rc_offset + rc->rc_size, NULL, 01625 1 << tvd->vdev_ashift, 01626 zio->io_type, zio->io_priority, 01627 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 01628 } 01629 01630 return (ZIO_PIPELINE_CONTINUE); 01631 } 01632 01633 ASSERT(zio->io_type == ZIO_TYPE_READ); 01634 01635 /* 01636 * Iterate over the columns in reverse order so that we hit the parity 01637 * last -- any errors along the way will force us to read the parity. 01638 */ 01639 for (c = rm->rm_cols - 1; c >= 0; c--) { 01640 rc = &rm->rm_col[c]; 01641 cvd = vd->vdev_child[rc->rc_devidx]; 01642 if (!vdev_readable(cvd)) { 01643 if (c >= rm->rm_firstdatacol) 01644 rm->rm_missingdata++; 01645 else 01646 rm->rm_missingparity++; 01647 rc->rc_error = ENXIO; 01648 rc->rc_tried = 1; /* don't even try */ 01649 rc->rc_skipped = 1; 01650 continue; 01651 } 01652 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 01653 if (c >= rm->rm_firstdatacol) 01654 rm->rm_missingdata++; 01655 else 01656 rm->rm_missingparity++; 01657 rc->rc_error = ESTALE; 01658 rc->rc_skipped = 1; 01659 continue; 01660 } 01661 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 01662 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 01663 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 01664 rc->rc_offset, rc->rc_data, rc->rc_size, 01665 zio->io_type, zio->io_priority, 0, 01666 vdev_raidz_child_done, rc)); 01667 } 01668 } 01669 01670 return (ZIO_PIPELINE_CONTINUE); 01671 } 01672 01673 01677 static void 01678 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 01679 { 01680 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 01681 01682 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 01683 zio_bad_cksum_t zbc; 01684 raidz_map_t *rm = zio->io_vsd; 01685 01686 mutex_enter(&vd->vdev_stat_lock); 01687 vd->vdev_stat.vs_checksum_errors++; 01688 mutex_exit(&vd->vdev_stat_lock); 01689 01690 zbc.zbc_has_cksum = 0; 01691 zbc.zbc_injected = rm->rm_ecksuminjected; 01692 01693 zfs_ereport_post_checksum(zio->io_spa, vd, zio, 01694 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, 01695 &zbc); 01696 } 01697 } 01698 01703 static int 01704 raidz_checksum_verify(zio_t *zio) 01705 { 01706 zio_bad_cksum_t zbc; 01707 raidz_map_t *rm = zio->io_vsd; 01708 01709 int ret = zio_checksum_error(zio, &zbc); 01710 if (ret != 0 && zbc.zbc_injected != 0) 01711 rm->rm_ecksuminjected = 1; 01712 01713 return (ret); 01714 } 01715 01722 static int 01723 raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 01724 { 01725 void *orig[VDEV_RAIDZ_MAXPARITY]; 01726 int c, ret = 0; 01727 raidz_col_t *rc; 01728 01729 for (c = 0; c < rm->rm_firstdatacol; c++) { 01730 rc = &rm->rm_col[c]; 01731 if (!rc->rc_tried || rc->rc_error != 0) 01732 continue; 01733 orig[c] = zio_buf_alloc(rc->rc_size); 01734 bcopy(rc->rc_data, orig[c], rc->rc_size); 01735 } 01736 01737 vdev_raidz_generate_parity(rm); 01738 01739 for (c = 0; c < rm->rm_firstdatacol; c++) { 01740 rc = &rm->rm_col[c]; 01741 if (!rc->rc_tried || rc->rc_error != 0) 01742 continue; 01743 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 01744 raidz_checksum_error(zio, rc, orig[c]); 01745 rc->rc_error = ECKSUM; 01746 ret++; 01747 } 01748 zio_buf_free(orig[c], rc->rc_size); 01749 } 01750 01751 return (ret); 01752 } 01753 01757 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 01758 01759 static int 01760 vdev_raidz_worst_error(raidz_map_t *rm) 01761 { 01762 int error = 0; 01763 01764 for (int c = 0; c < rm->rm_cols; c++) 01765 error = zio_worst_error(error, rm->rm_col[c].rc_error); 01766 01767 return (error); 01768 } 01769 01778 static int 01779 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 01780 { 01781 raidz_map_t *rm = zio->io_vsd; 01782 raidz_col_t *rc; 01783 void *orig[VDEV_RAIDZ_MAXPARITY]; 01784 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 01785 int *tgts = &tstore[1]; 01786 int current, next, i, c, n; 01787 int code, ret = 0; 01788 01789 ASSERT(total_errors < rm->rm_firstdatacol); 01790 01791 /* 01792 * This simplifies one edge condition. 01793 */ 01794 tgts[-1] = -1; 01795 01796 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 01797 /* 01798 * Initialize the targets array by finding the first n columns 01799 * that contain no error. 01800 * 01801 * If there were no data errors, we need to ensure that we're 01802 * always explicitly attempting to reconstruct at least one 01803 * data column. To do this, we simply push the highest target 01804 * up into the data columns. 01805 */ 01806 for (c = 0, i = 0; i < n; i++) { 01807 if (i == n - 1 && data_errors == 0 && 01808 c < rm->rm_firstdatacol) { 01809 c = rm->rm_firstdatacol; 01810 } 01811 01812 while (rm->rm_col[c].rc_error != 0) { 01813 c++; 01814 ASSERT3S(c, <, rm->rm_cols); 01815 } 01816 01817 tgts[i] = c++; 01818 } 01819 01820 /* 01821 * Setting tgts[n] simplifies the other edge condition. 01822 */ 01823 tgts[n] = rm->rm_cols; 01824 01825 /* 01826 * These buffers were allocated in previous iterations. 01827 */ 01828 for (i = 0; i < n - 1; i++) { 01829 ASSERT(orig[i] != NULL); 01830 } 01831 01832 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 01833 01834 current = 0; 01835 next = tgts[current]; 01836 01837 while (current != n) { 01838 tgts[current] = next; 01839 current = 0; 01840 01841 /* 01842 * Save off the original data that we're going to 01843 * attempt to reconstruct. 01844 */ 01845 for (i = 0; i < n; i++) { 01846 ASSERT(orig[i] != NULL); 01847 c = tgts[i]; 01848 ASSERT3S(c, >=, 0); 01849 ASSERT3S(c, <, rm->rm_cols); 01850 rc = &rm->rm_col[c]; 01851 bcopy(rc->rc_data, orig[i], rc->rc_size); 01852 } 01853 01854 /* 01855 * Attempt a reconstruction and exit the outer loop on 01856 * success. 01857 */ 01858 code = vdev_raidz_reconstruct(rm, tgts, n); 01859 if (raidz_checksum_verify(zio) == 0) { 01860 atomic_inc_64(&raidz_corrected[code]); 01861 01862 for (i = 0; i < n; i++) { 01863 c = tgts[i]; 01864 rc = &rm->rm_col[c]; 01865 ASSERT(rc->rc_error == 0); 01866 if (rc->rc_tried) 01867 raidz_checksum_error(zio, rc, 01868 orig[i]); 01869 rc->rc_error = ECKSUM; 01870 } 01871 01872 ret = code; 01873 goto done; 01874 } 01875 01876 /* 01877 * Restore the original data. 01878 */ 01879 for (i = 0; i < n; i++) { 01880 c = tgts[i]; 01881 rc = &rm->rm_col[c]; 01882 bcopy(orig[i], rc->rc_data, rc->rc_size); 01883 } 01884 01885 do { 01886 /* 01887 * Find the next valid column after the current 01888 * position.. 01889 */ 01890 for (next = tgts[current] + 1; 01891 next < rm->rm_cols && 01892 rm->rm_col[next].rc_error != 0; next++) 01893 continue; 01894 01895 ASSERT(next <= tgts[current + 1]); 01896 01897 /* 01898 * If that spot is available, we're done here. 01899 */ 01900 if (next != tgts[current + 1]) 01901 break; 01902 01903 /* 01904 * Otherwise, find the next valid column after 01905 * the previous position. 01906 */ 01907 for (c = tgts[current - 1] + 1; 01908 rm->rm_col[c].rc_error != 0; c++) 01909 continue; 01910 01911 tgts[current] = c; 01912 current++; 01913 01914 } while (current != n); 01915 } 01916 } 01917 n--; 01918 done: 01919 for (i = 0; i < n; i++) { 01920 zio_buf_free(orig[i], rm->rm_col[0].rc_size); 01921 } 01922 01923 return (ret); 01924 } 01925 01946 static void 01947 vdev_raidz_io_done(zio_t *zio) 01948 { 01949 vdev_t *vd = zio->io_vd; 01950 vdev_t *cvd; 01951 raidz_map_t *rm = zio->io_vsd; 01952 raidz_col_t *rc; 01953 int unexpected_errors = 0; 01954 int parity_errors = 0; 01955 int parity_untried = 0; 01956 int data_errors = 0; 01957 int total_errors = 0; 01958 int n, c; 01959 int tgts[VDEV_RAIDZ_MAXPARITY]; 01960 int code; 01961 01962 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 01963 01964 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 01965 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 01966 01967 for (c = 0; c < rm->rm_cols; c++) { 01968 rc = &rm->rm_col[c]; 01969 01970 if (rc->rc_error) { 01971 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 01972 01973 if (c < rm->rm_firstdatacol) 01974 parity_errors++; 01975 else 01976 data_errors++; 01977 01978 if (!rc->rc_skipped) 01979 unexpected_errors++; 01980 01981 total_errors++; 01982 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 01983 parity_untried++; 01984 } 01985 } 01986 01987 if (zio->io_type == ZIO_TYPE_WRITE) { 01998 /* XXPOLICY */ 01999 if (total_errors > rm->rm_firstdatacol) 02000 zio->io_error = vdev_raidz_worst_error(rm); 02001 02002 return; 02003 } else if (zio->io_type == ZIO_TYPE_FREE) { 02004 return; 02005 } 02006 02007 ASSERT(zio->io_type == ZIO_TYPE_READ); 02008 /* 02009 * There are three potential phases for a read: 02010 * 1. produce valid data from the columns read 02011 * 2. read all disks and try again 02012 * 3. perform combinatorial reconstruction 02013 * 02014 * Each phase is progressively both more expensive and less likely to 02015 * occur. If we encounter more errors than we can repair or all phases 02016 * fail, we have no choice but to return an error. 02017 */ 02018 02019 /* 02020 * If the number of errors we saw was correctable -- less than or equal 02021 * to the number of parity disks read -- attempt to produce data that 02022 * has a valid checksum. Naturally, this case applies in the absence of 02023 * any errors. 02024 */ 02025 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 02026 if (data_errors == 0) { 02027 if (raidz_checksum_verify(zio) == 0) { 02028 /* 02029 * If we read parity information (unnecessarily 02030 * as it happens since no reconstruction was 02031 * needed) regenerate and verify the parity. 02032 * We also regenerate parity when resilvering 02033 * so we can write it out to the failed device 02034 * later. 02035 */ 02036 if (parity_errors + parity_untried < 02037 rm->rm_firstdatacol || 02038 (zio->io_flags & ZIO_FLAG_RESILVER)) { 02039 n = raidz_parity_verify(zio, rm); 02040 unexpected_errors += n; 02041 ASSERT(parity_errors + n <= 02042 rm->rm_firstdatacol); 02043 } 02044 goto done; 02045 } 02046 } else { 02047 /* 02048 * We either attempt to read all the parity columns or 02049 * none of them. If we didn't try to read parity, we 02050 * wouldn't be here in the correctable case. There must 02051 * also have been fewer parity errors than parity 02052 * columns or, again, we wouldn't be in this code path. 02053 */ 02054 ASSERT(parity_untried == 0); 02055 ASSERT(parity_errors < rm->rm_firstdatacol); 02056 02057 /* 02058 * Identify the data columns that reported an error. 02059 */ 02060 n = 0; 02061 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 02062 rc = &rm->rm_col[c]; 02063 if (rc->rc_error != 0) { 02064 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 02065 tgts[n++] = c; 02066 } 02067 } 02068 02069 ASSERT(rm->rm_firstdatacol >= n); 02070 02071 code = vdev_raidz_reconstruct(rm, tgts, n); 02072 02073 if (raidz_checksum_verify(zio) == 0) { 02074 atomic_inc_64(&raidz_corrected[code]); 02075 02076 /* 02077 * If we read more parity disks than were used 02078 * for reconstruction, confirm that the other 02079 * parity disks produced correct data. This 02080 * routine is suboptimal in that it regenerates 02081 * the parity that we already used in addition 02082 * to the parity that we're attempting to 02083 * verify, but this should be a relatively 02084 * uncommon case, and can be optimized if it 02085 * becomes a problem. Note that we regenerate 02086 * parity when resilvering so we can write it 02087 * out to failed devices later. 02088 */ 02089 if (parity_errors < rm->rm_firstdatacol - n || 02090 (zio->io_flags & ZIO_FLAG_RESILVER)) { 02091 n = raidz_parity_verify(zio, rm); 02092 unexpected_errors += n; 02093 ASSERT(parity_errors + n <= 02094 rm->rm_firstdatacol); 02095 } 02096 02097 goto done; 02098 } 02099 } 02100 } 02101 02102 /* 02103 * This isn't a typical situation -- either we got a read error or 02104 * a child silently returned bad data. Read every block so we can 02105 * try again with as much data and parity as we can track down. If 02106 * we've already been through once before, all children will be marked 02107 * as tried so we'll proceed to combinatorial reconstruction. 02108 */ 02109 unexpected_errors = 1; 02110 rm->rm_missingdata = 0; 02111 rm->rm_missingparity = 0; 02112 02113 for (c = 0; c < rm->rm_cols; c++) { 02114 if (rm->rm_col[c].rc_tried) 02115 continue; 02116 02117 zio_vdev_io_redone(zio); 02118 do { 02119 rc = &rm->rm_col[c]; 02120 if (rc->rc_tried) 02121 continue; 02122 zio_nowait(zio_vdev_child_io(zio, NULL, 02123 vd->vdev_child[rc->rc_devidx], 02124 rc->rc_offset, rc->rc_data, rc->rc_size, 02125 zio->io_type, zio->io_priority, 0, 02126 vdev_raidz_child_done, rc)); 02127 } while (++c < rm->rm_cols); 02128 02129 return; 02130 } 02131 02132 /* 02133 * At this point we've attempted to reconstruct the data given the 02134 * errors we detected, and we've attempted to read all columns. There 02135 * must, therefore, be one or more additional problems -- silent errors 02136 * resulting in invalid data rather than explicit I/O errors resulting 02137 * in absent data. We check if there is enough additional data to 02138 * possibly reconstruct the data and then perform combinatorial 02139 * reconstruction over all possible combinations. If that fails, 02140 * we're cooked. 02141 */ 02142 if (total_errors > rm->rm_firstdatacol) { 02143 zio->io_error = vdev_raidz_worst_error(rm); 02144 02145 } else if (total_errors < rm->rm_firstdatacol && 02146 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 02147 /* 02148 * If we didn't use all the available parity for the 02149 * combinatorial reconstruction, verify that the remaining 02150 * parity is correct. 02151 */ 02152 if (code != (1 << rm->rm_firstdatacol) - 1) 02153 (void) raidz_parity_verify(zio, rm); 02154 } else { 02155 /* 02156 * We're here because either: 02157 * 02158 * total_errors == rm_first_datacol, or 02159 * vdev_raidz_combrec() failed 02160 * 02161 * In either case, there is enough bad data to prevent 02162 * reconstruction. 02163 * 02164 * Start checksum ereports for all children which haven't 02165 * failed, and the IO wasn't speculative. 02166 */ 02167 zio->io_error = ECKSUM; 02168 02169 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 02170 for (c = 0; c < rm->rm_cols; c++) { 02171 rc = &rm->rm_col[c]; 02172 if (rc->rc_error == 0) { 02173 zio_bad_cksum_t zbc; 02174 zbc.zbc_has_cksum = 0; 02175 zbc.zbc_injected = 02176 rm->rm_ecksuminjected; 02177 02178 zfs_ereport_start_checksum( 02179 zio->io_spa, 02180 vd->vdev_child[rc->rc_devidx], 02181 zio, rc->rc_offset, rc->rc_size, 02182 (void *)(uintptr_t)c, &zbc); 02183 } 02184 } 02185 } 02186 } 02187 02188 done: 02189 zio_checksum_verified(zio); 02190 02191 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 02192 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 02193 /* 02194 * Use the good data we have in hand to repair damaged children. 02195 */ 02196 for (c = 0; c < rm->rm_cols; c++) { 02197 rc = &rm->rm_col[c]; 02198 cvd = vd->vdev_child[rc->rc_devidx]; 02199 02200 if (rc->rc_error == 0) 02201 continue; 02202 02203 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 02204 rc->rc_offset, rc->rc_data, rc->rc_size, 02205 ZIO_TYPE_WRITE, zio->io_priority, 02206 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 02207 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 02208 } 02209 } 02210 } 02211 02212 static void 02213 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 02214 { 02215 if (faulted > vd->vdev_nparity) 02216 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 02217 VDEV_AUX_NO_REPLICAS); 02218 else if (degraded + faulted != 0) 02219 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 02220 else 02221 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 02222 } 02223 02224 vdev_ops_t vdev_raidz_ops = { 02225 vdev_raidz_open, 02226 vdev_raidz_close, 02227 vdev_raidz_asize, 02228 vdev_raidz_io_start, 02229 vdev_raidz_io_done, 02230 vdev_raidz_state_change, 02231 NULL, 02232 NULL, 02233 VDEV_TYPE_RAIDZ, /* name of this vdev type */ 02234 B_FALSE /* not a leaf vdev */ 02235 };