FreeBSD ZFS
The Zettabyte File System
|
00001 /* 00002 * CDDL HEADER START 00003 * 00004 * The contents of this file are subject to the terms of the 00005 * Common Development and Distribution License (the "License"). 00006 * You may not use this file except in compliance with the License. 00007 * 00008 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 00009 * or http://www.opensolaris.org/os/licensing. 00010 * See the License for the specific language governing permissions 00011 * and limitations under the License. 00012 * 00013 * When distributing Covered Code, include this CDDL HEADER in each 00014 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 00015 * If applicable, add the following below this CDDL HEADER, with the 00016 * fields enclosed by brackets "[]" replaced with your own identifying 00017 * information: Portions Copyright [yyyy] [name of copyright owner] 00018 * 00019 * CDDL HEADER END 00020 */ 00021 /* 00022 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 00023 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 00024 * Copyright (c) 2012 by Delphix. All rights reserved. 00025 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 00026 * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved. 00027 */ 00028 00029 #include <sys/dmu.h> 00030 #include <sys/dmu_impl.h> 00031 #include <sys/dmu_tx.h> 00032 #include <sys/dbuf.h> 00033 #include <sys/dnode.h> 00034 #include <sys/zfs_context.h> 00035 #include <sys/dmu_objset.h> 00036 #include <sys/dmu_traverse.h> 00037 #include <sys/dsl_dataset.h> 00038 #include <sys/dsl_dir.h> 00039 #include <sys/dsl_prop.h> 00040 #include <sys/dsl_pool.h> 00041 #include <sys/dsl_synctask.h> 00042 #include <sys/zfs_ioctl.h> 00043 #include <sys/zap.h> 00044 #include <sys/zio_checksum.h> 00045 #include <sys/zfs_znode.h> 00046 #include <zfs_fletcher.h> 00047 #include <sys/avl.h> 00048 #include <sys/ddt.h> 00049 #include <sys/zfs_onexit.h> 00050 00051 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ 00052 int zfs_send_corrupt_data = B_FALSE; 00053 00054 static char *dmu_recv_tag = "dmu_recv_tag"; 00055 00056 static int 00057 dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) 00058 { 00059 dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; 00060 struct uio auio; 00061 struct iovec aiov; 00062 ASSERT0(len % 8); 00063 00064 fletcher_4_incremental_native(buf, len, &dsp->dsa_zc); 00065 aiov.iov_base = buf; 00066 aiov.iov_len = len; 00067 auio.uio_iov = &aiov; 00068 auio.uio_iovcnt = 1; 00069 auio.uio_resid = len; 00070 auio.uio_segflg = UIO_SYSSPACE; 00071 auio.uio_rw = UIO_WRITE; 00072 auio.uio_offset = (off_t)-1; 00073 auio.uio_td = dsp->dsa_td; 00074 #ifdef _KERNEL 00075 if (dsp->dsa_fp->f_type == DTYPE_VNODE) 00076 bwillwrite(); 00077 dsp->dsa_err = fo_write(dsp->dsa_fp, &auio, dsp->dsa_td->td_ucred, 0, 00078 dsp->dsa_td); 00079 #else 00080 fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); 00081 dsp->dsa_err = EOPNOTSUPP; 00082 #endif 00083 mutex_enter(&ds->ds_sendstream_lock); 00084 *dsp->dsa_off += len; 00085 mutex_exit(&ds->ds_sendstream_lock); 00086 00087 return (dsp->dsa_err); 00088 } 00089 00090 static int 00091 dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 00092 uint64_t length) 00093 { 00094 struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); 00095 00096 if (length != -1ULL && offset + length < offset) 00097 length = -1ULL; 00098 00099 /* 00100 * If there is a pending op, but it's not PENDING_FREE, push it out, 00101 * since free block aggregation can only be done for blocks of the 00102 * same type (i.e., DRR_FREE records can only be aggregated with 00103 * other DRR_FREE records. DRR_FREEOBJECTS records can only be 00104 * aggregated with other DRR_FREEOBJECTS records. 00105 */ 00106 if (dsp->dsa_pending_op != PENDING_NONE && 00107 dsp->dsa_pending_op != PENDING_FREE) { 00108 if (dump_bytes(dsp, dsp->dsa_drr, 00109 sizeof (dmu_replay_record_t)) != 0) 00110 return (EINTR); 00111 dsp->dsa_pending_op = PENDING_NONE; 00112 } 00113 00114 if (dsp->dsa_pending_op == PENDING_FREE) { 00115 /* 00116 * There should never be a PENDING_FREE if length is -1 00117 * (because dump_dnode is the only place where this 00118 * function is called with a -1, and only after flushing 00119 * any pending record). 00120 */ 00121 ASSERT(length != -1ULL); 00122 /* 00123 * Check to see whether this free block can be aggregated 00124 * with pending one. 00125 */ 00126 if (drrf->drr_object == object && drrf->drr_offset + 00127 drrf->drr_length == offset) { 00128 drrf->drr_length += length; 00129 return (0); 00130 } else { 00131 /* not a continuation. Push out pending record */ 00132 if (dump_bytes(dsp, dsp->dsa_drr, 00133 sizeof (dmu_replay_record_t)) != 0) 00134 return (EINTR); 00135 dsp->dsa_pending_op = PENDING_NONE; 00136 } 00137 } 00138 /* create a FREE record and make it pending */ 00139 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 00140 dsp->dsa_drr->drr_type = DRR_FREE; 00141 drrf->drr_object = object; 00142 drrf->drr_offset = offset; 00143 drrf->drr_length = length; 00144 drrf->drr_toguid = dsp->dsa_toguid; 00145 if (length == -1ULL) { 00146 if (dump_bytes(dsp, dsp->dsa_drr, 00147 sizeof (dmu_replay_record_t)) != 0) 00148 return (EINTR); 00149 } else { 00150 dsp->dsa_pending_op = PENDING_FREE; 00151 } 00152 00153 return (0); 00154 } 00155 00156 static int 00157 dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, 00158 uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) 00159 { 00160 struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); 00161 00162 00163 /* 00164 * If there is any kind of pending aggregation (currently either 00165 * a grouping of free objects or free blocks), push it out to 00166 * the stream, since aggregation can't be done across operations 00167 * of different types. 00168 */ 00169 if (dsp->dsa_pending_op != PENDING_NONE) { 00170 if (dump_bytes(dsp, dsp->dsa_drr, 00171 sizeof (dmu_replay_record_t)) != 0) 00172 return (EINTR); 00173 dsp->dsa_pending_op = PENDING_NONE; 00174 } 00175 /* write a DATA record */ 00176 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 00177 dsp->dsa_drr->drr_type = DRR_WRITE; 00178 drrw->drr_object = object; 00179 drrw->drr_type = type; 00180 drrw->drr_offset = offset; 00181 drrw->drr_length = blksz; 00182 drrw->drr_toguid = dsp->dsa_toguid; 00183 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); 00184 if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) 00185 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; 00186 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); 00187 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); 00188 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); 00189 drrw->drr_key.ddk_cksum = bp->blk_cksum; 00190 00191 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 00192 return (EINTR); 00193 if (dump_bytes(dsp, data, blksz) != 0) 00194 return (EINTR); 00195 return (0); 00196 } 00197 00198 static int 00199 dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) 00200 { 00201 struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); 00202 00203 if (dsp->dsa_pending_op != PENDING_NONE) { 00204 if (dump_bytes(dsp, dsp->dsa_drr, 00205 sizeof (dmu_replay_record_t)) != 0) 00206 return (EINTR); 00207 dsp->dsa_pending_op = PENDING_NONE; 00208 } 00209 00210 /* write a SPILL record */ 00211 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 00212 dsp->dsa_drr->drr_type = DRR_SPILL; 00213 drrs->drr_object = object; 00214 drrs->drr_length = blksz; 00215 drrs->drr_toguid = dsp->dsa_toguid; 00216 00217 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t))) 00218 return (EINTR); 00219 if (dump_bytes(dsp, data, blksz)) 00220 return (EINTR); 00221 return (0); 00222 } 00223 00224 static int 00225 dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) 00226 { 00227 struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); 00228 00229 /* 00230 * If there is a pending op, but it's not PENDING_FREEOBJECTS, 00231 * push it out, since free block aggregation can only be done for 00232 * blocks of the same type (i.e., DRR_FREE records can only be 00233 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records 00234 * can only be aggregated with other DRR_FREEOBJECTS records. 00235 */ 00236 if (dsp->dsa_pending_op != PENDING_NONE && 00237 dsp->dsa_pending_op != PENDING_FREEOBJECTS) { 00238 if (dump_bytes(dsp, dsp->dsa_drr, 00239 sizeof (dmu_replay_record_t)) != 0) 00240 return (EINTR); 00241 dsp->dsa_pending_op = PENDING_NONE; 00242 } 00243 if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { 00244 /* 00245 * See whether this free object array can be aggregated 00246 * with pending one 00247 */ 00248 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { 00249 drrfo->drr_numobjs += numobjs; 00250 return (0); 00251 } else { 00252 /* can't be aggregated. Push out pending record */ 00253 if (dump_bytes(dsp, dsp->dsa_drr, 00254 sizeof (dmu_replay_record_t)) != 0) 00255 return (EINTR); 00256 dsp->dsa_pending_op = PENDING_NONE; 00257 } 00258 } 00259 00260 /* write a FREEOBJECTS record */ 00261 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 00262 dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; 00263 drrfo->drr_firstobj = firstobj; 00264 drrfo->drr_numobjs = numobjs; 00265 drrfo->drr_toguid = dsp->dsa_toguid; 00266 00267 dsp->dsa_pending_op = PENDING_FREEOBJECTS; 00268 00269 return (0); 00270 } 00271 00272 static int 00273 dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) 00274 { 00275 struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); 00276 00277 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 00278 return (dump_freeobjects(dsp, object, 1)); 00279 00280 if (dsp->dsa_pending_op != PENDING_NONE) { 00281 if (dump_bytes(dsp, dsp->dsa_drr, 00282 sizeof (dmu_replay_record_t)) != 0) 00283 return (EINTR); 00284 dsp->dsa_pending_op = PENDING_NONE; 00285 } 00286 00287 /* write an OBJECT record */ 00288 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 00289 dsp->dsa_drr->drr_type = DRR_OBJECT; 00290 drro->drr_object = object; 00291 drro->drr_type = dnp->dn_type; 00292 drro->drr_bonustype = dnp->dn_bonustype; 00293 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 00294 drro->drr_bonuslen = dnp->dn_bonuslen; 00295 drro->drr_checksumtype = dnp->dn_checksum; 00296 drro->drr_compress = dnp->dn_compress; 00297 drro->drr_toguid = dsp->dsa_toguid; 00298 00299 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 00300 return (EINTR); 00301 00302 if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) 00303 return (EINTR); 00304 00305 /* free anything past the end of the file */ 00306 if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * 00307 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) 00308 return (EINTR); 00309 if (dsp->dsa_err) 00310 return (EINTR); 00311 return (0); 00312 } 00313 00314 #define BP_SPAN(dnp, level) \ 00315 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 00316 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 00317 00318 /* ARGSUSED */ 00319 static int 00320 backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, 00321 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 00322 { 00323 dmu_sendarg_t *dsp = arg; 00324 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 00325 int err = 0; 00326 00327 if (issig(JUSTLOOKING) && issig(FORREAL)) 00328 return (EINTR); 00329 00330 if (zb->zb_object != DMU_META_DNODE_OBJECT && 00331 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { 00332 return (0); 00333 } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) { 00334 uint64_t span = BP_SPAN(dnp, zb->zb_level); 00335 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; 00336 err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); 00337 } else if (bp == NULL) { 00338 uint64_t span = BP_SPAN(dnp, zb->zb_level); 00339 err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); 00340 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { 00341 return (0); 00342 } else if (type == DMU_OT_DNODE) { 00343 dnode_phys_t *blk; 00344 int i; 00345 int blksz = BP_GET_LSIZE(bp); 00346 uint32_t aflags = ARC_WAIT; 00347 arc_buf_t *abuf; 00348 00349 if (dsl_read(NULL, spa, bp, pbuf, 00350 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 00351 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) 00352 return (EIO); 00353 00354 blk = abuf->b_data; 00355 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 00356 uint64_t dnobj = (zb->zb_blkid << 00357 (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 00358 err = dump_dnode(dsp, dnobj, blk+i); 00359 if (err) 00360 break; 00361 } 00362 (void) arc_buf_remove_ref(abuf, &abuf); 00363 } else if (type == DMU_OT_SA) { 00364 uint32_t aflags = ARC_WAIT; 00365 arc_buf_t *abuf; 00366 int blksz = BP_GET_LSIZE(bp); 00367 00368 if (arc_read_nolock(NULL, spa, bp, 00369 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 00370 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) 00371 return (EIO); 00372 00373 err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); 00374 (void) arc_buf_remove_ref(abuf, &abuf); 00375 } else { /* it's a level-0 block of a regular object */ 00376 uint32_t aflags = ARC_WAIT; 00377 arc_buf_t *abuf; 00378 int blksz = BP_GET_LSIZE(bp); 00379 00380 if (dsl_read(NULL, spa, bp, pbuf, 00381 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 00382 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) { 00383 if (zfs_send_corrupt_data) { 00384 /* Send a block filled with 0x"zfs badd bloc" */ 00385 abuf = arc_buf_alloc(spa, blksz, &abuf, 00386 ARC_BUFC_DATA); 00387 uint64_t *ptr; 00388 for (ptr = abuf->b_data; 00389 (char *)ptr < (char *)abuf->b_data + blksz; 00390 ptr++) 00391 *ptr = 0x2f5baddb10c; 00392 } else { 00393 return (EIO); 00394 } 00395 } 00396 00397 err = dump_data(dsp, type, zb->zb_object, zb->zb_blkid * blksz, 00398 blksz, bp, abuf->b_data); 00399 (void) arc_buf_remove_ref(abuf, &abuf); 00400 } 00401 00402 ASSERT(err == 0 || err == EINTR); 00403 return (err); 00404 } 00405 00406 int 00407 dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, 00408 int outfd, struct file *fp, offset_t *off) 00409 { 00410 dsl_dataset_t *ds = tosnap->os_dsl_dataset; 00411 dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; 00412 dmu_replay_record_t *drr; 00413 dmu_sendarg_t *dsp; 00414 int err; 00415 uint64_t fromtxg = 0; 00416 00417 /* tosnap must be a snapshot */ 00418 if (ds->ds_phys->ds_next_snap_obj == 0) 00419 return (EINVAL); 00420 00421 /* fromsnap must be an earlier snapshot from the same fs as tosnap */ 00422 if (fromds && (ds->ds_dir != fromds->ds_dir || 00423 fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) 00424 return (EXDEV); 00425 00426 if (fromorigin) { 00427 dsl_pool_t *dp = ds->ds_dir->dd_pool; 00428 00429 if (fromsnap) 00430 return (EINVAL); 00431 00432 if (dsl_dir_is_clone(ds->ds_dir)) { 00433 rw_enter(&dp->dp_config_rwlock, RW_READER); 00434 err = dsl_dataset_hold_obj(dp, 00435 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); 00436 rw_exit(&dp->dp_config_rwlock); 00437 if (err) 00438 return (err); 00439 } else { 00440 fromorigin = B_FALSE; 00441 } 00442 } 00443 00444 00445 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 00446 drr->drr_type = DRR_BEGIN; 00447 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 00448 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, 00449 DMU_SUBSTREAM); 00450 00451 #ifdef _KERNEL 00452 if (dmu_objset_type(tosnap) == DMU_OST_ZFS) { 00453 uint64_t version; 00454 if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) { 00455 kmem_free(drr, sizeof (dmu_replay_record_t)); 00456 return (EINVAL); 00457 } 00458 if (version == ZPL_VERSION_SA) { 00459 DMU_SET_FEATUREFLAGS( 00460 drr->drr_u.drr_begin.drr_versioninfo, 00461 DMU_BACKUP_FEATURE_SA_SPILL); 00462 } 00463 } 00464 #endif 00465 00466 drr->drr_u.drr_begin.drr_creation_time = 00467 ds->ds_phys->ds_creation_time; 00468 drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type; 00469 if (fromorigin) 00470 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; 00471 drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; 00472 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 00473 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; 00474 00475 if (fromds) 00476 drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; 00477 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 00478 00479 if (fromds) 00480 fromtxg = fromds->ds_phys->ds_creation_txg; 00481 if (fromorigin) 00482 dsl_dataset_rele(fromds, FTAG); 00483 00484 dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); 00485 00486 dsp->dsa_drr = drr; 00487 dsp->dsa_outfd = outfd; 00488 dsp->dsa_proc = curproc; 00489 dsp->dsa_td = curthread; 00490 dsp->dsa_fp = fp; 00491 dsp->dsa_os = tosnap; 00492 dsp->dsa_off = off; 00493 dsp->dsa_toguid = ds->ds_phys->ds_guid; 00494 ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); 00495 dsp->dsa_pending_op = PENDING_NONE; 00496 00497 mutex_enter(&ds->ds_sendstream_lock); 00498 list_insert_head(&ds->ds_sendstreams, dsp); 00499 mutex_exit(&ds->ds_sendstream_lock); 00500 00501 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 00502 err = dsp->dsa_err; 00503 goto out; 00504 } 00505 00506 err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, 00507 backup_cb, dsp); 00508 00509 if (dsp->dsa_pending_op != PENDING_NONE) 00510 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) 00511 err = EINTR; 00512 00513 if (err) { 00514 if (err == EINTR && dsp->dsa_err) 00515 err = dsp->dsa_err; 00516 goto out; 00517 } 00518 00519 bzero(drr, sizeof (dmu_replay_record_t)); 00520 drr->drr_type = DRR_END; 00521 drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; 00522 drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; 00523 00524 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 00525 err = dsp->dsa_err; 00526 goto out; 00527 } 00528 00529 out: 00530 mutex_enter(&ds->ds_sendstream_lock); 00531 list_remove(&ds->ds_sendstreams, dsp); 00532 mutex_exit(&ds->ds_sendstream_lock); 00533 00534 kmem_free(drr, sizeof (dmu_replay_record_t)); 00535 kmem_free(dsp, sizeof (dmu_sendarg_t)); 00536 00537 return (err); 00538 } 00539 00540 int 00541 dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, 00542 uint64_t *sizep) 00543 { 00544 dsl_dataset_t *ds = tosnap->os_dsl_dataset; 00545 dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; 00546 dsl_pool_t *dp = ds->ds_dir->dd_pool; 00547 int err; 00548 uint64_t size; 00549 00550 /* tosnap must be a snapshot */ 00551 if (ds->ds_phys->ds_next_snap_obj == 0) 00552 return (EINVAL); 00553 00554 /* fromsnap must be an earlier snapshot from the same fs as tosnap */ 00555 if (fromds && (ds->ds_dir != fromds->ds_dir || 00556 fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) 00557 return (EXDEV); 00558 00559 if (fromorigin) { 00560 if (fromsnap) 00561 return (EINVAL); 00562 00563 if (dsl_dir_is_clone(ds->ds_dir)) { 00564 rw_enter(&dp->dp_config_rwlock, RW_READER); 00565 err = dsl_dataset_hold_obj(dp, 00566 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); 00567 rw_exit(&dp->dp_config_rwlock); 00568 if (err) 00569 return (err); 00570 } else { 00571 fromorigin = B_FALSE; 00572 } 00573 } 00574 00575 /* Get uncompressed size estimate of changed data. */ 00576 if (fromds == NULL) { 00577 size = ds->ds_phys->ds_uncompressed_bytes; 00578 } else { 00579 uint64_t used, comp; 00580 err = dsl_dataset_space_written(fromds, ds, 00581 &used, &comp, &size); 00582 if (fromorigin) 00583 dsl_dataset_rele(fromds, FTAG); 00584 if (err) 00585 return (err); 00586 } 00587 00588 /* 00589 * Assume that space (both on-disk and in-stream) is dominated by 00590 * data. We will adjust for indirect blocks and the copies property, 00591 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). 00592 */ 00593 00594 /* 00595 * Subtract out approximate space used by indirect blocks. 00596 * Assume most space is used by data blocks (non-indirect, non-dnode). 00597 * Assume all blocks are recordsize. Assume ditto blocks and 00598 * internal fragmentation counter out compression. 00599 * 00600 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per 00601 * block, which we observe in practice. 00602 */ 00603 uint64_t recordsize; 00604 rw_enter(&dp->dp_config_rwlock, RW_READER); 00605 err = dsl_prop_get_ds(ds, "recordsize", 00606 sizeof (recordsize), 1, &recordsize, NULL); 00607 rw_exit(&dp->dp_config_rwlock); 00608 if (err) 00609 return (err); 00610 size -= size / recordsize * sizeof (blkptr_t); 00611 00612 /* Add in the space for the record associated with each block. */ 00613 size += size / recordsize * sizeof (dmu_replay_record_t); 00614 00615 *sizep = size; 00616 00617 return (0); 00618 } 00619 00620 struct recvbeginsyncarg { 00621 const char *tofs; 00622 const char *tosnap; 00623 dsl_dataset_t *origin; 00624 uint64_t fromguid; 00625 dmu_objset_type_t type; 00626 void *tag; 00627 boolean_t force; 00628 uint64_t dsflags; 00629 char clonelastname[MAXNAMELEN]; 00630 dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ 00631 cred_t *cr; 00632 }; 00633 00634 /* ARGSUSED */ 00635 static int 00636 recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) 00637 { 00638 dsl_dir_t *dd = arg1; 00639 struct recvbeginsyncarg *rbsa = arg2; 00640 objset_t *mos = dd->dd_pool->dp_meta_objset; 00641 uint64_t val; 00642 int err; 00643 00644 err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, 00645 strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); 00646 00647 if (err != ENOENT) 00648 return (err ? err : EEXIST); 00649 00650 if (rbsa->origin) { 00651 /* make sure it's a snap in the same pool */ 00652 if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) 00653 return (EXDEV); 00654 if (!dsl_dataset_is_snapshot(rbsa->origin)) 00655 return (EINVAL); 00656 if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) 00657 return (ENODEV); 00658 } 00659 00660 return (0); 00661 } 00662 00663 static void 00664 recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx) 00665 { 00666 dsl_dir_t *dd = arg1; 00667 struct recvbeginsyncarg *rbsa = arg2; 00668 uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; 00669 uint64_t dsobj; 00670 00671 /* Create and open new dataset. */ 00672 dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, 00673 rbsa->origin, flags, rbsa->cr, tx); 00674 VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj, 00675 B_TRUE, dmu_recv_tag, &rbsa->ds)); 00676 00677 if (rbsa->origin == NULL) { 00678 (void) dmu_objset_create_impl(dd->dd_pool->dp_spa, 00679 rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx); 00680 } 00681 00682 spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC, 00683 dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj); 00684 } 00685 00686 /* ARGSUSED */ 00687 static int 00688 recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) 00689 { 00690 dsl_dataset_t *ds = arg1; 00691 struct recvbeginsyncarg *rbsa = arg2; 00692 int err; 00693 uint64_t val; 00694 00695 /* must not have any changes since most recent snapshot */ 00696 if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) 00697 return (ETXTBSY); 00698 00699 /* new snapshot name must not exist */ 00700 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, 00701 ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); 00702 if (err == 0) 00703 return (EEXIST); 00704 if (err != ENOENT) 00705 return (err); 00706 00707 if (rbsa->fromguid) { 00708 /* if incremental, most recent snapshot must match fromguid */ 00709 if (ds->ds_prev == NULL) 00710 return (ENODEV); 00711 00712 /* 00713 * most recent snapshot must match fromguid, or there are no 00714 * changes since the fromguid one 00715 */ 00716 if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) { 00717 uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth; 00718 uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj; 00719 while (obj != 0) { 00720 dsl_dataset_t *snap; 00721 err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 00722 obj, FTAG, &snap); 00723 if (err) 00724 return (ENODEV); 00725 if (snap->ds_phys->ds_creation_txg < birth) { 00726 dsl_dataset_rele(snap, FTAG); 00727 return (ENODEV); 00728 } 00729 if (snap->ds_phys->ds_guid == rbsa->fromguid) { 00730 dsl_dataset_rele(snap, FTAG); 00731 break; /* it's ok */ 00732 } 00733 obj = snap->ds_phys->ds_prev_snap_obj; 00734 dsl_dataset_rele(snap, FTAG); 00735 } 00736 if (obj == 0) 00737 return (ENODEV); 00738 } 00739 } else { 00740 /* if full, most recent snapshot must be $ORIGIN */ 00741 if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) 00742 return (ENODEV); 00743 } 00744 00745 /* temporary clone name must not exist */ 00746 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, 00747 ds->ds_dir->dd_phys->dd_child_dir_zapobj, 00748 rbsa->clonelastname, 8, 1, &val); 00749 if (err == 0) 00750 return (EEXIST); 00751 if (err != ENOENT) 00752 return (err); 00753 00754 return (0); 00755 } 00756 00757 /* ARGSUSED */ 00758 static void 00759 recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx) 00760 { 00761 dsl_dataset_t *ohds = arg1; 00762 struct recvbeginsyncarg *rbsa = arg2; 00763 dsl_pool_t *dp = ohds->ds_dir->dd_pool; 00764 dsl_dataset_t *cds; 00765 uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; 00766 uint64_t dsobj; 00767 00768 /* create and open the temporary clone */ 00769 dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname, 00770 ohds->ds_prev, flags, rbsa->cr, tx); 00771 VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds)); 00772 00773 /* 00774 * If we actually created a non-clone, we need to create the 00775 * objset in our new dataset. 00776 */ 00777 if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) { 00778 (void) dmu_objset_create_impl(dp->dp_spa, 00779 cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx); 00780 } 00781 00782 rbsa->ds = cds; 00783 00784 spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC, 00785 dp->dp_spa, tx, "dataset = %lld", dsobj); 00786 } 00787 00788 static boolean_t 00789 dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb) 00790 { 00791 int featureflags; 00792 00793 featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); 00794 00795 /* Verify pool version supports SA if SA_SPILL feature set */ 00796 return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && 00797 (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA)); 00798 } 00799 00804 int 00805 dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, 00806 boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc) 00807 { 00808 int err = 0; 00809 boolean_t byteswap; 00810 struct recvbeginsyncarg rbsa = { 0 }; 00811 uint64_t versioninfo; 00812 int flags; 00813 dsl_dataset_t *ds; 00814 00815 if (drrb->drr_magic == DMU_BACKUP_MAGIC) 00816 byteswap = FALSE; 00817 else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 00818 byteswap = TRUE; 00819 else 00820 return (EINVAL); 00821 00822 rbsa.tofs = tofs; 00823 rbsa.tosnap = tosnap; 00824 rbsa.origin = origin ? origin->os_dsl_dataset : NULL; 00825 rbsa.fromguid = drrb->drr_fromguid; 00826 rbsa.type = drrb->drr_type; 00827 rbsa.tag = FTAG; 00828 rbsa.dsflags = 0; 00829 rbsa.cr = CRED(); 00830 versioninfo = drrb->drr_versioninfo; 00831 flags = drrb->drr_flags; 00832 00833 if (byteswap) { 00834 rbsa.type = BSWAP_32(rbsa.type); 00835 rbsa.fromguid = BSWAP_64(rbsa.fromguid); 00836 versioninfo = BSWAP_64(versioninfo); 00837 flags = BSWAP_32(flags); 00838 } 00839 00840 if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM || 00841 rbsa.type >= DMU_OST_NUMTYPES || 00842 ((flags & DRR_FLAG_CLONE) && origin == NULL)) 00843 return (EINVAL); 00844 00845 if (flags & DRR_FLAG_CI_DATA) 00846 rbsa.dsflags = DS_FLAG_CI_DATASET; 00847 00848 bzero(drc, sizeof (dmu_recv_cookie_t)); 00849 drc->drc_drrb = drrb; 00850 drc->drc_tosnap = tosnap; 00851 drc->drc_top_ds = top_ds; 00852 drc->drc_force = force; 00853 00854 /* 00855 * Process the begin in syncing context. 00856 */ 00857 00858 /* open the dataset we are logically receiving into */ 00859 err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); 00860 if (err == 0) { 00861 if (dmu_recv_verify_features(ds, drrb)) { 00862 dsl_dataset_rele(ds, dmu_recv_tag); 00863 return (ENOTSUP); 00864 } 00865 /* target fs already exists; recv into temp clone */ 00866 00867 /* Can't recv a clone into an existing fs */ 00868 if (flags & DRR_FLAG_CLONE) { 00869 dsl_dataset_rele(ds, dmu_recv_tag); 00870 return (EINVAL); 00871 } 00872 00873 /* must not have an incremental recv already in progress */ 00874 if (!mutex_tryenter(&ds->ds_recvlock)) { 00875 dsl_dataset_rele(ds, dmu_recv_tag); 00876 return (EBUSY); 00877 } 00878 00879 /* tmp clone name is: tofs/%tosnap" */ 00880 (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), 00881 "%%%s", tosnap); 00882 rbsa.force = force; 00883 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 00884 recv_existing_check, recv_existing_sync, ds, &rbsa, 5); 00885 if (err) { 00886 mutex_exit(&ds->ds_recvlock); 00887 dsl_dataset_rele(ds, dmu_recv_tag); 00888 return (err); 00889 } 00890 drc->drc_logical_ds = ds; 00891 drc->drc_real_ds = rbsa.ds; 00892 } else if (err == ENOENT) { 00893 /* target fs does not exist; must be a full backup or clone */ 00894 char *cp; 00895 00896 /* 00897 * If it's a non-clone incremental, we are missing the 00898 * target fs, so fail the recv. 00899 */ 00900 if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) 00901 return (ENOENT); 00902 00903 /* Open the parent of tofs */ 00904 cp = strrchr(tofs, '/'); 00905 *cp = '\0'; 00906 err = dsl_dataset_hold(tofs, FTAG, &ds); 00907 *cp = '/'; 00908 if (err) 00909 return (err); 00910 00911 if (dmu_recv_verify_features(ds, drrb)) { 00912 dsl_dataset_rele(ds, FTAG); 00913 return (ENOTSUP); 00914 } 00915 00916 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 00917 recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5); 00918 dsl_dataset_rele(ds, FTAG); 00919 if (err) 00920 return (err); 00921 drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; 00922 drc->drc_newfs = B_TRUE; 00923 } 00924 00925 return (err); 00926 } 00927 00928 struct restorearg { 00929 int err; 00930 int byteswap; 00931 kthread_t *td; 00932 struct file *fp; 00933 char *buf; 00934 uint64_t voff; 00935 int bufsize; /* amount of memory allocated for buf */ 00936 zio_cksum_t cksum; 00937 avl_tree_t *guid_to_ds_map; 00938 }; 00939 00940 typedef struct guid_map_entry { 00941 uint64_t guid; 00942 dsl_dataset_t *gme_ds; 00943 avl_node_t avlnode; 00944 } guid_map_entry_t; 00945 00946 static int 00947 guid_compare(const void *arg1, const void *arg2) 00948 { 00949 const guid_map_entry_t *gmep1 = arg1; 00950 const guid_map_entry_t *gmep2 = arg2; 00951 00952 if (gmep1->guid < gmep2->guid) 00953 return (-1); 00954 else if (gmep1->guid > gmep2->guid) 00955 return (1); 00956 return (0); 00957 } 00958 00959 static void 00960 free_guid_map_onexit(void *arg) 00961 { 00962 avl_tree_t *ca = arg; 00963 void *cookie = NULL; 00964 guid_map_entry_t *gmep; 00965 00966 while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { 00967 dsl_dataset_rele(gmep->gme_ds, ca); 00968 kmem_free(gmep, sizeof (guid_map_entry_t)); 00969 } 00970 avl_destroy(ca); 00971 kmem_free(ca, sizeof (avl_tree_t)); 00972 } 00973 00974 static int 00975 restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *resid) 00976 { 00977 struct uio auio; 00978 struct iovec aiov; 00979 int error; 00980 00981 aiov.iov_base = buf; 00982 aiov.iov_len = len; 00983 auio.uio_iov = &aiov; 00984 auio.uio_iovcnt = 1; 00985 auio.uio_resid = len; 00986 auio.uio_segflg = UIO_SYSSPACE; 00987 auio.uio_rw = UIO_READ; 00988 auio.uio_offset = off; 00989 auio.uio_td = ra->td; 00990 #ifdef _KERNEL 00991 error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td); 00992 #else 00993 fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); 00994 error = EOPNOTSUPP; 00995 #endif 00996 *resid = auio.uio_resid; 00997 return (error); 00998 } 00999 01000 static void * 01001 restore_read(struct restorearg *ra, int len) 01002 { 01003 void *rv; 01004 int done = 0; 01005 01006 /* some things will require 8-byte alignment, so everything must */ 01007 ASSERT0(len % 8); 01008 01009 while (done < len) { 01010 ssize_t resid; 01011 01012 ra->err = restore_bytes(ra, (caddr_t)ra->buf + done, 01013 len - done, ra->voff, &resid); 01014 01015 if (resid == len - done) 01016 ra->err = EINVAL; 01017 ra->voff += len - done - resid; 01018 done = len - resid; 01019 if (ra->err) 01020 return (NULL); 01021 } 01022 01023 ASSERT3U(done, ==, len); 01024 rv = ra->buf; 01025 if (ra->byteswap) 01026 fletcher_4_incremental_byteswap(rv, len, &ra->cksum); 01027 else 01028 fletcher_4_incremental_native(rv, len, &ra->cksum); 01029 return (rv); 01030 } 01031 01032 static void 01033 backup_byteswap(dmu_replay_record_t *drr) 01034 { 01035 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 01036 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 01037 drr->drr_type = BSWAP_32(drr->drr_type); 01038 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); 01039 switch (drr->drr_type) { 01040 case DRR_BEGIN: 01041 DO64(drr_begin.drr_magic); 01042 DO64(drr_begin.drr_versioninfo); 01043 DO64(drr_begin.drr_creation_time); 01044 DO32(drr_begin.drr_type); 01045 DO32(drr_begin.drr_flags); 01046 DO64(drr_begin.drr_toguid); 01047 DO64(drr_begin.drr_fromguid); 01048 break; 01049 case DRR_OBJECT: 01050 DO64(drr_object.drr_object); 01051 /* DO64(drr_object.drr_allocation_txg); */ 01052 DO32(drr_object.drr_type); 01053 DO32(drr_object.drr_bonustype); 01054 DO32(drr_object.drr_blksz); 01055 DO32(drr_object.drr_bonuslen); 01056 DO64(drr_object.drr_toguid); 01057 break; 01058 case DRR_FREEOBJECTS: 01059 DO64(drr_freeobjects.drr_firstobj); 01060 DO64(drr_freeobjects.drr_numobjs); 01061 DO64(drr_freeobjects.drr_toguid); 01062 break; 01063 case DRR_WRITE: 01064 DO64(drr_write.drr_object); 01065 DO32(drr_write.drr_type); 01066 DO64(drr_write.drr_offset); 01067 DO64(drr_write.drr_length); 01068 DO64(drr_write.drr_toguid); 01069 DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); 01070 DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); 01071 DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); 01072 DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); 01073 DO64(drr_write.drr_key.ddk_prop); 01074 break; 01075 case DRR_WRITE_BYREF: 01076 DO64(drr_write_byref.drr_object); 01077 DO64(drr_write_byref.drr_offset); 01078 DO64(drr_write_byref.drr_length); 01079 DO64(drr_write_byref.drr_toguid); 01080 DO64(drr_write_byref.drr_refguid); 01081 DO64(drr_write_byref.drr_refobject); 01082 DO64(drr_write_byref.drr_refoffset); 01083 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); 01084 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); 01085 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); 01086 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); 01087 DO64(drr_write_byref.drr_key.ddk_prop); 01088 break; 01089 case DRR_FREE: 01090 DO64(drr_free.drr_object); 01091 DO64(drr_free.drr_offset); 01092 DO64(drr_free.drr_length); 01093 DO64(drr_free.drr_toguid); 01094 break; 01095 case DRR_SPILL: 01096 DO64(drr_spill.drr_object); 01097 DO64(drr_spill.drr_length); 01098 DO64(drr_spill.drr_toguid); 01099 break; 01100 case DRR_END: 01101 DO64(drr_end.drr_checksum.zc_word[0]); 01102 DO64(drr_end.drr_checksum.zc_word[1]); 01103 DO64(drr_end.drr_checksum.zc_word[2]); 01104 DO64(drr_end.drr_checksum.zc_word[3]); 01105 DO64(drr_end.drr_toguid); 01106 break; 01107 } 01108 #undef DO64 01109 #undef DO32 01110 } 01111 01112 static int 01113 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) 01114 { 01115 int err; 01116 dmu_tx_t *tx; 01117 void *data = NULL; 01118 01119 if (drro->drr_type == DMU_OT_NONE || 01120 !DMU_OT_IS_VALID(drro->drr_type) || 01121 !DMU_OT_IS_VALID(drro->drr_bonustype) || 01122 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || 01123 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 01124 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 01125 drro->drr_blksz < SPA_MINBLOCKSIZE || 01126 drro->drr_blksz > SPA_MAXBLOCKSIZE || 01127 drro->drr_bonuslen > DN_MAX_BONUSLEN) { 01128 return (EINVAL); 01129 } 01130 01131 err = dmu_object_info(os, drro->drr_object, NULL); 01132 01133 if (err != 0 && err != ENOENT) 01134 return (EINVAL); 01135 01136 if (drro->drr_bonuslen) { 01137 data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); 01138 if (ra->err) 01139 return (ra->err); 01140 } 01141 01142 if (err == ENOENT) { 01143 /* currently free, want to be allocated */ 01144 tx = dmu_tx_create(os); 01145 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 01146 err = dmu_tx_assign(tx, TXG_WAIT); 01147 if (err) { 01148 dmu_tx_abort(tx); 01149 return (err); 01150 } 01151 err = dmu_object_claim(os, drro->drr_object, 01152 drro->drr_type, drro->drr_blksz, 01153 drro->drr_bonustype, drro->drr_bonuslen, tx); 01154 dmu_tx_commit(tx); 01155 } else { 01156 /* currently allocated, want to be allocated */ 01157 err = dmu_object_reclaim(os, drro->drr_object, 01158 drro->drr_type, drro->drr_blksz, 01159 drro->drr_bonustype, drro->drr_bonuslen); 01160 } 01161 if (err) { 01162 return (EINVAL); 01163 } 01164 01165 tx = dmu_tx_create(os); 01166 dmu_tx_hold_bonus(tx, drro->drr_object); 01167 err = dmu_tx_assign(tx, TXG_WAIT); 01168 if (err) { 01169 dmu_tx_abort(tx); 01170 return (err); 01171 } 01172 01173 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, 01174 tx); 01175 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); 01176 01177 if (data != NULL) { 01178 dmu_buf_t *db; 01179 01180 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); 01181 dmu_buf_will_dirty(db, tx); 01182 01183 ASSERT3U(db->db_size, >=, drro->drr_bonuslen); 01184 bcopy(data, db->db_data, drro->drr_bonuslen); 01185 if (ra->byteswap) { 01186 dmu_object_byteswap_t byteswap = 01187 DMU_OT_BYTESWAP(drro->drr_bonustype); 01188 dmu_ot_byteswap[byteswap].ob_func(db->db_data, 01189 drro->drr_bonuslen); 01190 } 01191 dmu_buf_rele(db, FTAG); 01192 } 01193 dmu_tx_commit(tx); 01194 return (0); 01195 } 01196 01197 /* ARGSUSED */ 01198 static int 01199 restore_freeobjects(struct restorearg *ra, objset_t *os, 01200 struct drr_freeobjects *drrfo) 01201 { 01202 uint64_t obj; 01203 01204 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 01205 return (EINVAL); 01206 01207 for (obj = drrfo->drr_firstobj; 01208 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; 01209 (void) dmu_object_next(os, &obj, FALSE, 0)) { 01210 int err; 01211 01212 if (dmu_object_info(os, obj, NULL) != 0) 01213 continue; 01214 01215 err = dmu_free_object(os, obj); 01216 if (err) 01217 return (err); 01218 } 01219 return (0); 01220 } 01221 01222 static int 01223 restore_write(struct restorearg *ra, objset_t *os, 01224 struct drr_write *drrw) 01225 { 01226 dmu_tx_t *tx; 01227 void *data; 01228 int err; 01229 01230 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 01231 !DMU_OT_IS_VALID(drrw->drr_type)) 01232 return (EINVAL); 01233 01234 data = restore_read(ra, drrw->drr_length); 01235 if (data == NULL) 01236 return (ra->err); 01237 01238 if (dmu_object_info(os, drrw->drr_object, NULL) != 0) 01239 return (EINVAL); 01240 01241 tx = dmu_tx_create(os); 01242 01243 dmu_tx_hold_write(tx, drrw->drr_object, 01244 drrw->drr_offset, drrw->drr_length); 01245 err = dmu_tx_assign(tx, TXG_WAIT); 01246 if (err) { 01247 dmu_tx_abort(tx); 01248 return (err); 01249 } 01250 if (ra->byteswap) { 01251 dmu_object_byteswap_t byteswap = 01252 DMU_OT_BYTESWAP(drrw->drr_type); 01253 dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length); 01254 } 01255 dmu_write(os, drrw->drr_object, 01256 drrw->drr_offset, drrw->drr_length, data, tx); 01257 dmu_tx_commit(tx); 01258 return (0); 01259 } 01260 01268 static int 01269 restore_write_byref(struct restorearg *ra, objset_t *os, 01270 struct drr_write_byref *drrwbr) 01271 { 01272 dmu_tx_t *tx; 01273 int err; 01274 guid_map_entry_t gmesrch; 01275 guid_map_entry_t *gmep; 01276 avl_index_t where; 01277 objset_t *ref_os = NULL; 01278 dmu_buf_t *dbp; 01279 01280 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) 01281 return (EINVAL); 01282 01283 /* 01284 * If the GUID of the referenced dataset is different from the 01285 * GUID of the target dataset, find the referenced dataset. 01286 */ 01287 if (drrwbr->drr_toguid != drrwbr->drr_refguid) { 01288 gmesrch.guid = drrwbr->drr_refguid; 01289 if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, 01290 &where)) == NULL) { 01291 return (EINVAL); 01292 } 01293 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) 01294 return (EINVAL); 01295 } else { 01296 ref_os = os; 01297 } 01298 01299 if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, 01300 drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH)) 01301 return (err); 01302 01303 tx = dmu_tx_create(os); 01304 01305 dmu_tx_hold_write(tx, drrwbr->drr_object, 01306 drrwbr->drr_offset, drrwbr->drr_length); 01307 err = dmu_tx_assign(tx, TXG_WAIT); 01308 if (err) { 01309 dmu_tx_abort(tx); 01310 return (err); 01311 } 01312 dmu_write(os, drrwbr->drr_object, 01313 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); 01314 dmu_buf_rele(dbp, FTAG); 01315 dmu_tx_commit(tx); 01316 return (0); 01317 } 01318 01319 static int 01320 restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) 01321 { 01322 dmu_tx_t *tx; 01323 void *data; 01324 dmu_buf_t *db, *db_spill; 01325 int err; 01326 01327 if (drrs->drr_length < SPA_MINBLOCKSIZE || 01328 drrs->drr_length > SPA_MAXBLOCKSIZE) 01329 return (EINVAL); 01330 01331 data = restore_read(ra, drrs->drr_length); 01332 if (data == NULL) 01333 return (ra->err); 01334 01335 if (dmu_object_info(os, drrs->drr_object, NULL) != 0) 01336 return (EINVAL); 01337 01338 VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); 01339 if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { 01340 dmu_buf_rele(db, FTAG); 01341 return (err); 01342 } 01343 01344 tx = dmu_tx_create(os); 01345 01346 dmu_tx_hold_spill(tx, db->db_object); 01347 01348 err = dmu_tx_assign(tx, TXG_WAIT); 01349 if (err) { 01350 dmu_buf_rele(db, FTAG); 01351 dmu_buf_rele(db_spill, FTAG); 01352 dmu_tx_abort(tx); 01353 return (err); 01354 } 01355 dmu_buf_will_dirty(db_spill, tx); 01356 01357 if (db_spill->db_size < drrs->drr_length) 01358 VERIFY(0 == dbuf_spill_set_blksz(db_spill, 01359 drrs->drr_length, tx)); 01360 bcopy(data, db_spill->db_data, drrs->drr_length); 01361 01362 dmu_buf_rele(db, FTAG); 01363 dmu_buf_rele(db_spill, FTAG); 01364 01365 dmu_tx_commit(tx); 01366 return (0); 01367 } 01368 01369 /* ARGSUSED */ 01370 static int 01371 restore_free(struct restorearg *ra, objset_t *os, 01372 struct drr_free *drrf) 01373 { 01374 int err; 01375 01376 if (drrf->drr_length != -1ULL && 01377 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 01378 return (EINVAL); 01379 01380 if (dmu_object_info(os, drrf->drr_object, NULL) != 0) 01381 return (EINVAL); 01382 01383 err = dmu_free_long_range(os, drrf->drr_object, 01384 drrf->drr_offset, drrf->drr_length); 01385 return (err); 01386 } 01387 01391 int 01392 dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, 01393 int cleanup_fd, uint64_t *action_handlep) 01394 { 01395 struct restorearg ra = { 0 }; 01396 dmu_replay_record_t *drr; 01397 objset_t *os; 01398 zio_cksum_t pcksum; 01399 int featureflags; 01400 01401 if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 01402 ra.byteswap = TRUE; 01403 01404 { 01405 /* compute checksum of drr_begin record */ 01406 dmu_replay_record_t *drr; 01407 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 01408 01409 drr->drr_type = DRR_BEGIN; 01410 drr->drr_u.drr_begin = *drc->drc_drrb; 01411 if (ra.byteswap) { 01412 fletcher_4_incremental_byteswap(drr, 01413 sizeof (dmu_replay_record_t), &ra.cksum); 01414 } else { 01415 fletcher_4_incremental_native(drr, 01416 sizeof (dmu_replay_record_t), &ra.cksum); 01417 } 01418 kmem_free(drr, sizeof (dmu_replay_record_t)); 01419 } 01420 01421 if (ra.byteswap) { 01422 struct drr_begin *drrb = drc->drc_drrb; 01423 drrb->drr_magic = BSWAP_64(drrb->drr_magic); 01424 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); 01425 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 01426 drrb->drr_type = BSWAP_32(drrb->drr_type); 01427 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 01428 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 01429 } 01430 01431 ra.td = curthread; 01432 ra.fp = fp; 01433 ra.voff = *voffp; 01434 ra.bufsize = 1<<20; 01435 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 01436 01437 /* these were verified in dmu_recv_begin */ 01438 ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) == 01439 DMU_SUBSTREAM); 01440 ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); 01441 01442 /* 01443 * Open the objset we are modifying. 01444 */ 01445 VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0); 01446 01447 ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); 01448 01449 featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); 01450 01451 /* if this stream is dedup'ed, set up the avl tree for guid mapping */ 01452 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 01453 minor_t minor; 01454 01455 if (cleanup_fd == -1) { 01456 ra.err = EBADF; 01457 goto out; 01458 } 01459 ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); 01460 if (ra.err) { 01461 cleanup_fd = -1; 01462 goto out; 01463 } 01464 01465 if (*action_handlep == 0) { 01466 ra.guid_to_ds_map = 01467 kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 01468 avl_create(ra.guid_to_ds_map, guid_compare, 01469 sizeof (guid_map_entry_t), 01470 offsetof(guid_map_entry_t, avlnode)); 01471 ra.err = zfs_onexit_add_cb(minor, 01472 free_guid_map_onexit, ra.guid_to_ds_map, 01473 action_handlep); 01474 if (ra.err) 01475 goto out; 01476 } else { 01477 ra.err = zfs_onexit_cb_data(minor, *action_handlep, 01478 (void **)&ra.guid_to_ds_map); 01479 if (ra.err) 01480 goto out; 01481 } 01482 01483 drc->drc_guid_to_ds_map = ra.guid_to_ds_map; 01484 } 01485 01486 /* 01487 * Read records and process them. 01488 */ 01489 pcksum = ra.cksum; 01490 while (ra.err == 0 && 01491 NULL != (drr = restore_read(&ra, sizeof (*drr)))) { 01492 if (issig(JUSTLOOKING) && issig(FORREAL)) { 01493 ra.err = EINTR; 01494 goto out; 01495 } 01496 01497 if (ra.byteswap) 01498 backup_byteswap(drr); 01499 01500 switch (drr->drr_type) { 01501 case DRR_OBJECT: 01502 { 01503 /* 01504 * We need to make a copy of the record header, 01505 * because restore_{object,write} may need to 01506 * restore_read(), which will invalidate drr. 01507 */ 01508 struct drr_object drro = drr->drr_u.drr_object; 01509 ra.err = restore_object(&ra, os, &drro); 01510 break; 01511 } 01512 case DRR_FREEOBJECTS: 01513 { 01514 struct drr_freeobjects drrfo = 01515 drr->drr_u.drr_freeobjects; 01516 ra.err = restore_freeobjects(&ra, os, &drrfo); 01517 break; 01518 } 01519 case DRR_WRITE: 01520 { 01521 struct drr_write drrw = drr->drr_u.drr_write; 01522 ra.err = restore_write(&ra, os, &drrw); 01523 break; 01524 } 01525 case DRR_WRITE_BYREF: 01526 { 01527 struct drr_write_byref drrwbr = 01528 drr->drr_u.drr_write_byref; 01529 ra.err = restore_write_byref(&ra, os, &drrwbr); 01530 break; 01531 } 01532 case DRR_FREE: 01533 { 01534 struct drr_free drrf = drr->drr_u.drr_free; 01535 ra.err = restore_free(&ra, os, &drrf); 01536 break; 01537 } 01538 case DRR_END: 01539 { 01540 struct drr_end drre = drr->drr_u.drr_end; 01541 /* 01542 * We compare against the *previous* checksum 01543 * value, because the stored checksum is of 01544 * everything before the DRR_END record. 01545 */ 01546 if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) 01547 ra.err = ECKSUM; 01548 goto out; 01549 } 01550 case DRR_SPILL: 01551 { 01552 struct drr_spill drrs = drr->drr_u.drr_spill; 01553 ra.err = restore_spill(&ra, os, &drrs); 01554 break; 01555 } 01556 default: 01557 ra.err = EINVAL; 01558 goto out; 01559 } 01560 pcksum = ra.cksum; 01561 } 01562 ASSERT(ra.err != 0); 01563 01564 out: 01565 if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) 01566 zfs_onexit_fd_rele(cleanup_fd); 01567 01568 if (ra.err != 0) { 01569 /* 01570 * destroy what we created, so we don't leave it in the 01571 * inconsistent restoring state. 01572 */ 01573 txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); 01574 01575 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, 01576 B_FALSE); 01577 if (drc->drc_real_ds != drc->drc_logical_ds) { 01578 mutex_exit(&drc->drc_logical_ds->ds_recvlock); 01579 dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); 01580 } 01581 } 01582 01583 kmem_free(ra.buf, ra.bufsize); 01584 *voffp = ra.voff; 01585 return (ra.err); 01586 } 01587 01588 struct recvendsyncarg { 01589 char *tosnap; 01590 uint64_t creation_time; 01591 uint64_t toguid; 01592 }; 01593 01594 static int 01595 recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) 01596 { 01597 dsl_dataset_t *ds = arg1; 01598 struct recvendsyncarg *resa = arg2; 01599 01600 return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); 01601 } 01602 01603 static void 01604 recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) 01605 { 01606 dsl_dataset_t *ds = arg1; 01607 struct recvendsyncarg *resa = arg2; 01608 01609 dsl_dataset_snapshot_sync(ds, resa->tosnap, tx); 01610 01611 /* set snapshot's creation time and guid */ 01612 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 01613 ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; 01614 ds->ds_prev->ds_phys->ds_guid = resa->toguid; 01615 ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 01616 01617 dmu_buf_will_dirty(ds->ds_dbuf, tx); 01618 ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 01619 } 01620 01621 static int 01622 add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds) 01623 { 01624 dsl_pool_t *dp = ds->ds_dir->dd_pool; 01625 uint64_t snapobj = ds->ds_phys->ds_prev_snap_obj; 01626 dsl_dataset_t *snapds; 01627 guid_map_entry_t *gmep; 01628 int err; 01629 01630 ASSERT(guid_map != NULL); 01631 01632 rw_enter(&dp->dp_config_rwlock, RW_READER); 01633 err = dsl_dataset_hold_obj(dp, snapobj, guid_map, &snapds); 01634 if (err == 0) { 01635 gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); 01636 gmep->guid = snapds->ds_phys->ds_guid; 01637 gmep->gme_ds = snapds; 01638 avl_add(guid_map, gmep); 01639 } 01640 01641 rw_exit(&dp->dp_config_rwlock); 01642 return (err); 01643 } 01644 01645 static int 01646 dmu_recv_existing_end(dmu_recv_cookie_t *drc) 01647 { 01648 struct recvendsyncarg resa; 01649 dsl_dataset_t *ds = drc->drc_logical_ds; 01650 int err, myerr; 01651 01652 if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { 01653 err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, 01654 drc->drc_force); 01655 if (err) 01656 goto out; 01657 } else { 01658 mutex_exit(&ds->ds_recvlock); 01659 dsl_dataset_rele(ds, dmu_recv_tag); 01660 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, 01661 B_FALSE); 01662 return (EBUSY); 01663 } 01664 01665 resa.creation_time = drc->drc_drrb->drr_creation_time; 01666 resa.toguid = drc->drc_drrb->drr_toguid; 01667 resa.tosnap = drc->drc_tosnap; 01668 01669 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 01670 recv_end_check, recv_end_sync, ds, &resa, 3); 01671 if (err) { 01672 /* swap back */ 01673 (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE); 01674 } 01675 01676 out: 01677 mutex_exit(&ds->ds_recvlock); 01678 if (err == 0 && drc->drc_guid_to_ds_map != NULL) 01679 (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); 01680 dsl_dataset_disown(ds, dmu_recv_tag); 01681 myerr = dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); 01682 ASSERT0(myerr); 01683 return (err); 01684 } 01685 01686 static int 01687 dmu_recv_new_end(dmu_recv_cookie_t *drc) 01688 { 01689 struct recvendsyncarg resa; 01690 dsl_dataset_t *ds = drc->drc_logical_ds; 01691 int err; 01692 01693 /* 01694 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() 01695 * expects it to have a ds_user_ptr (and zil), but clone_swap() 01696 * can close it. 01697 */ 01698 txg_wait_synced(ds->ds_dir->dd_pool, 0); 01699 01700 resa.creation_time = drc->drc_drrb->drr_creation_time; 01701 resa.toguid = drc->drc_drrb->drr_toguid; 01702 resa.tosnap = drc->drc_tosnap; 01703 01704 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 01705 recv_end_check, recv_end_sync, ds, &resa, 3); 01706 if (err) { 01707 /* clean up the fs we just recv'd into */ 01708 (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE); 01709 } else { 01710 if (drc->drc_guid_to_ds_map != NULL) 01711 (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); 01712 /* release the hold from dmu_recv_begin */ 01713 dsl_dataset_disown(ds, dmu_recv_tag); 01714 } 01715 return (err); 01716 } 01717 01718 int 01719 dmu_recv_end(dmu_recv_cookie_t *drc) 01720 { 01721 if (drc->drc_logical_ds != drc->drc_real_ds) 01722 return (dmu_recv_existing_end(drc)); 01723 else 01724 return (dmu_recv_new_end(drc)); 01725 }