FreeBSD ZFS
The Zettabyte File System
|
00001 /* 00002 * CDDL HEADER START 00003 * 00004 * The contents of this file are subject to the terms of the 00005 * Common Development and Distribution License (the "License"). 00006 * You may not use this file except in compliance with the License. 00007 * 00008 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 00009 * or http://www.opensolaris.org/os/licensing. 00010 * See the License for the specific language governing permissions 00011 * and limitations under the License. 00012 * 00013 * When distributing Covered Code, include this CDDL HEADER in each 00014 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 00015 * If applicable, add the following below this CDDL HEADER, with the 00016 * fields enclosed by brackets "[]" replaced with your own identifying 00017 * information: Portions Copyright [yyyy] [name of copyright owner] 00018 * 00019 * CDDL HEADER END 00020 */ 00021 00022 /* 00023 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. 00024 * Copyright (c) 2011 by Delphix. All rights reserved. 00025 */ 00026 00027 #include <sys/spa.h> 00028 #include <sys/spa_impl.h> 00029 #include <sys/zap.h> 00030 #include <sys/dsl_synctask.h> 00031 #include <sys/dmu_tx.h> 00032 #include <sys/dmu_objset.h> 00033 #include <sys/utsname.h> 00034 #include <sys/sunddi.h> 00035 #include "zfs_comutil.h" 00036 #ifdef _KERNEL 00037 #include <sys/cmn_err.h> 00038 #include <sys/zone.h> 00039 #endif 00040 00074 static uint64_t 00075 spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp) 00076 { 00077 uint64_t phys_len; 00078 00079 phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len; 00080 return ((log_off - shpp->sh_pool_create_len) % phys_len 00081 + shpp->sh_pool_create_len); 00082 } 00083 00084 void 00085 spa_history_create_obj(spa_t *spa, dmu_tx_t *tx) 00086 { 00087 dmu_buf_t *dbp; 00088 spa_history_phys_t *shpp; 00089 objset_t *mos = spa->spa_meta_objset; 00090 00091 ASSERT(spa->spa_history == 0); 00092 spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY, 00093 SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS, 00094 sizeof (spa_history_phys_t), tx); 00095 00096 VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, 00097 DMU_POOL_HISTORY, sizeof (uint64_t), 1, 00098 &spa->spa_history, tx) == 0); 00099 00100 VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); 00101 ASSERT(dbp->db_size >= sizeof (spa_history_phys_t)); 00102 00103 shpp = dbp->db_data; 00104 dmu_buf_will_dirty(dbp, tx); 00105 00106 /* 00107 * Figure out maximum size of history log. We set it at 00108 * 0.1% of pool size, with a max of 1G and min of 128KB. 00109 */ 00110 shpp->sh_phys_max_off = 00111 metaslab_class_get_dspace(spa_normal_class(spa)) / 1000; 00112 shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30); 00113 shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10); 00114 00115 dmu_buf_rele(dbp, FTAG); 00116 } 00117 00121 static int 00122 spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp) 00123 { 00124 objset_t *mos = spa->spa_meta_objset; 00125 uint64_t firstread, reclen, phys_bof; 00126 char buf[sizeof (reclen)]; 00127 int err; 00128 00129 phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp); 00130 firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof); 00131 00132 if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread, 00133 buf, DMU_READ_PREFETCH)) != 0) 00134 return (err); 00135 if (firstread != sizeof (reclen)) { 00136 if ((err = dmu_read(mos, spa->spa_history, 00137 shpp->sh_pool_create_len, sizeof (reclen) - firstread, 00138 buf + firstread, DMU_READ_PREFETCH)) != 0) 00139 return (err); 00140 } 00141 00142 reclen = LE_64(*((uint64_t *)buf)); 00143 shpp->sh_bof += reclen + sizeof (reclen); 00144 shpp->sh_records_lost++; 00145 return (0); 00146 } 00147 00148 static int 00149 spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp, 00150 dmu_tx_t *tx) 00151 { 00152 uint64_t firstwrite, phys_eof; 00153 objset_t *mos = spa->spa_meta_objset; 00154 int err; 00155 00156 ASSERT(MUTEX_HELD(&spa->spa_history_lock)); 00157 00158 /* see if we need to reset logical BOF */ 00159 while (shpp->sh_phys_max_off - shpp->sh_pool_create_len - 00160 (shpp->sh_eof - shpp->sh_bof) <= len) { 00161 if ((err = spa_history_advance_bof(spa, shpp)) != 0) { 00162 return (err); 00163 } 00164 } 00165 00166 phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); 00167 firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof); 00168 shpp->sh_eof += len; 00169 dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx); 00170 00171 len -= firstwrite; 00172 if (len > 0) { 00173 /* write out the rest at the beginning of physical file */ 00174 dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len, 00175 len, (char *)buf + firstwrite, tx); 00176 } 00177 00178 return (0); 00179 } 00180 00181 static char * 00182 spa_history_zone() 00183 { 00184 #ifdef _KERNEL 00185 /* XXX: pr_hostname can be changed by default from within a jail! */ 00186 if (jailed(curthread->td_ucred)) 00187 return (curthread->td_ucred->cr_prison->pr_hostname); 00188 #endif 00189 return ("global"); 00190 } 00191 00195 /*ARGSUSED*/ 00196 static void 00197 spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) 00198 { 00199 spa_t *spa = arg1; 00200 history_arg_t *hap = arg2; 00201 const char *history_str = hap->ha_history_str; 00202 objset_t *mos = spa->spa_meta_objset; 00203 dmu_buf_t *dbp; 00204 spa_history_phys_t *shpp; 00205 size_t reclen; 00206 uint64_t le_len; 00207 nvlist_t *nvrecord; 00208 char *record_packed = NULL; 00209 int ret; 00210 00211 /* 00212 * If we have an older pool that doesn't have a command 00213 * history object, create it now. 00214 */ 00215 mutex_enter(&spa->spa_history_lock); 00216 if (!spa->spa_history) 00217 spa_history_create_obj(spa, tx); 00218 mutex_exit(&spa->spa_history_lock); 00219 00220 /* 00221 * Get the offset of where we need to write via the bonus buffer. 00222 * Update the offset when the write completes. 00223 */ 00224 VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); 00225 shpp = dbp->db_data; 00226 00227 dmu_buf_will_dirty(dbp, tx); 00228 00229 #ifdef ZFS_DEBUG 00230 { 00231 dmu_object_info_t doi; 00232 dmu_object_info_from_db(dbp, &doi); 00233 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); 00234 } 00235 #endif 00236 00237 VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_SLEEP) == 0); 00238 VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME, 00239 gethrestime_sec()) == 0); 00240 VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO, hap->ha_uid) == 0); 00241 if (hap->ha_zone != NULL) 00242 VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_ZONE, 00243 hap->ha_zone) == 0); 00244 #ifdef _KERNEL 00245 VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_HOST, 00246 utsname.nodename) == 0); 00247 #endif 00248 if (hap->ha_log_type == LOG_CMD_POOL_CREATE || 00249 hap->ha_log_type == LOG_CMD_NORMAL) { 00250 VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD, 00251 history_str) == 0); 00252 00253 zfs_dbgmsg("command: %s", history_str); 00254 } else { 00255 VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT, 00256 hap->ha_event) == 0); 00257 VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TXG, 00258 tx->tx_txg) == 0); 00259 VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR, 00260 history_str) == 0); 00261 00262 zfs_dbgmsg("internal %s pool:%s txg:%llu %s", 00263 zfs_history_event_names[hap->ha_event], spa_name(spa), 00264 (longlong_t)tx->tx_txg, history_str); 00265 00266 } 00267 00268 VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0); 00269 record_packed = kmem_alloc(reclen, KM_SLEEP); 00270 00271 VERIFY(nvlist_pack(nvrecord, &record_packed, &reclen, 00272 NV_ENCODE_XDR, KM_SLEEP) == 0); 00273 00274 mutex_enter(&spa->spa_history_lock); 00275 if (hap->ha_log_type == LOG_CMD_POOL_CREATE) 00276 VERIFY(shpp->sh_eof == shpp->sh_pool_create_len); 00277 00278 /* write out the packed length as little endian */ 00279 le_len = LE_64((uint64_t)reclen); 00280 ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx); 00281 if (!ret) 00282 ret = spa_history_write(spa, record_packed, reclen, shpp, tx); 00283 00284 if (!ret && hap->ha_log_type == LOG_CMD_POOL_CREATE) { 00285 shpp->sh_pool_create_len += sizeof (le_len) + reclen; 00286 shpp->sh_bof = shpp->sh_pool_create_len; 00287 } 00288 00289 mutex_exit(&spa->spa_history_lock); 00290 nvlist_free(nvrecord); 00291 kmem_free(record_packed, reclen); 00292 dmu_buf_rele(dbp, FTAG); 00293 00294 strfree(hap->ha_history_str); 00295 if (hap->ha_zone != NULL) 00296 strfree(hap->ha_zone); 00297 kmem_free(hap, sizeof (history_arg_t)); 00298 } 00299 00303 int 00304 spa_history_log(spa_t *spa, const char *history_str, history_log_type_t what) 00305 { 00306 history_arg_t *ha; 00307 int err = 0; 00308 dmu_tx_t *tx; 00309 00310 ASSERT(what != LOG_INTERNAL); 00311 00312 if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa)) 00313 return (EINVAL); 00314 00315 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 00316 err = dmu_tx_assign(tx, TXG_WAIT); 00317 if (err) { 00318 dmu_tx_abort(tx); 00319 return (err); 00320 } 00321 00322 ha = kmem_alloc(sizeof (history_arg_t), KM_SLEEP); 00323 ha->ha_history_str = strdup(history_str); 00324 ha->ha_zone = strdup(spa_history_zone()); 00325 ha->ha_log_type = what; 00326 ha->ha_uid = crgetuid(CRED()); 00327 00328 /* Kick this off asynchronously; errors are ignored. */ 00329 dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, 00330 spa_history_log_sync, spa, ha, 0, tx); 00331 dmu_tx_commit(tx); 00332 00333 /* spa_history_log_sync will free ha and strings */ 00334 return (err); 00335 } 00336 00340 int 00341 spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) 00342 { 00343 objset_t *mos = spa->spa_meta_objset; 00344 dmu_buf_t *dbp; 00345 uint64_t read_len, phys_read_off, phys_eof; 00346 uint64_t leftover = 0; 00347 spa_history_phys_t *shpp; 00348 int err; 00349 00350 /* 00351 * If the command history doesn't exist (older pool), 00352 * that's ok, just return ENOENT. 00353 */ 00354 if (!spa->spa_history) 00355 return (ENOENT); 00356 00357 /* 00358 * The history is logged asynchronously, so when they request 00359 * the first chunk of history, make sure everything has been 00360 * synced to disk so that we get it. 00361 */ 00362 if (*offp == 0 && spa_writeable(spa)) 00363 txg_wait_synced(spa_get_dsl(spa), 0); 00364 00365 if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0) 00366 return (err); 00367 shpp = dbp->db_data; 00368 00369 #ifdef ZFS_DEBUG 00370 { 00371 dmu_object_info_t doi; 00372 dmu_object_info_from_db(dbp, &doi); 00373 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); 00374 } 00375 #endif 00376 00377 mutex_enter(&spa->spa_history_lock); 00378 phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); 00379 00380 if (*offp < shpp->sh_pool_create_len) { 00381 /* read in just the zpool create history */ 00382 phys_read_off = *offp; 00383 read_len = MIN(*len, shpp->sh_pool_create_len - 00384 phys_read_off); 00385 } else { 00386 /* 00387 * Need to reset passed in offset to BOF if the passed in 00388 * offset has since been overwritten. 00389 */ 00390 *offp = MAX(*offp, shpp->sh_bof); 00391 phys_read_off = spa_history_log_to_phys(*offp, shpp); 00392 00393 /* 00394 * Read up to the minimum of what the user passed down or 00395 * the EOF (physical or logical). If we hit physical EOF, 00396 * use 'leftover' to read from the physical BOF. 00397 */ 00398 if (phys_read_off <= phys_eof) { 00399 read_len = MIN(*len, phys_eof - phys_read_off); 00400 } else { 00401 read_len = MIN(*len, 00402 shpp->sh_phys_max_off - phys_read_off); 00403 if (phys_read_off + *len > shpp->sh_phys_max_off) { 00404 leftover = MIN(*len - read_len, 00405 phys_eof - shpp->sh_pool_create_len); 00406 } 00407 } 00408 } 00409 00410 /* offset for consumer to use next */ 00411 *offp += read_len + leftover; 00412 00413 /* tell the consumer how much you actually read */ 00414 *len = read_len + leftover; 00415 00416 if (read_len == 0) { 00417 mutex_exit(&spa->spa_history_lock); 00418 dmu_buf_rele(dbp, FTAG); 00419 return (0); 00420 } 00421 00422 err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf, 00423 DMU_READ_PREFETCH); 00424 if (leftover && err == 0) { 00425 err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, 00426 leftover, buf + read_len, DMU_READ_PREFETCH); 00427 } 00428 mutex_exit(&spa->spa_history_lock); 00429 00430 dmu_buf_rele(dbp, FTAG); 00431 return (err); 00432 } 00433 00434 static void 00435 log_internal(history_internal_events_t event, spa_t *spa, 00436 dmu_tx_t *tx, const char *fmt, va_list adx) 00437 { 00438 history_arg_t *ha; 00439 va_list adx2; 00440 00441 /* 00442 * If this is part of creating a pool, not everything is 00443 * initialized yet, so don't bother logging the internal events. 00444 * Likewise if the pool is not writeable. 00445 */ 00446 if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa)) 00447 return; 00448 00449 va_copy(adx2, adx); 00450 00451 ha = kmem_alloc(sizeof (history_arg_t), KM_SLEEP); 00452 ha->ha_history_str = kmem_alloc(vsnprintf(NULL, 0, fmt, adx2) + 1, 00453 KM_SLEEP); 00454 00455 va_end(adx2); 00456 00457 (void) vsprintf(ha->ha_history_str, fmt, adx); 00458 00459 ha->ha_log_type = LOG_INTERNAL; 00460 ha->ha_event = event; 00461 ha->ha_zone = NULL; 00462 ha->ha_uid = 0; 00463 00464 if (dmu_tx_is_syncing(tx)) { 00465 spa_history_log_sync(spa, ha, tx); 00466 } else { 00467 dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, 00468 spa_history_log_sync, spa, ha, 0, tx); 00469 } 00470 /* spa_history_log_sync() will free ha and strings */ 00471 } 00472 00473 void 00474 spa_history_log_internal(history_internal_events_t event, spa_t *spa, 00475 dmu_tx_t *tx, const char *fmt, ...) 00476 { 00477 dmu_tx_t *htx = tx; 00478 va_list adx; 00479 00480 /* create a tx if we didn't get one */ 00481 if (tx == NULL) { 00482 htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 00483 if (dmu_tx_assign(htx, TXG_WAIT) != 0) { 00484 dmu_tx_abort(htx); 00485 return; 00486 } 00487 } 00488 00489 va_start(adx, fmt); 00490 log_internal(event, spa, htx, fmt, adx); 00491 va_end(adx); 00492 00493 /* if we didn't get a tx from the caller, commit the one we made */ 00494 if (tx == NULL) 00495 dmu_tx_commit(htx); 00496 } 00497 00498 void 00499 spa_history_log_version(spa_t *spa, history_internal_events_t event) 00500 { 00501 #ifdef _KERNEL 00502 uint64_t current_vers = spa_version(spa); 00503 00504 if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) { 00505 spa_history_log_internal(event, spa, NULL, 00506 "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s", 00507 (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION, 00508 utsname.nodename, utsname.release, utsname.version, 00509 utsname.machine); 00510 } 00511 #if 0 00512 cmn_err(CE_CONT, "!%s version %llu pool %s using %llu", 00513 event == LOG_POOL_IMPORT ? "imported" : 00514 event == LOG_POOL_CREATE ? "created" : "accessed", 00515 (u_longlong_t)current_vers, spa_name(spa), SPA_VERSION); 00516 #endif 00517 #endif 00518 }