FreeBSD ZFS
The Zettabyte File System

spa_history.c

Go to the documentation of this file.
00001 /*
00002  * CDDL HEADER START
00003  *
00004  * The contents of this file are subject to the terms of the
00005  * Common Development and Distribution License (the "License").
00006  * You may not use this file except in compliance with the License.
00007  *
00008  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
00009  * or http://www.opensolaris.org/os/licensing.
00010  * See the License for the specific language governing permissions
00011  * and limitations under the License.
00012  *
00013  * When distributing Covered Code, include this CDDL HEADER in each
00014  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
00015  * If applicable, add the following below this CDDL HEADER, with the
00016  * fields enclosed by brackets "[]" replaced with your own identifying
00017  * information: Portions Copyright [yyyy] [name of copyright owner]
00018  *
00019  * CDDL HEADER END
00020  */
00021 
00022 /*
00023  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
00024  * Copyright (c) 2011 by Delphix. All rights reserved.
00025  */
00026 
00027 #include <sys/spa.h>
00028 #include <sys/spa_impl.h>
00029 #include <sys/zap.h>
00030 #include <sys/dsl_synctask.h>
00031 #include <sys/dmu_tx.h>
00032 #include <sys/dmu_objset.h>
00033 #include <sys/utsname.h>
00034 #include <sys/sunddi.h>
00035 #include "zfs_comutil.h"
00036 #ifdef _KERNEL
00037 #include <sys/cmn_err.h>
00038 #include <sys/zone.h>
00039 #endif
00040 
00074 static uint64_t
00075 spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp)
00076 {
00077         uint64_t phys_len;
00078 
00079         phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len;
00080         return ((log_off - shpp->sh_pool_create_len) % phys_len
00081             + shpp->sh_pool_create_len);
00082 }
00083 
00084 void
00085 spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
00086 {
00087         dmu_buf_t *dbp;
00088         spa_history_phys_t *shpp;
00089         objset_t *mos = spa->spa_meta_objset;
00090 
00091         ASSERT(spa->spa_history == 0);
00092         spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
00093             SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
00094             sizeof (spa_history_phys_t), tx);
00095 
00096         VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
00097             DMU_POOL_HISTORY, sizeof (uint64_t), 1,
00098             &spa->spa_history, tx) == 0);
00099 
00100         VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
00101         ASSERT(dbp->db_size >= sizeof (spa_history_phys_t));
00102 
00103         shpp = dbp->db_data;
00104         dmu_buf_will_dirty(dbp, tx);
00105 
00106         /*
00107          * Figure out maximum size of history log.  We set it at
00108          * 0.1% of pool size, with a max of 1G and min of 128KB.
00109          */
00110         shpp->sh_phys_max_off =
00111             metaslab_class_get_dspace(spa_normal_class(spa)) / 1000;
00112         shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30);
00113         shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
00114 
00115         dmu_buf_rele(dbp, FTAG);
00116 }
00117 
00121 static int
00122 spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp)
00123 {
00124         objset_t *mos = spa->spa_meta_objset;
00125         uint64_t firstread, reclen, phys_bof;
00126         char buf[sizeof (reclen)];
00127         int err;
00128 
00129         phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp);
00130         firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof);
00131 
00132         if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread,
00133             buf, DMU_READ_PREFETCH)) != 0)
00134                 return (err);
00135         if (firstread != sizeof (reclen)) {
00136                 if ((err = dmu_read(mos, spa->spa_history,
00137                     shpp->sh_pool_create_len, sizeof (reclen) - firstread,
00138                     buf + firstread, DMU_READ_PREFETCH)) != 0)
00139                         return (err);
00140         }
00141 
00142         reclen = LE_64(*((uint64_t *)buf));
00143         shpp->sh_bof += reclen + sizeof (reclen);
00144         shpp->sh_records_lost++;
00145         return (0);
00146 }
00147 
00148 static int
00149 spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp,
00150     dmu_tx_t *tx)
00151 {
00152         uint64_t firstwrite, phys_eof;
00153         objset_t *mos = spa->spa_meta_objset;
00154         int err;
00155 
00156         ASSERT(MUTEX_HELD(&spa->spa_history_lock));
00157 
00158         /* see if we need to reset logical BOF */
00159         while (shpp->sh_phys_max_off - shpp->sh_pool_create_len -
00160             (shpp->sh_eof - shpp->sh_bof) <= len) {
00161                 if ((err = spa_history_advance_bof(spa, shpp)) != 0) {
00162                         return (err);
00163                 }
00164         }
00165 
00166         phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
00167         firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof);
00168         shpp->sh_eof += len;
00169         dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx);
00170 
00171         len -= firstwrite;
00172         if (len > 0) {
00173                 /* write out the rest at the beginning of physical file */
00174                 dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len,
00175                     len, (char *)buf + firstwrite, tx);
00176         }
00177 
00178         return (0);
00179 }
00180 
00181 static char *
00182 spa_history_zone()
00183 {
00184 #ifdef _KERNEL
00185         /* XXX: pr_hostname can be changed by default from within a jail! */
00186         if (jailed(curthread->td_ucred))
00187                 return (curthread->td_ucred->cr_prison->pr_hostname);
00188 #endif
00189         return ("global");
00190 }
00191 
00195 /*ARGSUSED*/
00196 static void
00197 spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx)
00198 {
00199         spa_t           *spa = arg1;
00200         history_arg_t   *hap = arg2;
00201         const char      *history_str = hap->ha_history_str;
00202         objset_t        *mos = spa->spa_meta_objset;
00203         dmu_buf_t       *dbp;
00204         spa_history_phys_t *shpp;
00205         size_t          reclen;
00206         uint64_t        le_len;
00207         nvlist_t        *nvrecord;
00208         char            *record_packed = NULL;
00209         int             ret;
00210 
00211         /*
00212          * If we have an older pool that doesn't have a command
00213          * history object, create it now.
00214          */
00215         mutex_enter(&spa->spa_history_lock);
00216         if (!spa->spa_history)
00217                 spa_history_create_obj(spa, tx);
00218         mutex_exit(&spa->spa_history_lock);
00219 
00220         /*
00221          * Get the offset of where we need to write via the bonus buffer.
00222          * Update the offset when the write completes.
00223          */
00224         VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
00225         shpp = dbp->db_data;
00226 
00227         dmu_buf_will_dirty(dbp, tx);
00228 
00229 #ifdef ZFS_DEBUG
00230         {
00231                 dmu_object_info_t doi;
00232                 dmu_object_info_from_db(dbp, &doi);
00233                 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
00234         }
00235 #endif
00236 
00237         VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_SLEEP) == 0);
00238         VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME,
00239             gethrestime_sec()) == 0);
00240         VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO, hap->ha_uid) == 0);
00241         if (hap->ha_zone != NULL)
00242                 VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_ZONE,
00243                     hap->ha_zone) == 0);
00244 #ifdef _KERNEL
00245         VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_HOST,
00246             utsname.nodename) == 0);
00247 #endif
00248         if (hap->ha_log_type == LOG_CMD_POOL_CREATE ||
00249             hap->ha_log_type == LOG_CMD_NORMAL) {
00250                 VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD,
00251                     history_str) == 0);
00252 
00253                 zfs_dbgmsg("command: %s", history_str);
00254         } else {
00255                 VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT,
00256                     hap->ha_event) == 0);
00257                 VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TXG,
00258                     tx->tx_txg) == 0);
00259                 VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR,
00260                     history_str) == 0);
00261 
00262                 zfs_dbgmsg("internal %s pool:%s txg:%llu %s",
00263                     zfs_history_event_names[hap->ha_event], spa_name(spa),
00264                     (longlong_t)tx->tx_txg, history_str);
00265 
00266         }
00267 
00268         VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0);
00269         record_packed = kmem_alloc(reclen, KM_SLEEP);
00270 
00271         VERIFY(nvlist_pack(nvrecord, &record_packed, &reclen,
00272             NV_ENCODE_XDR, KM_SLEEP) == 0);
00273 
00274         mutex_enter(&spa->spa_history_lock);
00275         if (hap->ha_log_type == LOG_CMD_POOL_CREATE)
00276                 VERIFY(shpp->sh_eof == shpp->sh_pool_create_len);
00277 
00278         /* write out the packed length as little endian */
00279         le_len = LE_64((uint64_t)reclen);
00280         ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx);
00281         if (!ret)
00282                 ret = spa_history_write(spa, record_packed, reclen, shpp, tx);
00283 
00284         if (!ret && hap->ha_log_type == LOG_CMD_POOL_CREATE) {
00285                 shpp->sh_pool_create_len += sizeof (le_len) + reclen;
00286                 shpp->sh_bof = shpp->sh_pool_create_len;
00287         }
00288 
00289         mutex_exit(&spa->spa_history_lock);
00290         nvlist_free(nvrecord);
00291         kmem_free(record_packed, reclen);
00292         dmu_buf_rele(dbp, FTAG);
00293 
00294         strfree(hap->ha_history_str);
00295         if (hap->ha_zone != NULL)
00296                 strfree(hap->ha_zone);
00297         kmem_free(hap, sizeof (history_arg_t));
00298 }
00299 
00303 int
00304 spa_history_log(spa_t *spa, const char *history_str, history_log_type_t what)
00305 {
00306         history_arg_t *ha;
00307         int err = 0;
00308         dmu_tx_t *tx;
00309 
00310         ASSERT(what != LOG_INTERNAL);
00311 
00312         if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa))
00313                 return (EINVAL);
00314 
00315         tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
00316         err = dmu_tx_assign(tx, TXG_WAIT);
00317         if (err) {
00318                 dmu_tx_abort(tx);
00319                 return (err);
00320         }
00321 
00322         ha = kmem_alloc(sizeof (history_arg_t), KM_SLEEP);
00323         ha->ha_history_str = strdup(history_str);
00324         ha->ha_zone = strdup(spa_history_zone());
00325         ha->ha_log_type = what;
00326         ha->ha_uid = crgetuid(CRED());
00327 
00328         /* Kick this off asynchronously; errors are ignored. */
00329         dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL,
00330             spa_history_log_sync, spa, ha, 0, tx);
00331         dmu_tx_commit(tx);
00332 
00333         /* spa_history_log_sync will free ha and strings */
00334         return (err);
00335 }
00336 
00340 int
00341 spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
00342 {
00343         objset_t *mos = spa->spa_meta_objset;
00344         dmu_buf_t *dbp;
00345         uint64_t read_len, phys_read_off, phys_eof;
00346         uint64_t leftover = 0;
00347         spa_history_phys_t *shpp;
00348         int err;
00349 
00350         /*
00351          * If the command history  doesn't exist (older pool),
00352          * that's ok, just return ENOENT.
00353          */
00354         if (!spa->spa_history)
00355                 return (ENOENT);
00356 
00357         /*
00358          * The history is logged asynchronously, so when they request
00359          * the first chunk of history, make sure everything has been
00360          * synced to disk so that we get it.
00361          */
00362         if (*offp == 0 && spa_writeable(spa))
00363                 txg_wait_synced(spa_get_dsl(spa), 0);
00364 
00365         if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0)
00366                 return (err);
00367         shpp = dbp->db_data;
00368 
00369 #ifdef ZFS_DEBUG
00370         {
00371                 dmu_object_info_t doi;
00372                 dmu_object_info_from_db(dbp, &doi);
00373                 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
00374         }
00375 #endif
00376 
00377         mutex_enter(&spa->spa_history_lock);
00378         phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
00379 
00380         if (*offp < shpp->sh_pool_create_len) {
00381                 /* read in just the zpool create history */
00382                 phys_read_off = *offp;
00383                 read_len = MIN(*len, shpp->sh_pool_create_len -
00384                     phys_read_off);
00385         } else {
00386                 /*
00387                  * Need to reset passed in offset to BOF if the passed in
00388                  * offset has since been overwritten.
00389                  */
00390                 *offp = MAX(*offp, shpp->sh_bof);
00391                 phys_read_off = spa_history_log_to_phys(*offp, shpp);
00392 
00393                 /*
00394                  * Read up to the minimum of what the user passed down or
00395                  * the EOF (physical or logical).  If we hit physical EOF,
00396                  * use 'leftover' to read from the physical BOF.
00397                  */
00398                 if (phys_read_off <= phys_eof) {
00399                         read_len = MIN(*len, phys_eof - phys_read_off);
00400                 } else {
00401                         read_len = MIN(*len,
00402                             shpp->sh_phys_max_off - phys_read_off);
00403                         if (phys_read_off + *len > shpp->sh_phys_max_off) {
00404                                 leftover = MIN(*len - read_len,
00405                                     phys_eof - shpp->sh_pool_create_len);
00406                         }
00407                 }
00408         }
00409 
00410         /* offset for consumer to use next */
00411         *offp += read_len + leftover;
00412 
00413         /* tell the consumer how much you actually read */
00414         *len = read_len + leftover;
00415 
00416         if (read_len == 0) {
00417                 mutex_exit(&spa->spa_history_lock);
00418                 dmu_buf_rele(dbp, FTAG);
00419                 return (0);
00420         }
00421 
00422         err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf,
00423             DMU_READ_PREFETCH);
00424         if (leftover && err == 0) {
00425                 err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len,
00426                     leftover, buf + read_len, DMU_READ_PREFETCH);
00427         }
00428         mutex_exit(&spa->spa_history_lock);
00429 
00430         dmu_buf_rele(dbp, FTAG);
00431         return (err);
00432 }
00433 
00434 static void
00435 log_internal(history_internal_events_t event, spa_t *spa,
00436     dmu_tx_t *tx, const char *fmt, va_list adx)
00437 {
00438         history_arg_t *ha;
00439         va_list adx2;
00440 
00441         /*
00442          * If this is part of creating a pool, not everything is
00443          * initialized yet, so don't bother logging the internal events.
00444          * Likewise if the pool is not writeable.
00445          */
00446         if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa))
00447                 return;
00448 
00449         va_copy(adx2, adx);
00450 
00451         ha = kmem_alloc(sizeof (history_arg_t), KM_SLEEP);
00452         ha->ha_history_str = kmem_alloc(vsnprintf(NULL, 0, fmt, adx2) + 1,
00453             KM_SLEEP);
00454 
00455         va_end(adx2);
00456 
00457         (void) vsprintf(ha->ha_history_str, fmt, adx);
00458 
00459         ha->ha_log_type = LOG_INTERNAL;
00460         ha->ha_event = event;
00461         ha->ha_zone = NULL;
00462         ha->ha_uid = 0;
00463 
00464         if (dmu_tx_is_syncing(tx)) {
00465                 spa_history_log_sync(spa, ha, tx);
00466         } else {
00467                 dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL,
00468                     spa_history_log_sync, spa, ha, 0, tx);
00469         }
00470         /* spa_history_log_sync() will free ha and strings */
00471 }
00472 
00473 void
00474 spa_history_log_internal(history_internal_events_t event, spa_t *spa,
00475     dmu_tx_t *tx, const char *fmt, ...)
00476 {
00477         dmu_tx_t *htx = tx;
00478         va_list adx;
00479 
00480         /* create a tx if we didn't get one */
00481         if (tx == NULL) {
00482                 htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
00483                 if (dmu_tx_assign(htx, TXG_WAIT) != 0) {
00484                         dmu_tx_abort(htx);
00485                         return;
00486                 }
00487         }
00488 
00489         va_start(adx, fmt);
00490         log_internal(event, spa, htx, fmt, adx);
00491         va_end(adx);
00492 
00493         /* if we didn't get a tx from the caller, commit the one we made */
00494         if (tx == NULL)
00495                 dmu_tx_commit(htx);
00496 }
00497 
00498 void
00499 spa_history_log_version(spa_t *spa, history_internal_events_t event)
00500 {
00501 #ifdef _KERNEL
00502         uint64_t current_vers = spa_version(spa);
00503 
00504         if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) {
00505                 spa_history_log_internal(event, spa, NULL,
00506                     "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s",
00507                     (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION,
00508                     utsname.nodename, utsname.release, utsname.version,
00509                     utsname.machine);
00510         }
00511 #if 0
00512         cmn_err(CE_CONT, "!%s version %llu pool %s using %llu",
00513             event == LOG_POOL_IMPORT ? "imported" :
00514             event == LOG_POOL_CREATE ? "created" : "accessed",
00515             (u_longlong_t)current_vers, spa_name(spa), SPA_VERSION);
00516 #endif
00517 #endif
00518 }
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines