FreeBSD ZFS
The Zettabyte File System
|
Intent log format. More...
#include <sys/types.h>
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/dmu.h>
Go to the source code of this file.
Data Structures | |
struct | zil_header |
Intent log header. More... | |
struct | zil_chain |
Log block chaining. More... | |
struct | lr_t |
Format of log records. More... | |
struct | lr_ooo_t |
Common start of all out-of-order record types (TX_OOO() above). More... | |
struct | lr_attr_t |
Handle option extended vattr attributes. More... | |
struct | lr_create_t |
log record for creates without optional ACL. More... | |
struct | lr_acl_create_t |
Log record for creates with optional ACL This log record is also used for recording any FUID information needed for replaying the create. More... | |
struct | lr_remove_t |
struct | lr_link_t |
struct | lr_rename_t |
struct | lr_write_t |
struct | lr_truncate_t |
struct | lr_setattr_t |
struct | lr_acl_v0_t |
struct | lr_acl_t |
struct | itx |
Defines | |
#define | ZIL_MIN_BLKSZ 4096ULL |
#define | ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE |
#define | ZIL_XVAT_SIZE(mapsize) |
size of xvattr log section. | |
#define | ZIL_ACE_LENGTH(x) (roundup(x, sizeof (uint64_t))) |
Size of ACL in log. | |
#define | TX_CI ((uint64_t)0x1 << 63) |
The transactions for mkdir, symlink, remove, rmdir, link, and rename may have the following bit set, indicating the original request specified case-insensitive handling of names. | |
#define | TX_OOO(txtype) |
Transactions for write, truncate, setattr, acl_v0, and acl can be logged out of order. | |
zh_flags bit settings | |
#define | ZIL_REPLAY_NEEDED 0x1 |
replay needed - internal only | |
#define | ZIL_CLAIM_LR_SEQ_VALID 0x2 |
zh_claim_lr_seq field is valid | |
The words of a log block checksum. | |
#define | ZIL_ZC_GUID_0 0 |
#define | ZIL_ZC_GUID_1 1 |
#define | ZIL_ZC_OBJSET 2 |
#define | ZIL_ZC_SEQ 3 |
Intent log transaction types and record structures | |
#define | TX_CREATE 1 |
Create file. | |
#define | TX_MKDIR 2 |
Make directory. | |
#define | TX_MKXATTR 3 |
Make XATTR directory. | |
#define | TX_SYMLINK 4 |
Create symbolic link to a file. | |
#define | TX_REMOVE 5 |
Remove file. | |
#define | TX_RMDIR 6 |
Remove directory. | |
#define | TX_LINK 7 |
Create hard link to a file. | |
#define | TX_RENAME 8 |
Rename a file. | |
#define | TX_WRITE 9 |
File write. | |
#define | TX_TRUNCATE 10 |
Truncate a file. | |
#define | TX_SETATTR 11 |
Set file attributes. | |
#define | TX_ACL_V0 12 |
Set old formatted ACL. | |
#define | TX_ACL 13 |
Set ACL. | |
#define | TX_CREATE_ACL 14 |
create with ACL | |
#define | TX_CREATE_ATTR 15 |
create + attrs | |
#define | TX_CREATE_ACL_ATTR 16 |
create with ACL + attrs | |
#define | TX_MKDIR_ACL 17 |
mkdir with ACL | |
#define | TX_MKDIR_ATTR 18 |
mkdir with attr | |
#define | TX_MKDIR_ACL_ATTR 19 |
mkdir with ACL + attrs | |
#define | TX_WRITE2 20 |
dmu_sync EALREADY write | |
#define | TX_MAX_TYPE 21 |
Max transaction type. | |
Typedefs | |
typedef struct zil_header | zil_header_t |
Intent log header. | |
typedef struct zil_chain | zil_chain_t |
Log block chaining. | |
typedef enum zil_create | zil_create_t |
typedef struct itx | itx_t |
typedef int | zil_parse_blk_func_t (zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t txg) |
typedef int | zil_parse_lr_func_t (zilog_t *zilog, lr_t *lr, void *arg, uint64_t txg) |
typedef int | zil_replay_func_t () |
typedef int | zil_get_data_t (void *arg, lr_write_t *lr, char *dbuf, zio_t *zio) |
Enumerations | |
enum | zil_create { Z_FILE, Z_DIR, Z_XATTRDIR } |
enum | itx_wr_state_t { WR_INDIRECT, WR_COPIED, WR_NEED_COPY, WR_NUM_STATES } |
Writes are handled in three different ways:
| |
Functions | |
int | zil_parse (zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) |
Parse the intent log, and call parse_func for each valid record within. | |
void | zil_init (void) |
void | zil_fini (void) |
zilog_t * | zil_alloc (objset_t *os, zil_header_t *zh_phys) |
void | zil_free (zilog_t *zilog) |
zilog_t * | zil_open (objset_t *os, zil_get_data_t *get_data) |
Open an intent log. | |
void | zil_close (zilog_t *zilog) |
Close an intent log. | |
void | zil_replay (objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) |
If this dataset has a non-empty intent log, replay it and destroy it. | |
boolean_t | zil_replaying (zilog_t *zilog, dmu_tx_t *tx) |
void | zil_destroy (zilog_t *zilog, boolean_t keep_first) |
In one tx, free all log blocks and clear the log header. | |
void | zil_destroy_sync (zilog_t *zilog, dmu_tx_t *tx) |
void | zil_rollback_destroy (zilog_t *zilog, dmu_tx_t *tx) |
itx_t * | zil_itx_create (uint64_t txtype, size_t lrsize) |
void | zil_itx_destroy (itx_t *itx) |
void | zil_itx_assign (zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) |
void | zil_commit (zilog_t *zilog, uint64_t oid) |
Commit zfs transactions to stable storage. | |
int | zil_vdev_offline (const char *osname, void *txarg) |
int | zil_claim (const char *osname, void *txarg) |
int | zil_check_log_chain (const char *osname, void *txarg) |
Check the log by walking the log chain. | |
void | zil_sync (zilog_t *zilog, dmu_tx_t *tx) |
Called in syncing context to free committed log blocks and update log header. | |
void | zil_clean (zilog_t *zilog, uint64_t synced_txg) |
If there are any in-memory intent log transactions which have now been synced then start up a taskq to free them. | |
int | zil_suspend (zilog_t *zilog) |
Suspend an intent log. | |
void | zil_resume (zilog_t *zilog) |
void | zil_add_block (zilog_t *zilog, const blkptr_t *bp) |
int | zil_bp_tree_add (zilog_t *zilog, const blkptr_t *bp) |
void | zil_set_sync (zilog_t *zilog, uint64_t syncval) |
void | zil_set_logbias (zilog_t *zilog, uint64_t slogval) |
Variables | |
int | zil_replay_disable |
Disable intent logging replay. |
Intent log format.
Each objset has its own intent log. The log header (zil_header_t) for objset N's intent log is kept in the Nth object of the SPA's intent_log objset. The log header points to a chain of log blocks, each of which contains log records (i.e., transactions) followed by a log block trailer (zil_trailer_t). The format of a log record depends on the record (or transaction) type, but all records begin with a common structure that defines the type, length, and txg.
Definition in file zil.h.
#define TX_CI ((uint64_t)0x1 << 63) |
#define TX_OOO | ( | txtype | ) |
((txtype) == TX_WRITE || \ (txtype) == TX_TRUNCATE || \ (txtype) == TX_SETATTR || \ (txtype) == TX_ACL_V0 || \ (txtype) == TX_ACL || \ (txtype) == TX_WRITE2)
Transactions for write, truncate, setattr, acl_v0, and acl can be logged out of order.
For convenience in the code, all such records must have lr_foid at the same offset.
#define ZIL_ACE_LENGTH | ( | x | ) | (roundup(x, sizeof (uint64_t))) |
#define ZIL_CLAIM_LR_SEQ_VALID 0x2 |
#define ZIL_XVAT_SIZE | ( | mapsize | ) |
sizeof (lr_attr_t) + (sizeof (uint32_t) * (mapsize - 1)) + \ (sizeof (uint64_t) * 7)
size of xvattr log section.
its composed of lr_attr_t + xvattr bitmap + 2 64 bit timestamps for create time and a single 64 bit integer for all of the attributes, and 4 64 bit integers (32 bytes) for the scanstamp.
typedef struct zil_chain zil_chain_t |
Log block chaining.
Log blocks are chained together. Originally they were chained at the end of the block. For performance reasons the chain was moved to the beginning of the block which allows writes for only the data being used. The older position is supported for backwards compatability.
The zio_eck_t contains a zec_cksum which for the intent log is the sequence number of this log block. A seq of 0 is invalid. The zec_cksum is checked by the SPA against the sequence number passed in the blk_cksum field of the blkptr_t
typedef enum zil_create zil_create_t |
typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio) |
typedef struct zil_header zil_header_t |
Intent log header.
This on disk structure holds fields to manage the log. All fields are 64 bit to easily handle cross architectures.
typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t txg) |
typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, uint64_t txg) |
typedef int zil_replay_func_t() |
enum itx_wr_state_t |
Writes are handled in three different ways:
When the txg commits the block is linked in. This saves additionally writing the data into the log record. There are a few requirements for this to occur:
WR_COPIED:
If we know we'll immediately be committing the transaction (FSYNC or FDSYNC), the we allocate a larger log record here for the data and copy the data in.
WR_INDIRECT |
indirect - a large write (dmu_sync() data and put blkptr in log, rather than actual data) |
WR_COPIED |
immediate - data is copied into lr_write_t |
WR_NEED_COPY |
immediate - data needs to be copied if pushed |
WR_NUM_STATES |
number of states |
enum zil_create |
zilog_t* zil_alloc | ( | objset_t * | os, |
zil_header_t * | zh_phys | ||
) |
int zil_check_log_chain | ( | const char * | osname, |
void * | tx | ||
) |
void zil_clean | ( | zilog_t * | zilog, |
uint64_t | synced_txg | ||
) |
If there are any in-memory intent log transactions which have now been synced then start up a taskq to free them.
We should only do this after we have written out the uberblocks (i.e. txg has been comitted) so that don't inadvertently clean out in-memory log records that would be required by zil_commit().
void zil_commit | ( | zilog_t * | zilog, |
uint64_t | foid | ||
) |
Commit zfs transactions to stable storage.
itxs are committed in batches. In a heavily stressed zil there will be a commit writer thread who is writing out a bunch of itxs to the log for a set of committing threads (cthreads) in the same batch as the writer. Those cthreads are all waiting on the same cv for that batch.
There will also be a different and growing batch of threads that are waiting to commit (qthreads). When the committing batch completes a transition occurs such that the cthreads exit and the qthreads become cthreads. One of the new cthreads becomes the writer thread for the batch. Any new threads arriving become new qthreads.
Only 2 condition variables are needed and there's no transition between the two cvs needed. They just flip-flop between qthreads and cthreads.
Using this scheme we can efficiently wakeup up only those threads that have been committed.
[in] | foid | if 0, push out all transactions. Otherwise push only those for that object or might reference that object |
void zil_destroy | ( | zilog_t * | zilog, |
boolean_t | keep_first | ||
) |
In one tx, free all log blocks and clear the log header.
If keep_first is set, then we're replaying a log with no content. We want to keep the first block, however, so that the first synchronous transaction doesn't require a txg_wait_synced() in zil_create(). We don't need to txg_wait_synced() here either when keep_first is set, because both zil_create() and zil_destroy() will wait for any in-progress destroys to complete.
zilog_t* zil_open | ( | objset_t * | os, |
zil_get_data_t * | get_data | ||
) |
int zil_parse | ( | zilog_t * | zilog, |
zil_parse_blk_func_t * | parse_blk_func, | ||
zil_parse_lr_func_t * | parse_lr_func, | ||
void * | arg, | ||
uint64_t | txg | ||
) |
void zil_replay | ( | objset_t * | os, |
void * | arg, | ||
zil_replay_func_t * | replay_func[TX_MAX_TYPE] | ||
) |
int zil_suspend | ( | zilog_t * | zilog | ) |
Suspend an intent log.
While in suspended mode, we still honor synchronous semantics, but we rely on txg_wait_synced() to do it. We suspend the log briefly when taking a snapshot so that the snapshot contains all the data it's supposed to, and has an empty intent log.