Change 513309 by willa@willa_repo on 2011/11/10 18:16:49 Add hooks to record write ranges for buffers that will be dirtied. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#4 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#2 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#2 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#2 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#4 (text) ==== @@ -1131,6 +1131,9 @@ * transaction group won't leak out when we sync the older txg. */ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); + //list_create(&dr->write_ranges, sizeof(dbuf_dirty_range_t), + // offsetof(dbuf_dirty_range_t, write_range_link)); + if (db->db_level == 0) { void *data_old = db->db_buf; @@ -1404,6 +1407,23 @@ (void) dbuf_dirty(db, tx); } +#pragma weak dmu_buf_will_dirty_range = dbuf_will_dirty_range +/** + */ +void +dbuf_will_dirty_range(dmu_buf_impl_t *db, dmu_tx_t *tx, int offset, int size) +{ + + //DB_DNODE_ENTER(db); + //dbuf_read_async(db, tx); + /* + * After issuing the async read (which may return immediately), + * record the range, either + */ + //DB_DNODE_EXIT(db); + dbuf_will_dirty(db, tx); +} + void dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#2 (text) ==== @@ -830,7 +830,7 @@ if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else - dmu_buf_will_dirty(db, tx); + dmu_buf_will_dirty_range(db, tx, bufoff, tocpy); bcopy(buf, (char *)db->db_data + bufoff, tocpy); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#2 (text) ==== @@ -130,10 +130,24 @@ blkptr_t dr_overridden_by; override_states_t dr_override_state; uint8_t dr_copies; + + /* + * Record a list of the ranges that dr_data's + * contents are valid for. This allows recording + * multiple writes while waiting for an asynchronous + * read to complete. + */ + list_t *write_ranges; } dl; } dt; } dbuf_dirty_record_t; +typedef struct dbuf_dirty_range { + list_node_t write_range_link; + int offset; + int size; +} dbuf_dirty_range_t; + typedef struct dmu_buf_impl { /* * The following members are immutable, with the exception of @@ -224,6 +238,7 @@ uint8_t db_freed_in_flight; uint8_t db_dirtycnt; + } dmu_buf_impl_t; /* Note: the dbuf hash table is exposed only for the mdb module */ @@ -263,6 +278,8 @@ int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +void dbuf_will_dirty_range(dmu_buf_impl_t *db, dmu_tx_t *tx, int offset, + int size); void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#2 (text) ==== @@ -299,6 +299,8 @@ void *dmu_buf_get_user(dmu_buf_t *db); void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_will_dirty_range(dmu_buf_t *db, dmu_tx_t *tx, int offset, + int size); boolean_t dmu_buf_freeable(dmu_buf_t *); Change 514212 by willa@willa_repo on 2011/11/18 14:00:35 Convert dmu_buf_impl_t.db_last_dirty to a list_t, db_dirty_records. The old mechanism implements a rudimentary list in which the newest dirty record for a dbuf is at the head. This change reimplements that using the OpenSolaris list_t. Besides that it's cleaner, it also allows iterating on the list in reverse (oldest dirty record first), which we will need to do in order to resolve an async read upon completion of the I/O. Update all consumers of db->db_last_dirty, and modify dbuf_create and dbuf_destroy to perform the setup and teardown. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#5 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#3 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c#4 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c#2 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c#2 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#3 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#5 (text) ==== @@ -330,12 +330,12 @@ ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); } - for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) + for (dr = list_head(&db->db_dirty_records); dr != NULL;) { ASSERT(dr->dr_dbuf == db); + dr = list_next(&db->db_dirty_records, dr); + } + ASSERT(db->db_data_pending->dr_dbuf == db); - for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) - ASSERT(dr->dr_dbuf == db); - /* * We can't assert that db_size matches dn_datablksz because it * can be momentarily different when another thread is doing @@ -712,7 +712,7 @@ static void dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) { - dbuf_dirty_record_t *dr = db->db_last_dirty; + dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db.db_data != NULL); @@ -797,6 +797,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) { dmu_buf_impl_t *db, *db_next; + dbuf_dirty_record_t *dr; uint64_t txg = tx->tx_txg; int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; uint64_t first_l1 = start >> epbs; @@ -815,8 +816,8 @@ if (db->db_level == 1 && db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { mutex_enter(&db->db_mtx); - if (db->db_last_dirty && - db->db_last_dirty->dr_txg < txg) { + dr = list_head(&db->db_dirty_records); + if (dr != NULL && dr->dr_txg < txg) { dbuf_add_ref(db, FTAG); mutex_exit(&db->db_mtx); dbuf_will_dirty(db, tx); @@ -857,9 +858,8 @@ } /* The dbuf is referenced */ - if (db->db_last_dirty != NULL) { - dbuf_dirty_record_t *dr = db->db_last_dirty; - + dr = list_head(&db->db_dirty_records); + if (dr != NULL) { if (dr->dr_txg == txg) { /* * This buffer is "in-use", re-adjust the file @@ -897,16 +897,18 @@ dbuf_block_freeable(dmu_buf_impl_t *db) { dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; + dbuf_dirty_record_t *dr; uint64_t birth_txg = 0; /* * We don't need any locking to protect db_blkptr: - * If it's syncing, then db_last_dirty will be set - * so we'll ignore db_blkptr. + * If it's syncing, then db_dirty_records will have + * entries, so we'll ignore db_blkptr. */ ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_last_dirty) - birth_txg = db->db_last_dirty->dr_txg; + dr = list_head(&db->db_dirty_records); + if (dr != NULL) + birth_txg = dr->dr_txg; else if (db->db_blkptr) birth_txg = db->db_blkptr->blk_birth; @@ -967,8 +969,11 @@ db->db.db_size = size; if (db->db_level == 0) { - ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); - db->db_last_dirty->dt.dl.dr_data = buf; + dbuf_dirty_record_t *dr; + + dr = list_head(&db->db_dirty_records); + ASSERT3U(dr->dr_txg, ==, tx->tx_txg); + dr->dt.dl.dr_data = buf; } mutex_exit(&db->db_mtx); @@ -1003,7 +1008,7 @@ { dnode_t *dn; objset_t *os; - dbuf_dirty_record_t **drp, *dr; + dbuf_dirty_record_t *dr; int drop_struct_lock = FALSE; boolean_t do_free_accounting = B_FALSE; int txgoff = tx->tx_txg & TXG_MASK; @@ -1063,11 +1068,11 @@ /* * If this buffer is already dirty, we're done. */ - drp = &db->db_last_dirty; - ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || + dr = list_head(&db->db_dirty_records); + ASSERT(dr == NULL || dr->dr_txg <= tx->tx_txg || db->db.db_object == DMU_META_DNODE_OBJECT); - while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) - drp = &dr->dr_next; + while (dr != NULL && dr->dr_txg > tx->tx_txg) + dr = list_next(&db->db_dirty_records, dr); if (dr && dr->dr_txg == tx->tx_txg) { DB_DNODE_EXIT(db); @@ -1131,8 +1136,6 @@ * transaction group won't leak out when we sync the older txg. */ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); - //list_create(&dr->write_ranges, sizeof(dbuf_dirty_range_t), - // offsetof(dbuf_dirty_range_t, write_range_link)); if (db->db_level == 0) { void *data_old = db->db_buf; @@ -1166,8 +1169,7 @@ } dr->dr_dbuf = db; dr->dr_txg = tx->tx_txg; - dr->dr_next = *drp; - *drp = dr; + list_insert_head(&db->db_dirty_records, dr); /* * We could have been freed_in_flight between the dbuf_noread @@ -1248,7 +1250,7 @@ mutex_enter(&db->db_mtx); /* possible race with dbuf_undirty() */ - if (db->db_last_dirty == dr || + if (list_head(&db->db_dirty_records) == dr || dn->dn_object == DMU_META_DNODE_OBJECT) { mutex_enter(&di->dt.di.dr_mtx); ASSERT3U(di->dr_txg, ==, tx->tx_txg); @@ -1280,7 +1282,7 @@ { dnode_t *dn; uint64_t txg = tx->tx_txg; - dbuf_dirty_record_t *dr, **drp; + dbuf_dirty_record_t *dr; ASSERT(txg != 0); ASSERT(db->db_blkid != DMU_BONUS_BLKID); @@ -1289,9 +1291,11 @@ /* * If this buffer is not dirty, we're done. */ - for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) + for (dr = list_head(&db->db_dirty_records); dr != NULL;) { if (dr->dr_txg <= txg) break; + dr = list_next(&db->db_dirty_records, dr); + } if (dr == NULL || dr->dr_txg < txg) { mutex_exit(&db->db_mtx); return (0); @@ -1328,7 +1332,7 @@ /* XXX would be nice to fix up dn_towrite_space[] */ - *drp = dr->dr_next; + list_remove(&db->db_dirty_records, dr); /* * Note that there are three places in dbuf_dirty() @@ -1515,7 +1519,7 @@ xuio_stat_wbuf_nocopy(); if (db->db_state == DB_CACHED) { - dbuf_dirty_record_t *dr = db->db_last_dirty; + dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); ASSERT(db->db_buf != NULL); if (dr != NULL && dr->dr_txg == tx->tx_txg) { @@ -1699,11 +1703,13 @@ db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); + list_create(&db->db_dirty_records, sizeof(dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, db_dirty_record_link)); + db->db_objset = os; db->db.db_object = dn->dn_object; db->db_level = level; db->db_blkid = blkid; - db->db_last_dirty = NULL; db->db_dirtycnt = 0; db->db_dnode_handle = dn->dn_handle; db->db_parent = parent; @@ -1825,6 +1831,7 @@ } db->db_parent = NULL; db->db_buf = NULL; + list_destroy(&db->db_dirty_records); ASSERT(!list_link_active(&db->db_link)); ASSERT(db->db.db_data == NULL); @@ -2380,8 +2387,6 @@ * be called). */ if (db->db_blkid == DMU_BONUS_BLKID) { - dbuf_dirty_record_t **drp; - ASSERT(*datap != NULL); ASSERT3U(db->db_level, ==, 0); ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); @@ -2393,12 +2398,9 @@ arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } db->db_data_pending = NULL; - drp = &db->db_last_dirty; - while (*drp != dr) - drp = &(*drp)->dr_next; - ASSERT(dr->dr_next == NULL); + ASSERT(list_next(&db->db_dirty_records, dr) == NULL); ASSERT(dr->dr_dbuf == db); - *drp = dr->dr_next; + list_remove(&db->db_dirty_records, dr); if (dr->dr_dbuf->db_level != 0) { list_destroy(&dr->dt.di.dr_children); mutex_destroy(&dr->dt.di.dr_mtx); @@ -2585,7 +2587,7 @@ blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; uint64_t txg = zio->io_txg; - dbuf_dirty_record_t **drp, *dr; + dbuf_dirty_record_t *dr; ASSERT3U(zio->io_error, ==, 0); ASSERT(db->db_blkptr == bp); @@ -2609,14 +2611,12 @@ DBUF_VERIFY(db); - drp = &db->db_last_dirty; - while ((dr = *drp) != db->db_data_pending) - drp = &dr->dr_next; + dr = db->db_data_pending; ASSERT(!list_link_active(&dr->dr_dirty_node)); ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); - ASSERT(dr->dr_next == NULL); - *drp = dr->dr_next; + ASSERT(list_next(&db->db_dirty_records, dr) == NULL); + list_remove(&db->db_dirty_records, dr); #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#3 (text) ==== @@ -1454,9 +1454,9 @@ return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); } - dr = db->db_last_dirty; - while (dr && dr->dr_txg != txg) - dr = dr->dr_next; + dr = list_head(&db->db_dirty_records); + while (dr != NULL && dr->dr_txg != txg) + dr = list_next(&db->db_dirty_records, dr); if (dr == NULL) { /* ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c#4 (text) ==== @@ -1304,13 +1304,14 @@ static void * dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) { - dbuf_dirty_record_t *dr, **drp; + dbuf_dirty_record_t *dr; void *data; if (db->db_dirtycnt == 0) return (db->db.db_data); /* Nothing is changing */ - for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) + dr = list_head(&db->db_dirty_records); + for (;dr != NULL; dr = list_next(&db->db_dirty_records, dr)) if (dr->dr_txg == tx->tx_txg) break; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c#2 (text) ==== @@ -1564,7 +1564,7 @@ caddr_t data; /* don't dirty if it isn't on disk and isn't dirty */ - if (db->db_last_dirty || + if (list_head(&db->db_dirty_records) != NULL || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); dbuf_will_dirty(db, tx); @@ -1600,7 +1600,7 @@ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), TRUE, FTAG, &db) == 0) { /* don't dirty if not on disk and not dirty */ - if (db->db_last_dirty || + if (list_head(&db->db_dirty_records) != NULL || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); dbuf_will_dirty(db, tx); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c#2 (text) ==== @@ -170,9 +170,9 @@ continue; ASSERT(err == 0); ASSERT(child->db_level == 0); - dr = child->db_last_dirty; - while (dr && dr->dr_txg > txg) - dr = dr->dr_next; + dr = list_head(&child->db_dirty_records); + while (dr != NULL && dr->dr_txg > txg) + dr = list_next(&child->db_dirty_records, dr); ASSERT(dr == NULL || dr->dr_txg == txg); /* data_old better be zeroed */ @@ -194,7 +194,7 @@ mutex_enter(&child->db_mtx); buf = child->db.db_data; if (buf != NULL && child->db_state != DB_FILL && - child->db_last_dirty == NULL) { + list_head(&child->db_dirty_records) == NULL) { for (j = 0; j < child->db.db_size >> 3; j++) { if (buf[j] != 0) { panic("freed data not zero: " @@ -262,7 +262,8 @@ FREE_VERIFY(db, start, end, tx); blocks_freed = free_blocks(dn, bp, end-start+1, tx); arc_buf_freeze(db->db_buf); - ASSERT(all || blocks_freed == 0 || db->db_last_dirty); + ASSERT(all || blocks_freed == 0 || + list_empty(&db->db_dirty_records)); DB_DNODE_EXIT(db); return (all ? ALL : blocks_freed); } @@ -295,7 +296,7 @@ ASSERT3U(bp->blk_birth, ==, 0); } #endif - ASSERT(all || blocks_freed == 0 || db->db_last_dirty); + ASSERT(all || blocks_freed == 0 || !list_empty(&db->db_dirty_records)); return (all ? ALL : blocks_freed); } @@ -443,8 +444,8 @@ mutex_enter(&db->db_mtx); /* XXX - use dbuf_undirty()? */ list_remove(list, dr); - ASSERT(db->db_last_dirty == dr); - db->db_last_dirty = NULL; + ASSERT(list_head(&db->db_dirty_records) == dr); + list_remove_head(&db->db_dirty_records); db->db_dirtycnt -= 1; if (db->db_level == 0) { ASSERT(db->db_blkid == DMU_BONUS_BLKID || ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#3 (text) ==== @@ -104,8 +104,8 @@ /* pointer back to our dbuf */ struct dmu_buf_impl *dr_dbuf; - /* pointer to next dirty record */ - struct dbuf_dirty_record *dr_next; + /* list link for dbuf dirty records */ + list_node_t db_dirty_record_link; /* pointer to parent dirty record */ struct dbuf_dirty_record *dr_parent; @@ -219,7 +219,7 @@ dbuf_dirty_record_t *db_data_pending; /* pointer to most recent dirty record for this buffer */ - dbuf_dirty_record_t *db_last_dirty; + list_t db_dirty_records; /* * Our link on the owner dnodes's dn_dbufs list. Change 515441 by willa@willa_repo on 2011/11/28 14:39:59 Record write ranges as part of a dbuf's dirty record. Clean up when a dirty record goes away. Checked using: % randomio -c -R -s 16384 -f /pool/pool_12965139711111904475/test/1g -v -p Which jumps to a random offset and writes a 4K-aligned randomized data buffer up to 16K in size, then reads it back to verify. /pool/pool_12965139711111904475/test/1g is a 1GB file already created. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: - Initialize a R/O sysctl that indicates the number of dirty ranges that are in flight as an int64_t. Should never be < 0, and should go to 0 when I/O stops for at least 5 seconds. Debug mostly, may be removed later. - When a new dirty record is created for a level 0 dbuf, initialize the write ranges list. - Add dbuf_dirty_record_add_range(): Add a range to a dirty record's write ranges list. This performs some calculations to determine whether a new range can be merged with an existing range, and if not, adds it. The range list is kept sorted by offset. - Add dbuf_dirty_record_cleanup_ranges(): Iterate on a dirty record's write ranges, removing every entry. This is intended as a catch all in case write ranges are left unresolved, but for now is the only exit point. - Update dbuf_will_dirty_range(): This is now basically a clone of dbuf_will_dirty() except that it takes the [offset,size] information and instead of throwing away the dbuf_dirty() return, uses it to call dbuf_dirty_record_add_range() to add the new range to it. - Add calls to dbuf_dirty_record_cleanup_ranges() in each place that a dirty record may be reaped. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h: - Break out the dbuf dirty types into fully qualified structs and their union, to enable accessing the write ranges list. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c: - Call dbuf_dirty_record_cleanup_ranges() in one other place that a dirty record may be reaped. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c: - Change calls from dmu_buf_will_dirty() to dmu_buf_will_dirty_range(). Rally: US1104 Writes to ZFS backed block storage (via CTL or blockback) do not block waiting for completion of read associated with COW fault Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#6 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#4 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c#3 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#4 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#6 (text) ==== @@ -71,6 +71,10 @@ /** \brief dbuf hash table routines */ static dbuf_hash_table_t dbuf_hash_table; +int64_t dirty_ranges_in_flight; +SYSCTL_DECL(_vfs_zfs); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dirty_ranges_in_flight, CTLFLAG_RD, + &dirty_ranges_in_flight, 0, "number of dirty ranges in flight"); static uint64_t dbuf_hash_count; @@ -274,6 +278,8 @@ for (i = 0; i < DBUF_MUTEXES; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); + + dirty_ranges_in_flight = 0; } void @@ -1139,6 +1145,7 @@ if (db->db_level == 0) { void *data_old = db->db_buf; + dbuf_dirty_leaf_t *dl = &dr->dt.dl; if (db->db_state != DB_NOFILL) { if (db->db_blkid == DMU_BONUS_BLKID) { @@ -1161,6 +1168,8 @@ ASSERT(data_old != NULL); } dr->dt.dl.dr_data = data_old; + list_create(&dl->write_ranges, sizeof(dbuf_dirty_range_t), + offsetof(dbuf_dirty_range_t, write_range_link)); } else { mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&dr->dt.di.dr_children, @@ -1277,6 +1286,37 @@ return (dr); } +/** + * \brief Cleanup a dirty record's write ranges as necessary. + * + * XXX + * This should be replaced with a larger dbuf_dirty_record_destroy() that + * cleans up an entire dirty record. + */ +void +dbuf_dirty_record_cleanup_ranges(dbuf_dirty_record_t *dr) +{ + dbuf_dirty_leaf_t *dl; + dbuf_dirty_range_t *range; + + /* Write ranges are not recorded for indirect blocks yet */ + if (dr->dr_dbuf->db_level != 0) + return; + + /* Remove any write range entries left behind. */ + dl = &dr->dt.dl; + while ((range = list_remove_head(&dl->write_ranges)) != NULL) { + /* + * XXX This may need to be logged in some way once + * merges occur, which in theory should always + * result in empty lists here. + */ + kmem_free(range, sizeof(dbuf_dirty_range_t)); + dirty_ranges_in_flight--; + } + list_destroy(&dl->write_ranges); +} + static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { @@ -1369,6 +1409,7 @@ mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } + dbuf_dirty_record_cleanup_ranges(dr); kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); @@ -1411,21 +1452,111 @@ (void) dbuf_dirty(db, tx); } +/** + * \brief Record a write range for the associated dirty record. + * + * \param dr The dirty record to record the write range for. + * \param offset The offset of the new write range. + * \param size The size of the new write range. + */ +void +dbuf_dirty_record_add_range(dbuf_dirty_record_t *dr, int offset, int size) +{ + struct dbuf_dirty_leaf *dl; + dbuf_dirty_range_t *old_range, *new_range; + int new_end, old_end; + boolean_t use_existing; + + dl = &dr->dt.dl; + + /* Discover whether an existing entry overlaps. */ + old_range = list_head(&dl->write_ranges); + new_end = offset + size; + old_end = 0; + use_existing = B_FALSE; + while (old_range != NULL) { + old_end = old_range->offset + old_range->size; + + /* + * Adjacent ranges count as "overlapping". A range's + * calculated end comes after the last byte. Thus, an + * adjacent range's end value will equal the other one's + * offset, and vice versa. + */ + + /* + * The existing range starts after the end of the new one, + * meaning the new one must be inserted before this one. + */ + if (old_range->offset > (new_end + 1)) + break; + +#define IN_RANGE(val, start, end) (val) >= (start) && (val) <= (end) + /* + * If the existing range's start or end falls within the new + * one, expand it to include the new one. + */ + if (IN_RANGE(old_range->offset, offset, new_end) + || IN_RANGE(old_end, offset, new_end)) { + use_existing = B_TRUE; + break; + } + + /* Try the next range, since this one didn't fit. */ + old_range = list_next(&dl->write_ranges, old_range); + } + + if (use_existing) { + /* + * Update the overlapping range entry so that it is a + * superset of the old entry and the new one. + */ + old_range->offset = MIN(offset, old_range->offset); + old_range->size = MAX(new_end, old_end) - old_range->offset; + return; + } + + /* No overlapping entry was found, so create a new one. */ + new_range = kmem_zalloc(sizeof(dbuf_dirty_range_t), KM_NOSLEEP); + new_range->offset = offset; + new_range->size = size; + + /* + * Insert the new range: + * - At the end of the list (old_range == NULL): + * - If the list is empty. + * - If no entry with a later offset was found. + * - Before another range (old_range != NULL): + * - If that range starts after the end of the new one. + */ + list_insert_before(&dl->write_ranges, old_range, new_range); + + dirty_ranges_in_flight++; +} + #pragma weak dmu_buf_will_dirty_range = dbuf_will_dirty_range /** */ void dbuf_will_dirty_range(dmu_buf_impl_t *db, dmu_tx_t *tx, int offset, int size) { + int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; + dbuf_dirty_record_t *dr; - //DB_DNODE_ENTER(db); - //dbuf_read_async(db, tx); - /* - * After issuing the async read (which may return immediately), - * record the range, either - */ - //DB_DNODE_EXIT(db); - dbuf_will_dirty(db, tx); + ASSERT(tx->tx_txg != 0); + ASSERT(!refcount_is_zero(&db->db_holds)); + ASSERT(db->db_level == 0); + + DB_DNODE_ENTER(db); + if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) + rf |= DB_RF_HAVESTRUCT; + DB_DNODE_EXIT(db); + (void) dbuf_read(db, NULL, rf); + dr = dbuf_dirty(db, tx); + ASSERT(dr != NULL); + mutex_enter(&db->db_mtx); + dbuf_dirty_record_add_range(dr, offset, size); + mutex_exit(&db->db_mtx); } void @@ -2405,6 +2536,7 @@ list_destroy(&dr->dt.di.dr_children); mutex_destroy(&dr->dt.di.dr_mtx); } + dbuf_dirty_record_cleanup_ranges(dr); kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; @@ -2661,6 +2793,7 @@ mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } + dbuf_dirty_record_cleanup_ranges(dr); kmem_free(dr, sizeof (dbuf_dirty_record_t)); cv_broadcast(&db->db_changed); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#4 (text) ==== @@ -1076,7 +1076,7 @@ if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else - dmu_buf_will_dirty(db, tx); + dmu_buf_will_dirty_range(db, tx, bufoff, tocpy); /* * XXX uiomove could block forever (eg. nfs-backed @@ -1174,7 +1174,7 @@ if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else - dmu_buf_will_dirty(db, tx); + dmu_buf_will_dirty_range(db, tx, bufoff, tocpy); for (copied = 0; copied < tocpy; copied += PAGESIZE) { ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c#3 (text) ==== @@ -446,6 +446,7 @@ list_remove(list, dr); ASSERT(list_head(&db->db_dirty_records) == dr); list_remove_head(&db->db_dirty_records); + dbuf_dirty_record_cleanup_ranges(dr); db->db_dirtycnt -= 1; if (db->db_level == 0) { ASSERT(db->db_blkid == DMU_BONUS_BLKID || ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#4 (text) ==== @@ -91,6 +91,34 @@ DR_OVERRIDDEN } override_states_t; +typedef struct dbuf_dirty_indirect { + kmutex_t dr_mtx; /* Protects the children. */ + list_t dr_children; /* List of our dirty children. */ +} dbuf_dirty_indirect_t; + +typedef struct dbuf_dirty_leaf { + /* + * dr_data is set when we dirty the buffer so that we can retain the + * pointer even if it gets COW'd in a subsequent transaction group. + */ + arc_buf_t *dr_data; + blkptr_t dr_overridden_by; + override_states_t dr_override_state; + uint8_t dr_copies; + + /* + * List of the ranges that dr_data's contents are valid for. + * Used when not all of dr_data is valid, as it may be if writes + * only cover part of it, and no read has filled in the gaps yet. + */ + list_t write_ranges; +} dbuf_dirty_leaf_t; + +typedef union dbuf_dirty_types { + struct dbuf_dirty_indirect di; + struct dbuf_dirty_leaf dl; +} dbuf_dirty_types_t; + typedef struct dbuf_dirty_record { /* link on our parents dirty list */ list_node_t dr_dirty_node; @@ -110,36 +138,7 @@ /* pointer to parent dirty record */ struct dbuf_dirty_record *dr_parent; - union dirty_types { - struct dirty_indirect { - - /* protect access to list */ - kmutex_t dr_mtx; - - /* Our list of dirty children */ - list_t dr_children; - } di; - struct dirty_leaf { - - /* - * dr_data is set when we dirty the buffer - * so that we can retain the pointer even if it - * gets COW'd in a subsequent transaction group. - */ - arc_buf_t *dr_data; - blkptr_t dr_overridden_by; - override_states_t dr_override_state; - uint8_t dr_copies; - - /* - * Record a list of the ranges that dr_data's - * contents are valid for. This allows recording - * multiple writes while waiting for an asynchronous - * read to complete. - */ - list_t *write_ranges; - } dl; - } dt; + union dbuf_dirty_types dt; } dbuf_dirty_record_t; typedef struct dbuf_dirty_range { @@ -323,6 +322,7 @@ void dbuf_init(void); void dbuf_fini(void); +void dbuf_dirty_record_cleanup_ranges(dbuf_dirty_record_t *dr); boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); Change 515452 by willa@willa_repo on 2011/11/28 16:15:02 Fix a few asserts. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#7 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c#4 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#7 (text) ==== @@ -304,6 +304,7 @@ { dnode_t *dn; dbuf_dirty_record_t *dr; + dbuf_dirty_record_t *pending; ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -336,11 +337,20 @@ ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); } + pending = NULL; for (dr = list_head(&db->db_dirty_records); dr != NULL;) { ASSERT(dr->dr_dbuf == db); dr = list_next(&db->db_dirty_records, dr); + /* This DR happens to be the pending DR. */ + if (dr == db->db_data_pending) + pending = dr; } - ASSERT(db->db_data_pending->dr_dbuf == db); + if (db->db_data_pending != NULL) { + /* The pending DR's dbuf is this dbuf. */ + ASSERT(db->db_data_pending->dr_dbuf == db); + /* XXX The pending DR should be on the list. */ + /*ASSERT(pending == db->db_data_pending);*/ + } /* * We can't assert that db_size matches dn_datablksz because it @@ -1313,6 +1323,7 @@ */ kmem_free(range, sizeof(dbuf_dirty_range_t)); dirty_ranges_in_flight--; + ASSERT(dirty_ranges_in_flight >= 0); } list_destroy(&dl->write_ranges); } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c#4 (text) ==== @@ -263,7 +263,7 @@ blocks_freed = free_blocks(dn, bp, end-start+1, tx); arc_buf_freeze(db->db_buf); ASSERT(all || blocks_freed == 0 || - list_empty(&db->db_dirty_records)); + !list_is_empty(&db->db_dirty_records)); DB_DNODE_EXIT(db); return (all ? ALL : blocks_freed); } @@ -296,7 +296,8 @@ ASSERT3U(bp->blk_birth, ==, 0); } #endif - ASSERT(all || blocks_freed == 0 || !list_empty(&db->db_dirty_records)); + ASSERT(all || blocks_freed == 0 || + !list_is_empty(&db->db_dirty_records)); return (all ? ALL : blocks_freed); } Change 515454 by willa@willa_repo on 2011/11/28 16:16:52 Add a new knob, WITH_ZFS_DEBUGGING. This knob is needed because ZFS uses -DDEBUG to enable certain asserts and so on, which conflict with other things. If this is enabled, ZFS should build with -DDEBUG=1 (and at least -g, unless DEBUG_FLAGS is already set). Affected files ... ... //depot/branches/redline/projects/cow/sys/modules/zfs/Makefile#2 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/modules/zfs/Makefile#2 (text) ==== @@ -89,8 +89,10 @@ CFLAGS+=-mminimal-toc .endif -#CFLAGS+=-DDEBUG=1 -#DEBUG_FLAGS=-g +.ifdef WITH_ZFS_DEBUGGING +CFLAGS+=-DDEBUG=1 +DEBUG_FLAGS?=-g +.endif .include Change 515489 by justing@justing-ns1 on 2011/11/29 09:47:24 Refine the changes of CS 514212 that converted DBuf's dirty record tracking to use a list_t. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c: o When performing traversals of the db_dirty_records list always use the same for loop idium. This fixes a bug in dbuf_verify() where the last element of the list was skipped. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: o Re-enable an assert that was firing due to the bug in dbuf_verify(). sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c: Use the list_is_empty() idiom instead of testing list_head() for NULL. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c: In free_verify(), traverse the db_dirty_record list backward since the record we are looking for is likely to be old. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h: Now that db_dirty_records is a list, say so in the comment. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#8 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#5 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c#5 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c#3 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c#5 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#5 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#8 (text) ==== @@ -338,9 +338,9 @@ } pending = NULL; - for (dr = list_head(&db->db_dirty_records); dr != NULL;) { + for (dr = list_head(&db->db_dirty_records); dr != NULL; + dr = list_next(&db->db_dirty_records, dr)) { ASSERT(dr->dr_dbuf == db); - dr = list_next(&db->db_dirty_records, dr); /* This DR happens to be the pending DR. */ if (dr == db->db_data_pending) pending = dr; @@ -348,8 +348,8 @@ if (db->db_data_pending != NULL) { /* The pending DR's dbuf is this dbuf. */ ASSERT(db->db_data_pending->dr_dbuf == db); - /* XXX The pending DR should be on the list. */ - /*ASSERT(pending == db->db_data_pending);*/ + /* The pending DR should be on the list. */ + ASSERT(pending == db->db_data_pending); } /* @@ -1342,10 +1342,10 @@ /* * If this buffer is not dirty, we're done. */ - for (dr = list_head(&db->db_dirty_records); dr != NULL;) { + for (dr = list_head(&db->db_dirty_records); dr != NULL; + dr = list_next(&db->db_dirty_records, dr)) { if (dr->dr_txg <= txg) break; - dr = list_next(&db->db_dirty_records, dr); } if (dr == NULL || dr->dr_txg < txg) { mutex_exit(&db->db_mtx); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#5 (text) ==== @@ -1454,9 +1454,11 @@ return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); } - dr = list_head(&db->db_dirty_records); - while (dr != NULL && dr->dr_txg != txg) - dr = list_next(&db->db_dirty_records, dr); + for (dr = list_head(&db->db_dirty_records); dr != NULL; + dr = list_next(&db->db_dirty_records, dr)) { + if (dr->dr_txg == txg) + break; + } if (dr == NULL) { /* ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c#5 (text) ==== @@ -1310,10 +1310,11 @@ if (db->db_dirtycnt == 0) return (db->db.db_data); /* Nothing is changing */ - dr = list_head(&db->db_dirty_records); - for (;dr != NULL; dr = list_next(&db->db_dirty_records, dr)) + for (dr = list_head(&db->db_dirty_records); dr != NULL; + dr = list_next(&db->db_dirty_records, dr)) { if (dr->dr_txg == tx->tx_txg) break; + } if (dr == NULL) { data = NULL; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c#3 (text) ==== @@ -1564,7 +1564,7 @@ caddr_t data; /* don't dirty if it isn't on disk and isn't dirty */ - if (list_head(&db->db_dirty_records) != NULL || + if (!list_is_empty(&db->db_dirty_records) || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); dbuf_will_dirty(db, tx); @@ -1600,7 +1600,7 @@ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), TRUE, FTAG, &db) == 0) { /* don't dirty if not on disk and not dirty */ - if (list_head(&db->db_dirty_records) != NULL || + if (!list_is_empty(&db->db_dirty_records) || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); dbuf_will_dirty(db, tx); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c#5 (text) ==== @@ -170,9 +170,11 @@ continue; ASSERT(err == 0); ASSERT(child->db_level == 0); - dr = list_head(&child->db_dirty_records); - while (dr != NULL && dr->dr_txg > txg) - dr = list_next(&child->db_dirty_records, dr); + for (dr = list_tail(&child->db_dirty_records); dr != NULL; + dr = list_prev(&child->db_dirty_records, dr)) { + if (dr->dr_txg >= txg) + break; + } ASSERT(dr == NULL || dr->dr_txg == txg); /* data_old better be zeroed */ @@ -194,7 +196,7 @@ mutex_enter(&child->db_mtx); buf = child->db.db_data; if (buf != NULL && child->db_state != DB_FILL && - list_head(&child->db_dirty_records) == NULL) { + list_is_empty(&child->db_dirty_records)) { for (j = 0; j < child->db.db_size >> 3; j++) { if (buf[j] != 0) { panic("freed data not zero: " ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#5 (text) ==== @@ -217,7 +217,7 @@ kcondvar_t db_changed; dbuf_dirty_record_t *db_data_pending; - /* pointer to most recent dirty record for this buffer */ + /* List of dirty records for the buffer sorted newest to oldest. */ list_t db_dirty_records; /* Change 515515 by willa@willa_repo on 2011/11/29 13:18:23 Use atomics when manipulating dirty_ranges_in_flight. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#9 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#9 (text) ==== @@ -1322,7 +1322,7 @@ * result in empty lists here. */ kmem_free(range, sizeof(dbuf_dirty_range_t)); - dirty_ranges_in_flight--; + atomic_subtract_64(&dirty_ranges_in_flight, 1); ASSERT(dirty_ranges_in_flight >= 0); } list_destroy(&dl->write_ranges); @@ -1542,7 +1542,7 @@ */ list_insert_before(&dl->write_ranges, old_range, new_range); - dirty_ranges_in_flight++; + atomic_add_64(&dirty_ranges_in_flight, 1); } #pragma weak dmu_buf_will_dirty_range = dbuf_will_dirty_range Change 515572 by willa@willa_repo on 2011/11/29 15:00:37 Correct this particular loop as modified by change 515489. When free_verify() is called, a dbuf's dirty record list may include transaction groups newer than the one being checked, but not necessarily older. Thus, in the new version, the loop would end with a non-NULL dirty record, where the old one wouldn't. Use the same idiom, but traverse forward as in the original code. Since the dbuf dirty record list can't have more than 3 entries at a time, the impact is negligible. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c#6 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c#6 (text) ==== @@ -170,11 +170,12 @@ continue; ASSERT(err == 0); ASSERT(child->db_level == 0); - for (dr = list_tail(&child->db_dirty_records); dr != NULL; - dr = list_prev(&child->db_dirty_records, dr)) { - if (dr->dr_txg >= txg) - break; - } + + for (dr = list_head(&child->db_dirty_records); + dr != NULL && dr->dr_txg > txg; + dr = list_next(&child->db_dirty_records, dr)) + ; + ASSERT(dr == NULL || dr->dr_txg == txg); /* data_old better be zeroed */ Change 515576 by justing@justing-ns1 on 2011/11/29 16:16:21 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h: o Add preliminary documentation for all DBuf states. o Add new DBuf states to handle asynchronous COW resolution. o Convert dbuf state values to unique bits to allow simplification of the logic to deal with them. o Update ASCII DBuf state machine. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#6 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#6 (text) ==== @@ -54,24 +54,177 @@ /* * The simplified state transition diagram for dbufs looks like: * - * +----> READ ----+ - * | | - * | V - * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) - * | ^ ^ - * | | | - * +----> FILL ----+ | - * | | - * | | - * +--------> NOFILL -------+ + * +-> PARTIAL_FILL <---> PARTIAL-+ + * | | | + * +---------->READ_FILL<----[----+ + * | ^ | + * | | | + * | V | + * +-----------> READ -------+ + * | | + * | V + * (alloc)-->UNCACHED----------------->FILL--->CACHED----> EVICTING-->(free) + * | ^ + * | | + * +--------------------> NOFILL ------------------+ + * + * Reader State Transitions: + * UNCACHED -> READ: Access to a block that does not have an + * active dbuf. A read is issued to media + * upon an ARC or L2ARC miss. + * + * READ -> FILL: Data satisfied from the ARC, L2ARC, or + * a read of the media. + * + * PARTIAL -> READ: Access to a block that has been partially + * written but has yet to have the read + * needed to resolve the COW fault issued. + * The read is issued to media. The ARC and + * L2ARC are not involved since they were + * checked for a hit at the time of the first + * write to this buffer. + * + * Writer State Transitions: + * UNCACHED -> FILL: Access to a block that does not have an + * active dbuf. Writer is filling the entire + * block. + * + * UNCACHED -> PARTIAL_FILL: Access to a block that does not have an + * active dbuf. Writer is filling a portion + * of the block starting at the beginning or + * end. The read needed to resolve the COW + * fault is deferred until we see that the + * writer will not fill this whole buffer. + * + * UNCACHED -> READ_FILL: Access to a block that does not have an + * active dbuf. Writer is filling a portion + * of the block and we have enough information + * to expect that the buffer will not be fully + * written. The read needed to resolve the COW + * fault is issued asynchronously. + * + * READ -> READ_FILL: Access to a block that has an active dbuf + * and a read has already been issued for the + * original buffer contents. A COW fault may + * not have occurred, if the buffer was not + * already dirty. Writer is filling a portion + * of the buffer. + * + * PARTIAL -> PARTIAL_FILL: Access to a block that has an active dbuf + * with an outstanding COW fault. Writer is + * filling a portion of the block and we have + * enough information to expect that the buffer + * will eventually be fully written. + * + * PARTIAL -> READ_FILL: Access to a block that has an active dbuf + * with an outstanding COW fault. Writer is + * filling a portion of the block and we have + * enough information to expect that the buffer + * will not be fully written, causing a read + * to be issued. + * + * PARTIAL -> FILL: Access to a block that has an active dbuf + * with an outstanding COW fault. Writer is + * filling enough of the buffer to avoid the + * read for this fault entirely. + * + * I/O Complete Transitions: + * FILL -> CACHED: The thread modifying the buffer has completed + * its work. The buffer can now be accessed by + * other threads. + * + * PARTIAL_FILL -> PARTIAL: The write thread modifying the buffer has + * completed its work. The buffer can now be + * accessed by other threads. No read has been + * issued to resolve the COW fault. + * + * READ_FILL -> READ: The write thread modifying the buffer has + * completed its work. The buffer can now be + * accessed by other threads. A read is + * outstanding to resolve the COW fault. + * + * The READ, PARITIAL_FILL, and READ_FILL states indicate the data associated + * with a dbuf is volatile and a new client must wait for the current consumer + * to exit the dbuf from that state prior to accessing the data. + * + * The PARITIAL_FILL, PARTIAL, READ_FILL, and READ states are used for + * deferring any reads required for resolution of Copy-On-Write faults. + * A PARTIAL dbuf has accumulated write data in its dirty records + * that must be merged into the existing data for the record once the + * record is read. A READ dbuf is a dbuf for which a synchronous or + * async read has been issued. If the dbuf has dirty records, this read + * is required to resolve the COW fault before those dirty records can be + * committed to disk. The FILL variants of these two states indicate that + * either new write data is being added to the dirty records for this dbuf, + * or the read has completed and the write and read data are being merged. + * + * Writers must block on dbufs in any of the FILL states. + * + * Synchronous readers must block on dbufs in the READ, and any + * of the FILL states. Further, a reader must transition a dbuf from the + * UNCACHED or PARTIAL state to the READ state by issuing a read, before + * blocking. + * + * The transition from PARTIAL to READ is also triggered by writers that + * perform a discontiguous write to the buffer, meaning that there is + * little chance for a latter writer to completely fill the buffer. + * Since the read cannot be avoided, it is issued immediately. */ typedef enum dbuf_states { - DB_UNCACHED, - DB_FILL, - DB_NOFILL, - DB_READ, - DB_CACHED, - DB_EVICTING + /** + * Dbuf has no valid data. + */ + DB_UNCACHED = 0x01, + + /** + * The Dbuf's contents are being modified by an active thread. + * This state can be combined with PARTIAL or READ. When + * just in the DB_FILL state, the entire buffer's contents are + * being supplied by the writer. When combined with the other + * states, the buffer is only being partially dirtied. + */ + DB_FILL = 0x02, + + /** + * Dbuf has been partially dirtied by writers. No read has been + * issued to resolve the COW fault. + */ + DB_PARTIAL = 0x04, + + /** + * A NULL DBuf associated with swap backing store. + */ + DB_NOFILL = 0x08, + + /** + * A read has been issued for an uncached buffer with no + * outstanding dirty data (i.e. Not PARTIAL). + */ + DB_READ = 0x10, + + /** + * The entire contents of this dbuf are valid. The buffer + * may still be dirty. + */ + DB_CACHED = 0x20, + + /** + * The Dbuf is in the process of being freed. + */ + DB_EVICTING = 0x40, + + /** + * Dbuf has been partially dirtied by writers and a + * thread is actively modifying the dbuf. + */ + DB_PARTIAL_FILL = DB_PARTIAL|DB_FILL, + + /** + * Dbuf has been partially dirtied by writers, a read + * has been issued to resolve the COW fault, and a + * thread is actively modifying the dbuf. + */ + DB_READ_FILL = DB_READ|DB_FILL } dbuf_states_t; struct dnode; Change 515580 by willa@willa_repo on 2011/11/29 17:24:23 Add comments reflecting dbuf state handling that will need updating. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#10 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#6 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#10 (text) ==== @@ -396,6 +396,12 @@ } } } + /* + * XXX + * We may need to modify the state check here if something may be + * in DB_FILL and have dirty parts, depending on how db_state + * semantics are changed. + */ if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && (db->db_buf == NULL || db->db_buf->b_data) && db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && @@ -493,6 +499,12 @@ { dmu_buf_impl_t *db = vdb; + /* + * XXX + * When reads become asynchronous, this needs to change to cv_wait + * for DB_FILL to complete + */ + mutex_enter(&db->db_mtx); ASSERT3U(db->db_state, ==, DB_READ); /* @@ -537,6 +549,7 @@ /* We need the struct_rwlock to prevent db_blkptr from changing. */ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(MUTEX_HELD(&db->db_mtx)); + /* XXX Must be changed to allow for PARTIAL */ ASSERT(db->db_state == DB_UNCACHED); ASSERT(db->db_buf == NULL); @@ -633,6 +646,7 @@ DBUF_IS_CACHEABLE(db); mutex_enter(&db->db_mtx); + /* XXX Need to add PARTIAL case, or merge with one of these */ if (db->db_state == DB_CACHED) { mutex_exit(&db->db_mtx); if (prefetch) @@ -693,6 +707,24 @@ ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); + /* + * XXX + * This function is called when a caller wants to fill the + * entire buffer. + * + * If the steady state is PARTIAL, then we must issue a READ + * immediately for this buffer, to resolve writes that may be in a + * previous transaction group. + * + * In the future, this can be optimized by not issuing the READ, if + * this request happens to be coming in on the same transaction + * group. The fill that's about to occur can then obsolete the + * previously issued writes. + * + * The write ranges, however, are needed in order for the read_done + * callback to realize that its buffer is no longer needed, so the + * fill should still create a single write range covering the buffer. + */ while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { @@ -861,6 +893,7 @@ mutex_exit(&db->db_mtx); continue; } + /* XXX This should check for PARTIAL also */ if (db->db_state == DB_READ || db->db_state == DB_FILL) { /* will be handled in dbuf_read_done or dbuf_rele */ db->db_freed_in_flight = TRUE; @@ -1060,6 +1093,10 @@ * transactions created with dmu_tx_create_assigned() from * syncing context don't bother holding ahead. */ + /* XXX + * This should be valid for PARTIAL and READ too, since writes will + * still be able to occur in those states. + */ ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || db->db_state == DB_FILL || db->db_state == DB_NOFILL); @@ -1087,9 +1124,18 @@ dr = list_head(&db->db_dirty_records); ASSERT(dr == NULL || dr->dr_txg <= tx->tx_txg || db->db.db_object == DMU_META_DNODE_OBJECT); + /* + * Find the newest dirty record that is not newer than the + * transaction's group. If there isn't one, dr == NULL. If it is + * older, it will be ignored. + */ while (dr != NULL && dr->dr_txg > tx->tx_txg) dr = list_next(&db->db_dirty_records, dr); if (dr && dr->dr_txg == tx->tx_txg) { + /* + * This transaction happens to be occurring in the same + * transaction group as the dirty record found above. + */ DB_DNODE_EXIT(db); if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { @@ -1309,7 +1355,7 @@ dbuf_dirty_leaf_t *dl; dbuf_dirty_range_t *range; - /* Write ranges are not recorded for indirect blocks yet */ + /* Write ranges do not apply to indirect blocks */ if (dr->dr_dbuf->db_level != 0) return; @@ -1328,6 +1374,9 @@ list_destroy(&dl->write_ranges); } +/* + * This function appears to only be called in the syncer state. + */ static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { @@ -1478,6 +1527,9 @@ int new_end, old_end; boolean_t use_existing; + /* Write ranges do not apply to indirect blocks. */ + ASSERT(dr->dr_dbuf->db_level == 0); + dl = &dr->dt.dl; /* Discover whether an existing entry overlaps. */ @@ -1556,7 +1608,6 @@ ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); - ASSERT(db->db_level == 0); DB_DNODE_ENTER(db); if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) @@ -1613,6 +1664,12 @@ bzero(db->db.db_data, db->db.db_size); db->db_freed_in_flight = FALSE; } + /* XXX + * This function should be called when a FILL is done while + * the steady state is READ or PARTIAL, in which cases the + * state should be reset to those. However, if it's PARTIAL + * and the buffer fully fills, then it can go to CACHED. + */ db->db_state = DB_CACHED; cv_broadcast(&db->db_changed); } @@ -1640,6 +1697,19 @@ mutex_enter(&db->db_mtx); + /* XXX + * The caller has already filled an ARC buffer and wants to assign + * it to this dbuf. + * + * Just like dbuf_noread, except the data is already here, so + * there's no need to yield control of the buffer in a FILL state. + * + * If the steady state is PARTIAL, transition to READ to resolve the + * partial writes, issuing the read in the process. + * + * If the steady state is READ, then perform the dirty record work + * needed to save this ARC buffer. + */ while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#6 (text) ==== @@ -443,6 +443,7 @@ for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; mutex_enter(&db->db_mtx); + /* Must wait on PARTIAL too */ while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); Change 515595 by willa@willa_repo on 2011/11/30 11:16:11 Tweak the state transitions for the READ steady state. For a reader, if no writes have come in, it will go directly from READ to CACHED, since the read done callback will have the ARC buffer containing the read data already in memory, and therefore does not need to drop the dbuf mutex to copy the data. It will simply set the dbuf's arc buffer to that provided in the callback. For a writer, it will go from READ to FILL if a writer will be filling the buffer completely before the read is completed. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#7 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#7 (text) ==== @@ -60,9 +60,9 @@ * | ^ | * | | | * | V | - * +-----------> READ -------+ - * | | - * | V + * +-----------> READ ------+[-------+ + * | || | + * | VV V * (alloc)-->UNCACHED----------------->FILL--->CACHED----> EVICTING-->(free) * | ^ * | | @@ -73,8 +73,8 @@ * active dbuf. A read is issued to media * upon an ARC or L2ARC miss. * - * READ -> FILL: Data satisfied from the ARC, L2ARC, or - * a read of the media. + * READ -> CACHED: Data satisfied from the ARC, L2ARC, or + * a read of the media. No writes occurred. * * PARTIAL -> READ: Access to a block that has been partially * written but has yet to have the read @@ -128,6 +128,11 @@ * filling enough of the buffer to avoid the * read for this fault entirely. * + * READ -> FILL: Access to a block that has an active dbuf + * with an outstanding COW fault, and a read + * has been issued. Write is filling enough of + * the buffer to obsolete the read. + * * I/O Complete Transitions: * FILL -> CACHED: The thread modifying the buffer has completed * its work. The buffer can now be accessed by Change 515654 by willa@willa_repo on 2011/11/30 16:57:06 In dbuf_will_dirty_range(), issue the read asynchronously. Then immediately wait on it, replicating previous functionality. Add a few more debugging counters to help us see what state the dbuf is in when it enters the function. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#11 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#11 (text) ==== @@ -72,9 +72,30 @@ */ static dbuf_hash_table_t dbuf_hash_table; int64_t dirty_ranges_in_flight; +uint64_t async_read_wait_loops; +uint64_t will_dirty_uncached; +uint64_t will_dirty_cached; +uint64_t will_dirty_read; +uint64_t will_dirty_nofill; +uint64_t will_dirty_fill; +uint64_t will_dirty_evicting; SYSCTL_DECL(_vfs_zfs); SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dirty_ranges_in_flight, CTLFLAG_RD, &dirty_ranges_in_flight, 0, "number of dirty ranges in flight"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, async_read_wait_loops, CTLFLAG_RD, + &async_read_wait_loops, 0, "number of times async reads waited"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, will_dirty_uncached, CTLFLAG_RD, + &will_dirty_uncached, 0, "will dirty uncached"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, will_dirty_cached, CTLFLAG_RD, + &will_dirty_cached, 0, "will dirty cached"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, will_dirty_read, CTLFLAG_RD, + &will_dirty_read, 0, "will dirty read"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, will_dirty_nofill, CTLFLAG_RD, + &will_dirty_nofill, 0, "will dirty nofill"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, will_dirty_fill, CTLFLAG_RD, + &will_dirty_fill, 0, "will dirty fill"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, will_dirty_evicting, CTLFLAG_RD, + &will_dirty_evicting, 0, "will dirty evicting"); static uint64_t dbuf_hash_count; @@ -280,6 +301,13 @@ mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); dirty_ranges_in_flight = 0; + async_read_wait_loops = 0; + will_dirty_uncached = 0; + will_dirty_cached = 0; + will_dirty_read = 0; + will_dirty_nofill = 0; + will_dirty_fill = 0; + will_dirty_evicting = 0; } void @@ -401,6 +429,10 @@ * We may need to modify the state check here if something may be * in DB_FILL and have dirty parts, depending on how db_state * semantics are changed. + * + * XXX + * Why does this ignore DB_FILL in the first place? DB_FILL + * still dirties the buffer and must be sunk too. */ if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && (db->db_buf == NULL || db->db_buf->b_data) && @@ -1599,23 +1631,75 @@ #pragma weak dmu_buf_will_dirty_range = dbuf_will_dirty_range /** + * \brief Signal intent to dirty a subset of the buffer. + * + * \param db The dbuf that will be dirtied + * \param tx The transaction the dirty will occur in + * \param offset The starting offset of the intended dirty + * \param size The length of the intended dirty + * + * XXX This needs to be merged into dbuf_will_dirty(). */ void dbuf_will_dirty_range(dmu_buf_impl_t *db, dmu_tx_t *tx, int offset, int size) { int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; dbuf_dirty_record_t *dr; + dnode_t *dn; + zio_t *zio = NULL; ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); - DB_DNODE_ENTER(db); - if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) - rf |= DB_RF_HAVESTRUCT; - DB_DNODE_EXIT(db); - (void) dbuf_read(db, NULL, rf); + mutex_enter(&db->db_mtx); + switch(db->db_state) { + case DB_UNCACHED: + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) + rf |= DB_RF_HAVESTRUCT; + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, + ZIO_FLAG_MUSTSUCCEED); + DB_DNODE_EXIT(db); + atomic_add_64(&will_dirty_uncached, 1); + break; + case DB_CACHED: + atomic_add_64(&will_dirty_cached, 1); + break; + case DB_FILL: + atomic_add_64(&will_dirty_fill, 1); + break; + case DB_READ: + atomic_add_64(&will_dirty_read, 1); + break; + case DB_NOFILL: + atomic_add_64(&will_dirty_nofill, 1); + break; + case DB_EVICTING: + atomic_add_64(&will_dirty_evicting, 1); + break; + } + mutex_exit(&db->db_mtx); + + /* Issue the asynchronous read using our zio */ + if (zio != NULL) { + (void) dbuf_read(db, zio, rf); + (void) zio_nowait(zio); + } + + /* Wait for the async read to complete */ + mutex_enter(&db->db_mtx); + while (db->db_state == DB_READ) { + cv_wait(&db->db_changed, &db->db_mtx); + atomic_add_64(&async_read_wait_loops, 1); + } + ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); + mutex_exit(&db->db_mtx); + dr = dbuf_dirty(db, tx); ASSERT(dr != NULL); + + /* Add the write range to this dbuf. */ mutex_enter(&db->db_mtx); dbuf_dirty_record_add_range(dr, offset, size); mutex_exit(&db->db_mtx); Change 515655 by willa@willa_repo on 2011/11/30 19:20:09 Add a bit more commenting for dbuf_free_range(). Change dbuf_read_done() to cv_wait on DB_FILL flag set. This is currently a NOP. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#12 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#12 (text) ==== @@ -531,14 +531,10 @@ { dmu_buf_impl_t *db = vdb; - /* - * XXX - * When reads become asynchronous, this needs to change to cv_wait - * for DB_FILL to complete - */ + mutex_enter(&db->db_mtx); + while (db->db_state & DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); - mutex_enter(&db->db_mtx); - ASSERT3U(db->db_state, ==, DB_READ); /* * All reads are synchronous, so we must have a hold on the dbuf */ @@ -925,7 +921,23 @@ mutex_exit(&db->db_mtx); continue; } - /* XXX This should check for PARTIAL also */ + /* XXX + * This should check for PARTIAL also. + * + * Our goal is to make the data visible in the current + * transaction group all zeros while preserving the data + * as seen in any earlier transaction groups. + * + * If the buffer is currently being filled then we cannot + * directly clear the buffer's contents. Instead, we + * signal the filler by setting db_freed_in_flight and + * have it do this work just before transitioning the buffer + * to the CACHED state. + * + * If a read is outstanding then the dirty record, if any, + * needs to be processed for this transaction group here, + * rather than deferring it to dbuf_read_done. + */ if (db->db_state == DB_READ || db->db_state == DB_FILL) { /* will be handled in dbuf_read_done or dbuf_rele */ db->db_freed_in_flight = TRUE; Change 515656 by willa@willa_repo on 2011/11/30 19:52:53 Create a new function, dirty_record_create(). Its job is to set up a new dbuf dirty record, performing the necessary allocations, initializations, and record keeping. Call it from dbuf_dirty(). Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#13 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#13 (text) ==== @@ -1095,6 +1095,91 @@ db->db_blkptr, os->os_spa, &zb); } +/** + * \brief Create a new dbuf dirty record for this transaction. + * + * \param db The dbuf to create the dirty record for + * \param tx The transaction to create the dirty record on + * + * \invariant The dbuf mutex must be held. + * \invariant The dnode must be referenced. + * \invariant A dirty record must not already exist for the transaction's + * transaction group. + */ +dbuf_dirty_record_t * +dirty_record_create(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dbuf_dirty_record_t *dr; + dnode_t *dn; + + /* Check the invariants. */ + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(DB_DNODE_HELD(db)); + dr = list_head(&db->db_dirty_records); + ASSERT(dr == NULL || dr->dr_txg != tx->tx_txg); + + dn = DB_DNODE(db); + dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); + + if (db->db_level == 0) { + void *data_old = db->db_buf; + dbuf_dirty_leaf_t *dl = &dr->dt.dl; + + if (db->db_state != DB_NOFILL) { + if (db->db_blkid == DMU_BONUS_BLKID) { + dbuf_fix_old_data(db, tx->tx_txg); + data_old = db->db.db_data; + } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { + /* + * Release the data buffer from the cache so + * that we can modify it without impacting + * possible other users of this cached data + * block. Note that indirect blocks and + * private objects are not released until the + * syncing state (since they are only modified + * then). + */ + arc_release(db->db_buf, db); + dbuf_fix_old_data(db, tx->tx_txg); + data_old = db->db_buf; + } + ASSERT(data_old != NULL); + } + dr->dt.dl.dr_data = data_old; + list_create(&dl->write_ranges, sizeof(dbuf_dirty_range_t), + offsetof(dbuf_dirty_range_t, write_range_link)); + } else { + mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); + list_create(&dr->dt.di.dr_children, + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); + } + dr->dr_dbuf = db; + dr->dr_txg = tx->tx_txg; + list_insert_head(&db->db_dirty_records, dr); + + /* + * Make sure that if this block was marked to be freed in this + * transaction group, that we revert that change. + */ + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && + db->db_blkid != DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + dnode_clear_range(dn, db->db_blkid, 1, tx); + mutex_exit(&dn->dn_mtx); + db->db_freed_in_flight = FALSE; + } + + /* + * This buffer is now part of this txg + */ + dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); + db->db_dirtycnt += 1; + ASSERT3U(db->db_dirtycnt, <=, TXG_CONCURRENT_STATES); + + return (dr); +} + /** \brief Mark a dbuf as dirty. */ dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) @@ -1241,64 +1326,7 @@ * to make a copy of it so that the changes we make in this * transaction group won't leak out when we sync the older txg. */ - dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); - - if (db->db_level == 0) { - void *data_old = db->db_buf; - dbuf_dirty_leaf_t *dl = &dr->dt.dl; - - if (db->db_state != DB_NOFILL) { - if (db->db_blkid == DMU_BONUS_BLKID) { - dbuf_fix_old_data(db, tx->tx_txg); - data_old = db->db.db_data; - } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { - /* - * Release the data buffer from the cache so - * that we can modify it without impacting - * possible other users of this cached data - * block. Note that indirect blocks and - * private objects are not released until the - * syncing state (since they are only modified - * then). - */ - arc_release(db->db_buf, db); - dbuf_fix_old_data(db, tx->tx_txg); - data_old = db->db_buf; - } - ASSERT(data_old != NULL); - } - dr->dt.dl.dr_data = data_old; - list_create(&dl->write_ranges, sizeof(dbuf_dirty_range_t), - offsetof(dbuf_dirty_range_t, write_range_link)); - } else { - mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); - list_create(&dr->dt.di.dr_children, - sizeof (dbuf_dirty_record_t), - offsetof(dbuf_dirty_record_t, dr_dirty_node)); - } - dr->dr_dbuf = db; - dr->dr_txg = tx->tx_txg; - list_insert_head(&db->db_dirty_records, dr); - - /* - * We could have been freed_in_flight between the dbuf_noread - * and dbuf_dirty. We win, as though the dbuf_noread() had - * happened after the free. - */ - if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && - db->db_blkid != DMU_SPILL_BLKID) { - mutex_enter(&dn->dn_mtx); - dnode_clear_range(dn, db->db_blkid, 1, tx); - mutex_exit(&dn->dn_mtx); - db->db_freed_in_flight = FALSE; - } - - /* - * This buffer is now part of this txg - */ - dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); - db->db_dirtycnt += 1; - ASSERT3U(db->db_dirtycnt, <=, 3); + dr = dirty_record_create(db, tx); mutex_exit(&db->db_mtx); Change 515658 by willa@willa_repo on 2011/11/30 20:41:15 Move the large block of verification out of dbuf_dirty(). Rename DMU_TX_DIRTY_BUF() to DMU_TX_VERIFY_DIRTY_BUF() to denote that it is in fact not a potentially destructive call. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#14 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c#2 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c#4 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h#2 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h#2 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#14 (text) ==== @@ -1180,24 +1180,16 @@ return (dr); } -/** \brief Mark a dbuf as dirty. */ -dbuf_dirty_record_t * -dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +void +dbuf_dirty_verify(dmu_buf_impl_t *db, dmu_tx_t *tx) { - dnode_t *dn; - objset_t *os; - dbuf_dirty_record_t *dr; - int drop_struct_lock = FALSE; - boolean_t do_free_accounting = B_FALSE; - int txgoff = tx->tx_txg & TXG_MASK; + dnode_t *dn = DB_DNODE(db); - /* Ensure that this dbuf has no transaction groups or holds */ + /* Ensure that this dbuf has a transaction group and a hold */ ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); - DMU_TX_DIRTY_BUF(tx, db); + DMU_TX_VERIFY_DIRTY_BUF(tx, db); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); /* * Shouldn't dirty a regular buffer in syncing context. Private * objects may be dirtied in syncing context, but only if they @@ -1207,16 +1199,9 @@ BP_IS_HOLE(dn->dn_objset->os_rootbp) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_objset->os_dsl_dataset == NULL); - /* - * We make this assert for private objects as well, but after we - * check if we're already dirty. They are allowed to re-dirty - * in syncing context. - */ - ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || - dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); + + DNODE_VERIFY_DIRTYCTX(dn, tx); - mutex_enter(&db->db_mtx); /* * XXX make this true for indirects too? The problem is that * transactions created with dmu_tx_create_assigned() from @@ -1229,20 +1214,26 @@ ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || db->db_state == DB_FILL || db->db_state == DB_NOFILL); +} - mutex_enter(&dn->dn_mtx); - /* - * Don't set dirtyctx to SYNC if we're just modifying this as we - * initialize the objset. - */ - if (dn->dn_dirtyctx == DN_UNDIRTIED && - !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { - dn->dn_dirtyctx = - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); - ASSERT(dn->dn_dirtyctx_firstset == NULL); - dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); - } - mutex_exit(&dn->dn_mtx); +/** \brief Mark a dbuf as dirty. */ +dbuf_dirty_record_t * +dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dnode_t *dn; + objset_t *os; + dbuf_dirty_record_t *dr; + int drop_struct_lock = FALSE; + boolean_t do_free_accounting = B_FALSE; + int txgoff = tx->tx_txg & TXG_MASK; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + mutex_enter(&db->db_mtx); + + dbuf_dirty_verify(db, tx); + dnode_set_dirtyctx(dn, tx); if (db->db_blkid == DMU_SPILL_BLKID) dn->dn_have_spill = B_TRUE; @@ -1284,9 +1275,7 @@ /* * Only valid if not already dirty. */ - ASSERT(dn->dn_object == 0 || - dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); + DNODE_VERIFY_DIRTYCTX(dn, tx); ASSERT3U(dn->dn_nlevels, >, db->db_level); ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c#2 (text) ==== @@ -801,7 +801,7 @@ #ifdef ZFS_DEBUG void -dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) +dmu_tx_verify_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) { dmu_tx_hold_t *txh; int match_object = FALSE, match_offset = FALSE; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c#4 (text) ==== @@ -1457,6 +1457,32 @@ rw_downgrade(&dn->dn_struct_rwlock); } +/** + * \brief Mark a dnode as dirty if it is not already. + * + * \param dn Dnode to mark dirty. + * \param tx Transaction the dnode is being dirtied in. + */ +void +dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx) +{ + + mutex_enter(&dn->dn_mtx); + /* + * Don't set dirtyctx to SYNC if we're just modifying this as we + * initialize the objset. + */ + if (dn->dn_dirtyctx == DN_UNDIRTIED) { + if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) { + if (dmu_tx_is_syncing(tx)) + dn->dn_dirtyctx = DN_DIRTY_SYNC; + else + dn->dn_dirtyctx = DN_DIRTY_OPEN; + } + } + mutex_exit(&dn->dn_mtx); +} + void dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) { ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h#2 (text) ==== @@ -130,14 +130,14 @@ int dmu_tx_private_ok(dmu_tx_t *tx); void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object); void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta); -void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db); +void dmu_tx_verify_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db); int dmu_tx_holds(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space); #ifdef ZFS_DEBUG -#define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db) +#define DMU_TX_VERIFY_DIRTY_BUF(tx, db) dmu_tx_verify_dirty_buf(tx, db) #else -#define DMU_TX_DIRTY_BUF(tx, db) +#define DMU_TX_VERIFY_DIRTY_BUF(tx, db) #endif #ifdef __cplusplus ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h#2 (text) ==== @@ -266,6 +266,7 @@ boolean_t dnode_add_ref(dnode_t *dn, void *ref); void dnode_rele(dnode_t *dn, void *ref); void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); +void dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx); void dnode_sync(dnode_t *dn, dmu_tx_t *tx); void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); @@ -290,6 +291,12 @@ int minlvl, uint64_t blkfill, uint64_t txg); void dnode_evict_dbufs(dnode_t *dn); +#define DNODE_VERIFY_DIRTYCTX(dn, tx) \ + ASSERT((dn)->dn_object == 0 || \ + (dn)->dn_dirtyctx == DN_UNDIRTIED || \ + (dn)->dn_dirtyctx == \ + (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)) + #ifdef ZFS_DEBUG /* Change 515681 by willa@willa_repo on 2011/12/01 10:53:04 Break up dbuf_dirty() a little more. Introduce dbuf_dirty_parent(), whose job is to perform the dirty up the dbuf chain to the dnode itself. This is a fairly self contained segment of dbuf_dirty() in which the dnode's struct_rwlock may be grabbed and dropped. It requires a fair amount of record keeping to be performed, especially for indirect blocks. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#16 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#16 (text) ==== @@ -1225,6 +1225,95 @@ db->db_state == DB_NOFILL); } +/** + * \brief Dirty the dbuf's parent. + * + * \param db The dbuf whose parent needs to be dirtied + * \param tx The transaction to dirty the parent for + * \param dr The applicable dirty record + * + * \invariant The dbuf's dnode must be referenced by the caller. + * + * If the dnode's struct_rwlock is not held, it will be grabbed and dropped + * within this function. + */ +static void +dbuf_dirty_parent(dmu_buf_impl_t *db, dmu_tx_t *tx, dbuf_dirty_record_t *dr) +{ + dnode_t *dn; + int drop_struct_lock = FALSE; + int txgoff = tx->tx_txg & TXG_MASK; + + ASSERT(DB_DNODE_HELD(db)); + dn = DB_DNODE(db); + + if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + drop_struct_lock = TRUE; + } + + if (db->db_level == 0) { + dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); + ASSERT(dn->dn_maxblkid >= db->db_blkid); + } + + if (db->db_level+1 < dn->dn_nlevels) { + /* The dbuf's parent is an indirect block */ + dmu_buf_impl_t *parent = db->db_parent; + dbuf_dirty_record_t *di; + int parent_held = FALSE; + + /* Get a hold on the parent before dropping struct_rwlock */ + if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + parent = dbuf_hold_level(dn, db->db_level+1, + db->db_blkid >> epbs, FTAG); + ASSERT(parent != NULL); + parent_held = TRUE; + } + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); + + ASSERT3U(db->db_level+1, ==, parent->db_level); + di = dbuf_dirty(parent, tx); + if (parent_held) + dbuf_rele(parent, FTAG); + + /* + * Update the dirty record to add this dbuf to its parent's + * dirty record's list of dirty children. + */ + mutex_enter(&db->db_mtx); + /* possible race with dbuf_undirty() */ + if (list_head(&db->db_dirty_records) == dr || + dn->dn_object == DMU_META_DNODE_OBJECT) { + mutex_enter(&di->dt.di.dr_mtx); + ASSERT3U(di->dr_txg, ==, tx->tx_txg); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&di->dt.di.dr_children, dr); + mutex_exit(&di->dt.di.dr_mtx); + dr->dr_parent = di; + } + mutex_exit(&db->db_mtx); + } else { + /* The dbuf's parent is the dnode */ + ASSERT(db->db_level+1 == dn->dn_nlevels); + ASSERT(db->db_blkid < dn->dn_nblkptr); + ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); + /* + * Update the dnode's list of dirty records to include this + * dbuf's dirty record. + */ + mutex_enter(&dn->dn_mtx); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&dn->dn_dirty_records[txgoff], dr); + mutex_exit(&dn->dn_mtx); + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); + } +} + /** \brief Mark a dbuf as dirty. */ dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) @@ -1232,7 +1321,6 @@ dnode_t *dn; objset_t *os; dbuf_dirty_record_t *dr; - int drop_struct_lock = FALSE; boolean_t do_free_accounting = B_FALSE; int txgoff = tx->tx_txg & TXG_MASK; @@ -1353,59 +1441,7 @@ dnode_willuse_space(dn, -willfree, tx); } - if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { - rw_enter(&dn->dn_struct_rwlock, RW_READER); - drop_struct_lock = TRUE; - } - - if (db->db_level == 0) { - dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); - ASSERT(dn->dn_maxblkid >= db->db_blkid); - } - - if (db->db_level+1 < dn->dn_nlevels) { - dmu_buf_impl_t *parent = db->db_parent; - dbuf_dirty_record_t *di; - int parent_held = FALSE; - - if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - - parent = dbuf_hold_level(dn, db->db_level+1, - db->db_blkid >> epbs, FTAG); - ASSERT(parent != NULL); - parent_held = TRUE; - } - if (drop_struct_lock) - rw_exit(&dn->dn_struct_rwlock); - ASSERT3U(db->db_level+1, ==, parent->db_level); - di = dbuf_dirty(parent, tx); - if (parent_held) - dbuf_rele(parent, FTAG); - - mutex_enter(&db->db_mtx); - /* possible race with dbuf_undirty() */ - if (list_head(&db->db_dirty_records) == dr || - dn->dn_object == DMU_META_DNODE_OBJECT) { - mutex_enter(&di->dt.di.dr_mtx); - ASSERT3U(di->dr_txg, ==, tx->tx_txg); - ASSERT(!list_link_active(&dr->dr_dirty_node)); - list_insert_tail(&di->dt.di.dr_children, dr); - mutex_exit(&di->dt.di.dr_mtx); - dr->dr_parent = di; - } - mutex_exit(&db->db_mtx); - } else { - ASSERT(db->db_level+1 == dn->dn_nlevels); - ASSERT(db->db_blkid < dn->dn_nblkptr); - ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); - mutex_enter(&dn->dn_mtx); - ASSERT(!list_link_active(&dr->dr_dirty_node)); - list_insert_tail(&dn->dn_dirty_records[txgoff], dr); - mutex_exit(&dn->dn_mtx); - if (drop_struct_lock) - rw_exit(&dn->dn_struct_rwlock); - } + dbuf_dirty_parent(db, tx, dr); dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); Change 515758 by willa@willa_repo on 2011/12/01 17:58:07 Clarify a path in dbuf_write_done(). Note that we need to update dmu_sync() to handle the possibility that it may be working with a dbuf that has not been CACHED yet. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#17 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#7 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#17 (text) ==== @@ -3177,6 +3177,10 @@ DB_DNODE_EXIT(db); if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + /* + * An immediate write has occurred via dmu_sync, which means + * its block pointer override needs to be handled here. + */ ASSERT(db->db_state != DB_NOFILL); dr->dr_zio = zio_write(pio, os->os_spa, txg, db->db_blkptr, data->b_data, arc_buf_size(data), &zp, ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#7 (text) ==== @@ -1470,6 +1470,14 @@ return (ENOENT); } + /* XXX + * If the dirty record is not CACHED, then the dmu_sync call needs + * to wait for it to reach that state. Thus, it needs to issue the + * READ if necessary. Then, it needs to wait for that read to + * complete, which should cause the dbuf to become CACHED after + * resolving the applicable write ranges. + */ + ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { Change 516097 by willa@willa_repo on 2011/12/06 09:20:24 Modify the dbuf state machine in preparation for async reads. Currently, this still performs sync reads inline with writes. Removing the temporary wait that makes the reads synchronous leads to violated asserts. Those asserts imply that this change does not yet properly manage the ARC buffers, so when they're evicted, ARC discovers that some buffers were written to after being frozen. This changeset is intended primarily as a WIP checkpoint, and appears to remain functional without leaks etc in basic testing. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: - Implement write range merging: - static void dbuf_resolve_ranges(dmu_buf_impl_t*, arc_buf_t*): - Iterate on all dirty records, oldest first, merging the provided buffer using write ranges, if any. - The initial ARC buffer used for data is provided; then each dirty record's ARC buffer is used for the next dirty record. - static void dbuf_merge_write_ranges(dbuf_dirty_leaf_t*, arc_buf_t*): - Perform an inverse merge on a given dirty leaf record. - Uses a hole iterator to simplify the function, which otherwise required two data copy sections. - static inline dbuf_dirty_record_hole_t * dbuf_dirty_record_hole_itr_next(dbuf_dirty_record_hole_itr_t*): - Update the hole iterator to provide the next available hole using the write ranges provided. - static inline void dbuf_dirty_record_hole_itr_init(dbuf_dirty_record_hole_itr_t*, dbuf_dirty_leaf_t*, arc_buf_t*): - Initialize the hole iterator using the dirty leaf record and ARC buffer containing source data. - Update dbuf_read_done(): - Accept that by the time the read done callback is called, the dbuf may have already reached the CACHED state, by writes that have managed to fill the buffer completely. - Otherwise, if the zio had no I/O errors, either resolve write ranges (if any exist) using the provided ARC buffer, or simply set the dbuf's ARC buffer to that buffer. - Assert that in the else condition, db_state is READ. - Update several places that need to expand their db_state checks. - Update several places that need to wait while another writer is modifying the buffer. - Modify dbuf_noread() to set the FILL state bit. - Modify dbuf_fix_old_data() so that it treats a buffer that has more holds than dirties the same as a buffer that has any outstanding dirty ranges. - Modify dbuf_free_range() so that it only sets freed_in_flight if the FILL state bit is set. For the READ state, fall through and make the buffer zero filled so it behaves as expected. - Add static void dbuf_dirty_record_truncate_ranges(dbuf_dirty_record_t *, int): This function chops off write ranges for a given dirty record using the new size. It was added to support making dbuf_new_size() nonblocking. For now, however, we're leaving that alone. - Change dirty_record_create() to dbuf_dirty_record_create(). - Modify dbuf_dirty_record_create(): - If no ARC buffer has been allocated, allocate one, otherwise, fix up the old ARC buffer data. - Modify dbuf_dirty_record_cleanup_ranges() so that it no longer destroys the range list. That is now the responsibility of the various locations that clean up the dirty record itself. - Modify dbuf_dirty_record_add_range(): If a new write range will cause the buffer to be filled entirely, set the dbuf state to FILL. - Add dbuf_transition_to_read(dmu_buf_impl_t*, int): Transition to the READ state if the dbuf's state meets a given state requirement, issuing an async read if it does so. - Modify dbuf_will_dirty_range(): - If the dbuf is a special one (i.e. not a data block), pass it off to the old dbuf_will_dirty(). - Remove most of the debug counters previously added. - Call dbuf_transition_to_read() if UNCACHED. - Add a few disabled conditionals for the PARTIAL state. - Temporarily: wait for the async read. - Set the FILL state bit. - If the dbuf is not CACHED, add a write range. - Modify dbuf_fill_done() so that it will now clear the FILL state bit if any other bit is set. - Modify dbuf_assign_arcbuf() to handle an unresolved dbuf. - Modify dbuf_sync_leaf(): - If the dbuf has not yet transitioned to CACHED, make it so. - Split out the bonus buffer handling to dbuf_sync_bonus(). sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c: - Call dbuf_fill_done() unconditionally where dbuf_will_dirty_range() was called. dbuf_fill_done() is now smart enough to understand those scenarios. - Modify dmu_sync() so that if a dbuf is not CACHED, it transitions it to READ if necessary and waits for the resolution. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h: - Add a few new state shorthands: - DB_IN_FLIGHT: Buffer has had writes or reads issued. - DB_UNRESOLVED: Buffer does not have fully valid data. - Add dbuf_transition_to_read() prototype. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#18 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#8 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#9 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#18 (text) ==== @@ -72,30 +72,9 @@ */ static dbuf_hash_table_t dbuf_hash_table; int64_t dirty_ranges_in_flight; -uint64_t async_read_wait_loops; -uint64_t will_dirty_uncached; -uint64_t will_dirty_cached; -uint64_t will_dirty_read; -uint64_t will_dirty_nofill; -uint64_t will_dirty_fill; -uint64_t will_dirty_evicting; SYSCTL_DECL(_vfs_zfs); SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dirty_ranges_in_flight, CTLFLAG_RD, &dirty_ranges_in_flight, 0, "number of dirty ranges in flight"); -SYSCTL_QUAD(_vfs_zfs, OID_AUTO, async_read_wait_loops, CTLFLAG_RD, - &async_read_wait_loops, 0, "number of times async reads waited"); -SYSCTL_QUAD(_vfs_zfs, OID_AUTO, will_dirty_uncached, CTLFLAG_RD, - &will_dirty_uncached, 0, "will dirty uncached"); -SYSCTL_QUAD(_vfs_zfs, OID_AUTO, will_dirty_cached, CTLFLAG_RD, - &will_dirty_cached, 0, "will dirty cached"); -SYSCTL_QUAD(_vfs_zfs, OID_AUTO, will_dirty_read, CTLFLAG_RD, - &will_dirty_read, 0, "will dirty read"); -SYSCTL_QUAD(_vfs_zfs, OID_AUTO, will_dirty_nofill, CTLFLAG_RD, - &will_dirty_nofill, 0, "will dirty nofill"); -SYSCTL_QUAD(_vfs_zfs, OID_AUTO, will_dirty_fill, CTLFLAG_RD, - &will_dirty_fill, 0, "will dirty fill"); -SYSCTL_QUAD(_vfs_zfs, OID_AUTO, will_dirty_evicting, CTLFLAG_RD, - &will_dirty_evicting, 0, "will dirty evicting"); static uint64_t dbuf_hash_count; @@ -301,13 +280,6 @@ mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); dirty_ranges_in_flight = 0; - async_read_wait_loops = 0; - will_dirty_uncached = 0; - will_dirty_cached = 0; - will_dirty_read = 0; - will_dirty_nofill = 0; - will_dirty_fill = 0; - will_dirty_evicting = 0; } void @@ -526,6 +498,217 @@ } } +typedef struct dbuf_dirty_record_hole { + caddr_t src; + caddr_t dst; + int size; +} dbuf_dirty_record_hole_t; + +typedef struct dbuf_dirty_record_hole_itr { + /* provided data */ + arc_buf_t *src; + dbuf_dirty_leaf_t *dl; + /* calculated data */ + dbuf_dirty_range_t *range; + /* One greater than the last valid offset in the dst buffer */ + int max_offset; + int hole_start; + dbuf_dirty_record_hole_t hole; +} dbuf_dirty_record_hole_itr_t; + +/** + * \brief Initialize a dirty record hole iterator. + * + * \param itr Iterator context to initialize. + * \param dl Dirty leaf to merge. + * \param src_buf ARC buffer containing initial data. + */ +static inline void +dbuf_dirty_record_hole_itr_init(dbuf_dirty_record_hole_itr_t *itr, + dbuf_dirty_leaf_t *dl, arc_buf_t *src_buf) +{ + itr->src = src_buf; + itr->dl = dl; + /* XXX This shouldn't be needed, test just to see if maybe these + * buffers are the ones getting modified after freeze */ + arc_buf_thaw(dl->dr_data); + itr->max_offset = MIN(arc_buf_size(src_buf), arc_buf_size(dl->dr_data)); + itr->range = list_head(&dl->write_ranges); + itr->hole.src = NULL; + itr->hole.dst = NULL; + itr->hole.size = 0; + /* If no ranges exist, the dirty buffer is entirely valid. */ + itr->hole_start = (itr->range == NULL) ? itr->max_offset : 0; +} + +/** + * \brief Iterate a dirty record, providing the next hole. + * + * \param itr Dirty record hole iterator context. + * + * The hole returned provides direct pointers to the source, destination, + * and the target size. A hole is a portion of the dirty record's ARC + * buffer that does not contain valid data and must be filled in using the + * initial ARC buffer, which should be entirely valid. + * + * \return NULL If there are no more holes. + */ +static inline dbuf_dirty_record_hole_t * +dbuf_dirty_record_hole_itr_next(dbuf_dirty_record_hole_itr_t *itr) +{ + + if (itr->hole_start >= itr->max_offset) + return (NULL); + + itr->hole.src = (caddr_t)(itr->src->b_data) + itr->hole_start; + itr->hole.dst = (caddr_t)(itr->dl->dr_data->b_data) + itr->hole_start; + if (itr->range != NULL) { + itr->hole.size = MIN(itr->max_offset, itr->range->offset) - + itr->hole_start; + itr->hole_start = itr->range->offset + itr->range->size; + itr->range = list_next(&itr->dl->write_ranges, itr->range); + } else { + itr->hole.size = itr->max_offset - itr->hole_start; + itr->hole_start = itr->max_offset; + } + return (&itr->hole); +} + + +/** + * \brief Merge write ranges for a dirty record. + * + * \param dl Dirty leaf record to merge the old buffer to. + * \param buf The old ARC buffer to use for missing data. + * + * This function performs an inverse merge. The write ranges provided + * indicate valid data in the dirty leaf's buffer, which means the old + * buffer has to be copied over exclusive of those ranges. + */ +static void +dbuf_merge_write_ranges(dbuf_dirty_leaf_t *dl, arc_buf_t *old_buf) +{ + dbuf_dirty_record_hole_itr_t itr; + dbuf_dirty_record_hole_t *hole; + + ASSERT3P(dl, !=, NULL); + /* If there are no write ranges, we're done. */ + if (list_is_empty(&dl->write_ranges)) + return; + /* If there are write ranges, there must be an ARC buffer. */ + ASSERT(dl->dr_data != NULL); + + /* + * We use an iterator here because it simplifies the logic + * considerably for this function. + */ + dbuf_dirty_record_hole_itr_init(&itr, dl, old_buf); + + while ((hole = dbuf_dirty_record_hole_itr_next(&itr)) != NULL) + memcpy(hole->src, hole->dst, hole->size); +} + +/** + * \brief Resolve a dbuf using its ranges and the filled ARC buffer provided. + * + * \param db Dbuf to resolve. + * \param buf ARC buffer to use to resolve. + * + * This routine is called after a read completes. The results of the read + * are stored in the ARC buffer. It will then merge writes in the order + * that they occurred, cleaning up write ranges as it goes. + */ +static void +dbuf_resolve_ranges(dmu_buf_impl_t *db, arc_buf_t *buf) +{ + dbuf_dirty_record_t *dr; + dbuf_dirty_leaf_t *dl; + arc_buf_t *old_buf; + + /* No range data is kept for non data blocks. */ + ASSERT3U(db->db_level, ==, 0); + + /* + * Start with the oldest dirty record, merging backwards. For the + * first dirty record, the provided ARC buffer is the "old" buffer. + * + * In turn, the older buffer is copied to the newer one, using an + * inverse of the newer one's write ranges. + */ + dr = list_tail(&db->db_dirty_records); + old_buf = buf; + while (dr != NULL) { + dl = &dr->dt.dl; + /* No merges actually occur if the range list is empty */ + dbuf_merge_write_ranges(dl, old_buf); + /* The ranges were just merged, so they're now obsolete */ + dbuf_dirty_record_cleanup_ranges(dr); + old_buf = dl->dr_data; + dr = list_prev(&db->db_dirty_records, dr); + } +} + +static void +dbuf_merge_write_ranges(dbuf_dirty_leaf_t *dl, arc_buf_t *old_buf) +{ + dbuf_dirty_record_hole_itr_t itr; + dbuf_dirty_record_hole_t *hole; + + ASSERT3P(dl, !=, NULL); + /* If there are no write ranges, we're done. */ + if (list_is_empty(&dl->write_ranges)) + return; + /* If there are write ranges, there must be an ARC buffer. */ + ASSERT(dl->dr_data != NULL); + + /* + * We use an iterator here because it simplifies the logic + * considerably for this function. + */ + dbuf_dirty_record_hole_itr_init(&itr, dl, old_buf); + + while ((hole = dbuf_dirty_record_hole_itr_next(&itr)) != NULL) + memcpy(hole->src, hole->dst, hole->size); +} + +/** + * \brief Resolve a dbuf using its ranges and the filled ARC buffer provided. + * + * \param db Dbuf to resolve. + * \param buf ARC buffer to use to resolve. + * + * This routine is called after a read completes. The results of the read + * are stored in the ARC buffer. It will then merge writes in the order + * that they occurred, cleaning up write ranges as it goes. + */ +static void +dbuf_resolve_ranges(dmu_buf_impl_t *db, arc_buf_t *buf) +{ + dbuf_dirty_record_t *dr; + dbuf_dirty_leaf_t *dl; + arc_buf_t *old_buf; + + /* No range data is kept for non data blocks. */ + ASSERT3U(db->db_level, ==, 0); + + /* + * Start with the oldest dirty record, merging backwards. For the + * first dirty record, the provided ARC buffer is the "old" buffer. + * + * In turn, the older buffer is copied to the newer one, using an + * inverse of the newer one's write ranges. + */ + dr = list_tail(&db->db_dirty_records); + old_buf = buf; + while (dr != NULL) { + dl = &dr->dt.dl; + dbuf_merge_write_ranges(dl, old_buf); + dbuf_dirty_record_cleanup_ranges(dr); + old_buf = dl->dr_data; + dr = list_prev(&db->db_dirty_records, dr); + } +} + static void dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) { @@ -535,25 +718,29 @@ while (db->db_state & DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); - /* - * All reads are synchronous, so we must have a hold on the dbuf - */ + /* Any reads or writes must have a hold on this dbuf */ ASSERT(refcount_count(&db->db_holds) > 0); - ASSERT(db->db_buf == NULL); - ASSERT(db->db.db_data == NULL); - if (db->db_level == 0 && db->db_freed_in_flight) { - /* we were freed in flight; disregard any error */ - arc_release(buf, db); - bzero(buf->b_data, db->db.db_size); - arc_buf_freeze(buf); - db->db_freed_in_flight = FALSE; - dbuf_set_data(db, buf); - db->db_state = DB_CACHED; + + if (db->db_state == DB_CACHED) { + /* + * Read data became obsolete when a writer finished filling + * the buffer. + */ + VERIFY(arc_buf_remove_ref(buf, db) == 1); } else if (zio == NULL || zio->io_error == 0) { - dbuf_set_data(db, buf); + if (!list_is_empty(&db->db_dirty_records)) { + dbuf_resolve_ranges(db, buf); + /* The read data buffer is no longer needed */ + VERIFY(arc_buf_remove_ref(buf, db) == 1); + } else { + /* Read issued, no associated writes */ + ASSERT(db->db_state == DB_READ); + dbuf_set_data(db, buf); + } db->db_state = DB_CACHED; } else { ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(db->db_state == DB_READ); ASSERT3P(db->db_buf, ==, NULL); VERIFY(arc_buf_remove_ref(buf, db) == 1); db->db_state = DB_UNCACHED; @@ -577,8 +764,7 @@ /* We need the struct_rwlock to prevent db_blkptr from changing. */ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(MUTEX_HELD(&db->db_mtx)); - /* XXX Must be changed to allow for PARTIAL */ - ASSERT(db->db_state == DB_UNCACHED); + ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_PARTIAL); ASSERT(db->db_buf == NULL); if (db->db_blkid == DMU_BONUS_BLKID) { @@ -674,7 +860,9 @@ DBUF_IS_CACHEABLE(db); mutex_enter(&db->db_mtx); - /* XXX Need to add PARTIAL case, or merge with one of these */ + while (db->db_state & DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + if (db->db_state == DB_CACHED) { mutex_exit(&db->db_mtx); if (prefetch) @@ -683,7 +871,7 @@ if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); - } else if (db->db_state == DB_UNCACHED) { + } else if (db->db_state & DB_UNRESOLVED) { spa_t *spa = dn->dn_objset->os_spa; if (zio == NULL) @@ -722,8 +910,8 @@ /* Skip the wait on the caller's request. */ if ((flags & DB_RF_NEVERWAIT) == 0) { mutex_enter(&db->db_mtx); - while (db->db_state == DB_READ || - db->db_state == DB_FILL) { + /* Already waited for DB_FILL set above */ + while (db->db_state & DB_IN_FLIGHT) { ASSERT(db->db_state == DB_READ || (flags & DB_RF_HAVESTRUCT) == 0); cv_wait(&db->db_changed, &db->db_mtx); @@ -762,8 +950,11 @@ * callback to realize that its buffer is no longer needed, so the * fill should still create a single write range covering the buffer. */ - while (db->db_state == DB_READ || db->db_state == DB_FILL) + + /* Wait for another filler to finish. */ + while (db->db_state & DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); + if (db->db_state == DB_UNCACHED) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa; @@ -775,8 +966,10 @@ db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { dbuf_set_data(db, NULL); + } else if (db->db_state & (DB_READ|DB_PARTIAL)) { + db->db_state |= DB_FILL; } else { - ASSERT3U(db->db_state, ==, DB_CACHED); + ASSERT(db->db_state & (DB_IN_FLIGHT|DB_CACHED)); } mutex_exit(&db->db_mtx); } @@ -813,7 +1006,7 @@ * If the last dirty record for this dbuf has not yet synced * and its referencing the dbuf data, either: * reset the reference to point to a new copy, - * or (if there a no active holders) + * or (if there no active holders or unresolved ranges) * just null out the current db_data pointer. */ ASSERT(dr->dr_txg >= txg - 2); @@ -822,7 +1015,8 @@ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); - } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + } else if (refcount_count(&db->db_holds) > db->db_dirtycnt || + !list_is_empty(&dr->dt.dl.write_ranges)) { int size = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa; @@ -860,6 +1054,7 @@ zio_free(spa, txg, bp); } dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + /* * Release the already-written buffer, so we leave it in * a consistent dirty state. Note that all callers are @@ -936,23 +1131,21 @@ * Our goal is to make the data visible in the current * transaction group all zeros while preserving the data * as seen in any earlier transaction groups. - * - * If the buffer is currently being filled then we cannot - * directly clear the buffer's contents. Instead, we - * signal the filler by setting db_freed_in_flight and - * have it do this work just before transitioning the buffer - * to the CACHED state. - * - * If a read is outstanding then the dirty record, if any, - * needs to be processed for this transaction group here, - * rather than deferring it to dbuf_read_done. */ - if (db->db_state == DB_READ || db->db_state == DB_FILL) { - /* will be handled in dbuf_read_done or dbuf_rele */ + if (db->db_state & DB_FILL) { + /* + * If the buffer is currently being filled then we + * cannot directly clear the buffer's contents. + * Instead, we signal the filler by setting + * db_freed_in_flight and having dbuf_fill_done do + * this work just before transitioning the buffer to + * the CACHED state. + */ db->db_freed_in_flight = TRUE; mutex_exit(&db->db_mtx); continue; } + /* All consumers are finished, so evict the buffer */ if (refcount_count(&db->db_holds) == 0) { ASSERT(db->db_buf); dbuf_clear(db); @@ -971,7 +1164,9 @@ if (db->db_blkid != DMU_SPILL_BLKID && db->db_blkid > dn->dn_maxblkid) dn->dn_maxblkid = db->db_blkid; + /* Handle intermediate dmu_sync() calls. */ dbuf_unoverride(dr); + dbuf_dirty_record_cleanup_ranges(dr); } else { /* * This dbuf is not dirty in the open context. @@ -981,9 +1176,20 @@ */ dbuf_fix_old_data(db, txg); } + } else if (db->db_state != DB_CACHED) { + /* Create buffer and make it become zero filled. */ + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + spa_t *spa; + + ASSERT(db->db_state == DB_READ); + DB_GET_SPA(&spa, db); + dbuf_set_data(db, + arc_buf_alloc(spa, db->db.db_size, db, type)); + db->db_state = DB_CACHED; } - /* clear the contents if its cached */ - if (db->db_state == DB_CACHED) { + /* dbuf_fix_old_data above may go to DB_UNCACHED */ + if (db->db_state != DB_UNCACHED) { + /* clear the contents */ ASSERT(db->db.db_data != NULL); arc_release(db->db_buf, db); bzero(db->db.db_data, db->db.db_size); @@ -1027,10 +1233,40 @@ return (FALSE); } +static void +dbuf_dirty_record_truncate_ranges(dbuf_dirty_record_t *dr, int new_size) +{ + dbuf_dirty_leaf_t *dl; + dbuf_dirty_range_t *range; + + ASSERT(MUTEX_HELD(&dr->dr_dbuf->db_mtx)); + if (dr->dr_dbuf->db_level != 0) + return; + + dl = &dr->dt.dl; + for (;;) { + range = list_tail(&dl->write_ranges); + + if (range->offset >= new_size) { + list_remove(&dl->write_ranges, range); + kmem_free(range, sizeof(dbuf_dirty_range_t)); + continue; + } + + /* + * Update the last range that could be affected by + * this truncation. Its size changes only if it + * extends past the end of the new buffer's size. + */ + range->size = MIN(new_size - range->offset, range->size); + break; + } +} + void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) { - arc_buf_t *buf, *obuf; + arc_buf_t *buf, *old_buf; int osize = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dnode_t *dn; @@ -1053,21 +1289,22 @@ * XXX we should be doing a dbuf_read, checking the return * value and returning that up to our callers */ + /* XXX this needs to be made nonblocking */ dbuf_will_dirty(db, tx); /* create the data buffer for the new block */ buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); /* copy old block data to the new block */ - obuf = db->db_buf; - bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); + old_buf = db->db_buf; + bcopy(old_buf->b_data, buf->b_data, MIN(osize, size)); /* zero the remainder */ if (size > osize) bzero((uint8_t *)buf->b_data + osize, size - osize); mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); - VERIFY(arc_buf_remove_ref(obuf, db) == 1); + VERIFY(arc_buf_remove_ref(old_buf, db) == 1); db->db.db_size = size; if (db->db_level == 0) { @@ -1116,7 +1353,7 @@ * transaction group. */ dbuf_dirty_record_t * -dirty_record_create(dmu_buf_impl_t *db, dmu_tx_t *tx) +dbuf_dirty_record_create(dmu_buf_impl_t *db, dmu_tx_t *tx) { dbuf_dirty_record_t *dr; dnode_t *dn; @@ -1148,9 +1385,29 @@ * syncing state (since they are only modified * then). */ - arc_release(db->db_buf, db); - dbuf_fix_old_data(db, tx->tx_txg); - data_old = db->db_buf; + /* XXX + * isn't this wrong? + * in the new world order, just because we + * have an arc buf doesn't mean we don't + * want a separate buffer for new DRs. This + * could be why we are writing buffers after + * they have been frozen? + */ + if (db->db_buf != NULL) { + arc_release(db->db_buf, db); + dbuf_fix_old_data(db, tx->tx_txg); + data_old = db->db_buf; + } else { + int size = db->db.db_size; + arc_buf_contents_t type; + spa_t *spa; + + DB_GET_SPA(&spa, db); + type = DBUF_GET_BUFC_TYPE(db); + data_old = arc_buf_alloc(spa, size, + db, type); + dbuf_set_data(db, data_old); + } } ASSERT(data_old != NULL); } @@ -1174,7 +1431,7 @@ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && db->db_blkid != DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); - dnode_clear_range(dn, db->db_blkid, 1, tx); + dnode_clear_range(dn, db->db_blkid, /*nblks*/1, tx); mutex_exit(&dn->dn_mtx); db->db_freed_in_flight = FALSE; } @@ -1216,13 +1473,9 @@ * transactions created with dmu_tx_create_assigned() from * syncing context don't bother holding ahead. */ - /* XXX - * This should be valid for PARTIAL and READ too, since writes will - * still be able to occur in those states. - */ ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || db->db_state == DB_FILL || - db->db_state == DB_NOFILL); + db->db_state == DB_NOFILL || db->db_state & DB_IN_FLIGHT); } /** @@ -1356,13 +1609,14 @@ DB_DNODE_EXIT(db); if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { - /* - * If this buffer has already been written out, - * we now need to reset its state. - */ + /* Reset immediate write sync state if needed */ dbuf_unoverride(dr); if (db->db.db_object != DMU_META_DNODE_OBJECT && db->db_state != DB_NOFILL) + /* + * Notify ARC that the buffer will be + * modified, requiring a new checksum. + */ arc_buf_thaw(db->db_buf); } mutex_exit(&db->db_mtx); @@ -1412,7 +1666,7 @@ * to make a copy of it so that the changes we make in this * transaction group won't leak out when we sync the older txg. */ - dr = dirty_record_create(db, tx); + dr = dbuf_dirty_record_create(db, tx); mutex_exit(&db->db_mtx); @@ -1477,7 +1731,6 @@ atomic_subtract_64(&dirty_ranges_in_flight, 1); ASSERT(dirty_ranges_in_flight >= 0); } - list_destroy(&dl->write_ranges); } /** @@ -1577,6 +1830,8 @@ list_destroy(&dr->dt.di.dr_children); } dbuf_dirty_record_cleanup_ranges(dr); + if (db->db_level == 0) + list_destroy(&dr->dt.dl.write_ranges); kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); @@ -1629,6 +1884,7 @@ void dbuf_dirty_record_add_range(dbuf_dirty_record_t *dr, int offset, int size) { + dmu_buf_impl_t *db; struct dbuf_dirty_leaf *dl; dbuf_dirty_range_t *old_range, *new_range; int new_end, old_end; @@ -1636,8 +1892,10 @@ /* Write ranges do not apply to indirect blocks. */ ASSERT(dr->dr_dbuf->db_level == 0); + ASSERT(MUTEX_HELD(&db->db_mtx)); dl = &dr->dt.dl; + db = dr->dr_dbuf; /* Discover whether an existing entry overlaps. */ old_range = list_head(&dl->write_ranges); @@ -1683,6 +1941,17 @@ */ old_range->offset = MIN(offset, old_range->offset); old_range->size = MAX(new_end, old_end) - old_range->offset; + +#define LIST_HAS_ONE_ENTRY(list) \ + (list_head(list) == list_tail(list) && !list_is_empty(list)) + /* + * If there's a single write range and it now covers the + * entire buffer, the caller will finish filling, so clear + * any READ or PARTIAL bit that may be set. + */ + if (LIST_HAS_ONE_ENTRY(&db->db_dirty_records) && + old_range->offset == 0 && old_range->size == db->db.db_size) + db->db_state = DB_FILL; return; } @@ -1704,6 +1973,44 @@ atomic_add_64(&dirty_ranges_in_flight, 1); } +/** + * \brief Make the dbuf transition to READ if it is in certain states. + * + * \param db Dbuf to transition + * \param state_req States required to perform transition + * + * \invariant The dbuf's mutex must be held. + * + * The dbuf's mutex will be dropped if a read is actually issued. + */ +void +dbuf_transition_to_read(dmu_buf_impl_t *db, int state_req) +{ + int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NEVERWAIT; + dnode_t *dn; + zio_t *zio = NULL; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (db->db_state & state_req) { + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) + rf |= DB_RF_HAVESTRUCT; + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, + ZIO_FLAG_MUSTSUCCEED); + DB_DNODE_EXIT(db); + } + + /* Issue the asynchronous read, if applicable */ + if (zio != NULL) { + mutex_exit(&db->db_mtx); + (void) dbuf_read(db, zio, rf); + (void) zio_nowait(zio); + mutex_enter(&db->db_mtx); + } +} + #pragma weak dmu_buf_will_dirty_range = dbuf_will_dirty_range /** * \brief Signal intent to dirty a subset of the buffer. @@ -1718,57 +2025,57 @@ void dbuf_will_dirty_range(dmu_buf_impl_t *db, dmu_tx_t *tx, int offset, int size) { - int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; dbuf_dirty_record_t *dr; dnode_t *dn; - zio_t *zio = NULL; ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); mutex_enter(&db->db_mtx); - switch(db->db_state) { - case DB_UNCACHED: - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) - rf |= DB_RF_HAVESTRUCT; - zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, - ZIO_FLAG_MUSTSUCCEED); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + /* This can't handle special blocks or non level 0 blocks yet */ + if (DMU_OBJECT_IS_SPECIAL(dn->dn_object) || + db->db_blkid == DMU_SPILL_BLKID || + db->db_blkid == DMU_BONUS_BLKID || + db->db_level != 0) { DB_DNODE_EXIT(db); - atomic_add_64(&will_dirty_uncached, 1); - break; - case DB_CACHED: - atomic_add_64(&will_dirty_cached, 1); - break; - case DB_FILL: - atomic_add_64(&will_dirty_fill, 1); - break; - case DB_READ: - atomic_add_64(&will_dirty_read, 1); - break; - case DB_NOFILL: - atomic_add_64(&will_dirty_nofill, 1); - break; - case DB_EVICTING: - atomic_add_64(&will_dirty_evicting, 1); - break; + mutex_exit(&db->db_mtx); + dbuf_will_dirty(db, tx); + return; } - mutex_exit(&db->db_mtx); + DB_DNODE_EXIT(db); - /* Issue the asynchronous read using our zio */ - if (zio != NULL) { - (void) dbuf_read(db, zio, rf); - (void) zio_nowait(zio); - } + /* + * Only issue a read if we start writing inside the block rather + * than either at the beginning (forward) or end (backward) + */ +#ifdef NOTYET + if (db->db_state == DB_UNCACHED && + ((offset != 0 && offset != db->db.db_size) || + !BP_IS_HOLE(db->db_blkptr))) +#endif + dbuf_transition_to_read(db, DB_UNCACHED); - /* Wait for the async read to complete */ - mutex_enter(&db->db_mtx); - while (db->db_state == DB_READ) { +#if 0 + /* XXX TEMP Wait for the async read to complete */ + while (db->db_state == DB_READ) cv_wait(&db->db_changed, &db->db_mtx); - atomic_add_64(&async_read_wait_loops, 1); - } - ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); +#endif + + /* State transitions not done above, needed for dbuf_dirty */ + /* Transition to PARTIAL if we didn't transition to READ */ +#ifdef NOTYET + if (db->db_state == DB_UNCACHED) + db->db_state = DB_PARTIAL; +#endif + /* + * Now set the FILL bit, since the caller is about to write, but + * only if the dbuf isn't already CACHED. + */ + if (db->db_state & DB_IN_FLIGHT) + db->db_state |= DB_FILL; mutex_exit(&db->db_mtx); dr = dbuf_dirty(db, tx); @@ -1776,7 +2083,8 @@ /* Add the write range to this dbuf. */ mutex_enter(&db->db_mtx); - dbuf_dirty_record_add_range(dr, offset, size); + if (db->db_state != DB_CACHED) + dbuf_dirty_record_add_range(dr, offset, size); mutex_exit(&db->db_mtx); } @@ -1794,6 +2102,7 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dbuf_dirty_record_t *dr; ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(tx->tx_txg != 0); @@ -1804,7 +2113,17 @@ dmu_tx_private_ok(tx)); dbuf_noread(db); - (void) dbuf_dirty(db, tx); + dr = dbuf_dirty(db, tx); + + /* + * Mark the whole buffer as being written to by clearing + * any partial ranges in the dirty record from partial + * writes to this block that occurred earlier in this + * transaction. + */ + mutex_enter(&db->db_mtx); + dbuf_dirty_record_cleanup_ranges(dr); + mutex_exit(&db->db_mtx); } #pragma weak dmu_buf_fill_done = dbuf_fill_done @@ -1812,24 +2131,39 @@ void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) { + dbuf_dirty_record_t *dr; + mutex_enter(&db->db_mtx); DBUF_VERIFY(db); + if (db->db_state & DB_FILL) { + /* Find the relevant dirty record to update */ + for (dr = list_head(&db->db_dirty_records); + dr != NULL && dr->dr_txg != tx->tx_txg; + dr = list_next(&db->db_dirty_records, dr)) + ; + ASSERT(dr != NULL); - if (db->db_state == DB_FILL) { if (db->db_level == 0 && db->db_freed_in_flight) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ bzero(db->db.db_data, db->db.db_size); db->db_freed_in_flight = FALSE; + db->db_state = DB_CACHED; } - /* XXX - * This function should be called when a FILL is done while - * the steady state is READ or PARTIAL, in which cases the - * state should be reset to those. However, if it's PARTIAL - * and the buffer fully fills, then it can go to CACHED. + /* + * This function can be called with another state bit set, + * but if FILL is the only bit set, then the buffer has been + * fully filled. Otherwise, clear the FILL bit, so it goes + * back to the steady state. */ - db->db_state = DB_CACHED; + if (db->db_state == DB_FILL) + db->db_state = DB_CACHED; + else { + db->db_state &= ~DB_FILL; + ASSERT(db->db_state & DB_IN_FLIGHT); + } + cv_broadcast(&db->db_changed); } mutex_exit(&db->db_mtx); @@ -1843,6 +2177,8 @@ void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) { + dbuf_dirty_record_t *dr; + ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_level == 0); @@ -1856,21 +2192,43 @@ mutex_enter(&db->db_mtx); - /* XXX - * The caller has already filled an ARC buffer and wants to assign - * it to this dbuf. - * - * Just like dbuf_noread, except the data is already here, so - * there's no need to yield control of the buffer in a FILL state. - * - * If the steady state is PARTIAL, transition to READ to resolve the - * partial writes, issuing the read in the process. - * - * If the steady state is READ, then perform the dirty record work - * needed to save this ARC buffer. + while (db->db_state & DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + + /* If the steady state is PARTIAL, transition to READ. */ + dbuf_transition_to_read(db, DB_PARTIAL); + + /* + * The buffer is waiting for a read, so simply update the associated + * dirty record, using the buffer provided. */ - while (db->db_state == DB_READ || db->db_state == DB_FILL) - cv_wait(&db->db_changed, &db->db_mtx); + if (db->db_state == DB_READ) { + for (dr = list_head(&db->db_dirty_records); + dr != NULL && dr->dr_txg != tx->tx_txg; + dr = list_next(&db->db_dirty_records, dr)) + ; + if (dr == NULL) + /* No dirty record for this transaction yet */ + dr = dbuf_dirty_record_create(db, tx); + else { + /* Remove the old dirty data for this transaction */ + dbuf_dirty_record_cleanup_ranges(dr); + if (dr->dt.dl.dr_data != NULL) { + arc_release(buf, db); + VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, + db) == 1); + } + } + /* + * Assign the ARC buffer to the dirty record and record its + * write range as covering the whole buffer, so that when + * the READ is later resolved, the new buffer stays. + */ + dr->dt.dl.dr_data = buf; + dbuf_dirty_record_add_range(dr, 0, db->db.db_size); + mutex_exit(&db->db_mtx); + return; + } ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); @@ -1890,7 +2248,7 @@ xuio_stat_wbuf_nocopy(); if (db->db_state == DB_CACHED) { - dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); + dr = list_head(&db->db_dirty_records); ASSERT(db->db_buf != NULL); if (dr != NULL && dr->dr_txg == tx->tx_txg) { @@ -2714,6 +3072,45 @@ } static void +dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx, arc_buf_t **datap) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + uint64_t txg = tx->tx_txg; + dnode_t *dn; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(DB_DNODE_HELD(db)); + ASSERT(db->db_blkid == DMU_BONUS_BLKID); + dn = DB_DNODE(db); + + ASSERT(*datap != NULL); + ASSERT3U(db->db_level, ==, 0); + ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); + bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); + DB_DNODE_EXIT(db); + + if (*datap != db->db.db_data) { + zio_buf_free(*datap, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); + } + db->db_data_pending = NULL; + ASSERT(list_next(&db->db_dirty_records, dr) == NULL); + ASSERT(dr->dr_dbuf == db); + list_remove(&db->db_dirty_records, dr); + if (dr->dr_dbuf->db_level != 0) { + list_destroy(&dr->dt.di.dr_children); + mutex_destroy(&dr->dt.di.dr_mtx); + } + dbuf_dirty_record_cleanup_ranges(dr); + if (db->db_level == 0) + list_destroy(&dr->dt.dl.write_ranges); + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); +} + +static void dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { arc_buf_t **datap = &dr->dt.dl.dr_data; @@ -2728,6 +3125,16 @@ mutex_enter(&db->db_mtx); /* + * XXX TEMP + * If the dirty record is not CACHED, make it happen. + */ + if (db->db_state != DB_CACHED) { + dbuf_transition_to_read(db, DB_UNRESOLVED); + while (db->db_state & DB_READ) + cv_wait(&db->db_changed, &db->db_mtx); + } + + /* * To be synced, we must be dirtied. But we * might have been freed after the dirty. */ @@ -2758,29 +3165,7 @@ * be called). */ if (db->db_blkid == DMU_BONUS_BLKID) { - ASSERT(*datap != NULL); - ASSERT3U(db->db_level, ==, 0); - ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); - bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); - DB_DNODE_EXIT(db); - - if (*datap != db->db.db_data) { - zio_buf_free(*datap, DN_MAX_BONUSLEN); - arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); - } - db->db_data_pending = NULL; - ASSERT(list_next(&db->db_dirty_records, dr) == NULL); - ASSERT(dr->dr_dbuf == db); - list_remove(&db->db_dirty_records, dr); - if (dr->dr_dbuf->db_level != 0) { - list_destroy(&dr->dt.di.dr_children); - mutex_destroy(&dr->dt.di.dr_mtx); - } - dbuf_dirty_record_cleanup_ranges(dr); - kmem_free(dr, sizeof (dbuf_dirty_record_t)); - ASSERT(db->db_dirtycnt > 0); - db->db_dirtycnt -= 1; - dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); + dbuf_sync_bonus(dr, tx, datap); return; } @@ -2991,6 +3376,7 @@ ASSERT(!list_link_active(&dr->dr_dirty_node)); ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); + /* There should be no older dirty records. */ ASSERT(list_next(&db->db_dirty_records, dr) == NULL); list_remove(&db->db_dirty_records, dr); @@ -3057,6 +3443,8 @@ list_destroy(&dr->dt.di.dr_children); } dbuf_dirty_record_cleanup_ranges(dr); + if (db->db_level == 0) + list_destroy(&dr->dt.dl.write_ranges); kmem_free(dr, sizeof (dbuf_dirty_record_t)); cv_broadcast(&db->db_changed); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#8 (text) ==== @@ -835,8 +835,7 @@ bcopy(buf, (char *)db->db_data + bufoff, tocpy); - if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); + dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; @@ -1088,8 +1087,7 @@ err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_WRITE, uio); - if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); + dmu_buf_fill_done(db, tx); if (err) break; @@ -1187,8 +1185,7 @@ bufoff += PAGESIZE; } - if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); + dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; @@ -1470,13 +1467,26 @@ return (ENOENT); } - /* XXX + /* + * XXX TEMP * If the dirty record is not CACHED, then the dmu_sync call needs * to wait for it to reach that state. Thus, it needs to issue the * READ if necessary. Then, it needs to wait for that read to * complete, which should cause the dbuf to become CACHED after - * resolving the applicable write ranges. + * resolving the applicable write ranges. At that point, the sync + * can be completed. */ + if (db->db_state != DB_CACHED) { + dbuf_transition_to_read(db, DB_UNRESOLVED); + while (db->db_state & DB_READ) + cv_wait(&db->db_changed, &db->db_mtx); + /* The dbuf had an I/O error or was freed in flight */ + if (db->db_state == DB_UNCACHED) { + mutex_exit(&db->db_mtx); + return (ENOENT); + } + } + ASSERT(db->db_state == DB_CACHED); ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#9 (text) ==== @@ -233,7 +233,17 @@ * has been issued to resolve the COW fault, and a * thread is actively modifying the dbuf. */ - DB_READ_FILL = DB_READ|DB_FILL + DB_READ_FILL = DB_READ|DB_FILL, + + /** + * Dbuf either has an outstanding read or writes. + */ + DB_IN_FLIGHT = DB_PARTIAL|DB_READ, + + /** + * Dbuf does not have fully valid data. + */ + DB_UNRESOLVED = DB_UNCACHED|DB_PARTIAL, } dbuf_states_t; struct dnode; @@ -449,6 +459,7 @@ void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); void dbuf_will_dirty_range(dmu_buf_impl_t *db, dmu_tx_t *tx, int offset, int size); +void dbuf_transition_to_read(dmu_buf_impl_t *db, int state_req); void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); Change 516098 by willa@willa_repo on 2011/12/06 09:28:49 Fix a botched shelf-merge. Left in an extra copy of the dbuf_merge_write_ranges() and dbuf_resolve_ranges() functions. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#19 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#19 (text) ==== @@ -574,7 +574,6 @@ return (&itr->hole); } - /** * \brief Merge write ranges for a dirty record. * @@ -639,69 +638,6 @@ old_buf = buf; while (dr != NULL) { dl = &dr->dt.dl; - /* No merges actually occur if the range list is empty */ - dbuf_merge_write_ranges(dl, old_buf); - /* The ranges were just merged, so they're now obsolete */ - dbuf_dirty_record_cleanup_ranges(dr); - old_buf = dl->dr_data; - dr = list_prev(&db->db_dirty_records, dr); - } -} - -static void -dbuf_merge_write_ranges(dbuf_dirty_leaf_t *dl, arc_buf_t *old_buf) -{ - dbuf_dirty_record_hole_itr_t itr; - dbuf_dirty_record_hole_t *hole; - - ASSERT3P(dl, !=, NULL); - /* If there are no write ranges, we're done. */ - if (list_is_empty(&dl->write_ranges)) - return; - /* If there are write ranges, there must be an ARC buffer. */ - ASSERT(dl->dr_data != NULL); - - /* - * We use an iterator here because it simplifies the logic - * considerably for this function. - */ - dbuf_dirty_record_hole_itr_init(&itr, dl, old_buf); - - while ((hole = dbuf_dirty_record_hole_itr_next(&itr)) != NULL) - memcpy(hole->src, hole->dst, hole->size); -} - -/** - * \brief Resolve a dbuf using its ranges and the filled ARC buffer provided. - * - * \param db Dbuf to resolve. - * \param buf ARC buffer to use to resolve. - * - * This routine is called after a read completes. The results of the read - * are stored in the ARC buffer. It will then merge writes in the order - * that they occurred, cleaning up write ranges as it goes. - */ -static void -dbuf_resolve_ranges(dmu_buf_impl_t *db, arc_buf_t *buf) -{ - dbuf_dirty_record_t *dr; - dbuf_dirty_leaf_t *dl; - arc_buf_t *old_buf; - - /* No range data is kept for non data blocks. */ - ASSERT3U(db->db_level, ==, 0); - - /* - * Start with the oldest dirty record, merging backwards. For the - * first dirty record, the provided ARC buffer is the "old" buffer. - * - * In turn, the older buffer is copied to the newer one, using an - * inverse of the newer one's write ranges. - */ - dr = list_tail(&db->db_dirty_records); - old_buf = buf; - while (dr != NULL) { - dl = &dr->dt.dl; dbuf_merge_write_ranges(dl, old_buf); dbuf_dirty_record_cleanup_ranges(dr); old_buf = dl->dr_data; Change 516110 by willa@willa_repo on 2011/12/06 12:23:21 Fix some ARC buffer record keeping. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: - Modify dbuf_rele_and_unlock(): Don't freeze the dbuf's current ARC buffer unless the dbuf is CACHED. Previously, this function could not be called with the dbuf still in flight. - Modify dbuf_merge_write_ranges(): - When using memcpy(), copy in the correct direction. I confused memcpy with bcopy. - Freeze a dirty record's buffer when memcpy finishes. This makes sure the ARC buffer is frozen even if dbuf_rele_and_unlock is not called on the buffer again before it reaches the syncer. - Modify dbuf_dirty_record_hole_itr_init() so that it asserts that the dirty record's ARC buffer is not frozen. - Modify dbuf_resolve_ranges() so that it asserts that the input ARC buffer is frozen. - Make the reads asynchronous, at least from dbuf_will_dirty_range. They still appear to be happening, though. - Add some new debug sysctl counters to track the flow of dbufs. - Add SYSCTL_COUNTER_U() and SYSCTL_COUNTER_I() macros to simplify the job of adding these counters. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h: - Add arc_buf_frozen(arc_buf_t*): Returns whether an ARC buffer is frozen. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c#2 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h#3 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c#2 (text) ==== @@ -1197,6 +1197,20 @@ mutex_exit(&buf->b_hdr->b_freeze_lock); } +boolean_t +arc_buf_frozen(arc_buf_t *buf) +{ + boolean_t frozen = B_TRUE; + + /* + * NB: Does not grab or assert the mutex because the caller more + * than likely cannot use the results in an atomic fashion. + */ + if (buf->b_hdr->b_freeze_cksum == NULL) + frozen = B_FALSE; + return (frozen); +} + void arc_buf_thaw(arc_buf_t *buf) { ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h#3 (text) ==== @@ -97,6 +97,7 @@ int arc_released(arc_buf_t *buf); int arc_has_callback(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); +boolean_t arc_buf_frozen(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf); Change 516111 by willa@willa_repo on 2011/12/06 12:24:27 See change 516110. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#20 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#20 (text) ==== @@ -73,8 +73,29 @@ static dbuf_hash_table_t dbuf_hash_table; int64_t dirty_ranges_in_flight; SYSCTL_DECL(_vfs_zfs); -SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dirty_ranges_in_flight, CTLFLAG_RD, - &dirty_ranges_in_flight, 0, "number of dirty ranges in flight"); +#define SYSCTL_COUNTER_I(name, desc) \ + int64_t name; \ + SYSCTL_QUAD(_vfs_zfs, OID_AUTO, name, CTLFLAG_RD, \ + &name, 0, desc) +#define SYSCTL_COUNTER_U(name, desc) \ + uint64_t name; \ + SYSCTL_QUAD(_vfs_zfs, OID_AUTO, name, CTLFLAG_RD, \ + &name, 0, desc) + +SYSCTL_COUNTER_I(dirty_ranges_in_flight, "number of dirty ranges in flight"); + +SYSCTL_COUNTER_U(will_dirty, "number of times dirty ran"); +SYSCTL_COUNTER_U(will_dirty_range_cached, "number of times dirty range ran CACHED"); +SYSCTL_COUNTER_U(will_dirty_range_uncached, "number of times dirty range ran UNCACHED"); +SYSCTL_COUNTER_U(dirty_ranges_total, "number of total dirty ranges"); + +SYSCTL_COUNTER_U(will_fill_cached, "will_fill CACHED"); +SYSCTL_COUNTER_U(will_fill_uncached, "will_fill UNCACHED"); + +SYSCTL_COUNTER_U(entered_read_uncached, "entered read UNCACHED"); +SYSCTL_COUNTER_U(entered_read_cached, "entered read CACHED"); +SYSCTL_COUNTER_U(exited_read_uncached, "exited read UNCACHED"); +SYSCTL_COUNTER_U(exited_read_cached, "exited read CACHED"); static uint64_t dbuf_hash_count; @@ -280,6 +301,16 @@ mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); dirty_ranges_in_flight = 0; + will_dirty = 0; + will_dirty_range_cached = 0; + will_dirty_range_uncached = 0; + will_fill_cached = 0; + will_fill_uncached = 0; + dirty_ranges_total = 0; + entered_read_uncached = 0; + entered_read_cached = 0; + exited_read_uncached = 0; + exited_read_cached = 0; } void @@ -529,11 +560,9 @@ { itr->src = src_buf; itr->dl = dl; - /* XXX This shouldn't be needed, test just to see if maybe these - * buffers are the ones getting modified after freeze */ - arc_buf_thaw(dl->dr_data); itr->max_offset = MIN(arc_buf_size(src_buf), arc_buf_size(dl->dr_data)); itr->range = list_head(&dl->write_ranges); + ASSERT(!arc_buf_frozen(dl->dr_data)); itr->hole.src = NULL; itr->hole.dst = NULL; itr->hole.size = 0; @@ -604,7 +633,10 @@ dbuf_dirty_record_hole_itr_init(&itr, dl, old_buf); while ((hole = dbuf_dirty_record_hole_itr_next(&itr)) != NULL) - memcpy(hole->src, hole->dst, hole->size); + memcpy(hole->dst, hole->src, hole->size); + + /* Now that we have updated the buffer, freeze it */ + arc_buf_freeze(dl->dr_data); } /** @@ -636,8 +668,10 @@ */ dr = list_tail(&db->db_dirty_records); old_buf = buf; + ASSERT(arc_buf_frozen(buf)); while (dr != NULL) { dl = &dr->dt.dl; + ASSERT(dl->dr_data); dbuf_merge_write_ranges(dl, old_buf); dbuf_dirty_record_cleanup_ranges(dr); old_buf = dl->dr_data; @@ -1906,6 +1940,7 @@ */ list_insert_before(&dl->write_ranges, old_range, new_range); + atomic_add_64(&dirty_ranges_total, 1); atomic_add_64(&dirty_ranges_in_flight, 1); } @@ -1928,6 +1963,11 @@ ASSERT(MUTEX_HELD(&db->db_mtx)); + if (db->db_state == DB_CACHED) + atomic_add_64(&entered_read_cached, 1); + else + atomic_add_64(&entered_read_uncached, 1); + if (db->db_state & state_req) { DB_DNODE_ENTER(db); dn = DB_DNODE(db); @@ -1945,6 +1985,11 @@ (void) zio_nowait(zio); mutex_enter(&db->db_mtx); } + + if (db->db_state == DB_CACHED) + atomic_add_64(&exited_read_cached, 1); + else + atomic_add_64(&exited_read_uncached, 1); } #pragma weak dmu_buf_will_dirty_range = dbuf_will_dirty_range @@ -1979,9 +2024,14 @@ DB_DNODE_EXIT(db); mutex_exit(&db->db_mtx); dbuf_will_dirty(db, tx); + atomic_add_64(&will_dirty, 1); return; } DB_DNODE_EXIT(db); + if (db->db_state == DB_CACHED) + atomic_add_64(&will_dirty_range_cached, 1); + else + atomic_add_64(&will_dirty_range_uncached, 1); /* * Only issue a read if we start writing inside the block rather @@ -1994,7 +2044,7 @@ #endif dbuf_transition_to_read(db, DB_UNCACHED); -#if 0 +#ifdef ZFS_DIRTY_SYNC_READS /* XXX TEMP Wait for the async read to complete */ while (db->db_state == DB_READ) cv_wait(&db->db_changed, &db->db_mtx); @@ -2048,6 +2098,11 @@ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); + if (db->db_state == DB_CACHED) + atomic_add_64(&will_fill_cached, 1); + else + atomic_add_64(&will_fill_uncached, 1); + dbuf_noread(db); dr = dbuf_dirty(db, tx); @@ -2752,9 +2807,11 @@ /* * We can't freeze indirects if there is a possibility that they - * may be modified in the current syncing context. + * may be modified in the current syncing context, or if there could + * be data in flight. */ - if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) + if (db->db_buf && db->db_state == DB_CACHED && + holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) arc_buf_freeze(db->db_buf); if (holds == db->db_dirtycnt && Change 516170 by willa@willa_repo on 2011/12/06 15:27:37 Deserialize read I/O's further from writes. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c: - Modify dmu_tx_count_write(): - Don't issue reads for level 0 blocks in an attempt to fail earlier when I/O errors would occur. This logic assumed that writes would need to read those blocks anyway. By removing these, the asynchronous reads are now actually executing and being resolved by dbuf_resolve_ranges(). Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c#4 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c#4 (text) ==== @@ -266,6 +266,7 @@ zio_t *zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); +#ifdef ZFS_DIRTY_SYNC_READS /* first level-0 block */ start = off >> dn->dn_datablkshift; if (P2PHASE(off, dn->dn_datablksz) || @@ -283,6 +284,10 @@ if (err) goto out; } +#else + start = off >> dn->dn_datablkshift; + end = (off+len-1) >> dn->dn_datablkshift; +#endif /* level-1 blocks */ if (nlvls > 1) { Change 516173 by willa@willa_repo on 2011/12/06 16:08:57 Enable the PARTIAL state in the dbuf state machine. The dbuf state machine now no longer issues reads if a given dbuf is in flight or if the write starts at the beginning of the dbuf or ends at the end. These bits were checked in previously but were not exercised until now. And as it turns out, only one of the asserts needed to be changed to enable PARTIAL. In dbuf_sync_leaf(), as committed in a previous checkin, reads that are required at that point will be issued if necessary. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#21 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#21 (text) ==== @@ -735,7 +735,12 @@ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_PARTIAL); - ASSERT(db->db_buf == NULL); + /* + * 1. Read without any writes (db_buf == NULL) + * 2. Have dirty records (!list_is_empty(&db->db_dirty_records) + * 3. freed_in_flight == TRUE + */ + //ASSERT(db->db_buf == NULL); if (db->db_blkid == DMU_BONUS_BLKID) { int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); @@ -2037,11 +2042,7 @@ * Only issue a read if we start writing inside the block rather * than either at the beginning (forward) or end (backward) */ -#ifdef NOTYET - if (db->db_state == DB_UNCACHED && - ((offset != 0 && offset != db->db.db_size) || - !BP_IS_HOLE(db->db_blkptr))) -#endif + if (offset != 0 && (offset + size != db->db.db_size)) dbuf_transition_to_read(db, DB_UNCACHED); #ifdef ZFS_DIRTY_SYNC_READS @@ -2052,10 +2053,8 @@ /* State transitions not done above, needed for dbuf_dirty */ /* Transition to PARTIAL if we didn't transition to READ */ -#ifdef NOTYET if (db->db_state == DB_UNCACHED) db->db_state = DB_PARTIAL; -#endif /* * Now set the FILL bit, since the caller is about to write, but * only if the dbuf isn't already CACHED. Change 516252 by willa@willa_repo on 2011/12/07 14:29:22 Fix a bug in dbuf_read_done() that caused a panic (assert fired). - Don't resolve ranges for non-level 0 blocks. They do not, and cannot apply. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#22 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#22 (text) ==== @@ -698,7 +698,9 @@ */ VERIFY(arc_buf_remove_ref(buf, db) == 1); } else if (zio == NULL || zio->io_error == 0) { - if (!list_is_empty(&db->db_dirty_records)) { + if (!list_is_empty(&db->db_dirty_records) && + db->db_level == 0) { + /* Write ranges only exist for level 0 blocks */ dbuf_resolve_ranges(db, buf); /* The read data buffer is no longer needed */ VERIFY(arc_buf_remove_ref(buf, db) == 1); Change 516291 by willa@willa_repo on 2011/12/08 11:33:12 Remove a few now-obsolete comments and code sections. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#24 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#24 (text) ==== @@ -1366,19 +1366,15 @@ * syncing state (since they are only modified * then). */ - /* XXX - * isn't this wrong? - * in the new world order, just because we - * have an arc buf doesn't mean we don't - * want a separate buffer for new DRs. This - * could be why we are writing buffers after - * they have been frozen? - */ if (db->db_buf != NULL) { arc_release(db->db_buf, db); dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db_buf; } else { + /* + * Buffer hasn't been created yet - + * new dbuf that's just been dirtied. + */ int size = db->db.db_size; arc_buf_contents_t type; spa_t *spa; @@ -1703,11 +1699,6 @@ /* Remove any write range entries left behind. */ dl = &dr->dt.dl; while ((range = list_remove_head(&dl->write_ranges)) != NULL) { - /* - * XXX This may need to be logged in some way once - * merges occur, which in theory should always - * result in empty lists here. - */ kmem_free(range, sizeof(dbuf_dirty_range_t)); atomic_subtract_64(&dirty_ranges_in_flight, 1); ASSERT(dirty_ranges_in_flight >= 0); @@ -2127,18 +2118,9 @@ void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) { - dbuf_dirty_record_t *dr; - mutex_enter(&db->db_mtx); DBUF_VERIFY(db); if (db->db_state & DB_FILL) { - /* Find the relevant dirty record to update */ - for (dr = list_head(&db->db_dirty_records); - dr != NULL && dr->dr_txg != tx->tx_txg; - dr = list_next(&db->db_dirty_records, dr)) - ; - ASSERT(dr != NULL); - if (db->db_level == 0 && db->db_freed_in_flight) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ Change 516775 by justing@justing-ns1 on 2011/12/13 11:35:01 cddl/tools/regression/stc/src/suites/fs/zfs/bin/file_trunc.c: When the validation that a block we truncated is zero filled fails, say so explicitly instead of giving a cryptic error. Affected files ... ... //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/bin/file_trunc.c#2 edit Differences ... ==== //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/bin/file_trunc.c#2 (text) ==== @@ -203,7 +203,9 @@ } if (memcmp(buf, rbuf, bsize) != 0) { - perror("memcmp"); + (void) fprintf(stderr, + "Previously truncated block at offset %ld " + "isn't zeroed", (offset + roffset)); exit(9); } } Change 516872 by justing@justing-ns1 on 2011/12/14 14:16:55 cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c: Checkpoint my efforts to document the RAIDZ layout mapping function. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c#3 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c#3 (text) ==== @@ -637,19 +637,38 @@ uint64_t nparity) { raidz_map_t *rm; + /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = zio->io_offset >> unit_shift; /* The zio's size in units of the vdev's preferred sector size */ uint64_t s = zio->io_size >> unit_shift; + /* The first column for this stripe. */ uint64_t f = b % dcols; + /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << unit_shift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; + /* + * The number of sectors for this stripe on all but the "remnant" + * child vdev. + */ q = s / (dcols - nparity); + + /* + * The number of "remnant" sectors in this I/O. This will add a + * sector to some, but not all, child vdevs. */ + */ r = s - q * (dcols - nparity); + + /* The number of "bonus columns" - those which contain remnant data. */ bc = (r == 0 ? 0 : r + nparity); + + /* The total number of sectors associated with this I/O. */ tot = s + nparity * (q + (r == 0 ? 0 : 1)); if (q == 0) { + /* + * Our I/O request doesn't span all child vdevs. + */ acols = bc; scols = MIN(dcols, roundup(bc, nparity + 1)); } else { Change 517074 by willa@willa_repo on 2011/12/15 20:53:13 Make dprintf's more accessible. - Make zfs_flags modifiable via sysctl vfs.zfs.debug_flags. - Whenever dprintf() is called, call printf() on its generated string. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c#3 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c#4 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c#3 (text) ==== @@ -268,6 +268,8 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0, "Try to recover from otherwise-fatal errors."); +SYSCTL_INT(_vfs_zfs, OID_AUTO, debug_flags, CTLFLAG_RW, &zfs_flags, 0, + "Debug flags for ZFS testing."); /* * ========================================================================== ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c#4 (text) ==== @@ -141,6 +141,7 @@ va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); + printf("%s", buf); /* * To get this data, use the zfs-dprintf probe as so: Change 517075 by willa@willa_repo on 2011/12/15 20:58:24 Remove DB_UNRESOLVED and DB_IN_FLIGHT states. Their meaning was too hard to remember. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#26 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#10 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#10 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#26 (text) ==== @@ -889,7 +889,7 @@ if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); - } else if (db->db_state & DB_UNRESOLVED) { + } else if (db->db_state & (DB_UNCACHED|DB_PARTIAL)) { spa_t *spa = dn->dn_objset->os_spa; #ifdef _KERNEL @@ -933,7 +933,7 @@ if ((flags & DB_RF_NEVERWAIT) == 0) { mutex_enter(&db->db_mtx); /* Already waited for DB_FILL set above */ - while (db->db_state & DB_IN_FLIGHT) { + while (db->db_state & DB_READ) { ASSERT(db->db_state == DB_READ || (flags & DB_RF_HAVESTRUCT) == 0); cv_wait(&db->db_changed, &db->db_mtx); @@ -991,7 +991,7 @@ } else if (db->db_state & (DB_READ|DB_PARTIAL)) { db->db_state |= DB_FILL; } else { - ASSERT(db->db_state & (DB_IN_FLIGHT|DB_CACHED)); + ASSERT(db->db_state & (DB_PARTIAL|DB_READ|DB_CACHED)); } mutex_exit(&db->db_mtx); } @@ -1501,7 +1501,7 @@ */ ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || db->db_state == DB_FILL || - db->db_state == DB_NOFILL || db->db_state & DB_IN_FLIGHT); + db->db_state == DB_NOFILL || db->db_state & (DB_PARTIAL|DB_READ)); } /** @@ -2105,7 +2105,7 @@ * Now set the FILL bit, since the caller is about to write, but * only if the dbuf isn't already CACHED. */ - if (db->db_state & DB_IN_FLIGHT) + if (db->db_state & (DB_PARTIAL|DB_READ)) db->db_state |= DB_FILL; mutex_exit(&db->db_mtx); @@ -2188,7 +2188,7 @@ db->db_state = DB_CACHED; else { db->db_state &= ~DB_FILL; - ASSERT(db->db_state & DB_IN_FLIGHT); + ASSERT(db->db_state & (DB_PARTIAL|DB_READ)); } cv_broadcast(&db->db_changed); @@ -3158,7 +3158,7 @@ * If the dirty record is not CACHED, make it happen. */ if (db->db_state != DB_CACHED) { - dbuf_transition_to_read(db, DB_UNRESOLVED); + dbuf_transition_to_read(db, DB_UNCACHED|DB_PARTIAL); while (db->db_state & DB_READ) cv_wait(&db->db_changed, &db->db_mtx); } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#10 (text) ==== @@ -1480,7 +1480,7 @@ * can be completed. */ if (db->db_state != DB_CACHED) { - dbuf_transition_to_read(db, DB_UNRESOLVED); + dbuf_transition_to_read(db, DB_UNCACHED|DB_PARTIAL); while (db->db_state & DB_READ) cv_wait(&db->db_changed, &db->db_mtx); /* The dbuf had an I/O error or was freed in flight */ ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#10 (text) ==== @@ -233,17 +233,7 @@ * has been issued to resolve the COW fault, and a * thread is actively modifying the dbuf. */ - DB_READ_FILL = DB_READ|DB_FILL, - - /** - * Dbuf either has an outstanding read or writes. - */ - DB_IN_FLIGHT = DB_PARTIAL|DB_READ, - - /** - * Dbuf does not have fully valid data. - */ - DB_UNRESOLVED = DB_UNCACHED|DB_PARTIAL, + DB_READ_FILL = DB_READ|DB_FILL } dbuf_states_t; struct dnode; Change 517076 by willa@willa_repo on 2011/12/15 21:08:02 Move dbuf sysctl counters to vfs.zfs.dbuf. Add a few more. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#27 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#27 (text) ==== @@ -73,13 +73,15 @@ static dbuf_hash_table_t dbuf_hash_table; int64_t dirty_ranges_in_flight; SYSCTL_DECL(_vfs_zfs); + +SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf, CTLFLAG_RW, 0, "ZFS DBUF"); #define SYSCTL_COUNTER_I(name, desc) \ int64_t name; \ - SYSCTL_QUAD(_vfs_zfs, OID_AUTO, name, CTLFLAG_RD, \ + SYSCTL_QUAD(_vfs_zfs_dbuf, OID_AUTO, name, CTLFLAG_RD, \ &name, 0, desc) #define SYSCTL_COUNTER_U(name, desc) \ uint64_t name; \ - SYSCTL_QUAD(_vfs_zfs, OID_AUTO, name, CTLFLAG_RD, \ + SYSCTL_QUAD(_vfs_zfs_dbuf, OID_AUTO, name, CTLFLAG_RD, \ &name, 0, desc) SYSCTL_COUNTER_I(dirty_ranges_in_flight, "number of dirty ranges in flight"); @@ -97,6 +99,14 @@ SYSCTL_COUNTER_U(exited_read_uncached, "exited read UNCACHED"); SYSCTL_COUNTER_U(exited_read_cached, "exited read CACHED"); +SYSCTL_COUNTER_U(dirty_writes_lost, "dirty writes list"); +SYSCTL_COUNTER_U(fill_freed_in_flight, "FILL freed in flight"); +SYSCTL_COUNTER_U(threw_away_level0_read, "threw away level-0 READ buffer"); +SYSCTL_COUNTER_U(fix_old_data_null, "fix_old_data NULL'd db_data"); +SYSCTL_COUNTER_U(fix_old_data_more_dirties, "fix_old_data dirty db_data"); +SYSCTL_COUNTER_U(free_range_cleared, "free_range no holds"); +SYSCTL_COUNTER_U(free_range_referenced, "free_range with holds"); + static uint64_t dbuf_hash_count; static uint64_t @@ -299,18 +309,6 @@ for (i = 0; i < DBUF_MUTEXES; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); - - dirty_ranges_in_flight = 0; - will_dirty = 0; - will_dirty_range_cached = 0; - will_dirty_range_uncached = 0; - will_fill_cached = 0; - will_fill_uncached = 0; - dirty_ranges_total = 0; - entered_read_uncached = 0; - entered_read_cached = 0; - exited_read_uncached = 0; - exited_read_cached = 0; } void @@ -683,6 +681,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; + boolean_t level0_dirty; mutex_enter(&db->db_mtx); while (db->db_state & DB_FILL) @@ -691,11 +690,14 @@ /* Any reads or writes must have a hold on this dbuf */ ASSERT(refcount_count(&db->db_holds) > 0); + level0_dirty = db->db_level == 0 && db->db_dirtycnt > 0; if (db->db_state == DB_CACHED) { /* * Read data became obsolete when a writer finished filling * the buffer. */ + if (db->db_level == 0) + atomic_add_64(&threw_away_level0_read, 1); VERIFY(arc_buf_remove_ref(buf, db) == 1); } else if (zio == NULL || zio->io_error == 0) { if (!list_is_empty(&db->db_dirty_records) && @@ -704,6 +706,7 @@ dbuf_resolve_ranges(db, buf); /* The read data buffer is no longer needed */ VERIFY(arc_buf_remove_ref(buf, db) == 1); + atomic_add_64(&threw_away_level0_read, 1); } else { /* Read issued, no associated writes */ ASSERT(db->db_state == DB_READ); @@ -711,6 +714,11 @@ } db->db_state = DB_CACHED; } else { + /* Read failed. */ + if (level0_dirty && db->db_state != DB_CACHED) { + /* XXX Notify of errors saving write data */ + atomic_add_64(&dirty_writes_lost, 1); + } ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_state == DB_READ); ASSERT3P(db->db_buf, ==, NULL); @@ -1043,10 +1051,12 @@ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa; + atomic_add_64(&fix_old_data_more_dirties, 1); DB_GET_SPA(&spa, db); dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { + atomic_add_64(&fix_old_data_null, 1); dbuf_set_data(db, NULL); } } @@ -1178,9 +1188,11 @@ /* All consumers are finished, so evict the buffer */ if (refcount_count(&db->db_holds) == 0) { ASSERT(db->db_buf); + atomic_add_64(&free_range_cleared, 1); dbuf_clear(db); continue; } + atomic_add_64(&free_range_referenced, 1); /* The dbuf is referenced */ dr = list_head(&db->db_dirty_records); @@ -2174,6 +2186,7 @@ ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ + atomic_add_64(&fill_freed_in_flight, 1); bzero(db->db.db_data, db->db.db_size); db->db_freed_in_flight = FALSE; db->db_state = DB_CACHED; Change 517077 by willa@willa_repo on 2011/12/15 21:23:27 Fix a few minor nits. - Remove a few comments that no longer apply. - dbuf_fix_old_data(): The holds > dirtycount case doesn't need to be augmented by checking the write range count. - dbuf_fill_done(): Handle the freed_in_flight case separately. - dbuf_undirty(): Wait for readers to complete. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#28 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#28 (text) ==== @@ -1036,7 +1036,7 @@ * If the last dirty record for this dbuf has not yet synced * and its referencing the dbuf data, either: * reset the reference to point to a new copy, - * or (if there no active holders or unresolved ranges) + * or (if there no active holders) * just null out the current db_data pointer. */ ASSERT(dr->dr_txg >= txg - 2); @@ -1045,8 +1045,7 @@ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); - } else if (refcount_count(&db->db_holds) > db->db_dirtycnt || - !list_is_empty(&dr->dt.dl.write_ranges)) { + } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa; @@ -1165,9 +1164,7 @@ mutex_exit(&db->db_mtx); continue; } - /* XXX - * This should check for PARTIAL also. - * + /* * Our goal is to make the data visible in the current * transaction group all zeros while preserving the data * as seen in any earlier transaction groups. @@ -1768,7 +1765,6 @@ /** * \brief Undirty a buffer, clearing dirty records. - * This function appears to only be called in the syncer state. */ static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) @@ -1796,6 +1792,9 @@ ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); + while (db->db_state & DB_READ) + cv_wait(&db->db_changed, &db->db_mtx); + DB_DNODE_ENTER(db); dn = DB_DNODE(db); @@ -2183,25 +2182,31 @@ DBUF_VERIFY(db); if (db->db_state & DB_FILL) { if (db->db_level == 0 && db->db_freed_in_flight) { + dbuf_dirty_record_t *dr; + + atomic_add_64(&fill_freed_in_flight, 1); + dr = list_head(&db->db_dirty_records); + ASSERT(dr->dr_txg == tx->tx_txg); ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ - atomic_add_64(&fill_freed_in_flight, 1); bzero(db->db.db_data, db->db.db_size); db->db_freed_in_flight = FALSE; + dbuf_dirty_record_cleanup_ranges(dr); db->db_state = DB_CACHED; - } - /* - * This function can be called with another state bit set, - * but if FILL is the only bit set, then the buffer has been - * fully filled. Otherwise, clear the FILL bit, so it goes - * back to the steady state. - */ - if (db->db_state == DB_FILL) - db->db_state = DB_CACHED; - else { - db->db_state &= ~DB_FILL; - ASSERT(db->db_state & (DB_PARTIAL|DB_READ)); + } else { + /* + * This function can be called with another state bit + * set, but if FILL is the only bit set, then the + * buffer has been fully filled. Otherwise, clear the + * FILL bit, so it goes back to the steady state. + */ + if (db->db_state == DB_FILL) + db->db_state = DB_CACHED; + else { + db->db_state &= ~DB_FILL; + ASSERT(db->db_state & (DB_PARTIAL|DB_READ)); + } } cv_broadcast(&db->db_changed); Change 517129 by willa@willa_repo on 2011/12/16 14:40:57 Pass on WITH_ZFS_DIRTY_SYNC_READS. Affected files ... ... //depot/branches/redline/projects/cow/sys/modules/zfs/Makefile#4 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/modules/zfs/Makefile#4 (text) ==== @@ -92,6 +92,9 @@ CFLAGS+=-DDEBUG=1 DEBUG_FLAGS?=-g .endif +.ifdef WITH_ZFS_DIRTY_SYNC_READS +CFLAGS+=-DZFS_DIRTY_SYNC_READS +.endif .include Change 517130 by willa@willa_repo on 2011/12/16 14:43:07 If the read-compare-after-write fails, print the offending bytes. Affected files ... ... //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/bin/file_trunc.c#3 edit Differences ... ==== //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/bin/file_trunc.c#3 (text) ==== @@ -171,6 +171,7 @@ off_t roffset = 0; char *buf = NULL; char *rbuf = NULL; + int i; buf = (char *)calloc(1, bsize); rbuf = (char *)calloc(1, bsize); @@ -204,8 +205,15 @@ if (memcmp(buf, rbuf, bsize) != 0) { (void) fprintf(stderr, - "Previously truncated block at offset %ld " - "isn't zeroed", (offset + roffset)); + "Read back of data written offset %x " + "isn't what we wrote:\n", (offset + roffset)); + for (i = 0;i < bsize; i++) { + if (buf[i] == rbuf[i]) + continue; + fprintf(stderr, "%04x: %02x | %02x\n", + i, buf[i], rbuf[i]); + } + (void) fprintf(stderr, "%s", rbuf); exit(9); } } Change 517131 by willa@willa_repo on 2011/12/16 14:44:39 Minor improvements to dprintf_dbuf() calls. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c#3 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#11 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c#3 (text) ==== @@ -164,7 +164,7 @@ if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { int64_t delta; - dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); + dprintf_bp(bp, "freeing ds=%llu\n", ds->ds_object); dsl_free(tx->tx_pool, tx->tx_txg, bp); mutex_enter(&ds->ds_dir->dd_lock); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#11 (text) ==== @@ -530,8 +530,8 @@ (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ (u_longlong_t)__db_obj); \ dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \ - "obj=%s lvl=%u blkid=%lld " fmt, \ - __db_buf, (dbuf)->db_level, \ + "ptr=%p arc=%p obj=%s lvl=%u blkid=%lld " fmt, \ + dbuf, (dbuf)->db_buf, __db_buf, (dbuf)->db_level, \ (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \ } \ _NOTE(CONSTCOND) } while (0) Change 517134 by justing@justing-ns1 on 2011/12/16 15:01:19 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c: Fix comment within a comment. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c#4 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c#4 (text) ==== @@ -655,7 +655,7 @@ /* * The number of "remnant" sectors in this I/O. This will add a - * sector to some, but not all, child vdevs. */ + * sector to some, but not all, child vdevs. */ r = s - q * (dcols - nparity); Change 517139 by willa@willa_repo on 2011/12/16 15:21:27 Fix (at least some) broken truncate(2) handling. This now passes a basic write-read-truncate cycle without getting zeroes back from the read as before. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: - Add a new macro, DBUF_STATE_CHANGE, which performs the requested dbuf state change and, if enabled, logs it. The log includes the specific state change made as well as the reason. This macro has proven extremely useful for understanding what's going on. - Update all modifiers of db->db_state to use DBUF_STATE_CHANGE. - Don't check arc_buf_frozen() if the ZFS_DEBUG_MODIFY flag isn't set. - Change dbuf_read_done() a bit: - If there are any dirty records, even if the state is CACHED, make sure the write ranges are processed anyway. Otherwise, the dbuf enters an inconsistent state. - If there were no dirty records and the state is not DB_READ, assume someone else beat us to the punch and simply throw away the now obsolete read buffer. - Fix dbuf_read_on_hole(): Treat this case as if the full read cycle had been executed. The only difference is we can satisfy the request immediately with a new zero-filled ARC buffer. However, this new buffer must still go through dbuf_read_done(), like a normal read would have. - Fix dbuf_free_range(): Merge two parts of the still-referenced case into one to keep the record-keeping straight. If, after processing, the dbuf is not UNCACHED and no longer has an ARC buffer, create one. Either way, zero fill the buffer. - Fix dbuf_dirty_record_add_range(): In the debug build case, the asserts at the top require certain assignments earlier. - Fix dbuf_assign_arcbuf(): Don't call arc_release here on the dirty record's old ARC buffer, since we are not touching it. Don't create a new write range covering the buffer, since the write range merging now assumes no entries means no holes. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h: - Add a new debug flag, ZFS_DEBUG_DBUF_STATE, which controls whether dbuf state changes specifically are printed. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#29 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h#2 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#29 (text) ==== @@ -128,6 +128,13 @@ return (crc); } +#define DBUF_STATE_CHANGE(db, op, state, why) do { \ + (db)->db_state op state; \ + if (zfs_flags & ZFS_DEBUG_DBUF_STATE) \ + dprintf_dbuf(db, "%s: state change (" #op " " #state \ + "): %s\n", __func__, why); \ +} while(0) + #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ @@ -485,7 +492,7 @@ dbuf_evict_user(db); db->db.db_data = NULL; if (db->db_state != DB_NOFILL) - db->db_state = DB_UNCACHED; + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "set data"); } } @@ -560,7 +567,8 @@ itr->dl = dl; itr->max_offset = MIN(arc_buf_size(src_buf), arc_buf_size(dl->dr_data)); itr->range = list_head(&dl->write_ranges); - ASSERT(!arc_buf_frozen(dl->dr_data)); + ASSERT((zfs_flags & ZFS_DEBUG_MODIFY) == 0 || + !arc_buf_frozen(dl->dr_data)); itr->hole.src = NULL; itr->hole.dst = NULL; itr->hole.size = 0; @@ -666,7 +674,7 @@ */ dr = list_tail(&db->db_dirty_records); old_buf = buf; - ASSERT(arc_buf_frozen(buf)); + ASSERT((zfs_flags & ZFS_DEBUG_MODIFY) == 0 || arc_buf_frozen(buf)); while (dr != NULL) { dl = &dr->dt.dl; ASSERT(dl->dr_data); @@ -687,32 +695,48 @@ while (db->db_state & DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); + dprintf_dbuf(db, "%s: zio=%p arc=%p\n", __func__, zio, buf); + /* Any reads or writes must have a hold on this dbuf */ ASSERT(refcount_count(&db->db_holds) > 0); level0_dirty = db->db_level == 0 && db->db_dirtycnt > 0; - if (db->db_state == DB_CACHED) { - /* - * Read data became obsolete when a writer finished filling - * the buffer. - */ - if (db->db_level == 0) - atomic_add_64(&threw_away_level0_read, 1); - VERIFY(arc_buf_remove_ref(buf, db) == 1); - } else if (zio == NULL || zio->io_error == 0) { - if (!list_is_empty(&db->db_dirty_records) && - db->db_level == 0) { - /* Write ranges only exist for level 0 blocks */ + if (zio == NULL || zio->io_error == 0) { + /* Read succeeded. */ + if (!level0_dirty) { + if (db->db_state == DB_READ) { + /* No dirty ranges to resolve */ + dbuf_set_data(db, buf); + DBUF_STATE_CHANGE(db, =, DB_CACHED, + "no dirty ranges to resolve"); + } else { + ASSERT(db->db_state == DB_UNCACHED || + db->db_state == DB_CACHED); + if (db->db_level == 0) + atomic_add_64(&threw_away_level0_read, 1); + /* The provided buffer is no longer needed. */ + arc_release(buf, db); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + } + } else { + dbuf_dirty_record_t *dr; + + /* + * Level 0 buffer with dirty records. If CACHED, at + * least the newest transaction group should have no + * write ranges, as is done in dbuf_free_range(). + */ + dr = list_head(&db->db_dirty_records); + ASSERT(db->db_state != DB_CACHED || + list_is_empty(&dr->dt.dl.write_ranges)); + dbuf_resolve_ranges(db, buf); - /* The read data buffer is no longer needed */ + /* The provided buffer is no longer needed. */ + arc_release(buf, db); VERIFY(arc_buf_remove_ref(buf, db) == 1); - atomic_add_64(&threw_away_level0_read, 1); - } else { - /* Read issued, no associated writes */ - ASSERT(db->db_state == DB_READ); - dbuf_set_data(db, buf); + DBUF_STATE_CHANGE(db, =, DB_CACHED, + "dirty ranges resolved"); } - db->db_state = DB_CACHED; } else { /* Read failed. */ if (level0_dirty && db->db_state != DB_CACHED) { @@ -756,7 +780,7 @@ bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); DB_DNODE_EXIT(db); dbuf_update_data(db); - db->db_state = DB_CACHED; + DBUF_STATE_CHANGE(db, =, DB_CACHED, "bonus buffer filled"); mutex_exit(&db->db_mtx); return (TRUE); } @@ -768,6 +792,8 @@ * \param dn Dnode for the dbuf. * \param flags Dbuf read flags pointer. * + * \invariant The dbuf's mutex must be held. + * * \returns whether any action was taken. */ static boolean_t @@ -775,6 +801,8 @@ { int is_hole; + ASSERT(MUTEX_HELD(&db->db_mtx)); + is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr); /* * For level 0 blocks only, if the above check fails: @@ -787,15 +815,19 @@ BP_IS_HOLE(db->db_blkptr); if (is_hole) { - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + arc_buf_t *buf; - dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, - db->db.db_size, db, type)); + buf = arc_buf_alloc(dn->dn_objset->os_spa, db->db.db_size, + db, DBUF_GET_BUFC_TYPE(db)); DB_DNODE_EXIT(db); - bzero(db->db.db_data, db->db.db_size); - db->db_state = DB_CACHED; - *flags |= DB_RF_CACHED; + arc_release(buf, db); + bzero(buf->b_data, db->db.db_size); + arc_buf_freeze(buf); + DBUF_STATE_CHANGE(db, =, DB_READ, "hole read satisfied"); + dbuf_add_ref(db, NULL); + //*flags |= DB_RF_CACHED; mutex_exit(&db->db_mtx); + dbuf_read_done(/*zio*/NULL, buf, db); return (TRUE); } return (FALSE); @@ -833,7 +865,7 @@ spa = dn->dn_objset->os_spa; DB_DNODE_EXIT(db); - db->db_state = DB_READ; + DBUF_STATE_CHANGE(db, =, DB_READ, "read issued"); mutex_exit(&db->db_mtx); if (DBUF_IS_L2CACHEABLE(db)) @@ -997,7 +1029,7 @@ } else if (db->db_state == DB_NOFILL) { dbuf_set_data(db, NULL); } else if (db->db_state & (DB_READ|DB_PARTIAL)) { - db->db_state |= DB_FILL; + DBUF_STATE_CHANGE(db, |=, DB_FILL, "notifying of a fill"); } else { ASSERT(db->db_state & (DB_PARTIAL|DB_READ|DB_CACHED)); } @@ -1215,24 +1247,28 @@ */ dbuf_fix_old_data(db, txg); } - } else if (db->db_state != DB_CACHED) { - /* Create buffer and make it become zero filled. */ - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - spa_t *spa; - - ASSERT(db->db_state == DB_READ); - DB_GET_SPA(&spa, db); - dbuf_set_data(db, - arc_buf_alloc(spa, db->db.db_size, db, type)); - db->db_state = DB_CACHED; } - /* dbuf_fix_old_data above may go to DB_UNCACHED */ + /* dbuf_fix_old_data above may have gone to DB_UNCACHED. */ if (db->db_state != DB_UNCACHED) { - /* clear the contents */ - ASSERT(db->db.db_data != NULL); - arc_release(db->db_buf, db); + /* If there's only a reader, provide a fresh buffer. + * */ + if (db->db.db_data == NULL) { + arc_buf_t *buf; + spa_t *spa; + + ASSERT(db->db_state == DB_READ); + DB_GET_SPA(&spa, db); + buf = arc_buf_alloc(spa, db->db.db_size, db, + DBUF_GET_BUFC_TYPE(db)); + dbuf_set_data(db, buf); + } else + arc_release(db->db_buf, db); + + /* Now clear the contents. */ bzero(db->db.db_data, db->db.db_size); arc_buf_freeze(db->db_buf); + DBUF_STATE_CHANGE(db, =, DB_CACHED, + "dbuf has been freed"); } mutex_exit(&db->db_mtx); @@ -1447,6 +1483,8 @@ ASSERT(data_old != NULL); } dr->dt.dl.dr_data = data_old; + dprintf_dbuf(db, "%s: dr_data=%p\n", __func__, + dr->dt.dl.dr_data); list_create(&dl->write_ranges, sizeof(dbuf_dirty_range_t), offsetof(dbuf_dirty_range_t, write_range_link)); } else { @@ -1922,13 +1960,13 @@ int new_end, old_end; boolean_t use_existing; + dl = &dr->dt.dl; + db = dr->dr_dbuf; + /* Write ranges do not apply to indirect blocks. */ - ASSERT(dr->dr_dbuf->db_level == 0); + ASSERT(db->db_level == 0); ASSERT(MUTEX_HELD(&db->db_mtx)); - dl = &dr->dt.dl; - db = dr->dr_dbuf; - /* Discover whether an existing entry overlaps. */ old_range = list_head(&dl->write_ranges); new_end = offset + size; @@ -1973,6 +2011,8 @@ */ old_range->offset = MIN(offset, old_range->offset); old_range->size = MAX(new_end, old_end) - old_range->offset; + dprintf_dbuf(dr->dr_dbuf, "%s: dr %p: modify range offset " + "0x%x size 0x%x\n", __func__, dr, offset, size); #define LIST_HAS_ONE_ENTRY(list) \ (list_head(list) == list_tail(list) && !list_is_empty(list)) @@ -1983,7 +2023,8 @@ */ if (LIST_HAS_ONE_ENTRY(&db->db_dirty_records) && old_range->offset == 0 && old_range->size == db->db.db_size) - db->db_state = DB_FILL; + DBUF_STATE_CHANGE(db, =, DB_FILL, + "caller is about to finish filling buffer"); return; } @@ -1992,6 +2033,9 @@ new_range->offset = offset; new_range->size = size; + dprintf_dbuf(dr->dr_dbuf, "%s: dr %p: new range offset 0x%x size 0x%x\n", + __func__, dr, offset, size); + /* * Insert the new range: * - At the end of the list (old_range == NULL): @@ -2111,13 +2155,15 @@ /* State transitions not done above, needed for dbuf_dirty */ /* Transition to PARTIAL if we didn't transition to READ */ if (db->db_state == DB_UNCACHED) - db->db_state = DB_PARTIAL; + DBUF_STATE_CHANGE(db, =, DB_PARTIAL, + "new buffer about to be written without async read"); /* * Now set the FILL bit, since the caller is about to write, but * only if the dbuf isn't already CACHED. */ if (db->db_state & (DB_PARTIAL|DB_READ)) - db->db_state |= DB_FILL; + DBUF_STATE_CHANGE(db, |=, DB_FILL, + "new writer about to modify non-CACHED buffer"); mutex_exit(&db->db_mtx); dr = dbuf_dirty(db, tx); @@ -2135,7 +2181,7 @@ { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - db->db_state = DB_NOFILL; + DBUF_STATE_CHANGE(db, =, DB_NOFILL, "allocating NOFILL buffer"); dmu_buf_will_fill(db_fake, tx); } @@ -2193,7 +2239,8 @@ bzero(db->db.db_data, db->db.db_size); db->db_freed_in_flight = FALSE; dbuf_dirty_record_cleanup_ranges(dr); - db->db_state = DB_CACHED; + DBUF_STATE_CHANGE(db, =, DB_CACHED, + "fill done handling freed in flight"); } else { /* * This function can be called with another state bit @@ -2202,9 +2249,11 @@ * FILL bit, so it goes back to the steady state. */ if (db->db_state == DB_FILL) - db->db_state = DB_CACHED; + DBUF_STATE_CHANGE(db, =, DB_CACHED, + "filler finished, complete buffer"); else { - db->db_state &= ~DB_FILL; + DBUF_STATE_CHANGE(db, &=, ~DB_FILL, + "filler finished, incomplete buffer"); ASSERT(db->db_state & (DB_PARTIAL|DB_READ)); } } @@ -2259,7 +2308,6 @@ /* Remove the old dirty data for this transaction */ dbuf_dirty_record_cleanup_ranges(dr); if (dr->dt.dl.dr_data != NULL) { - arc_release(buf, db); VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); } @@ -2270,7 +2318,6 @@ * the READ is later resolved, the new buffer stays. */ dr->dt.dl.dr_data = buf; - dbuf_dirty_record_add_range(dr, 0, db->db.db_size); mutex_exit(&db->db_mtx); return; } @@ -2314,7 +2361,7 @@ ASSERT(db->db_buf == NULL); /* Set db->db_buf = buf */ dbuf_set_data(db, buf); - db->db_state = DB_FILL; + DBUF_STATE_CHANGE(db, =, DB_FILL, "assigning buffer"); mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); /* clear db->db.db_data and tell waiters it's changed ?? */ @@ -2353,13 +2400,13 @@ arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } db->db.db_data = NULL; - db->db_state = DB_UNCACHED; + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "buffer cleared"); } ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); - db->db_state = DB_EVICTING; + DBUF_STATE_CHANGE(db, =, DB_EVICTING, "buffer eviction started"); db->db_blkptr = NULL; DB_DNODE_ENTER(db); @@ -2501,7 +2548,7 @@ (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; - db->db_state = DB_UNCACHED; + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "bonus buffer created"); /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); return (db); @@ -2524,7 +2571,7 @@ * dn_dbufs list. */ mutex_enter(&dn->dn_dbufs_mtx); - db->db_state = DB_EVICTING; + db->db_state = DB_EVICTING; /* not worth logging this state change */ if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ kmem_cache_free(dbuf_cache, db); @@ -2532,7 +2579,7 @@ return (odb); } list_insert_head(&dn->dn_dbufs, db); - db->db_state = DB_UNCACHED; + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "regular buffer created"); mutex_exit(&dn->dn_dbufs_mtx); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h#2 (text) ==== @@ -52,6 +52,7 @@ #define ZFS_DEBUG_DNODE_VERIFY 0x0004 #define ZFS_DEBUG_SNAPNAMES 0x0008 #define ZFS_DEBUG_MODIFY 0x0010 +#define ZFS_DEBUG_DBUF_STATE 0x0020 #ifdef ZFS_DEBUG extern void __dprintf(const char *file, const char *func, Change 517201 by willa@willa_repo on 2011/12/16 18:46:33 When a buffer is modified while frozen, log which buffer. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c#3 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c#3 (text) ==== @@ -1162,7 +1162,7 @@ } fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) - panic("buffer modified while frozen!"); + panic("buffer %p modified while frozen!", buf); mutex_exit(&buf->b_hdr->b_freeze_lock); } Change 517205 by justing@justing-ns1 on 2011/12/16 20:58:35 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c: Fix comment formatting to comply with style(9). Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c#4 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c#4 (text) ==== @@ -3438,7 +3438,8 @@ return (1); } -/** \brief Release this buffer from the cache. +/** + * \brief Convert to an anonymous buffer. * * This must be done after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make a new hdr for the Change 517206 by justing@justing-ns1 on 2011/12/16 21:00:11 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c: In dmu_buf_hold_array_by_dnode(), readers must wait on all of the FILL states in addition to an outstanding READ. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#11 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#11 (text) ==== @@ -446,9 +446,7 @@ for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; mutex_enter(&db->db_mtx); - /* Must wait on PARTIAL too */ - while (db->db_state == DB_READ || - db->db_state == DB_FILL) + while (db->db_state & (DB_READ|DB_FILL)) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) err = EIO; Change 517207 by justing@justing-ns1 on 2011/12/16 21:05:14 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: o Split successful read completion handling out of dbuf_read_done() into dbuf_read_complete(). Use this new function to simplify the handling of short-circuit reads of blocks backed by holes. o In dbuf_read_complete(), do not assume that we can transition to the DB_CACHED state. If the block has been freed in a newer transaction group, it may already be in the DB_UNCACHED state and must remain there. o Fix up miscelaneous comments. o In dbuf_clear(), assert that the dbuf is free of dirty records. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#30 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#30 (text) ==== @@ -686,10 +686,63 @@ } static void +dbuf_read_complete(dmu_buf_impl_t *db, arc_buf_t *buf) +{ + if (db->db_dirtycnt > 0) { + + /* + * Fill any holes in the dbuf's dirty records + * with the original block we read. + */ + dbuf_resolve_ranges(db, buf); + + /* + * The read version of this block is superceeded + * by the versions contained within the dirty records. + * Discard it. + */ + arc_release(buf, db); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + atomic_add_64(&threw_away_level0_read, 1); + + ASSERT(db->db_state == DB_CACHED || + db->db_state == DB_UNCACHED || + db->db_state == DB_READ); + if (db->db_state == DB_READ) { + /* + * The most recent version of this block + * was waiting on this read. Transition + * to cached. + */ + DBUF_STATE_CHANGE(db, =, DB_CACHED, + "resolve of records in READ state"); + } + } else if (db->db_state == DB_READ) { + /* + * Read with no dirty data. Use the buffer we + * read and transition to DB_CACHED. + */ + dbuf_set_data(db, buf); + } else { + /* + * The block was free'd or filled before this + * read could complete. + */ + ASSERT(db->db_state == DB_CACHED || + db->db_state == DB_UNCACHED); + arc_release(buf, db); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + atomic_add_64(&threw_away_level0_read, 1); + } +} + +static void dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; - boolean_t level0_dirty; + dbuf_dirty_record_t *dr; + + ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); while (db->db_state & DB_FILL) @@ -700,54 +753,38 @@ /* Any reads or writes must have a hold on this dbuf */ ASSERT(refcount_count(&db->db_holds) > 0); - level0_dirty = db->db_level == 0 && db->db_dirtycnt > 0; + /* + * Only level 0 blocks can be dirtied before being read + * (i.e. entering the DB_CACHED state). + */ + ASSERT(db->db_level == 0 || db->db_dirtycnt == 0); + if (zio == NULL || zio->io_error == 0) { /* Read succeeded. */ - if (!level0_dirty) { - if (db->db_state == DB_READ) { - /* No dirty ranges to resolve */ - dbuf_set_data(db, buf); - DBUF_STATE_CHANGE(db, =, DB_CACHED, - "no dirty ranges to resolve"); - } else { - ASSERT(db->db_state == DB_UNCACHED || - db->db_state == DB_CACHED); - if (db->db_level == 0) - atomic_add_64(&threw_away_level0_read, 1); - /* The provided buffer is no longer needed. */ - arc_release(buf, db); - VERIFY(arc_buf_remove_ref(buf, db) == 1); - } - } else { - dbuf_dirty_record_t *dr; - + dbuf_read_complete(db, buf); + } else { + /* Read failed. */ + if (db->db_dirtycnt > 0) { /* - * Level 0 buffer with dirty records. If CACHED, at - * least the newest transaction group should have no - * write ranges, as is done in dbuf_free_range(). - */ - dr = list_head(&db->db_dirty_records); - ASSERT(db->db_state != DB_CACHED || - list_is_empty(&dr->dt.dl.write_ranges)); - + * The failure of this read has already been + * communicated to the user by the zio pipeline. + * Limit our losses to just the data we can't + * read by filling any holes in our dirty records + * with zeros. + */ + bzero(buf->b_data, arc_buf_size(buf)); + arc_buf_freeze(buf); dbuf_resolve_ranges(db, buf); - /* The provided buffer is no longer needed. */ - arc_release(buf, db); - VERIFY(arc_buf_remove_ref(buf, db) == 1); - DBUF_STATE_CHANGE(db, =, DB_CACHED, - "dirty ranges resolved"); - } - } else { - /* Read failed. */ - if (level0_dirty && db->db_state != DB_CACHED) { - /* XXX Notify of errors saving write data */ atomic_add_64(&dirty_writes_lost, 1); + db->db_state = DB_CACHED; + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, + "read failed with oustanding writes failed"); + } else { + ASSERT3P(db->db_buf, ==, NULL); + db->db_state = DB_UNCACHED; + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "read failed"); } - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT(db->db_state == DB_READ); - ASSERT3P(db->db_buf, ==, NULL); VERIFY(arc_buf_remove_ref(buf, db) == 1); - db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); dbuf_rele_and_unlock(db, NULL); @@ -820,14 +857,12 @@ buf = arc_buf_alloc(dn->dn_objset->os_spa, db->db.db_size, db, DBUF_GET_BUFC_TYPE(db)); DB_DNODE_EXIT(db); - arc_release(buf, db); bzero(buf->b_data, db->db.db_size); - arc_buf_freeze(buf); DBUF_STATE_CHANGE(db, =, DB_READ, "hole read satisfied"); dbuf_add_ref(db, NULL); - //*flags |= DB_RF_CACHED; + *flags |= DB_RF_CACHED; + dbuf_read_complete(db, buf); mutex_exit(&db->db_mtx); - dbuf_read_done(/*zio*/NULL, buf, db); return (TRUE); } return (FALSE); @@ -1036,7 +1071,8 @@ mutex_exit(&db->db_mtx); } -/** \brief This is our just-in-time copy function. +/** + * \brief This is our just-in-time copy function. * * It makes a copy of buffers that have been modified in a previous transaction * group, before we modify them in the current active group. @@ -1065,7 +1101,7 @@ return; /* - * If the last dirty record for this dbuf has not yet synced + * If the most recent dirty record for this dbuf has not yet synced * and its referencing the dbuf data, either: * reset the reference to point to a new copy, * or (if there no active holders) @@ -1117,7 +1153,7 @@ ASSERT(db->db_data_pending != dr); - /* free this block */ + /* Free this block. */ if (!BP_IS_HOLE(bp)) { spa_t *spa; @@ -1189,6 +1225,7 @@ continue; mutex_enter(&db->db_mtx); + if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL || db->db_state == DB_EVICTING) { @@ -1221,8 +1258,9 @@ dbuf_clear(db); continue; } + + /* The dbuf is referenced */ atomic_add_64(&free_range_referenced, 1); - /* The dbuf is referenced */ dr = list_head(&db->db_dirty_records); if (dr != NULL) { @@ -1237,6 +1275,12 @@ dn->dn_maxblkid = db->db_blkid; /* Handle intermediate dmu_sync() calls. */ dbuf_unoverride(dr); + + /* + * If this buffer is still waiting on data + * for a RMW merge, that data no longer applies + * to this buffer. + */ dbuf_dirty_record_cleanup_ranges(dr); } else { /* @@ -1250,8 +1294,7 @@ } /* dbuf_fix_old_data above may have gone to DB_UNCACHED. */ if (db->db_state != DB_UNCACHED) { - /* If there's only a reader, provide a fresh buffer. - * */ + /* If there's only a reader, provide a fresh buffer. */ if (db->db.db_data == NULL) { arc_buf_t *buf; spa_t *spa; @@ -1802,7 +1845,14 @@ } /** - * \brief Undirty a buffer, clearing dirty records. + * \brief Undirty a buffer in the transaction group referenced by + * the given transaction. + * + * XXX The extra refcount of doing a resolving read confuses some + * of the hold accounting. Do we do the wrong thing in this + * case? + * + * XXX Need to update comments to reflect the dbuf_dirty() refactoring. */ static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) @@ -1816,7 +1866,8 @@ mutex_enter(&db->db_mtx); /* - * If this buffer is not dirty, we're done. + * If this buffer is not dirty in this transaction + * group, we're done. */ for (dr = list_head(&db->db_dirty_records); dr != NULL; dr = list_next(&db->db_dirty_records, dr)) { @@ -1830,6 +1881,11 @@ ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); + /* + * XXX Wait for the buffer to be resolved. With additional accounting + * we should be able to undirty immediately and disassociate the + * read from this dbuf before it completes. + */ while (db->db_state & DB_READ) cv_wait(&db->db_changed, &db->db_mtx); @@ -1860,7 +1916,7 @@ ASSERT(db->db.db_size != 0); - /* XXX would be nice to fix up dn_towrite_space[] */ + /* XXX would be nice to fix up *_space_towrite[] */ list_remove(&db->db_dirty_records, dr); @@ -2318,6 +2374,7 @@ * the READ is later resolved, the new buffer stays. */ dr->dt.dl.dr_data = buf; + dbuf_set_data(db, buf); mutex_exit(&db->db_mtx); return; } @@ -2368,7 +2425,8 @@ dbuf_fill_done(db, tx); } -/** \brief "Clear" the contents of this dbuf. +/** + * \brief "Clear" the contents of this dbuf. * * This will mark the dbuf EVICTING and clear *most* of its references. * Unfortunately, when we are not holding the dn_dbufs_mtx, we can't clear the @@ -2405,6 +2463,7 @@ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); + ASSERT(list_is_empty(&db->db_dirty_records)); DBUF_STATE_CHANGE(db, =, DB_EVICTING, "buffer eviction started"); db->db_blkptr = NULL; Change 517348 by willa@willa_repo on 2011/12/18 01:47:00 Fix the remaining STF test regressions. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: - Replace the algorithm used for dbuf_dirty_record_add_range() with a better and more compact one written by Justin. Instead of looping to find matches and performing post-loop updates, make the loop an accumulator that combines write ranges, dropping obsolete ones in the process. Then the post-loop simplifies to adding the final range back to the list. - In lieu of unit tests, dbuf_dirty_record_add_range() now has a companion dbuf_dirty_record_check_ranges() which is responsible for making sure the range list still contains sorted ranges that do not overlap and are not adjacent. - Fix dbuf_dirty_record_hole_itr_init() to handle the case where the first range may start at offset 0, meaning the first hole is at the end of that range. - Fix dbuf_resolve_ranges(): The provided ARC buffer may legitimately be thawed, so do not assert that it is frozen. - Fix dbuf_read_complete(): - Only check the dbuf's dirty count if it is a level 0 block. - If there is a dirty count and we were waiting on the READ, and no ARC buffer is set for the current transaction group, use the provided buffer. - Only discard the ARC buffer if we don't claim it here. - In READ with no dirties, set the state to CACHED. - In the case of a failed read with outstanding unresolved writes, set the state to CACHED. - Fix dbuf_read_on_hole(): Don't add a reference, because this function now directly calls dbuf_read_complete() instead of dbuf_read_done() which removed the reference. - Fix dbuf_sync_leaf(): Only transition to READ if the dbuf is PARTIAL, because if it is UNCACHED, it was freed in flight and therefore needs to be handled differently here. - Assert that a dbuf has no dirty records when evicting. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h: - Change dbuf_dirty_range_t to contain an 'end' value to avoid recalculating it more than once and simplify the logic. - Change the 'offset' value to 'start' to harmonize with 'end'. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: - ifdef ZFS_DEBUG a few debugging printfs. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c#5 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#31 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#12 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c#5 (text) ==== @@ -1248,6 +1248,15 @@ hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); +#ifdef ZFS_DEBUG + if (buf->b_hdr->b_freeze_cksum == NULL && buf->b_hdr->b_state != arc_anon) { + printf("%s: invalid state: freeze_cksum=%p, b_state=%p\n", + __func__, buf->b_hdr->b_freeze_cksum, buf->b_hdr->b_state); + printf("arc_anon=%p arc_mru=%p arc_mru_ghost=%p arc_mfu=%p " + "arc_mfu_ghost=%p arc_l2c_only=%p\n", arc_anon, arc_mru, + arc_mru_ghost, arc_mfu, arc_mfu_ghost, arc_l2c_only); + } +#endif ASSERT(buf->b_hdr->b_freeze_cksum != NULL || buf->b_hdr->b_state == arc_anon); arc_cksum_compute(buf, B_FALSE); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#31 (text) ==== @@ -128,12 +128,29 @@ return (crc); } +#ifdef ZFS_DEBUG +#define DBUF_STATE_CHANGE(db, op, state, why) do { \ + (db)->db_state op state; \ + if (zfs_flags & ZFS_DEBUG_DBUF_STATE) { \ + uint64_t __db_obj = (db)->db.db_object; \ + char __db_buf[32]; \ + if (__db_obj == DMU_META_DNODE_OBJECT) \ + strcpy(__db_buf, "mdn"); \ + else \ + (void) snprintf(__db_buf, sizeof(__db_buf), \ + "%lld", (u_longlong_t)__db_obj); \ + __dprintf(__FILE__, __func__, __LINE__, \ + "%s: dbp=%p arc=%p obj=%s, lvl=%u blkid=%lld " \ + "state change (" #op " " #state "): %s\n", \ + __func__, db, (db)->db_buf, __db_buf, \ + (db)->db_level, (u_longlong_t)(db)->db_blkid, why); \ + } \ +} while(0) +#else #define DBUF_STATE_CHANGE(db, op, state, why) do { \ (db)->db_state op state; \ - if (zfs_flags & ZFS_DEBUG_DBUF_STATE) \ - dprintf_dbuf(db, "%s: state change (" #op " " #state \ - "): %s\n", __func__, why); \ } while(0) +#endif #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); @@ -573,7 +590,14 @@ itr->hole.dst = NULL; itr->hole.size = 0; /* If no ranges exist, the dirty buffer is entirely valid. */ - itr->hole_start = (itr->range == NULL) ? itr->max_offset : 0; + if (itr->range == NULL) { + /* Set to the end to return no holes */ + itr->hole_start = itr->max_offset; + } else if (itr->range->start == 0) { + itr->hole_start = itr->range->size; + itr->range = list_next(&itr->dl->write_ranges, itr->range); + } else + itr->hole_start = 0; } /** @@ -598,9 +622,9 @@ itr->hole.src = (caddr_t)(itr->src->b_data) + itr->hole_start; itr->hole.dst = (caddr_t)(itr->dl->dr_data->b_data) + itr->hole_start; if (itr->range != NULL) { - itr->hole.size = MIN(itr->max_offset, itr->range->offset) - + itr->hole.size = MIN(itr->max_offset, itr->range->start) - itr->hole_start; - itr->hole_start = itr->range->offset + itr->range->size; + itr->hole_start = itr->range->end; itr->range = list_next(&itr->dl->write_ranges, itr->range); } else { itr->hole.size = itr->max_offset - itr->hole_start; @@ -674,7 +698,6 @@ */ dr = list_tail(&db->db_dirty_records); old_buf = buf; - ASSERT((zfs_flags & ZFS_DEBUG_MODIFY) == 0 || arc_buf_frozen(buf)); while (dr != NULL) { dl = &dr->dt.dl; ASSERT(dl->dr_data); @@ -688,7 +711,7 @@ static void dbuf_read_complete(dmu_buf_impl_t *db, arc_buf_t *buf) { - if (db->db_dirtycnt > 0) { + if (db->db_level == 0 && db->db_dirtycnt > 0) { /* * Fill any holes in the dbuf's dirty records @@ -696,14 +719,6 @@ */ dbuf_resolve_ranges(db, buf); - /* - * The read version of this block is superceeded - * by the versions contained within the dirty records. - * Discard it. - */ - arc_release(buf, db); - VERIFY(arc_buf_remove_ref(buf, db) == 1); - atomic_add_64(&threw_away_level0_read, 1); ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED || @@ -714,15 +729,28 @@ * was waiting on this read. Transition * to cached. */ + if (db->db_buf == NULL) + dbuf_set_data(db, buf); DBUF_STATE_CHANGE(db, =, DB_CACHED, "resolve of records in READ state"); } + if (db->db_buf != buf) { + /* + * The provided buffer is no longer relevant to the + * current transaction group. Discard it. + */ + arc_release(buf, db); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + atomic_add_64(&threw_away_level0_read, 1); + } } else if (db->db_state == DB_READ) { /* * Read with no dirty data. Use the buffer we * read and transition to DB_CACHED. */ dbuf_set_data(db, buf); + DBUF_STATE_CHANGE(db, =, DB_CACHED, + "read completed with no dirty records"); } else { /* * The block was free'd or filled before this @@ -776,8 +804,7 @@ arc_buf_freeze(buf); dbuf_resolve_ranges(db, buf); atomic_add_64(&dirty_writes_lost, 1); - db->db_state = DB_CACHED; - DBUF_STATE_CHANGE(db, =, DB_UNCACHED, + DBUF_STATE_CHANGE(db, =, DB_CACHED, "read failed with oustanding writes failed"); } else { ASSERT3P(db->db_buf, ==, NULL); @@ -859,7 +886,6 @@ DB_DNODE_EXIT(db); bzero(buf->b_data, db->db.db_size); DBUF_STATE_CHANGE(db, =, DB_READ, "hole read satisfied"); - dbuf_add_ref(db, NULL); *flags |= DB_RF_CACHED; dbuf_read_complete(db, buf); mutex_exit(&db->db_mtx); @@ -1365,7 +1391,7 @@ for (;;) { range = list_tail(&dl->write_ranges); - if (range->offset >= new_size) { + if (range->start >= new_size) { list_remove(&dl->write_ranges, range); kmem_free(range, sizeof(dbuf_dirty_range_t)); continue; @@ -1374,9 +1400,10 @@ /* * Update the last range that could be affected by * this truncation. Its size changes only if it - * extends past the end of the new buffer's size. + * extends past the end of the buffer's new size. */ - range->size = MIN(new_size - range->offset, range->size); + range->end = MIN(new_size, range->end); + range->size = range->end - range->size; break; } } @@ -2000,6 +2027,28 @@ (void) dbuf_dirty(db, tx); } +static void +dbuf_dirty_record_check_ranges(dbuf_dirty_record_t *dr) +{ +#ifdef ZFS_DEBUG + dbuf_dirty_leaf_t *dl; + dbuf_dirty_range_t *prev, *cur, *next; + + if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) + return; + + dl = &dr->dt.dl; + + prev = next = NULL; + for (cur = list_head(&dl->write_ranges); cur != NULL; + prev = cur, cur = next) { + next = list_next(&dl->write_ranges, cur); + ASSERT(prev == NULL || cur->start > prev->end); + ASSERT(next == NULL || cur->end < next->start); + } +#endif +} + /** * \brief Record a write range for the associated dirty record. * @@ -2010,100 +2059,48 @@ void dbuf_dirty_record_add_range(dbuf_dirty_record_t *dr, int offset, int size) { - dmu_buf_impl_t *db; - struct dbuf_dirty_leaf *dl; - dbuf_dirty_range_t *old_range, *new_range; - int new_end, old_end; - boolean_t use_existing; + dbuf_dirty_range_t *next_range, *old_range, *range; + dbuf_dirty_leaf_t *dl; dl = &dr->dt.dl; - db = dr->dr_dbuf; /* Write ranges do not apply to indirect blocks. */ - ASSERT(db->db_level == 0); - ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(dr->dr_dbuf->db_level == 0); + ASSERT(MUTEX_HELD(&dr->dr_dbuf->db_mtx)); - /* Discover whether an existing entry overlaps. */ - old_range = list_head(&dl->write_ranges); - new_end = offset + size; - old_end = 0; - use_existing = B_FALSE; - while (old_range != NULL) { - old_end = old_range->offset + old_range->size; + range = kmem_zalloc(sizeof(dbuf_dirty_range_t), KM_SLEEP); + range->start = offset; + range->size = size; + range->end = offset + size; - /* - * Adjacent ranges count as "overlapping". A range's - * calculated end comes after the last byte. Thus, an - * adjacent range's end value will equal the other one's - * offset, and vice versa. - */ - - /* - * The existing range starts after the end of the new one, - * meaning the new one must be inserted before this one. - */ - if (old_range->offset > (new_end + 1)) - break; - -#define IN_RANGE(val, start, end) (val) >= (start) && (val) <= (end) - /* - * If the existing range's start or end falls within the new - * one, expand it to include the new one. - */ - if (IN_RANGE(old_range->offset, offset, new_end) - || IN_RANGE(old_end, offset, new_end)) { - use_existing = B_TRUE; - break; + /* + * This loop acts as an accumulator, merging dirty ranges if they + * overlap or are adjacent, and in so doing leaving behind only one + * range. But if the new range must be inserted separately, it will + * do so using the old range as a marker. + */ + for (old_range = list_head(&dl->write_ranges); + old_range != NULL && old_range->start <= range->end; + old_range = next_range) { + next_range = list_next(&dl->write_ranges, old_range); + if (range->start <= old_range->end && + range->end >= old_range->start) { + old_range->start = MIN(range->start, old_range->start); + old_range->end = MAX(range->end, old_range->end); + old_range->size = old_range->end - old_range->start; + list_remove(&dl->write_ranges, old_range); + atomic_subtract_64(&dirty_ranges_in_flight, 1); + kmem_free(range, sizeof(dbuf_dirty_range_t)); + range = old_range; } - - /* Try the next range, since this one didn't fit. */ - old_range = list_next(&dl->write_ranges, old_range); } - if (use_existing) { - /* - * Update the overlapping range entry so that it is a - * superset of the old entry and the new one. - */ - old_range->offset = MIN(offset, old_range->offset); - old_range->size = MAX(new_end, old_end) - old_range->offset; - dprintf_dbuf(dr->dr_dbuf, "%s: dr %p: modify range offset " - "0x%x size 0x%x\n", __func__, dr, offset, size); + /* If old_range is NULL, this does a list_insert_tail(). */ + list_insert_before(&dl->write_ranges, old_range, range); -#define LIST_HAS_ONE_ENTRY(list) \ - (list_head(list) == list_tail(list) && !list_is_empty(list)) - /* - * If there's a single write range and it now covers the - * entire buffer, the caller will finish filling, so clear - * any READ or PARTIAL bit that may be set. - */ - if (LIST_HAS_ONE_ENTRY(&db->db_dirty_records) && - old_range->offset == 0 && old_range->size == db->db.db_size) - DBUF_STATE_CHANGE(db, =, DB_FILL, - "caller is about to finish filling buffer"); - return; - } - - /* No overlapping entry was found, so create a new one. */ - new_range = kmem_zalloc(sizeof(dbuf_dirty_range_t), KM_NOSLEEP); - new_range->offset = offset; - new_range->size = size; - - dprintf_dbuf(dr->dr_dbuf, "%s: dr %p: new range offset 0x%x size 0x%x\n", - __func__, dr, offset, size); - - /* - * Insert the new range: - * - At the end of the list (old_range == NULL): - * - If the list is empty. - * - If no entry with a later offset was found. - * - Before another range (old_range != NULL): - * - If that range starts after the end of the new one. - */ - list_insert_before(&dl->write_ranges, old_range, new_range); - + atomic_add_64(&dirty_ranges_in_flight, 1); atomic_add_64(&dirty_ranges_total, 1); - atomic_add_64(&dirty_ranges_in_flight, 1); + dbuf_dirty_record_check_ranges(dr); } /** @@ -2448,6 +2445,7 @@ ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(refcount_is_zero(&db->db_holds)); + ASSERT(list_is_empty(&db->db_dirty_records)); dbuf_evict_user(db); @@ -2665,6 +2663,7 @@ mutex_enter(&db->db_mtx); ASSERT(refcount_is_zero(&db->db_holds)); + ASSERT(list_is_empty(&db->db_dirty_records)); if (db->db_state != DB_EVICTING) { ASSERT(db->db_state == DB_CACHED); @@ -3000,6 +2999,15 @@ * This is a special case: we never associated this * dbuf with any data allocated from the ARC. */ +#ifdef ZFS_DEBUG + if ((db->db_state & (DB_UNCACHED|DB_NOFILL)) == 0) { + __dprintf(__FILE__, __func__, __LINE__, + "%s: dbuf invalid without ARC buffer: " + "state %d lvl=%d blkid=%d obj=%d\n", + __func__, db->db_state, db->db_level, + db->db_blkid, db->db.db_object); + } +#endif ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); dbuf_evict(db); @@ -3282,7 +3290,7 @@ * If the dirty record is not CACHED, make it happen. */ if (db->db_state != DB_CACHED) { - dbuf_transition_to_read(db, DB_UNCACHED|DB_PARTIAL); + dbuf_transition_to_read(db, DB_PARTIAL); while (db->db_state & DB_READ) cv_wait(&db->db_changed, &db->db_mtx); } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#12 (text) ==== @@ -313,7 +313,8 @@ typedef struct dbuf_dirty_range { list_node_t write_range_link; - int offset; + int start; + int end; int size; } dbuf_dirty_range_t; Change 517413 by willa@willa_repo on 2011/12/19 00:20:42 Reinstate the PARTIAL -> FILL state transition. This was present in the old dbuf_dirty_record_add_range() implementation and was left behind in the rewrite. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#32 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#32 (text) ==== @@ -2061,12 +2061,14 @@ { dbuf_dirty_range_t *next_range, *old_range, *range; dbuf_dirty_leaf_t *dl; + dmu_buf_impl_t *db; dl = &dr->dt.dl; + db = dr->dr_dbuf; /* Write ranges do not apply to indirect blocks. */ - ASSERT(dr->dr_dbuf->db_level == 0); - ASSERT(MUTEX_HELD(&dr->dr_dbuf->db_mtx)); + ASSERT(db->db_level == 0); + ASSERT(MUTEX_HELD(&db->db_mtx)); range = kmem_zalloc(sizeof(dbuf_dirty_range_t), KM_SLEEP); range->start = offset; @@ -2095,11 +2097,17 @@ } } - /* If old_range is NULL, this does a list_insert_tail(). */ - list_insert_before(&dl->write_ranges, old_range, range); + /* If the writer will finish filling, go directly to DB_FILL. */ + if (range->start == 0 && range->size == db->db.db_size) { + DBUF_STATE_CHANGE(db, =, DB_FILL, "writer fully filled"); + kmem_free(range, sizeof(dbuf_dirty_range_t)); + } else { + /* If old_range is NULL, this does a list_insert_tail(). */ + list_insert_before(&dl->write_ranges, old_range, range); + atomic_add_64(&dirty_ranges_in_flight, 1); + atomic_add_64(&dirty_ranges_total, 1); + } - atomic_add_64(&dirty_ranges_in_flight, 1); - atomic_add_64(&dirty_ranges_total, 1); dbuf_dirty_record_check_ranges(dr); } Change 518498 by justing@justing-ns1 on 2011/12/30 15:00:27 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h: Move state transition pre-requisite testing out of dbuf_transition_to_read() and into its callers. This clarifies the reason dbuf_transition_to_read() is called and also simplifies dbuf_transition_to_read()'s implementation. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#33 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#12 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#13 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#33 (text) ==== @@ -94,8 +94,6 @@ SYSCTL_COUNTER_U(will_fill_cached, "will_fill CACHED"); SYSCTL_COUNTER_U(will_fill_uncached, "will_fill UNCACHED"); -SYSCTL_COUNTER_U(entered_read_uncached, "entered read UNCACHED"); -SYSCTL_COUNTER_U(entered_read_cached, "entered read CACHED"); SYSCTL_COUNTER_U(exited_read_uncached, "exited read UNCACHED"); SYSCTL_COUNTER_U(exited_read_cached, "exited read CACHED"); @@ -2112,47 +2110,44 @@ } /** - * \brief Make the dbuf transition to READ if it is in certain states. + * \brief Issue an async read that will eventually transition a dbuf + * into the CACHED state. + * + * \param db Dbuf to transition * - * \param db Dbuf to transition - * \param state_req States required to perform transition + * \invariant The dbuf's mutex must be held. * - * \invariant The dbuf's mutex must be held. + * Upon return, the dbuf will either be in the READ (async READ + * pending), or CACHED (read satisfied by a cache hit or zero fill for + * an object hole) state. * - * The dbuf's mutex will be dropped if a read is actually issued. + * \note The dbuf's mutex is dropped temporarilly while the read is + * scheduled. Caller's must reverify if necessary any state + * protected by the dbuf mutex. */ void -dbuf_transition_to_read(dmu_buf_impl_t *db, int state_req) +dbuf_transition_to_read(dmu_buf_impl_t *db) { int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NEVERWAIT; dnode_t *dn; zio_t *zio = NULL; ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_state & (DB_PARTIAL|DB_UNCACHED)); - if (db->db_state == DB_CACHED) - atomic_add_64(&entered_read_cached, 1); - else - atomic_add_64(&entered_read_uncached, 1); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) + rf |= DB_RF_HAVESTRUCT; + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, + ZIO_FLAG_MUSTSUCCEED); + DB_DNODE_EXIT(db); - if (db->db_state & state_req) { - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) - rf |= DB_RF_HAVESTRUCT; - zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, - ZIO_FLAG_MUSTSUCCEED); - DB_DNODE_EXIT(db); - } + mutex_exit(&db->db_mtx); + (void) dbuf_read(db, zio, rf); + (void) zio_nowait(zio); + mutex_enter(&db->db_mtx); - /* Issue the asynchronous read, if applicable */ - if (zio != NULL) { - mutex_exit(&db->db_mtx); - (void) dbuf_read(db, zio, rf); - (void) zio_nowait(zio); - mutex_enter(&db->db_mtx); - } - if (db->db_state == DB_CACHED) atomic_add_64(&exited_read_cached, 1); else @@ -2195,6 +2190,7 @@ return; } DB_DNODE_EXIT(db); + if (db->db_state == DB_CACHED) atomic_add_64(&will_dirty_range_cached, 1); else @@ -2204,8 +2200,9 @@ * Only issue a read if we start writing inside the block rather * than either at the beginning (forward) or end (backward) */ - if (offset != 0 && (offset + size != db->db.db_size)) - dbuf_transition_to_read(db, DB_UNCACHED); + if (db->db_state == DB_UNCACHED && offset != 0 && + (offset + size != db->db.db_size)) + dbuf_transition_to_read(db); #ifdef ZFS_DIRTY_SYNC_READS /* XXX TEMP Wait for the async read to complete */ @@ -2350,8 +2347,8 @@ while (db->db_state & DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); - /* If the steady state is PARTIAL, transition to READ. */ - dbuf_transition_to_read(db, DB_PARTIAL); + if (db->db_state == DB_PARTIAL) + dbuf_transition_to_read(db); /* * The buffer is waiting for a read, so simply update the associated @@ -3298,7 +3295,9 @@ * If the dirty record is not CACHED, make it happen. */ if (db->db_state != DB_CACHED) { - dbuf_transition_to_read(db, DB_PARTIAL); + if (db->db_state & DB_PARTIAL) + dbuf_transition_to_read(db); + while (db->db_state & DB_READ) cv_wait(&db->db_changed, &db->db_mtx); } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#12 (text) ==== @@ -1478,9 +1478,12 @@ * can be completed. */ if (db->db_state != DB_CACHED) { - dbuf_transition_to_read(db, DB_UNCACHED|DB_PARTIAL); + if (db->db_state & (DB_UNCACHED|DB_PARTIAL)) + dbuf_transition_to_read(db); + while (db->db_state & DB_READ) cv_wait(&db->db_changed, &db->db_mtx); + /* The dbuf had an I/O error or was freed in flight */ if (db->db_state == DB_UNCACHED) { mutex_exit(&db->db_mtx); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#13 (text) ==== @@ -450,7 +450,7 @@ void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); void dbuf_will_dirty_range(dmu_buf_impl_t *db, dmu_tx_t *tx, int offset, int size); -void dbuf_transition_to_read(dmu_buf_impl_t *db, int state_req); +void dbuf_transition_to_read(dmu_buf_impl_t *db); void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); Change 519146 by justing@justing-ns1 on 2012/01/03 15:42:09 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: dbuf_dirty_record_create()'s use is local to this file. Make it static. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#34 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#34 (text) ==== @@ -1495,7 +1495,7 @@ * \invariant A dirty record must not already exist for the transaction's * transaction group. */ -dbuf_dirty_record_t * +static dbuf_dirty_record_t * dbuf_dirty_record_create(dmu_buf_impl_t *db, dmu_tx_t *tx) { dbuf_dirty_record_t *dr; Change 519151 by justing@justing-ns1 on 2012/01/03 15:56:57 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: Style(9): In dbuf_dirty(), add curly braces around a statement with four lines of comments before a single statement. No functional changes. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#35 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#35 (text) ==== @@ -1753,12 +1753,13 @@ /* Reset immediate write sync state if needed */ dbuf_unoverride(dr); if (db->db.db_object != DMU_META_DNODE_OBJECT && - db->db_state != DB_NOFILL) + db->db_state != DB_NOFILL) { /* * Notify ARC that the buffer will be * modified, requiring a new checksum. */ arc_buf_thaw(db->db_buf); + } } mutex_exit(&db->db_mtx); return (dr); Change 519152 by justing@justing-ns1 on 2012/01/03 16:07:20 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: In dbuf_read_done(), when a read fails on a dbuf with unresolved dirty ranges, update the dbuf via dbuf_read_complete() rather that performing some, but not all of the required operations inline. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#36 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#36 (text) ==== @@ -800,10 +800,8 @@ */ bzero(buf->b_data, arc_buf_size(buf)); arc_buf_freeze(buf); - dbuf_resolve_ranges(db, buf); + dbuf_read_complete(db, buf); atomic_add_64(&dirty_writes_lost, 1); - DBUF_STATE_CHANGE(db, =, DB_CACHED, - "read failed with oustanding writes failed"); } else { ASSERT3P(db->db_buf, ==, NULL); db->db_state = DB_UNCACHED; Change 519154 by justing@justing-ns1 on 2012/01/03 16:33:40 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: In dbuf_read_done(), we must wait for both DB_READ and DB_FILL to clear in order to guard against a filler sneaking in while the dbuf mutex is dropped by our cv_wait(). Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#37 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#37 (text) ==== @@ -1029,8 +1029,7 @@ /* Skip the wait on the caller's request. */ if ((flags & DB_RF_NEVERWAIT) == 0) { mutex_enter(&db->db_mtx); - /* Already waited for DB_FILL set above */ - while (db->db_state & DB_READ) { + while (db->db_state & (DB_READ|DB_FILL)) { ASSERT(db->db_state == DB_READ || (flags & DB_RF_HAVESTRUCT) == 0); cv_wait(&db->db_changed, &db->db_mtx); Change 519167 by justing@justing-ns1 on 2012/01/03 17:19:26 cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: Simplify the logic in dbuf_read_complete(), in the case where a read is resolving a buffer with dirty records. In this case, the buffer containing the read should always be discarded. Replace conditional logic with ASSERTS() to this effect. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#38 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#38 (text) ==== @@ -711,36 +711,35 @@ { if (db->db_level == 0 && db->db_dirtycnt > 0) { + ASSERT(db->db_buf != buf); + ASSERT(db->db_state == DB_CACHED || + db->db_state == DB_UNCACHED || + db->db_state == DB_READ); + /* * Fill any holes in the dbuf's dirty records * with the original block we read. */ dbuf_resolve_ranges(db, buf); - - ASSERT(db->db_state == DB_CACHED || - db->db_state == DB_UNCACHED || - db->db_state == DB_READ); if (db->db_state == DB_READ) { /* * The most recent version of this block * was waiting on this read. Transition * to cached. */ - if (db->db_buf == NULL) - dbuf_set_data(db, buf); + ASSERT(db->db_buf != NULL); DBUF_STATE_CHANGE(db, =, DB_CACHED, "resolve of records in READ state"); } - if (db->db_buf != buf) { - /* - * The provided buffer is no longer relevant to the - * current transaction group. Discard it. - */ - arc_release(buf, db); - VERIFY(arc_buf_remove_ref(buf, db) == 1); - atomic_add_64(&threw_away_level0_read, 1); - } + + /* + * The provided buffer is no longer relevant to the + * current transaction group. Discard it. + */ + arc_release(buf, db); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + atomic_add_64(&threw_away_level0_read, 1); } else if (db->db_state == DB_READ) { /* * Read with no dirty data. Use the buffer we Change 519183 by justing@justing-ns1 on 2012/01/03 20:49:58 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: In dbuf_fix_old_data(), when operating on a dbuf in the PARTIAL or READ states, give the open transaction group (db->db_buf) the new buffer instead of replacing the buffer in the previous dirty record. This avoids a superflous bcopy and simplifies dbuf resolution from syncer context. With this change, the syncer knows that the buffer in an unresolved dirty record will not change out from under it (between the issue of a resolving read and issuing a syncing write). Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#39 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#39 (text) ==== @@ -1137,11 +1137,28 @@ int size = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa; + arc_buf_t *buf; atomic_add_64(&fix_old_data_more_dirties, 1); DB_GET_SPA(&spa, db); - dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); - bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); + buf = arc_buf_alloc(spa, size, db, type); + + if (db->db_state & (DB_READ|DB_PARTIAL)) { + /* + * Any relevant data from the last dr will be + * copied into the new buffer based on the + * write ranges placed on this buffer once + * the dbuf is resolved. + */ + dbuf_set_data(db, buf); + } else { + /* + * Disassociate the dbuf from future syncer + * operation on the previous dirty record. + */ + bcopy(db->db.db_data, buf->b_data, size); + dr->dt.dl.dr_data = buf; + } } else { atomic_add_64(&fix_old_data_null, 1); dbuf_set_data(db, NULL); Change 519187 by justing@justing-ns1 on 2012/01/03 21:23:44 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: In dbuf_free_range(), issue a resolving read for any outstanding dirty records since there is no chance for a write to fully fill their buffers and the dbuf's state is transitioning to a state (UNCACHED or CACHED) that will cause the PARTIAL state of the dbuf to be lost. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#40 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#40 (text) ==== @@ -1301,6 +1301,13 @@ dr = list_head(&db->db_dirty_records); if (dr != NULL) { + boolean_t partial; + + /* + * Record partial state before any action is taken + * that might cause us to lose this information. + */ + partial = (db->db_state & DB_PARTIAL) ? TRUE : FALSE; if (dr->dr_txg == txg) { /* * This buffer is "in-use", re-adjust the file @@ -1319,6 +1326,8 @@ * to this buffer. */ dbuf_dirty_record_cleanup_ranges(dr); + + dr = list_next(&db->db_dirty_records, dr); } else { /* * This dbuf is not dirty in the open context. @@ -1328,6 +1337,14 @@ */ dbuf_fix_old_data(db, txg); } + + /* + * If the previous block contents are needed in + * an earlier transaction group, and we have yet + * to fetch that data, do so now. + */ + if (dr && partial) + dbuf_transition_to_read(db); } /* dbuf_fix_old_data above may have gone to DB_UNCACHED. */ if (db->db_state != DB_UNCACHED) { Change 519188 by justing@justing-ns1 on 2012/01/03 21:35:25 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: Change dbuf_write() to return the zio it creates instead of always assigning it to dr->dr_zio. Instead defer that operation to its callers. This changeset has no functional impact, but paves the way for dbuf_sync_leaf to use dr->dr_zio as an interlock for postponing the syncer's write until dbuf_read_done(). Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#41 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#41 (text) ==== @@ -39,7 +39,8 @@ static void dbuf_destroy(dmu_buf_impl_t *db); static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); +static zio_t *dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, + dmu_tx_t *tx); /** \brief Global data structures and functions for the dbuf cache. */ @@ -3258,10 +3259,9 @@ db->db_data_pending = dr; mutex_exit(&db->db_mtx); - dbuf_write(dr, db->db_buf, tx); + zio = dr->dr_zio = dbuf_write(dr, db->db_buf, tx); + mutex_enter(&dr->dt.di.dr_mtx); - zio = dr->dr_zio; - mutex_enter(&dr->dt.di.dr_mtx); dbuf_sync_list(&dr->dt.di.dr_children, tx); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); mutex_exit(&dr->dt.di.dr_mtx); @@ -3414,7 +3414,7 @@ mutex_exit(&db->db_mtx); - dbuf_write(dr, *datap, tx); + dr->dr_zio = dbuf_write(dr, *datap, tx); ASSERT(!list_link_active(&dr->dr_dirty_node)); if (dn->dn_object == DMU_META_DNODE_OBJECT) { @@ -3695,7 +3695,7 @@ /** * \brief Commit a dirty buffer to disk. */ -static void +static zio_t * dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; @@ -3706,6 +3706,7 @@ zbookmark_t zb; zio_prop_t zp; zio_t *pio; /* parent I/O */ + zio_t *dr_zio; int wp_flag = 0; DB_DNODE_ENTER(db); @@ -3769,27 +3770,29 @@ * its block pointer override needs to be handled here. */ ASSERT(db->db_state != DB_NOFILL); - dr->dr_zio = zio_write(pio, os->os_spa, txg, + dr_zio = zio_write(pio, os->os_spa, txg, db->db_blkptr, data->b_data, arc_buf_size(data), &zp, dbuf_write_override_ready, dbuf_write_override_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; - zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, + zio_write_override(dr_zio, &dr->dt.dl.dr_overridden_by, dr->dt.dl.dr_copies); mutex_exit(&db->db_mtx); } else if (db->db_state == DB_NOFILL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); - dr->dr_zio = zio_write(pio, os->os_spa, txg, + dr_zio = zio_write(pio, os->os_spa, txg, db->db_blkptr, NULL, db->db.db_size, &zp, dbuf_write_nofill_ready, dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); } else { ASSERT(arc_released(data)); - dr->dr_zio = arc_write(pio, os->os_spa, txg, + dr_zio = arc_write(pio, os->os_spa, txg, db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp, dbuf_write_ready, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } + + return (dr_zio); } Change 519189 by justing@justing-ns1 on 2012/01/03 21:45:05 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: Fix several issue in dbuf_assign_arcbuf(): - Transition from PARTIAL to READ or CACHED before waiting for any fillers to clear. dbuf_transition_to_read() drops the dbuf mtx, potentially allowing a filler to sneak in. - dbuf_drity_record_create() always allocates an anonymous arc buffer for the record. Note that we are allocating buffer we will immediately release via an 'XXX' comment and update the logic so this buffer isn't leaked. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#42 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#42 (text) ==== @@ -2376,12 +2376,12 @@ mutex_enter(&db->db_mtx); + if (db->db_state & DB_PARTIAL) + dbuf_transition_to_read(db); + while (db->db_state & DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); - if (db->db_state == DB_PARTIAL) - dbuf_transition_to_read(db); - /* * The buffer is waiting for a read, so simply update the associated * dirty record, using the buffer provided. @@ -2393,15 +2393,16 @@ ; if (dr == NULL) /* No dirty record for this transaction yet */ + /* XXX Avoid superfluous arc buffer allocation? */ dr = dbuf_dirty_record_create(db, tx); else { /* Remove the old dirty data for this transaction */ dbuf_dirty_record_cleanup_ranges(dr); - if (dr->dt.dl.dr_data != NULL) { - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, - db) == 1); - } } + ASSERT(dr->dt.dl.dr_data != NULL); + ASSERT(arc_released(dr->dt.dl.dr_data)); + VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); + /* * Assign the ARC buffer to the dirty record and record its * write range as covering the whole buffer, so that when Change 519190 by justing@justing-ns1 on 2012/01/03 22:00:02 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: In dbuf_will_dirty_range(), ensure that only a single filler is active at a time on a given dbuf. Otherwise the DB_FILL flag may be unintentionally cleared before all fillers have finished modifying the buffer. This check is made after any potential calls to dbuf_transition_to_read() since that function can drop the dbuf's mutex, allowing a filler to sneak in. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#43 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#43 (text) ==== @@ -2247,6 +2247,11 @@ if (db->db_state == DB_UNCACHED) DBUF_STATE_CHANGE(db, =, DB_PARTIAL, "new buffer about to be written without async read"); + + /* Only one filler allowed at a time. */ + while (db->db_state & DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + /* * Now set the FILL bit, since the caller is about to write, but * only if the dbuf isn't already CACHED. Change 519191 by justing@justing-ns1 on 2012/01/03 22:25:06 Perform asynchronous COW fault resolution from the syncer context. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: o In dbuf_sync_leaf(), issue the asynchronous, resolving, read if the dbuf is in the PARTIAL state, but do not wait for it to complete. Instead, construct the syncer's write zio immediately but only execute it if the dirty record has resolved. For dirty records that are unresolved at the time the write zio is constructed, we assign dr->dr_zio under protection of the dbuf's mutex so that dbuf_read_done() cannot race with dbuf_sync_leaf(). o In dbuf_read_done(), under the protection of the dbuf's mutex, execute the syncer's write zio (as found in db->db_data_pending->dr_zio) after resolving any dirty records for the dbuf. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#44 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#44 (text) ==== @@ -741,6 +741,11 @@ arc_release(buf, db); VERIFY(arc_buf_remove_ref(buf, db) == 1); atomic_add_64(&threw_away_level0_read, 1); + + /* Dispatch any deferred syncer writes. */ + if (db->db_data_pending != NULL && + db->db_data_pending->dr_zio != NULL) + zio_nowait(db->db_data_pending->dr_zio); } else if (db->db_state == DB_READ) { /* * Read with no dirty data. Use the buffer we @@ -3320,23 +3325,23 @@ dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; objset_t *os; + zio_t *zio; uint64_t txg = tx->tx_txg; + boolean_t resolve_pending; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); - /* - * XXX TEMP - * If the dirty record is not CACHED, make it happen. - */ - if (db->db_state != DB_CACHED) { - if (db->db_state & DB_PARTIAL) - dbuf_transition_to_read(db); - - while (db->db_state & DB_READ) - cv_wait(&db->db_changed, &db->db_mtx); + if (db->db_state & DB_PARTIAL) { + /* + * Time has run out for waiting on any writer to fill + * this buffer. + */ + ASSERT(arc_released(*datap)); + dbuf_transition_to_read(db); + ASSERT(db->db_state & (DB_CACHED|DB_READ)); } /* @@ -3346,11 +3351,15 @@ if (db->db_state == DB_UNCACHED) { /* This buffer has been freed since it was dirtied */ ASSERT(db->db.db_data == NULL); - } else if (db->db_state == DB_FILL) { - /* This buffer was freed and is now being re-filled */ + } else if (db->db_state & DB_FILL) { + /* + * This buffer is being modified. Those modifications + * should be in a newer transaction group and not + * reference the data we are about to write. + */ ASSERT(db->db.db_data != dr->dt.dl.dr_data); } else { - ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); + ASSERT(db->db_state & (DB_CACHED|DB_READ|DB_NOFILL)); } DBUF_VERIFY(db); @@ -3394,7 +3403,11 @@ ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); } - if (db->db_state != DB_NOFILL && + /* Remember if we need to defer write execution to dbuf_read_done(). */ + resolve_pending = !list_is_empty(&dr->dt.dl.write_ranges); + + if ((db->db_state & DB_NOFILL) == 0 && + resolve_pending == FALSE && dn->dn_object != DMU_META_DNODE_OBJECT && refcount_count(&db->db_holds) > 1 && dr->dt.dl.dr_override_state != DR_OVERRIDDEN && @@ -3415,12 +3428,31 @@ *datap = arc_buf_alloc(os->os_spa, blksz, db, type); bcopy(db->db.db_data, (*datap)->b_data, blksz); } - /* notify that the dirty record is about to write */ + /* Notify the world that this dirty record is about to write. */ db->db_data_pending = dr; mutex_exit(&db->db_mtx); - dr->dr_zio = dbuf_write(dr, *datap, tx); + zio = dbuf_write(dr, *datap, tx); + + if (resolve_pending) { + + /* Resolve race with dbuf_read_done(). */ + mutex_enter(&db->db_mtx); + dr->dr_zio = zio; + resolve_pending = !list_is_empty(&dr->dt.dl.write_ranges); + mutex_exit(&db->db_mtx); + + if (resolve_pending) { + /* + * Resolve still pending. Let dbuf_read_done() + * fire the write. + */ + DB_DNODE_EXIT(db); + return; + } + } else + dr->dr_zio = zio; ASSERT(!list_link_active(&dr->dr_dirty_node)); if (dn->dn_object == DMU_META_DNODE_OBJECT) { Change 519192 by justing@justing-ns1 on 2012/01/03 22:26:50 sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: Conform conditional to existing style. No functional changes. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#45 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#45 (text) ==== @@ -1349,7 +1349,7 @@ * an earlier transaction group, and we have yet * to fetch that data, do so now. */ - if (dr && partial) + if (dr != NULL && partial == TRUE) dbuf_transition_to_read(db); } /* dbuf_fix_old_data above may have gone to DB_UNCACHED. */ Change 519349 by willa@willa_repo on 2012/01/04 18:00:28 A few minor changes for parallel syncer resolves. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c: - Extend change 519154 to dbuf_undirty() and dmu_sync(), which currently only cv_wait() for DB_READ, meaning a filler could grab the mutex and change the state before the loop check. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: - Revise the now inaccurate comment in dbuf_assign_arcbuf(). We changed the write range list semantics a while back to denote a fully valid buffer as one having no associated write ranges, rather than a single range covering the entire buffer. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#46 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#13 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#46 (text) ==== @@ -1948,7 +1948,7 @@ * we should be able to undirty immediately and disassociate the * read from this dbuf before it completes. */ - while (db->db_state & DB_READ) + while (db->db_state & (DB_READ|DB_FILL)) cv_wait(&db->db_changed, &db->db_mtx); DB_DNODE_ENTER(db); @@ -2413,11 +2413,7 @@ ASSERT(arc_released(dr->dt.dl.dr_data)); VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); - /* - * Assign the ARC buffer to the dirty record and record its - * write range as covering the whole buffer, so that when - * the READ is later resolved, the new buffer stays. - */ + /* Assign the ARC buffer to the dirty record. */ dr->dt.dl.dr_data = buf; dbuf_set_data(db, buf); mutex_exit(&db->db_mtx); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#13 (text) ==== @@ -1481,7 +1481,7 @@ if (db->db_state & (DB_UNCACHED|DB_PARTIAL)) dbuf_transition_to_read(db); - while (db->db_state & DB_READ) + while (db->db_state & (DB_READ|DB_FILL)) cv_wait(&db->db_changed, &db->db_mtx); /* The dbuf had an I/O error or was freed in flight */ Change 519552 by willa@willa_repo on 2012/01/05 18:42:53 Prototype usage of dmu_assign_arcbuf() in zvol_write(). Compile tested and if 0'd for now. If a write is offset aligned and blocksize or larger, this should be faster (or at least cheaper) than dmu_write(). Using dmu_assign_arcbuf(), by virtue of performing uiocopy() before creating the transaction, also avoids holding up ZFS transactions due to page faults. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#3 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#3 (text) ==== @@ -1408,6 +1408,50 @@ while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); uint64_t off = uio->uio_loffset; + + /* XXX + * zfs_write() uses dbuf_assign_arcbuf() in conjunction with + * uiocopy() for recordsize write segments to avoid holding + * up transactions on page faults (see the XXX comment in + * dmu_write_uio_dnode()). This function should do + * the same thing. Prototype follows: + */ +#if 0 + arc_buf_t *buf = NULL; + + bytes = MIN(uio->uio_resid, zv->zv_volblocksize); + if (bytes > volsize - off) /* don't write past the end */ + bytes = volsize - off; + + if (bytes >= zv->zv_volblocksize && + P2PHASE(off, zv->zv_volblocksize) == 0) { + buf = dmu_request_arcbuf(zv->zv_dbuf, bytes); + if (error = uiocopy(buf->b_data, bytes, UIO_WRITE, + uio, &cbytes)) { + dmu_return_arcbuf(buf); + break; + } + } + + tx = dmu_tx_create(zv->zv_objset); + dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + if (buf != NULL) + dbuf_return_arcbuf(buf); + break; + } + if (buf != NULL) { + dmu_assign_arcbuf(zv->zv_dbuf, off, buf, tx); + error = 0; + } else { + error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx); + } + if (error == 0) + zvol_log_write(zv, tx, off, bytes, sync); + dmu_tx_commit(tx); +#else dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); if (bytes > volsize - off) /* don't write past the end */ @@ -1426,6 +1470,7 @@ if (error) break; +#endif } zfs_range_unlock(rl); if (sync) Change 519786 by willa@willa_repo on 2012/01/09 13:26:05 Fix regressions introduced by parallelizing syncer reads. Notably, this fixes truncate/trim (again). No new regressions found by the STF compared to SpectraBSD/head. Move the READ transition from dbuf_free_range() to dbuf_undirty(). It was being issued too late, so the db->db_buf != NULL assert in dbuf_read_complete() was firing. Consequently, make sure that dbuf_free_range() always clears the current transaction group if there are any dirty records, regardless of the state. Now, the async read performed to resolve the buffer will find the current transaction group truncated (CACHED with a zero filled buffer) but still resolve older transaction groups; previously it saw the dbuf UNCACHED with a NULL buffer for the current transaction group. Add a note to dbuf_undirty() that removing the wait from there causes ZFS to deadlock. This should get fixed later so that truncate calls don't have to potentially wait for I/O. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#47 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#47 (text) ==== @@ -1307,13 +1307,6 @@ dr = list_head(&db->db_dirty_records); if (dr != NULL) { - boolean_t partial; - - /* - * Record partial state before any action is taken - * that might cause us to lose this information. - */ - partial = (db->db_state & DB_PARTIAL) ? TRUE : FALSE; if (dr->dr_txg == txg) { /* * This buffer is "in-use", re-adjust the file @@ -1332,8 +1325,6 @@ * to this buffer. */ dbuf_dirty_record_cleanup_ranges(dr); - - dr = list_next(&db->db_dirty_records, dr); } else { /* * This dbuf is not dirty in the open context. @@ -1343,23 +1334,23 @@ */ dbuf_fix_old_data(db, txg); } - + } + /* + * If cached, zero fill the buffer. + * + * Outstanding dirty records may need to be flushed. In + * that case, transition to cached and zero fill the buffer. + */ + if (db->db_state == DB_CACHED || db->db_dirtycnt > 0) { /* - * If the previous block contents are needed in - * an earlier transaction group, and we have yet - * to fetch that data, do so now. + * If there's only a reader, provide a fresh buffer. + * The reader may be a resolver rather than an user + * initiated reader, so don't assert DB_READ. */ - if (dr != NULL && partial == TRUE) - dbuf_transition_to_read(db); - } - /* dbuf_fix_old_data above may have gone to DB_UNCACHED. */ - if (db->db_state != DB_UNCACHED) { - /* If there's only a reader, provide a fresh buffer. */ - if (db->db.db_data == NULL) { + if (db->db_buf == NULL) { arc_buf_t *buf; spa_t *spa; - ASSERT(db->db_state == DB_READ); DB_GET_SPA(&spa, db); buf = arc_buf_alloc(spa, db->db.db_size, db, DBUF_GET_BUFC_TYPE(db)); @@ -1944,9 +1935,18 @@ ASSERT(dr->dr_dbuf == db); /* + * The buffer was dirtied in a previous transaction group and needs + * to be resolved now. + */ + if (db->db_dirtycnt > 1 && (db->db_state & DB_PARTIAL)) + dbuf_transition_to_read(db); + + /* * XXX Wait for the buffer to be resolved. With additional accounting * we should be able to undirty immediately and disassociate the * read from this dbuf before it completes. + * + * XXX This wait should not be necessary, but ZFS deadlocks without it. */ while (db->db_state & (DB_READ|DB_FILL)) cv_wait(&db->db_changed, &db->db_mtx); Change 520294 by willa@willa_repo on 2012/01/11 12:36:27 Add the dbuf data structures that will be used for async I/O. The plan is to allocate these in the DMU layer when callers provide callbacks (and private callback data). When a hold is placed on an array of buffers, as dmu_read() and dmu_write() currently do, the array's members will be associated with the callback. Then, whenever the dbuf state changes, if the callback configuration specifies, the callback will be called. In this way, e.g. ZVOL or any other DMU consumer, can receive notification when a block changes state. This will likely work better than waiting for ZIOs because some dbufs may have data in the ARC already. For the zvol read case, this should work because it can implement a callback that figures out when all affected dbufs become CACHED, and therefore the I/O it issued to ZFS is complete. The intent is that a callback consumer will be responsible for cleaning up its dbuf_array_t once it is consumed. In this way, dmu_read() and dmu_write() can simply allocate it, issue all the I/Os, and exit. If no callback is passed in, then the DMU calls will be synchronous, and the DMU will be responsible for cleaning up. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#14 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#14 (text) ==== @@ -318,6 +318,44 @@ int size; } dbuf_dirty_range_t; +struct dbuf_array; +struct dmu_buf_impl; + +/** + * A callback function pointer which provides the callee, via dba.dba_private, + * a way to keep track of the state of an array of dbufs. + */ +typedef void(*dmu_callback_t)(struct dbuf_array *, struct dmu_buf_impl *, int); + +typedef struct dbuf_array { + + /** The set of dbufs in this array. */ + struct dmu_buf_impl **dba_dbp; + + /** The number of dbufs in the array. */ + size_t dba_count; + + /** The callback to call if the conditions are met. */ + dmu_callback_t dba_cb; + + /** The dbuf states when a callback may be called. */ + int dba_states; + + /** Private data for the callback. */ + void *dba_private; + +} dbuf_array_t; + +typedef struct dbuf_callback_node { + + /** This object's entry in the list in dmu_buf_impl_t. */ + list_node_t dbc_link; + + /** The dbuf array this callback is associated with. */ + dbuf_array_t *dbc_dba; + +} dbuf_callback_node_t; + typedef struct dmu_buf_impl { /* * The following members are immutable, with the exception of @@ -391,6 +429,9 @@ /** List of dirty records for the buffer sorted newest to oldest. */ list_t db_dirty_records; + /** List of callbacks (see dbuf_callback_node_t). */ + list_t db_callbacks; + /** * Our link on the owner dnodes's dn_dbufs list. * Protected by its dn_dbufs_mtx. Change 520424 by willa@willa_repo on 2012/01/12 10:58:37 Remove sysctl counters that have outlived their usefulness. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#48 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#48 (text) ==== @@ -87,24 +87,8 @@ SYSCTL_COUNTER_I(dirty_ranges_in_flight, "number of dirty ranges in flight"); -SYSCTL_COUNTER_U(will_dirty, "number of times dirty ran"); -SYSCTL_COUNTER_U(will_dirty_range_cached, "number of times dirty range ran CACHED"); -SYSCTL_COUNTER_U(will_dirty_range_uncached, "number of times dirty range ran UNCACHED"); SYSCTL_COUNTER_U(dirty_ranges_total, "number of total dirty ranges"); - -SYSCTL_COUNTER_U(will_fill_cached, "will_fill CACHED"); -SYSCTL_COUNTER_U(will_fill_uncached, "will_fill UNCACHED"); - -SYSCTL_COUNTER_U(exited_read_uncached, "exited read UNCACHED"); -SYSCTL_COUNTER_U(exited_read_cached, "exited read CACHED"); - SYSCTL_COUNTER_U(dirty_writes_lost, "dirty writes list"); -SYSCTL_COUNTER_U(fill_freed_in_flight, "FILL freed in flight"); -SYSCTL_COUNTER_U(threw_away_level0_read, "threw away level-0 READ buffer"); -SYSCTL_COUNTER_U(fix_old_data_null, "fix_old_data NULL'd db_data"); -SYSCTL_COUNTER_U(fix_old_data_more_dirties, "fix_old_data dirty db_data"); -SYSCTL_COUNTER_U(free_range_cleared, "free_range no holds"); -SYSCTL_COUNTER_U(free_range_referenced, "free_range with holds"); static uint64_t dbuf_hash_count; @@ -740,7 +724,6 @@ */ arc_release(buf, db); VERIFY(arc_buf_remove_ref(buf, db) == 1); - atomic_add_64(&threw_away_level0_read, 1); /* Dispatch any deferred syncer writes. */ if (db->db_data_pending != NULL && @@ -763,7 +746,6 @@ db->db_state == DB_UNCACHED); arc_release(buf, db); VERIFY(arc_buf_remove_ref(buf, db) == 1); - atomic_add_64(&threw_away_level0_read, 1); } } @@ -1145,7 +1127,6 @@ spa_t *spa; arc_buf_t *buf; - atomic_add_64(&fix_old_data_more_dirties, 1); DB_GET_SPA(&spa, db); buf = arc_buf_alloc(spa, size, db, type); @@ -1166,7 +1147,6 @@ dr->dt.dl.dr_data = buf; } } else { - atomic_add_64(&fix_old_data_null, 1); dbuf_set_data(db, NULL); } } @@ -1297,13 +1277,11 @@ /* All consumers are finished, so evict the buffer */ if (refcount_count(&db->db_holds) == 0) { ASSERT(db->db_buf); - atomic_add_64(&free_range_cleared, 1); dbuf_clear(db); continue; } /* The dbuf is referenced */ - atomic_add_64(&free_range_referenced, 1); dr = list_head(&db->db_dirty_records); if (dr != NULL) { @@ -2184,11 +2162,6 @@ (void) dbuf_read(db, zio, rf); (void) zio_nowait(zio); mutex_enter(&db->db_mtx); - - if (db->db_state == DB_CACHED) - atomic_add_64(&exited_read_cached, 1); - else - atomic_add_64(&exited_read_uncached, 1); } #pragma weak dmu_buf_will_dirty_range = dbuf_will_dirty_range @@ -2223,16 +2196,10 @@ DB_DNODE_EXIT(db); mutex_exit(&db->db_mtx); dbuf_will_dirty(db, tx); - atomic_add_64(&will_dirty, 1); return; } DB_DNODE_EXIT(db); - if (db->db_state == DB_CACHED) - atomic_add_64(&will_dirty_range_cached, 1); - else - atomic_add_64(&will_dirty_range_uncached, 1); - /* * Only issue a read if we start writing inside the block rather * than either at the beginning (forward) or end (backward) @@ -2300,11 +2267,6 @@ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); - if (db->db_state == DB_CACHED) - atomic_add_64(&will_fill_cached, 1); - else - atomic_add_64(&will_fill_uncached, 1); - dbuf_noread(db); dr = dbuf_dirty(db, tx); @@ -2330,7 +2292,6 @@ if (db->db_level == 0 && db->db_freed_in_flight) { dbuf_dirty_record_t *dr; - atomic_add_64(&fill_freed_in_flight, 1); dr = list_head(&db->db_dirty_records); ASSERT(dr->dr_txg == tx->tx_txg); ASSERT(db->db_blkid != DMU_BONUS_BLKID); Change 520433 by willa@willa_repo on 2012/01/12 12:02:33 Fix a few more regressions involving split brain dbufs. The biggest problem was that the syncer would eventually wait on a partial buffer that never had a read issued on it, and hang forever. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: - Refactor dbuf_dirty() and its consumers: - Modify/extend dbuf_dirty(): - Accept three new arguments: offset, size, how. "how" means to declare to dbuf_dirty() how the consumer intends to dirty. Valid values are: - DB_FILL: Caller intends to fill some or all of the buffer. - DB_NOFILL: Caller wants a NOFILL buffer. - DB_UNCACHED: Caller simply wants the buffer dirtied; no state changes are done in this case. This is primarily used for non level 0 blocks. - Perform state transitions if how is DB_FILL or DB_NOFILL, depending on the current state of the buffer. - Consolidate the async read issuance to this function from dbuf_assign_arcbuf() and dbuf_will_dirty_range(). Issue the async read if the dbuf is still partial and we are creating a new dirty record. Issue the async read if the dbuf is newly dirtied and the writer is unlikely to fill it. - Refactor out a few more pieces to dbuf_dirty_parent(). - Reorganize the dirty record discovery so that when it's time to add the dirty range, either we're using an existing record or have created a new one. Reduce the function to one exit. - Move the transition to FILL from PARTIAL to here. - Change dbuf_will_dirty(), dbuf_will_dirty_range(), dmu_buf_will_fill(), dmu_buf_will_not_fill() to use dbuf_dirty() to do most work. These functions are now merely frontend interfaces. - Remove dbuf_noread(). Its functions have been obsoleted by the extension of dbuf_dirty(). - Simplify dbuf_assign_arcbuf(). It now no longer needs a special case for buffers with reads outstanding and can rely on dbuf_dirty() to do the right thing. Combine that handling with the CACHED-only-one-writer case. - Change dbuf_dirty_range_add_record() so that it optimizes the case where we're dirtying the whole buffer. In that case, simply clear the range list and return. This is now much more likely to occur because of the dbuf_dirty interface change. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h: - Sync with the new dbuf_dirty() interface. For level 0 calls, pass in DB_FILL or DB_NOFILL as appropriate. For other calls, pass in DB_UNCACHED instead. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c: sys/modules/zfs/Makefile: - Remove the temporary ZFS_DIRTY_SYNC_READS ifdef'd code. We're far enough along this path now that we don't need this anymore. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#49 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c#6 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c#5 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#15 edit ... //depot/branches/redline/projects/cow/sys/modules/zfs/Makefile#5 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#49 (text) ==== @@ -699,7 +699,7 @@ ASSERT(db->db_buf != buf); ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED || - db->db_state == DB_READ); + (db->db_state & DB_READ)); /* * Fill any holes in the dbuf's dirty records @@ -707,7 +707,7 @@ */ dbuf_resolve_ranges(db, buf); - if (db->db_state == DB_READ) { + if (db->db_state & DB_READ) { /* * The most recent version of this block * was waiting on this read. Transition @@ -758,8 +758,6 @@ ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); - while (db->db_state & DB_FILL) - cv_wait(&db->db_changed, &db->db_mtx); dprintf_dbuf(db, "%s: zio=%p arc=%p\n", __func__, zio, buf); @@ -892,7 +890,7 @@ /* We need the struct_rwlock to prevent db_blkptr from changing. */ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_PARTIAL); + ASSERT(db->db_state == DB_UNCACHED || (db->db_state & DB_PARTIAL)); /* * 1. Read without any writes (db_buf == NULL) * 2. Have dirty records (!list_is_empty(&db->db_dirty_records) @@ -962,9 +960,6 @@ DBUF_IS_CACHEABLE(db); mutex_enter(&db->db_mtx); - while (db->db_state & DB_FILL) - cv_wait(&db->db_changed, &db->db_mtx); - if (db->db_state == DB_CACHED) { mutex_exit(&db->db_mtx); if (prefetch) @@ -1031,54 +1026,6 @@ return (err); } -static void -dbuf_noread(dmu_buf_impl_t *db) -{ - ASSERT(!refcount_is_zero(&db->db_holds)); - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - mutex_enter(&db->db_mtx); - /* - * XXX - * This function is called when a caller wants to fill the - * entire buffer. - * - * If the steady state is PARTIAL, then we must issue a READ - * immediately for this buffer, to resolve writes that may be in a - * previous transaction group. - * - * In the future, this can be optimized by not issuing the READ, if - * this request happens to be coming in on the same transaction - * group. The fill that's about to occur can then obsolete the - * previously issued writes. - * - * The write ranges, however, are needed in order for the read_done - * callback to realize that its buffer is no longer needed, so the - * fill should still create a single write range covering the buffer. - */ - - /* Wait for another filler to finish. */ - while (db->db_state & DB_FILL) - cv_wait(&db->db_changed, &db->db_mtx); - - if (db->db_state == DB_UNCACHED) { - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - spa_t *spa; - - ASSERT(db->db_buf == NULL); - ASSERT(db->db.db_data == NULL); - DB_GET_SPA(&spa, db); - dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); - db->db_state = DB_FILL; - } else if (db->db_state == DB_NOFILL) { - dbuf_set_data(db, NULL); - } else if (db->db_state & (DB_READ|DB_PARTIAL)) { - DBUF_STATE_CHANGE(db, |=, DB_FILL, "notifying of a fill"); - } else { - ASSERT(db->db_state & (DB_PARTIAL|DB_READ|DB_CACHED)); - } - mutex_exit(&db->db_mtx); -} - /** * \brief This is our just-in-time copy function. * @@ -1613,15 +1560,6 @@ dn->dn_objset->os_dsl_dataset == NULL); DNODE_VERIFY_DIRTYCTX(dn, tx); - - /* - * XXX make this true for indirects too? The problem is that - * transactions created with dmu_tx_create_assigned() from - * syncing context don't bother holding ahead. - */ - ASSERT(db->db_level != 0 || - db->db_state == DB_CACHED || db->db_state == DB_FILL || - db->db_state == DB_NOFILL || db->db_state & (DB_PARTIAL|DB_READ)); } /** @@ -1646,6 +1584,16 @@ ASSERT(DB_DNODE_HELD(db)); dn = DB_DNODE(db); + if (db->db_blkid == DMU_BONUS_BLKID || + db->db_blkid == DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&dn->dn_dirty_records[txgoff], dr); + mutex_exit(&dn->dn_mtx); + dnode_setdirty(dn, tx); + return; + } + if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { rw_enter(&dn->dn_struct_rwlock, RW_READER); drop_struct_lock = TRUE; @@ -1675,7 +1623,7 @@ rw_exit(&dn->dn_struct_rwlock); ASSERT3U(db->db_level+1, ==, parent->db_level); - di = dbuf_dirty(parent, tx); + di = dbuf_dirty(parent, tx, 0, parent->db.db_size, DB_UNCACHED); if (parent_held) dbuf_rele(parent, FTAG); @@ -1711,40 +1659,190 @@ if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); } + + dnode_setdirty(dn, tx); } +static void +dbuf_dirty_record_check_ranges(dbuf_dirty_record_t *dr) +{ +#ifdef ZFS_DEBUG + dbuf_dirty_leaf_t *dl; + dbuf_dirty_range_t *prev, *cur, *next; + + if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) + return; + + dl = &dr->dt.dl; + + prev = next = NULL; + for (cur = list_head(&dl->write_ranges); cur != NULL; + prev = cur, cur = next) { + next = list_next(&dl->write_ranges, cur); + ASSERT(prev == NULL || cur->start > prev->end); + ASSERT(next == NULL || cur->end < next->start); + } +#endif +} + +/** + * \brief Record a write range for the associated dirty record. + * + * \param dr The dirty record to record the write range for. + * \param offset The offset of the new write range. + * \param size The size of the new write range. + */ +static void +dbuf_dirty_record_add_range(dbuf_dirty_record_t *dr, int offset, int size) +{ + dbuf_dirty_range_t *next_range, *old_range, *range; + dbuf_dirty_leaf_t *dl; + dmu_buf_impl_t *db; + + dl = &dr->dt.dl; + db = dr->dr_dbuf; + + /* Write ranges do not apply to indirect blocks. */ + ASSERT(db->db_level == 0); + ASSERT(MUTEX_HELD(&db->db_mtx)); + + /* Optimization: clear the ranges if the incoming range fills. */ + if (offset == 0 && size == db->db.db_size) { + dbuf_dirty_record_cleanup_ranges(dr); + return; + } + + range = kmem_zalloc(sizeof(dbuf_dirty_range_t), KM_SLEEP); + range->start = offset; + range->size = size; + range->end = offset + size; + + /* + * This loop acts as an accumulator, merging dirty ranges if they + * overlap or are adjacent, and in so doing leaving behind only one + * range. But if the new range must be inserted separately, it will + * do so using the old range as a marker. + */ + for (old_range = list_head(&dl->write_ranges); + old_range != NULL && old_range->start <= range->end; + old_range = next_range) { + next_range = list_next(&dl->write_ranges, old_range); + if (range->start <= old_range->end && + range->end >= old_range->start) { + old_range->start = MIN(range->start, old_range->start); + old_range->end = MAX(range->end, old_range->end); + old_range->size = old_range->end - old_range->start; + list_remove(&dl->write_ranges, old_range); + atomic_subtract_64(&dirty_ranges_in_flight, 1); + kmem_free(range, sizeof(dbuf_dirty_range_t)); + range = old_range; + } + } + + /* If the writer will finish filling, go directly to DB_FILL. */ + if (range->start == 0 && range->size == db->db.db_size) { + kmem_free(range, sizeof(dbuf_dirty_range_t)); + } else { + /* If old_range is NULL, this does a list_insert_tail(). */ + list_insert_before(&dl->write_ranges, old_range, range); + atomic_add_64(&dirty_ranges_in_flight, 1); + atomic_add_64(&dirty_ranges_total, 1); + } + + dbuf_dirty_record_check_ranges(dr); +} + /** \brief Mark a dbuf as dirty. */ dbuf_dirty_record_t * -dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx, int offset, int size, int how) { dnode_t *dn; objset_t *os; dbuf_dirty_record_t *dr; boolean_t do_free_accounting = B_FALSE; + boolean_t already_dirty = B_FALSE; int txgoff = tx->tx_txg & TXG_MASK; + ASSERT(how == DB_FILL || how == DB_NOFILL || how == DB_UNCACHED); + DB_DNODE_ENTER(db); dn = DB_DNODE(db); mutex_enter(&db->db_mtx); + /* Only one filler allowed at a time. */ + if (db->db_level == 0) + while (db->db_state & DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + dbuf_dirty_verify(db, tx); + + dr = list_head(&db->db_dirty_records); + ASSERT(dr == NULL || dr->dr_txg <= tx->tx_txg || + db->db.db_object == DMU_META_DNODE_OBJECT); + + if (db->db_state & DB_PARTIAL) { + if (dr != NULL) { + if (dr->dr_txg != tx->tx_txg) { + /* Resolve split brain. */ + dbuf_transition_to_read(db); + } + } else if (offset != 0 && (offset + size) != db->db.db_size) { + /* + * Immediately issue a read if we start writing + * inside the block rather than either at the + * beginning (forward) or end (backward). Future + * writes are unlikely to fill. + */ + dbuf_transition_to_read(db); + } + } + dnode_set_dirtyctx(dn, tx); + /* Transition to the appropriate state if needed */ + if (how == DB_NOFILL) { + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(db->db_state & (DB_UNCACHED|DB_NOFILL|DB_CACHED)); + dbuf_set_data(db, NULL); + DBUF_STATE_CHANGE(db, =, DB_NOFILL, "allocating NOFILL buffer"); + } else if (how == DB_FILL) { + if (db->db_state == DB_UNCACHED) { + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + spa_t *spa; + + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + DB_GET_SPA(&spa, db); + dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, + db, type)); + if (size != db->db.db_size) + DBUF_STATE_CHANGE(db, =, (DB_PARTIAL|DB_FILL), + "notifying of an initial partial fill"); + else + DBUF_STATE_CHANGE(db, =, DB_FILL, + "notifying of a complete fill"); + } else if (db->db_state & (DB_READ|DB_PARTIAL)) { + DBUF_STATE_CHANGE(db, |=, DB_FILL, + "notifying of a followup partial fill"); + } else { + /* No wait on FILL is done for indirect blocks. */ + ASSERT(db->db_state == DB_CACHED || + (db->db_level != 0 && db->db_state == DB_FILL)); + } + } + if (db->db_blkid == DMU_SPILL_BLKID) dn->dn_have_spill = B_TRUE; /* * If this buffer is already dirty, we're done. - */ - dr = list_head(&db->db_dirty_records); - ASSERT(dr == NULL || dr->dr_txg <= tx->tx_txg || - db->db.db_object == DMU_META_DNODE_OBJECT); - /* + * * Find the newest dirty record that is not newer than the * transaction's group. If there isn't one, dr == NULL. If it is * older, it will be ignored. */ + dr = list_head(&db->db_dirty_records); while (dr != NULL && dr->dr_txg > tx->tx_txg) dr = list_next(&db->db_dirty_records, dr); if (dr && dr->dr_txg == tx->tx_txg) { @@ -1752,8 +1850,6 @@ * This transaction happens to be occurring in the same * transaction group as the dirty record found above. */ - DB_DNODE_EXIT(db); - if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { /* Reset immediate write sync state if needed */ dbuf_unoverride(dr); @@ -1766,85 +1862,87 @@ arc_buf_thaw(db->db_buf); } } - mutex_exit(&db->db_mtx); - return (dr); - } + already_dirty = B_TRUE; + } else { + + /* + * Only valid if not already dirty. + */ + DNODE_VERIFY_DIRTYCTX(dn, tx); - /* - * Only valid if not already dirty. - */ - DNODE_VERIFY_DIRTYCTX(dn, tx); + ASSERT3U(dn->dn_nlevels, >, db->db_level); + ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || + dn->dn_phys->dn_nlevels > db->db_level || + dn->dn_next_nlevels[txgoff] > db->db_level || + dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || + dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); - ASSERT3U(dn->dn_nlevels, >, db->db_level); - ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || - dn->dn_phys->dn_nlevels > db->db_level || - dn->dn_next_nlevels[txgoff] > db->db_level || - dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || - dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); + /* + * We should only be dirtying in syncing context if it's the + * mos or we're initializing the os or it's a special object. + * However, we are allowed to dirty in syncing context provided + * we already dirtied it in open context. Hence we must make + * this assertion only if we're not already dirty. + */ + os = dn->dn_objset; + ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || + os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); + ASSERT(db->db.db_size != 0); - /* - * We should only be dirtying in syncing context if it's the - * mos or we're initializing the os or it's a special object. - * However, we are allowed to dirty in syncing context provided - * we already dirtied it in open context. Hence we must make - * this assertion only if we're not already dirty. - */ - os = dn->dn_objset; - ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || - os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); - ASSERT(db->db.db_size != 0); + dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); - dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + if (db->db_blkid != DMU_BONUS_BLKID) { + /* + * Update the accounting. + * Note: we delay "free accounting" until after we drop + * the db_mtx. This keeps us from grabbing other locks + * (and possibly deadlocking) in bp_get_dsize() while + * also holding the db_mtx. + * + * XXX Shouldn't this conditional check for SPILL blkid too? + */ + dnode_willuse_space(dn, db->db.db_size, tx); + do_free_accounting = dbuf_block_freeable(db); + } - if (db->db_blkid != DMU_BONUS_BLKID) { /* - * Update the accounting. - * Note: we delay "free accounting" until after we drop - * the db_mtx. This keeps us from grabbing other locks - * (and possibly deadlocking) in bp_get_dsize() while - * also holding the db_mtx. + * If this buffer is dirty in an old transaction group we need + * to make a copy of it so that the changes we make in this + * transaction group won't leak out when we sync the older txg. */ - dnode_willuse_space(dn, db->db.db_size, tx); - do_free_accounting = dbuf_block_freeable(db); + dr = dbuf_dirty_record_create(db, tx); } - /* - * If this buffer is dirty in an old transaction group we need - * to make a copy of it so that the changes we make in this - * transaction group won't leak out when we sync the older txg. - */ - dr = dbuf_dirty_record_create(db, tx); + /* Add the dirty range and do some related bookkeeping. */ + if (db->db_state != DB_CACHED && db->db_level == 0) { + dbuf_dirty_record_add_range(dr, offset, size); + if ((db->db_state & DB_FILL) && + list_is_empty(&dr->dt.dl.write_ranges)) + DBUF_STATE_CHANGE(db, =, DB_FILL, "writer fully filled"); + } mutex_exit(&db->db_mtx); - if (db->db_blkid == DMU_BONUS_BLKID || - db->db_blkid == DMU_SPILL_BLKID) { - mutex_enter(&dn->dn_mtx); - ASSERT(!list_link_active(&dr->dr_dirty_node)); - list_insert_tail(&dn->dn_dirty_records[txgoff], dr); - mutex_exit(&dn->dn_mtx); - dnode_setdirty(dn, tx); - DB_DNODE_EXIT(db); - return (dr); - } else if (do_free_accounting) { - blkptr_t *bp = db->db_blkptr; - int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? - bp_get_dsize(os->os_spa, bp) : db->db.db_size; - /* - * This is only a guess -- if the dbuf is dirty - * in a previous txg, we don't know how much - * space it will use on disk yet. We should - * really have the struct_rwlock to access - * db_blkptr, but since this is just a guess, - * it's OK if we get an odd answer. - */ - ddt_prefetch(os->os_spa, bp); - dnode_willuse_space(dn, -willfree, tx); + if (!already_dirty) { + if (do_free_accounting && db->db_blkid != DMU_SPILL_BLKID) { + blkptr_t *bp = db->db_blkptr; + int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? + bp_get_dsize(os->os_spa, bp) : db->db.db_size; + /* + * This is only a guess -- if the dbuf is dirty + * in a previous txg, we don't know how much + * space it will use on disk yet. We should + * really have the struct_rwlock to access + * db_blkptr, but since this is just a guess, + * it's OK if we get an odd answer. + */ + ddt_prefetch(os->os_spa, bp); + dnode_willuse_space(dn, -willfree, tx); + } + + dbuf_dirty_parent(db, tx, dr); } - dbuf_dirty_parent(db, tx, dr); - - dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); } @@ -2037,91 +2135,8 @@ rf |= DB_RF_HAVESTRUCT; DB_DNODE_EXIT(db); (void) dbuf_read(db, NULL, rf); - (void) dbuf_dirty(db, tx); -} - -static void -dbuf_dirty_record_check_ranges(dbuf_dirty_record_t *dr) -{ -#ifdef ZFS_DEBUG - dbuf_dirty_leaf_t *dl; - dbuf_dirty_range_t *prev, *cur, *next; - - if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) - return; - - dl = &dr->dt.dl; - - prev = next = NULL; - for (cur = list_head(&dl->write_ranges); cur != NULL; - prev = cur, cur = next) { - next = list_next(&dl->write_ranges, cur); - ASSERT(prev == NULL || cur->start > prev->end); - ASSERT(next == NULL || cur->end < next->start); - } -#endif -} - -/** - * \brief Record a write range for the associated dirty record. - * - * \param dr The dirty record to record the write range for. - * \param offset The offset of the new write range. - * \param size The size of the new write range. - */ -void -dbuf_dirty_record_add_range(dbuf_dirty_record_t *dr, int offset, int size) -{ - dbuf_dirty_range_t *next_range, *old_range, *range; - dbuf_dirty_leaf_t *dl; - dmu_buf_impl_t *db; - - dl = &dr->dt.dl; - db = dr->dr_dbuf; - - /* Write ranges do not apply to indirect blocks. */ - ASSERT(db->db_level == 0); - ASSERT(MUTEX_HELD(&db->db_mtx)); - - range = kmem_zalloc(sizeof(dbuf_dirty_range_t), KM_SLEEP); - range->start = offset; - range->size = size; - range->end = offset + size; - - /* - * This loop acts as an accumulator, merging dirty ranges if they - * overlap or are adjacent, and in so doing leaving behind only one - * range. But if the new range must be inserted separately, it will - * do so using the old range as a marker. - */ - for (old_range = list_head(&dl->write_ranges); - old_range != NULL && old_range->start <= range->end; - old_range = next_range) { - next_range = list_next(&dl->write_ranges, old_range); - if (range->start <= old_range->end && - range->end >= old_range->start) { - old_range->start = MIN(range->start, old_range->start); - old_range->end = MAX(range->end, old_range->end); - old_range->size = old_range->end - old_range->start; - list_remove(&dl->write_ranges, old_range); - atomic_subtract_64(&dirty_ranges_in_flight, 1); - kmem_free(range, sizeof(dbuf_dirty_range_t)); - range = old_range; - } - } - - /* If the writer will finish filling, go directly to DB_FILL. */ - if (range->start == 0 && range->size == db->db.db_size) { - DBUF_STATE_CHANGE(db, =, DB_FILL, "writer fully filled"); - kmem_free(range, sizeof(dbuf_dirty_range_t)); - } else { - /* If old_range is NULL, this does a list_insert_tail(). */ - list_insert_before(&dl->write_ranges, old_range, range); - atomic_add_64(&dirty_ranges_in_flight, 1); - atomic_add_64(&dirty_ranges_total, 1); - } - - dbuf_dirty_record_check_ranges(dr); + /* Already CACHED or UNCACHED at this point */ + (void) dbuf_dirty(db, tx, 0, db->db.db_size, DB_FILL); } /** @@ -2179,68 +2194,24 @@ dbuf_will_dirty_range(dmu_buf_impl_t *db, dmu_tx_t *tx, int offset, int size) { dbuf_dirty_record_t *dr; - dnode_t *dn; ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); + ASSERT(db->db_level == 0); + ASSERT(db->db_blkid != DMU_SPILL_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); +#ifdef ZFS_DEBUG + { + dnode_t *dn; - mutex_enter(&db->db_mtx); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - - /* This can't handle special blocks or non level 0 blocks yet */ - if (DMU_OBJECT_IS_SPECIAL(dn->dn_object) || - db->db_blkid == DMU_SPILL_BLKID || - db->db_blkid == DMU_BONUS_BLKID || - db->db_level != 0) { + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); DB_DNODE_EXIT(db); - mutex_exit(&db->db_mtx); - dbuf_will_dirty(db, tx); - return; } - DB_DNODE_EXIT(db); - - /* - * Only issue a read if we start writing inside the block rather - * than either at the beginning (forward) or end (backward) - */ - if (db->db_state == DB_UNCACHED && offset != 0 && - (offset + size != db->db.db_size)) - dbuf_transition_to_read(db); - -#ifdef ZFS_DIRTY_SYNC_READS - /* XXX TEMP Wait for the async read to complete */ - while (db->db_state == DB_READ) - cv_wait(&db->db_changed, &db->db_mtx); #endif - /* State transitions not done above, needed for dbuf_dirty */ - /* Transition to PARTIAL if we didn't transition to READ */ - if (db->db_state == DB_UNCACHED) - DBUF_STATE_CHANGE(db, =, DB_PARTIAL, - "new buffer about to be written without async read"); - - /* Only one filler allowed at a time. */ - while (db->db_state & DB_FILL) - cv_wait(&db->db_changed, &db->db_mtx); - - /* - * Now set the FILL bit, since the caller is about to write, but - * only if the dbuf isn't already CACHED. - */ - if (db->db_state & (DB_PARTIAL|DB_READ)) - DBUF_STATE_CHANGE(db, |=, DB_FILL, - "new writer about to modify non-CACHED buffer"); - mutex_exit(&db->db_mtx); - - dr = dbuf_dirty(db, tx); - ASSERT(dr != NULL); - - /* Add the write range to this dbuf. */ - mutex_enter(&db->db_mtx); - if (db->db_state != DB_CACHED) - dbuf_dirty_record_add_range(dr, offset, size); - mutex_exit(&db->db_mtx); + dbuf_dirty(db, tx, offset, size, DB_FILL); } void @@ -2248,16 +2219,13 @@ { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - DBUF_STATE_CHANGE(db, =, DB_NOFILL, "allocating NOFILL buffer"); - - dmu_buf_will_fill(db_fake, tx); + dbuf_dirty(db, tx, 0, db->db.db_size, DB_NOFILL); } void dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dbuf_dirty_record_t *dr; ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(tx->tx_txg != 0); @@ -2267,18 +2235,11 @@ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); - dbuf_noread(db); - dr = dbuf_dirty(db, tx); + /* Wait for another filler to finish. */ + while (db->db_state & DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); - /* - * Mark the whole buffer as being written to by clearing - * any partial ranges in the dirty record from partial - * writes to this block that occurred earlier in this - * transaction. - */ - mutex_enter(&db->db_mtx); - dbuf_dirty_record_cleanup_ranges(dr); - mutex_exit(&db->db_mtx); + dbuf_dirty(db, tx, 0, db->db.db_size, DB_FILL); } #pragma weak dmu_buf_fill_done = dbuf_fill_done @@ -2347,42 +2308,11 @@ mutex_enter(&db->db_mtx); - if (db->db_state & DB_PARTIAL) - dbuf_transition_to_read(db); - while (db->db_state & DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); - /* - * The buffer is waiting for a read, so simply update the associated - * dirty record, using the buffer provided. - */ - if (db->db_state == DB_READ) { - for (dr = list_head(&db->db_dirty_records); - dr != NULL && dr->dr_txg != tx->tx_txg; - dr = list_next(&db->db_dirty_records, dr)) - ; - if (dr == NULL) - /* No dirty record for this transaction yet */ - /* XXX Avoid superfluous arc buffer allocation? */ - dr = dbuf_dirty_record_create(db, tx); - else { - /* Remove the old dirty data for this transaction */ - dbuf_dirty_record_cleanup_ranges(dr); - } - ASSERT(dr->dt.dl.dr_data != NULL); - ASSERT(arc_released(dr->dt.dl.dr_data)); - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); + ASSERT(db->db_state & (DB_CACHED|DB_UNCACHED|DB_PARTIAL|DB_READ)); - /* Assign the ARC buffer to the dirty record. */ - dr->dt.dl.dr_data = buf; - dbuf_set_data(db, buf); - mutex_exit(&db->db_mtx); - return; - } - - ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); - /* If the dbuf is cached and the number of holds exceeds the number * of dirty calls on it, then dirty it again and remove the buffer * reference, before copying the ARC buffer to the dbuf. @@ -2390,7 +2320,7 @@ if (db->db_state == DB_CACHED && refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { mutex_exit(&db->db_mtx); - (void) dbuf_dirty(db, tx); + (void) dbuf_dirty(db, tx, 0, db->db.db_size, DB_FILL); bcopy(buf->b_data, db->db.db_data, db->db.db_size); VERIFY(arc_buf_remove_ref(buf, db) == 1); xuio_stat_wbuf_copied(); @@ -2398,10 +2328,11 @@ } xuio_stat_wbuf_nocopy(); - if (db->db_state == DB_CACHED) { + if (db->db_state & (DB_CACHED|DB_READ|DB_PARTIAL)) { dr = list_head(&db->db_dirty_records); - ASSERT(db->db_buf != NULL); + ASSERT((db->db_state == DB_CACHED && db->db_buf != NULL) || + db->db_buf == NULL); if (dr != NULL && dr->dr_txg == tx->tx_txg) { ASSERT(dr->dt.dl.dr_data == db->db_buf); if (!arc_released(db->db_buf)) { @@ -2411,7 +2342,9 @@ } dr->dt.dl.dr_data = buf; VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); - } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { + } else if (db->db_buf != NULL && + (dr == NULL || dr->dt.dl.dr_data != db->db_buf)) { + ASSERT(db->db_state == DB_CACHED); arc_release(db->db_buf, db); VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); } @@ -2422,7 +2355,7 @@ dbuf_set_data(db, buf); DBUF_STATE_CHANGE(db, =, DB_FILL, "assigning buffer"); mutex_exit(&db->db_mtx); - (void) dbuf_dirty(db, tx); + (void) dbuf_dirty(db, tx, 0, db->db.db_size, DB_FILL); /* clear db->db.db_data and tell waiters it's changed ?? */ dbuf_fill_done(db, tx); } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c#6 (text) ==== @@ -269,28 +269,8 @@ zio_t *zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); -#ifdef ZFS_DIRTY_SYNC_READS - /* first level-0 block */ - start = off >> dn->dn_datablkshift; - if (P2PHASE(off, dn->dn_datablksz) || - len < dn->dn_datablksz) { - err = dmu_tx_check_ioerr(zio, dn, 0, start); - if (err) - goto out; - } - - /* last level-0 block */ - end = (off+len-1) >> dn->dn_datablkshift; - if (end != start && end <= dn->dn_maxblkid && - P2PHASE(off+len, dn->dn_datablksz)) { - err = dmu_tx_check_ioerr(zio, dn, 0, end); - if (err) - goto out; - } -#else start = off >> dn->dn_datablkshift; end = (off+len-1) >> dn->dn_datablkshift; -#endif /* level-1 blocks */ if (nlvls > 1) { ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c#5 (text) ==== @@ -1205,6 +1205,7 @@ void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) { + dmu_buf_impl_t *db; objset_t *os = dn->dn_objset; uint64_t txg = tx->tx_txg; @@ -1265,7 +1266,8 @@ */ VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); - (void) dbuf_dirty(dn->dn_dbuf, tx); + db = dn->dn_dbuf; + (void) dbuf_dirty(db, tx, 0, db->db.db_size, DB_UNCACHED); dsl_dataset_dirty(os->os_dsl_dataset, tx); } @@ -1430,7 +1432,7 @@ /* dirty the left indirects */ db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); ASSERT(db != NULL); - new = dbuf_dirty(db, tx); + new = dbuf_dirty(db, tx, 0, db->db.db_size, DB_UNCACHED); dbuf_rele(db, FTAG); /* transfer the dirty records to the new indirect */ ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#15 (text) ==== @@ -497,7 +497,8 @@ void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); -dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx, int offset, + int size, int how); arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); void dbuf_clear(dmu_buf_impl_t *db); ==== //depot/branches/redline/projects/cow/sys/modules/zfs/Makefile#5 (text) ==== @@ -92,9 +92,6 @@ CFLAGS+=-DDEBUG=1 DEBUG_FLAGS?=-g .endif -.ifdef WITH_ZFS_DIRTY_SYNC_READS -CFLAGS+=-DZFS_DIRTY_SYNC_READS -.endif .include Change 520520 by willa@willa_repo on 2012/01/12 16:24:35 Merge dirty record handling from dbuf_assign_arcbuf() to dbuf_dirty(). This seems to fix an assert that fired in dbuf_do_evict(), by removing the race between setting the ARC buffer and dirtying. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#50 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c#6 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#16 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#50 (text) ==== @@ -1623,7 +1623,8 @@ rw_exit(&dn->dn_struct_rwlock); ASSERT3U(db->db_level+1, ==, parent->db_level); - di = dbuf_dirty(parent, tx, 0, parent->db.db_size, DB_UNCACHED); + di = dbuf_dirty(parent, tx, 0, parent->db.db_size, DB_UNCACHED, + NULL); if (parent_held) dbuf_rele(parent, FTAG); @@ -1754,7 +1755,8 @@ /** \brief Mark a dbuf as dirty. */ dbuf_dirty_record_t * -dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx, int offset, int size, int how) +dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx, int offset, int size, int how, + arc_buf_t *db_buf) { dnode_t *dn; objset_t *os; @@ -1771,9 +1773,10 @@ mutex_enter(&db->db_mtx); /* Only one filler allowed at a time. */ - if (db->db_level == 0) - while (db->db_state & DB_FILL) - cv_wait(&db->db_changed, &db->db_mtx); + while (db->db_state & DB_FILL) { + ASSERT(db->db_level = 0); + cv_wait(&db->db_changed, &db->db_mtx); + } dbuf_dirty_verify(db, tx); @@ -1784,7 +1787,12 @@ if (db->db_state & DB_PARTIAL) { if (dr != NULL) { if (dr->dr_txg != tx->tx_txg) { - /* Resolve split brain. */ + /* + * Schedule resolution for this older + * transation group before we change the + * dbuf's state and lose track of the + * PARTIAL state. + */ dbuf_transition_to_read(db); } } else if (offset != 0 && (offset + size) != db->db.db_size) { @@ -1792,7 +1800,7 @@ * Immediately issue a read if we start writing * inside the block rather than either at the * beginning (forward) or end (backward). Future - * writes are unlikely to fill. + * writes are unlikely to fill this dbuf. */ dbuf_transition_to_read(db); } @@ -1810,12 +1818,17 @@ if (db->db_state == DB_UNCACHED) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa; + arc_buf_t *fill_buf; ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); DB_GET_SPA(&spa, db); - dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, - db, type)); + fill_buf = db_buf; + if (fill_buf == NULL) + fill_buf = arc_buf_alloc(spa, db->db.db_size, + db, type); + dbuf_set_data(db, fill_buf); + if (size != db->db.db_size) DBUF_STATE_CHANGE(db, =, (DB_PARTIAL|DB_FILL), "notifying of an initial partial fill"); @@ -1862,11 +1875,28 @@ arc_buf_thaw(db->db_buf); } } + + /* + * If we are assigning a buffer directly, release the + * old buffer allocated to this transaction group. + */ + if (db_buf != NULL && db_buf != db->db_buf) { + ASSERT(dr->dt.dl.dr_data == db->db_buf); + if (!arc_released(db->db_buf)) { + ASSERT(dr->dt.dl.dr_override_state == + DR_OVERRIDDEN); + arc_release(db->db_buf, db); + } + dr->dt.dl.dr_data = db_buf; + VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); + db->db_buf = NULL; + dbuf_set_data(db, db_buf); + } already_dirty = B_TRUE; } else { /* - * Only valid if not already dirty. + * Only valid if not already dirty in this transaction group. */ DNODE_VERIFY_DIRTYCTX(dn, tx); @@ -1906,6 +1936,28 @@ } /* + * If assigning a buffer directly, release any buffers + * that will no longer be referenced. + */ + if (db_buf != NULL && dr != NULL) { + ASSERT(db->db_level == 0); + + /* + * Handle the case of the syncer or dbuf_hold() + * preemptively dissassociating the dirty record + * from the buffer used in the open transaction + * group. + */ + if (dr->dt.dl.dr_data != db->db_buf) { + ASSERT(db->db_state == DB_CACHED); + arc_release(db->db_buf, db); + VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); + db->db_buf = NULL; + } + dbuf_set_data(db, db_buf); + } + + /* * If this buffer is dirty in an old transaction group we need * to make a copy of it so that the changes we make in this * transaction group won't leak out when we sync the older txg. @@ -2011,13 +2063,6 @@ ASSERT(dr->dr_dbuf == db); /* - * The buffer was dirtied in a previous transaction group and needs - * to be resolved now. - */ - if (db->db_dirtycnt > 1 && (db->db_state & DB_PARTIAL)) - dbuf_transition_to_read(db); - - /* * XXX Wait for the buffer to be resolved. With additional accounting * we should be able to undirty immediately and disassociate the * read from this dbuf before it completes. @@ -2136,7 +2181,7 @@ DB_DNODE_EXIT(db); (void) dbuf_read(db, NULL, rf); /* Already CACHED or UNCACHED at this point */ - (void) dbuf_dirty(db, tx, 0, db->db.db_size, DB_FILL); + (void) dbuf_dirty(db, tx, 0, db->db.db_size, DB_FILL, NULL); } /** @@ -2211,7 +2256,7 @@ } #endif - dbuf_dirty(db, tx, offset, size, DB_FILL); + dbuf_dirty(db, tx, offset, size, DB_FILL, NULL); } void @@ -2219,7 +2264,7 @@ { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dbuf_dirty(db, tx, 0, db->db.db_size, DB_NOFILL); + dbuf_dirty(db, tx, 0, db->db.db_size, DB_NOFILL, NULL); } void @@ -2239,7 +2284,7 @@ while (db->db_state & DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); - dbuf_dirty(db, tx, 0, db->db.db_size, DB_FILL); + dbuf_dirty(db, tx, 0, db->db.db_size, DB_FILL, NULL); } #pragma weak dmu_buf_fill_done = dbuf_fill_done @@ -2308,9 +2353,6 @@ mutex_enter(&db->db_mtx); - while (db->db_state & DB_FILL) - cv_wait(&db->db_changed, &db->db_mtx); - ASSERT(db->db_state & (DB_CACHED|DB_UNCACHED|DB_PARTIAL|DB_READ)); /* If the dbuf is cached and the number of holds exceeds the number @@ -2320,7 +2362,7 @@ if (db->db_state == DB_CACHED && refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { mutex_exit(&db->db_mtx); - (void) dbuf_dirty(db, tx, 0, db->db.db_size, DB_FILL); + (void) dbuf_dirty(db, tx, 0, db->db.db_size, DB_FILL, NULL); bcopy(buf->b_data, db->db.db_data, db->db.db_size); VERIFY(arc_buf_remove_ref(buf, db) == 1); xuio_stat_wbuf_copied(); @@ -2328,35 +2370,8 @@ } xuio_stat_wbuf_nocopy(); - if (db->db_state & (DB_CACHED|DB_READ|DB_PARTIAL)) { - dr = list_head(&db->db_dirty_records); - - ASSERT((db->db_state == DB_CACHED && db->db_buf != NULL) || - db->db_buf == NULL); - if (dr != NULL && dr->dr_txg == tx->tx_txg) { - ASSERT(dr->dt.dl.dr_data == db->db_buf); - if (!arc_released(db->db_buf)) { - ASSERT(dr->dt.dl.dr_override_state == - DR_OVERRIDDEN); - arc_release(db->db_buf, db); - } - dr->dt.dl.dr_data = buf; - VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); - } else if (db->db_buf != NULL && - (dr == NULL || dr->dt.dl.dr_data != db->db_buf)) { - ASSERT(db->db_state == DB_CACHED); - arc_release(db->db_buf, db); - VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); - } - db->db_buf = NULL; - } - ASSERT(db->db_buf == NULL); - /* Set db->db_buf = buf */ - dbuf_set_data(db, buf); - DBUF_STATE_CHANGE(db, =, DB_FILL, "assigning buffer"); mutex_exit(&db->db_mtx); - (void) dbuf_dirty(db, tx, 0, db->db.db_size, DB_FILL); - /* clear db->db.db_data and tell waiters it's changed ?? */ + (void) dbuf_dirty(db, tx, 0, db->db.db_size, DB_FILL, buf); dbuf_fill_done(db, tx); } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c#6 (text) ==== @@ -1267,7 +1267,7 @@ VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); db = dn->dn_dbuf; - (void) dbuf_dirty(db, tx, 0, db->db.db_size, DB_UNCACHED); + (void) dbuf_dirty(db, tx, 0, db->db.db_size, DB_UNCACHED, NULL); dsl_dataset_dirty(os->os_dsl_dataset, tx); } @@ -1432,7 +1432,7 @@ /* dirty the left indirects */ db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); ASSERT(db != NULL); - new = dbuf_dirty(db, tx, 0, db->db.db_size, DB_UNCACHED); + new = dbuf_dirty(db, tx, 0, db->db.db_size, DB_UNCACHED, NULL); dbuf_rele(db, FTAG); /* transfer the dirty records to the new indirect */ ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#16 (text) ==== @@ -498,7 +498,7 @@ void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx, int offset, - int size, int how); + int size, int how, arc_buf_t *db_buf); arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); void dbuf_clear(dmu_buf_impl_t *db); Change 520552 by willa@willa_repo on 2012/01/13 09:39:04 Add a global pointer to the syncer's zio. This is ifdef ZFS_DEBUG only, and makes it easier to access it from kgdb when zfs is deadlocked. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c#2 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c#2 (text) ==== @@ -54,6 +54,10 @@ kmutex_t zfs_write_limit_lock; +#ifdef ZFS_DEBUG +zio_t *syncer_zio = NULL; +#endif + static pgcnt_t old_physmem = 0; SYSCTL_DECL(_vfs_zfs); @@ -361,6 +365,9 @@ dsl_dataset_sync(ds, zio, tx); } DTRACE_PROBE(pool_sync__1setup); +#ifdef ZFS_DEBUG + syncer_zio = zio; +#endif err = zio_wait(zio); write_time = gethrtime() - start; @@ -383,6 +390,9 @@ dmu_buf_rele(ds->ds_dbuf, ds); dsl_dataset_sync(ds, zio, tx); } +#ifdef ZFS_DEBUG + syncer_zio = zio; +#endif err = zio_wait(zio); /* @@ -415,6 +425,9 @@ list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); dmu_objset_sync(mos, zio, tx); +#ifdef ZFS_DEBUG + syncer_zio = zio; +#endif err = zio_wait(zio); ASSERT(err == 0); dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); Change 520557 by willa@willa_repo on 2012/01/13 11:43:36 Fix the remaining STF test regressions found with asserts turned on. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: - Fix dbuf_free_range(): When potentially transitioning to UNCACHED, issue a READ if the buffer was in PARTIAL. This is another split brain transition that is not caught by dbuf_dirty(), nor is it appropriate to be handled there. - Move the arc_buf_freeze() call from dbuf_merge_write_ranges() to dbuf_resolve_ranges() (its caller), and make this call conditional on whether the dirty record's buffer is not the currently visible one or whether the FILL bit is set. This is needed because if a filler is currently operating on another dirty record, they will be responsible for clearing the FILL bit and freezing that buffer. - Fix an assert that was an assignment instead of a conditional. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#51 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#51 (text) ==== @@ -647,9 +647,6 @@ while ((hole = dbuf_dirty_record_hole_itr_next(&itr)) != NULL) memcpy(hole->dst, hole->src, hole->size); - - /* Now that we have updated the buffer, freeze it */ - arc_buf_freeze(dl->dr_data); } /** @@ -685,6 +682,14 @@ dl = &dr->dt.dl; ASSERT(dl->dr_data); dbuf_merge_write_ranges(dl, old_buf); + /* + * Now that we have updated the buffer, freeze it. However, + * if the FILL bit is set, someone else is actively + * modifying the current buffer, and will be responsible for + * freezing that buffer. + */ + if (dl->dr_data != db->db_buf || !(db->db_state & DB_FILL)) + arc_buf_freeze(dl->dr_data); dbuf_dirty_record_cleanup_ranges(dr); old_buf = dl->dr_data; dr = list_prev(&db->db_dirty_records, dr); @@ -838,6 +843,7 @@ * \param flags Dbuf read flags pointer. * * \invariant The dbuf's mutex must be held. + * \note If any action was taken, this function drops db_mtx. * * \returns whether any action was taken. */ @@ -1257,6 +1263,15 @@ * the open context) or reset its contents to * empty. */ + if (db->db_state & DB_PARTIAL) { + /* + * Schedule resolution for the older + * transaction group's dirty record + * before we change the dbuf's state + * and lose track of the PARTIAL state. + */ + dbuf_transition_to_read(db); + } dbuf_fix_old_data(db, txg); } } @@ -1774,7 +1789,7 @@ /* Only one filler allowed at a time. */ while (db->db_state & DB_FILL) { - ASSERT(db->db_level = 0); + ASSERT(db->db_level == 0); cv_wait(&db->db_changed, &db->db_mtx); } Change 521462 by willa@willa_repo on 2012/01/17 11:45:31 While freshly dirtying a dbuf, if its buffer is already in ARC, use it. The intent is to avoid churning the ARC of a buffer that we could end up needing anyway. The pre-COW ZFS would have had the same effect. While executing the 'replacement' STF test, this new code path resulted in over 350,000 cache hits. The 'online_offline' test added another 100,000. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c: - Add arc_buf_find_bp(): Find an ARC buffer for the given SPA and block pointer pair. If one exists, reference it and pass it back to the caller. This assumes that the caller won't make the request if there is any possibility that someone else is using it. Since dbufs use mutually exclusive ARC buffers, and since this will only get called for uncached dbufs, this should be appropriate. sys/cddl/contrib/opensolaris/uts/common/fs/dbuf.c: - Modify dbuf_read_impl() to accept a new flag, DB_RF_CACHED_ONLY, which does not drop the dbuf mutex, and after checking for hole or bonus buffers, performs strictly an ARC cache lookup using the new arc_buf_find_bp(), doing nothing if no cache hit occurs. - Add dbuf_read_cached(), which wraps dbuf_read_impl() using the above changes to perform a cache-check-only call. It's a simplified wrapper similar to dbuf_read() that acquires/drops dn_struct_rwlock, if needed to avoid block pointer changes from under us. - Modify dbuf_dirty(): When called on an uncached dbuf, call dbuf_read_cached() to see if we can avoid extra work creating, populating, merging another buffer etc. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c#6 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#52 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h#4 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#17 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c#6 (text) ==== @@ -3035,6 +3035,50 @@ arc_hdr_destroy(hdr); } +/** + * \brief Find an ARC buffer using its block pointer. + * + * \param spa The SPA associated with the buffer. + * \param bp The block pointer associated with the buffer. + * \param priv The private data associated with the buffer. + * + * \note Calling this function will place a reference on any found buffer. + * + * XXX This should be folded into arc_read_nolock somehow + */ +arc_buf_t * +arc_buf_find_bp(spa_t *spa, blkptr_t *bp, void *private) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + uint64_t guid = spa_guid(spa); + arc_buf_t *buf = NULL; + + hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), + &hash_lock); + if (hdr != NULL) { + if (hdr->b_datacnt > 0) { + add_reference(hdr, hash_lock, private); +#if 0 + if (HDR_BUF_AVAILABLE(hdr)) { + ASSERT(buf->b_efunc == NULL); + hdr->b_flags &= ~ARC_BUF_AVAILABLE; + buf = hdr->b_buf; + } else { + buf = arc_buf_clone(buf); + } +#else + ASSERT(HDR_BUF_AVAILABLE(hdr)); + hdr->b_flags &= ~ARC_BUF_AVAILABLE; + buf = hdr->b_buf; +#endif + arc_access(hdr, hash_lock); + } + mutex_exit(hash_lock); + } + return buf; +} + /** \brief "Read" the block block at the specified DVA (in bp) via the * cache. * ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#52 (text) ==== @@ -88,7 +88,8 @@ SYSCTL_COUNTER_I(dirty_ranges_in_flight, "number of dirty ranges in flight"); SYSCTL_COUNTER_U(dirty_ranges_total, "number of total dirty ranges"); -SYSCTL_COUNTER_U(dirty_writes_lost, "dirty writes list"); +SYSCTL_COUNTER_U(dirty_writes_lost, "dirty writes lost"); +SYSCTL_COUNTER_U(dirty_buffers_already_cached, "dirty buffers already cached"); static uint64_t dbuf_hash_count; @@ -812,7 +813,7 @@ * \returns whether any action was taken. */ static boolean_t -dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn) +dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t *flags) { int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); @@ -831,7 +832,8 @@ DB_DNODE_EXIT(db); dbuf_update_data(db); DBUF_STATE_CHANGE(db, =, DB_CACHED, "bonus buffer filled"); - mutex_exit(&db->db_mtx); + if (!(*flags & DB_RF_CACHED_ONLY)) + mutex_exit(&db->db_mtx); return (TRUE); } @@ -875,12 +877,27 @@ DBUF_STATE_CHANGE(db, =, DB_READ, "hole read satisfied"); *flags |= DB_RF_CACHED; dbuf_read_complete(db, buf); - mutex_exit(&db->db_mtx); + if (!(*flags & DB_RF_CACHED_ONLY)) + mutex_exit(&db->db_mtx); return (TRUE); } return (FALSE); } +/** + * \brief Actually read (or issue I/O for) a dbuf's block. + * + * \param db The dbuf to read. + * \param zio The parent zio to associate with. + * \param flags Pointer to the read flags. + * + * \note Flags will be modified to include DB_RF_CACHED if the call + * returns with the dbuf cached. + * \note The dbuf mutex will be dropped in all cases except if the + * DB_RF_CACHED flag is set. + * \note The DB_RF_CACHED flag has the effect of performing a + * cached-only read. + */ static void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) { @@ -904,13 +921,28 @@ */ //ASSERT(db->db_buf == NULL); - if (dbuf_read_bonus(db, dn)) + if (dbuf_read_bonus(db, dn, flags)) return; if (dbuf_read_on_hole(db, dn, flags)) return; spa = dn->dn_objset->os_spa; + + /* Check to see if a caller only wants cached buffers. */ + if (*flags & DB_RF_CACHED_ONLY) { + /* XXX this code path doesn't drop the lock */ + arc_buf_t *buf = arc_buf_find_bp(spa, db->db_blkptr, db); + if (buf != NULL) { + db->db_state = DB_READ; /* for read_complete */ + dbuf_read_complete(db, buf); + *flags |= DB_RF_CACHED; + /*mutex_exit(&db->db_mtx);*/ + } + DB_DNODE_EXIT(db); + return; + } + DB_DNODE_EXIT(db); DBUF_STATE_CHANGE(db, =, DB_READ, "read issued"); @@ -939,6 +971,37 @@ *flags |= DB_RF_CACHED; } +/** + * \brief Find a dbuf's block in the ARC, if it's there. + * + * \param db Dbuf to find the block for. + * \param dn Dnode for the dbuf. + * + * \note Calling this function is equivalent to calling dbuf_read, + * but only if the block is already in the cache. + * \note This function only applies to level 0 blocks. + * + * \returns whether it was there. + */ +static boolean_t +dbuf_read_cached(dmu_buf_impl_t *db, dnode_t *dn) +{ + int rflags = DB_RF_CACHED_ONLY; + boolean_t held = RW_WRITE_HELD(&dn->dn_struct_rwlock); + + ASSERT(DB_DNODE_HELD(db)); + + /* Prevent the block pointer from being changed from under us. */ + if (!held) + rw_enter(&dn->dn_struct_rwlock, RW_READER); + dbuf_read_impl(db, NULL, &rflags); + ASSERT(MUTEX_HELD(&db->db_mtx)); + if (!held) + rw_exit(&dn->dn_struct_rwlock); + + return (db->db_state == DB_CACHED); +} + int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { @@ -1832,6 +1895,7 @@ } else if (how == DB_FILL) { if (db->db_state == DB_UNCACHED) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + boolean_t cached = B_FALSE; spa_t *spa; arc_buf_t *fill_buf; @@ -1839,17 +1903,24 @@ ASSERT(db->db.db_data == NULL); DB_GET_SPA(&spa, db); fill_buf = db_buf; + /* See if our buffer is in ARC from a previous use. */ if (fill_buf == NULL) - fill_buf = arc_buf_alloc(spa, db->db.db_size, - db, type); - dbuf_set_data(db, fill_buf); + cached = dbuf_read_cached(db, dn); - if (size != db->db.db_size) - DBUF_STATE_CHANGE(db, =, (DB_PARTIAL|DB_FILL), - "notifying of an initial partial fill"); - else - DBUF_STATE_CHANGE(db, =, DB_FILL, - "notifying of a complete fill"); + if (!cached) { + if (fill_buf == NULL) + fill_buf = arc_buf_alloc(spa, + db->db.db_size, db, type); + dbuf_set_data(db, fill_buf); + if (size != db->db.db_size) + DBUF_STATE_CHANGE(db, =, + (DB_PARTIAL|DB_FILL), "notifying " + "of an initial partial fill"); + else + DBUF_STATE_CHANGE(db, =, DB_FILL, + "notifying of a complete fill"); + } else + atomic_add_64(&dirty_buffers_already_cached, 1); } else if (db->db_state & (DB_READ|DB_PARTIAL)) { DBUF_STATE_CHANGE(db, |=, DB_FILL, "notifying of a followup partial fill"); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h#4 (text) ==== @@ -92,6 +92,7 @@ int arc_buf_remove_ref(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); +arc_buf_t *arc_buf_find_bp(spa_t *spa, blkptr_t *bp, void *priv); int arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa, zbookmark_t *zb); int arc_released(arc_buf_t *buf); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#17 (text) ==== @@ -51,6 +51,7 @@ #define DB_RF_NOPREFETCH (1 << 3) #define DB_RF_NEVERWAIT (1 << 4) #define DB_RF_CACHED (1 << 5) +#define DB_RF_CACHED_ONLY (1 << 6) /** \} */ /** Change 521469 by willa@willa_repo on 2012/01/17 13:22:20 Fix the scrub_mirror ZFS tests. It seems someone changed the output of "zpool status" to say 'scrub repaired' instead of 'scrub completed'. Affected files ... ... //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/scrub_mirror/scrub_mirror_common.kshlib#2 edit Differences ... ==== //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/scrub_mirror/scrub_mirror_common.kshlib#2 (text) ==== @@ -61,7 +61,7 @@ typeset scrubbed="false" while [[ "$scrubbed" != "true" ]]; do $ZPOOL status $POOL | $GREP -s "scrub" \ - | $GREP -i "completed" + | $GREP -i "repaired" if [[ $? -eq 0 ]]; then scrubbed="true" fi Change 521472 by willa@willa_repo on 2012/01/17 13:35:03 arc_buf_find_bp(): Don't assume the buffer is available. It turns out this assumption was false, found by the scrub_mirror STF test. This test caused the assert left in place to test the assumption, to fire. Instead, do what arc_read_nolock() does, which is to return a cloned buffer of the original. More than likely, this is better than allocating a new buffer with no data. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c#7 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c#7 (text) ==== @@ -3059,19 +3059,13 @@ if (hdr != NULL) { if (hdr->b_datacnt > 0) { add_reference(hdr, hash_lock, private); -#if 0 if (HDR_BUF_AVAILABLE(hdr)) { + buf = hdr->b_buf; ASSERT(buf->b_efunc == NULL); hdr->b_flags &= ~ARC_BUF_AVAILABLE; - buf = hdr->b_buf; } else { buf = arc_buf_clone(buf); } -#else - ASSERT(HDR_BUF_AVAILABLE(hdr)); - hdr->b_flags &= ~ARC_BUF_AVAILABLE; - buf = hdr->b_buf; -#endif arc_access(hdr, hash_lock); } mutex_exit(hash_lock); Change 521474 by willa@willa_repo on 2012/01/17 13:37:04 Convert dmu_buf_hold_array_by_dnode() to dbuf_array_t. Convert all consumers of this interface to pass around a dbuf_array_t ** rather than a dmu_buf_t ***. Doing this allows us to extend the concept of an array of buffers being used for a particular I/O so that it can be referenced from the affected dbufs and then associated with callbacks and specific configuration for each callback. Move two function prototypes out of sys/dmu.h to dmu.c, since they were not called anywhere else, and they need dbuf_array_t. Make those functions static. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#14 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#18 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#3 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#14 (text) ==== @@ -44,6 +44,10 @@ #include #endif +static int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, + uint64_t length, int read, void *tag, dbuf_array_t **dbap); +static void dmu_buf_rele_array(dbuf_array_t *dba); + const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { byteswap_uint8_array, TRUE, "unallocated" }, { zap_byteswap, TRUE, "object directory" }, @@ -370,10 +374,11 @@ */ static int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) + int read, void *tag, dbuf_array_t **dbap, uint32_t flags) { dsl_pool_t *dp = NULL; - dmu_buf_t **dbp; + dmu_buf_impl_t **dbp; + dbuf_array_t *dba; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; @@ -404,7 +409,11 @@ } nblks = 1; } - dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); + dba = kmem_zalloc(sizeof(dbuf_array_t), KM_SLEEP); + dba->dba_count = nblks; + dba->dba_tag = tag; + dbp = kmem_zalloc(sizeof (dmu_buf_impl_t *) * nblks, KM_SLEEP); + dba->dba_dbp = dbp; if (dn->dn_objset->os_dsl_dataset) dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; @@ -416,7 +425,7 @@ dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); if (db == NULL) { rw_exit(&dn->dn_struct_rwlock); - dmu_buf_rele_array(dbp, nblks, tag); + dmu_buf_rele_array(dba); zio_nowait(zio); return (EIO); } @@ -427,7 +436,7 @@ else curthread->td_ru.ru_oublock++; #endif - dbp[i] = &db->db; + dbp[i] = db; } rw_exit(&dn->dn_struct_rwlock); @@ -437,14 +446,14 @@ if (dp && dsl_pool_sync_context(dp)) dp->dp_read_overhead += gethrtime() - start; if (err) { - dmu_buf_rele_array(dbp, nblks, tag); + dmu_buf_rele_array(dba); return (err); } /* wait for other io to complete */ if (read) { for (i = 0; i < nblks; i++) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; + dmu_buf_impl_t *db = dbp[i]; mutex_enter(&db->db_mtx); while (db->db_state & (DB_READ|DB_FILL)) cv_wait(&db->db_changed, &db->db_mtx); @@ -452,14 +461,13 @@ err = EIO; mutex_exit(&db->db_mtx); if (err) { - dmu_buf_rele_array(dbp, nblks, tag); + dmu_buf_rele_array(dba); return (err); } } } - *numbufsp = nblks; - *dbpp = dbp; + *dbap = dba; return (0); } @@ -469,7 +477,7 @@ */ static int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, - uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) + uint64_t length, int read, void *tag, dbuf_array_t **dbap) { dnode_t *dn; int err; @@ -479,16 +487,16 @@ return (err); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - numbufsp, dbpp, DMU_READ_PREFETCH); + dbap, DMU_READ_PREFETCH); dnode_rele(dn, FTAG); return (err); } -int +static int dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, - uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) + uint64_t length, int read, void *tag, dbuf_array_t **dbap) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; @@ -497,7 +505,7 @@ DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - numbufsp, dbpp, DMU_READ_PREFETCH); + dbap, DMU_READ_PREFETCH); DB_DNODE_EXIT(db); return (err); @@ -508,21 +516,19 @@ * hold on the array of buffers MUST be released with dmu_buf_rele_array. You * can NOT release the hold on each buffer individually with dmu_buf_rele. */ -void -dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) +static void +dmu_buf_rele_array(dbuf_array_t *dba) { int i; - dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; - if (numbufs == 0) - return; - - for (i = 0; i < numbufs; i++) { - if (dbp[i]) - dbuf_rele(dbp[i], tag); + for (i = 0; i < dba->dba_count; i++) { + if (dba->dba_dbp[i]) + dbuf_rele(dba->dba_dbp[i], dba->dba_tag); } - - kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); + if (dba->dba_count > 0) + kmem_free(dba->dba_dbp, + sizeof(dmu_buf_impl_t *) * dba->dba_count); + kmem_free(dba, sizeof(dbuf_array_t)); } /** \brief Asynchronously try to read in the data. @@ -750,8 +756,8 @@ void *buf, uint32_t flags) { dnode_t *dn; - dmu_buf_t **dbp; - int numbufs, err; + dbuf_array_t *dba; + int err; err = dnode_hold(os, object, FTAG, &dn); if (err) @@ -778,14 +784,14 @@ * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, - TRUE, FTAG, &numbufs, &dbp, flags); + TRUE, FTAG, &dba, flags); if (err) break; - for (i = 0; i < numbufs; i++) { + for (i = 0; i < dba->dba_count; i++) { int tocpy; int bufoff; - dmu_buf_t *db = dbp[i]; + dmu_buf_t *db = &dba->dba_dbp[i]->db; ASSERT(size > 0); @@ -798,7 +804,7 @@ size -= tocpy; buf = (char *)buf + tocpy; } - dmu_buf_rele_array(dbp, numbufs, FTAG); + dmu_buf_rele_array(dba); } dnode_rele(dn, FTAG); return (err); @@ -808,26 +814,27 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { - dmu_buf_t **dbp; - int numbufs, i; + dbuf_array_t *dba; + int i; if (size == 0) return; VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &numbufs, &dbp)); + FALSE, FTAG, &dba)); - for (i = 0; i < numbufs; i++) { + for (i = 0; i < dba->dba_count; i++) { int tocpy; int bufoff; - dmu_buf_t *db = dbp[i]; + dmu_buf_t *db = &dba->dba_dbp[i]->db; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); - ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + ASSERT(i == 0 || i == dba->dba_count - 1 || + tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); @@ -842,28 +849,28 @@ size -= tocpy; buf = (char *)buf + tocpy; } - dmu_buf_rele_array(dbp, numbufs, FTAG); + dmu_buf_rele_array(dba); } void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { - dmu_buf_t **dbp; - int numbufs, i; + dbuf_array_t *dba; + int i; if (size == 0) return; VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &numbufs, &dbp)); + FALSE, FTAG, &dba)); - for (i = 0; i < numbufs; i++) { - dmu_buf_t *db = dbp[i]; + for (i = 0; i < dba->dba_count; i++) { + dmu_buf_t *db = &dba->dba_dbp[i]->db; dmu_buf_will_not_fill(db, tx); } - dmu_buf_rele_array(dbp, numbufs, FTAG); + dmu_buf_rele_array(dba); } /** @@ -993,8 +1000,8 @@ int dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) { - dmu_buf_t **dbp; - int numbufs, i, err; + dbuf_array_t *dba; + int i, err; xuio_t *xuio = NULL; /* @@ -1002,7 +1009,7 @@ * to be reading in parallel. */ err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, - &numbufs, &dbp); + &dba); if (err) return (err); @@ -1011,10 +1018,10 @@ xuio = (xuio_t *)uio; #endif - for (i = 0; i < numbufs; i++) { + for (i = 0; i < dba->dba_count; i++) { int tocpy; int bufoff; - dmu_buf_t *db = dbp[i]; + dmu_buf_t *db = &dba->dba_dbp[i]->db; ASSERT(size > 0); @@ -1044,7 +1051,7 @@ size -= tocpy; } - dmu_buf_rele_array(dbp, numbufs, FTAG); + dmu_buf_rele_array(dba); return (err); } @@ -1052,27 +1059,26 @@ static int dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) { - dmu_buf_t **dbp; - int numbufs; + dbuf_array_t *dba; int err = 0; int i; err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, - FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); + FALSE, FTAG, &dba, DMU_READ_PREFETCH); if (err) return (err); - for (i = 0; i < numbufs; i++) { + for (i = 0; i < dba->dba_count; i++) { int tocpy; int bufoff; - dmu_buf_t *db = dbp[i]; + dmu_buf_t *db = &dba->dba_dbp[i]->db; ASSERT(size > 0); bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); - ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + ASSERT(i == 0 || i == dba->dba_count-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); @@ -1096,7 +1102,7 @@ size -= tocpy; } - dmu_buf_rele_array(dbp, numbufs, FTAG); + dmu_buf_rele_array(dba); return (err); } @@ -1145,22 +1151,22 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, page_t *pp, dmu_tx_t *tx) { - dmu_buf_t **dbp; - int numbufs, i; + dbuf_array_t *dba; + int i; int err; if (size == 0) return (0); err = dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &numbufs, &dbp); + FALSE, FTAG, &dba); if (err) return (err); - for (i = 0; i < numbufs; i++) { + for (i = 0; i < dba->dba_count; i++) { int tocpy, copied, thiscpy; int bufoff; - dmu_buf_t *db = dbp[i]; + dmu_buf_t *db = dba->dba_dbp[i]->db; caddr_t va; ASSERT(size > 0); @@ -1169,7 +1175,7 @@ bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); - ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + ASSERT(i == 0 || i == dba->dba_count-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); @@ -1191,7 +1197,7 @@ offset += tocpy; size -= tocpy; } - dmu_buf_rele_array(dbp, numbufs, FTAG); + dmu_buf_rele_array(dba); return (err); } #endif /* sun */ ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#18 (text) ==== @@ -336,6 +336,9 @@ /** The number of dbufs in the array. */ size_t dba_count; + /** The tag used for the array. */ + void *dba_tag; + /** The callback to call if the conditions are met. */ dmu_callback_t dba_cb; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#3 (text) ==== @@ -283,10 +283,6 @@ void dmu_buf_rele(dmu_buf_t *db, void *tag); uint64_t dmu_buf_refcount(dmu_buf_t *db); -int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, - uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); -void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); - void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func); void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr, Change 521500 by willa@willa_repo on 2012/01/17 17:01:07 A few fixes for dbuf_read_impl() & dbuf_dirty(). - Only issue cache checks for non-filling dirties. - Refactor positive exit handling for sub-functions of dbuf_read_impl() so that they're handled in dbuf_read_impl() instead. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#53 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#53 (text) ==== @@ -817,9 +817,7 @@ { int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); - if (db->db_blkid != DMU_BONUS_BLKID) - return (FALSE); - + ASSERT(db->db_blkid == DMU_BONUS_BLKID); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(DB_DNODE_HELD(db)); ASSERT3U(bonuslen, <=, db->db.db_size); @@ -829,11 +827,8 @@ bzero(db->db.db_data, DN_MAX_BONUSLEN); if (bonuslen) bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); - DB_DNODE_EXIT(db); dbuf_update_data(db); DBUF_STATE_CHANGE(db, =, DB_CACHED, "bonus buffer filled"); - if (!(*flags & DB_RF_CACHED_ONLY)) - mutex_exit(&db->db_mtx); return (TRUE); } @@ -872,13 +867,9 @@ buf = arc_buf_alloc(dn->dn_objset->os_spa, db->db.db_size, db, DBUF_GET_BUFC_TYPE(db)); - DB_DNODE_EXIT(db); bzero(buf->b_data, db->db.db_size); DBUF_STATE_CHANGE(db, =, DB_READ, "hole read satisfied"); - *flags |= DB_RF_CACHED; dbuf_read_complete(db, buf); - if (!(*flags & DB_RF_CACHED_ONLY)) - mutex_exit(&db->db_mtx); return (TRUE); } return (FALSE); @@ -921,11 +912,14 @@ */ //ASSERT(db->db_buf == NULL); - if (dbuf_read_bonus(db, dn, flags)) + if ((db->db_blkid == DMU_BONUS_BLKID && dbuf_read_bonus(db, dn, flags)) + || dbuf_read_on_hole(db, dn, flags)) { + DB_DNODE_EXIT(db); + *flags |= DB_RF_CACHED; + if ((*flags & DB_RF_CACHED_ONLY) == 0) + mutex_exit(&db->db_mtx); return; - - if (dbuf_read_on_hole(db, dn, flags)) - return; + } spa = dn->dn_objset->os_spa; @@ -937,9 +931,9 @@ db->db_state = DB_READ; /* for read_complete */ dbuf_read_complete(db, buf); *flags |= DB_RF_CACHED; - /*mutex_exit(&db->db_mtx);*/ } DB_DNODE_EXIT(db); + /*mutex_exit(&db->db_mtx);*/ return; } @@ -1903,8 +1897,12 @@ ASSERT(db->db.db_data == NULL); DB_GET_SPA(&spa, db); fill_buf = db_buf; - /* See if our buffer is in ARC from a previous use. */ - if (fill_buf == NULL) + + /* + * If this dirty won't fill the buffer, see if a + * previous version is in the ARC. + */ + if (fill_buf == NULL && size != db->db.db_size) cached = dbuf_read_cached(db, dn); if (!cached) { Change 521532 by willa@willa_repo on 2012/01/17 17:32:03 Initial hook-up of callbacks for dbuf state transitions. Whenever DBUF_STATE_CHANGE is called, the dbuf's callback list will be consulted. If it is not empty, dbuf_run_callbacks() is called. Add dbuf_run_callbacks(): every dbuf_array_t callback whose configured state mask matches will be called. Change the callback function pointer typedef to remove the third argument (intended for 'state'), which is redundant by simply passing the dbuf after changing its state. Initialize and destroy the db->db_callbacks list. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#54 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#19 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#54 (text) ==== @@ -112,9 +112,16 @@ return (crc); } +static void dbuf_run_callbacks(dmu_buf_impl_t *db); + +#define DBUF_STATE_CHANGE_COMMON(db, op, state, why) \ + (db)->db_state op state; \ + if (!list_is_empty(&(db)->db_callbacks)) \ + dbuf_run_callbacks(db) + #ifdef ZFS_DEBUG #define DBUF_STATE_CHANGE(db, op, state, why) do { \ - (db)->db_state op state; \ + DBUF_STATE_CHANGE_COMMON(db, op, state, why); \ if (zfs_flags & ZFS_DEBUG_DBUF_STATE) { \ uint64_t __db_obj = (db)->db.db_object; \ char __db_buf[32]; \ @@ -132,7 +139,7 @@ } while(0) #else #define DBUF_STATE_CHANGE(db, op, state, why) do { \ - (db)->db_state op state; \ + DBUF_STATE_CHANGE_COMMON(db, op, state, why); \ } while(0) #endif @@ -2621,6 +2628,9 @@ list_create(&db->db_dirty_records, sizeof(dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, db_dirty_record_link)); + list_create(&db->db_callbacks, sizeof(dbuf_callback_node_t), + offsetof(dbuf_callback_node_t, dbc_link)); + db->db_objset = os; db->db.db_object = dn->dn_object; db->db_level = level; @@ -2690,6 +2700,19 @@ return (db); } +static void +dbuf_run_callbacks(dmu_buf_impl_t *db) +{ + dbuf_callback_node_t *dbc; + + for (dbc = list_head(&db->db_callbacks); db != NULL; + dbc = list_next(&db->db_callbacks, dbc)) { + if (dbc->dbc_dba->dba_states & db->db_state) + dbc->dbc_dba->dba_cb(dbc->dbc_dba, db); + /* XXX what else should we do? */ + } +} + static int dbuf_do_evict(void *private) { @@ -2748,6 +2771,7 @@ db->db_parent = NULL; db->db_buf = NULL; list_destroy(&db->db_dirty_records); + list_destroy(&db->db_callbacks); ASSERT(!list_link_active(&db->db_link)); ASSERT(db->db.db_data == NULL); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#19 (text) ==== @@ -326,7 +326,7 @@ * A callback function pointer which provides the callee, via dba.dba_private, * a way to keep track of the state of an array of dbufs. */ -typedef void(*dmu_callback_t)(struct dbuf_array *, struct dmu_buf_impl *, int); +typedef void(*dmu_callback_t)(struct dbuf_array *, struct dmu_buf_impl *); typedef struct dbuf_array { Change 521534 by willa@willa_repo on 2012/01/17 17:45:39 Extend dmu_buf_hold_array_by_dnode() to accept callbacks. Callers must pass in a callback and a private pointer, or neither. Populate each member dbuf's callback list. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#15 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#15 (text) ==== @@ -374,7 +374,8 @@ */ static int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - int read, void *tag, dbuf_array_t **dbap, uint32_t flags) + int read, void *tag, dbuf_array_t **dbap, uint32_t flags, + dmu_callback_t dba_cb, void *dba_private) { dsl_pool_t *dp = NULL; dmu_buf_impl_t **dbp; @@ -467,6 +468,16 @@ } } + /* Hook up the callbacks to each affected dbuf. */ + /* The callback and private data better both be set or unset. */ + ASSERT(((dba_cb == NULL) ^ (dba_private == NULL)) == 0); + if (dba_cb != NULL) { + dba->dba_cb = dba_cb; + dba->dba_private = dba_private; + for (i = 0; i < nblks; i++) + list_insert_head(&dbp[i]->db_callbacks, dba); + } + *dbap = dba; return (0); } @@ -487,7 +498,7 @@ return (err); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - dbap, DMU_READ_PREFETCH); + dbap, DMU_READ_PREFETCH, NULL, NULL); dnode_rele(dn, FTAG); @@ -505,7 +516,7 @@ DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - dbap, DMU_READ_PREFETCH); + dbap, DMU_READ_PREFETCH, NULL, NULL); DB_DNODE_EXIT(db); return (err); @@ -784,7 +795,7 @@ * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, - TRUE, FTAG, &dba, flags); + TRUE, FTAG, &dba, flags, NULL, NULL); if (err) break; @@ -1064,7 +1075,7 @@ int i; err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, - FALSE, FTAG, &dba, DMU_READ_PREFETCH); + FALSE, FTAG, &dba, DMU_READ_PREFETCH, NULL, NULL); if (err) return (err); Change 521567 by willa@willa_repo on 2012/01/18 12:40:39 Get rid of dmu_buf_hold_array_by_bonus(). No one's using it. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#16 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#16 (text) ==== @@ -44,8 +44,6 @@ #include #endif -static int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, - uint64_t length, int read, void *tag, dbuf_array_t **dbap); static void dmu_buf_rele_array(dbuf_array_t *dba); const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { @@ -505,23 +503,6 @@ return (err); } -static int -dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, - uint64_t length, int read, void *tag, dbuf_array_t **dbap) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; - int err; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - dbap, DMU_READ_PREFETCH, NULL, NULL); - DB_DNODE_EXIT(db); - - return (err); -} - /** * Releases the hold on an array of dmu_buf_t*'s, and frees the array. The * hold on the array of buffers MUST be released with dmu_buf_rele_array. You Change 521625 by willa@willa_repo on 2012/01/18 16:50:23 Bubble up the DMU context structure to consumers. Move dbuf_array_t from dbuf.h to dmu.h. To facilitate this, change its name to dmu_context_t and change the primary array type from dmu_buf_impl_t to dmu_buf_t so as to preserve the hiding of dbuf implementation details from DMU consumers. Consumers of this interface within DMU or DBUF can still access those details. For all current consumers of the DMU read/write interfaces, pass a NULL dmu_context_t *. For all affected interfaces, if this argument is NULL, create and free it within the function. This is intended as a compatibility shim, to allow focus to be on making async reads happen for ZVOL and ZFS data I/Os. Collapse the tag argument passed to dmu_buf_hold_array_by_dnode() into the dmu_context_t * argument, which now replaces dbuf_array_t **. This function, and dmu_buf_rele_array(), are no longer responsible for creating or freeing the context super-structure. But they do still create and free the array of dbufs within that context. This provides a springboard from which DMU consumers can control the execution of I/O, e.g. specifying a callback on completion, rather than blocking. Eventually I think the DMU interface needs to be restructured so that consumers call a method to set up their DMU context, and then call a separate interface that performs the actual I/O. Doing so should collapse the current 30 or so DMU interfaces down to less than 5, and would have their argument lists shrunk significantly. It also provides a means by which the DMU consumers could be fixed to cause less lock contention on a given dnode's parent block, as the comment above dmu.c:dmu_buf_hold_array_by_dnode() mentions. They would setup their context early, and reuse it throughout. Additionally, it may be possible to combine dmu_context_t with dmu_tx_t, which serves a similar purpose but is currently only used for writes. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c#2 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#55 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#17 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c#2 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c#2 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c#3 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c#2 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c#2 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#20 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#4 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c#2 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c#2 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c#2 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c#2 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c#4 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#4 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c#2 (text) ==== @@ -340,7 +340,7 @@ mutex_enter(&bpo->bpo_lock); dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), - sizeof (subobj), &subobj, tx); + sizeof (subobj), &subobj, tx, NULL); bpo->bpo_phys->bpo_num_subobjs++; /* @@ -361,7 +361,8 @@ 0, FTAG, &subdb, 0)); dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), - numsubsub * sizeof (subobj), subdb->db_data, tx); + numsubsub * sizeof (subobj), subdb->db_data, tx, + NULL); dmu_buf_rele(subdb, FTAG); bpo->bpo_phys->bpo_num_subobjs += numsubsub; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#55 (text) ==== @@ -2707,8 +2707,8 @@ for (dbc = list_head(&db->db_callbacks); db != NULL; dbc = list_next(&db->db_callbacks, dbc)) { - if (dbc->dbc_dba->dba_states & db->db_state) - dbc->dbc_dba->dba_cb(dbc->dbc_dba, db); + if (dbc->dmu_ctx->states & db->db_state) + dbc->dmu_ctx->dmu_cb(dbc->dmu_ctx->dmu_cb_private, db); /* XXX what else should we do? */ } } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#17 (text) ==== @@ -44,7 +44,7 @@ #include #endif -static void dmu_buf_rele_array(dbuf_array_t *dba); +static void dmu_buf_rele_array(dmu_context_t *dmu_ctx); const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { byteswap_uint8_array, TRUE, "unallocated" }, @@ -372,12 +372,10 @@ */ static int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - int read, void *tag, dbuf_array_t **dbap, uint32_t flags, - dmu_callback_t dba_cb, void *dba_private) + int read, dmu_context_t *dmu_ctx, uint32_t flags) { dsl_pool_t *dp = NULL; dmu_buf_impl_t **dbp; - dbuf_array_t *dba; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; @@ -408,11 +406,8 @@ } nblks = 1; } - dba = kmem_zalloc(sizeof(dbuf_array_t), KM_SLEEP); - dba->dba_count = nblks; - dba->dba_tag = tag; - dbp = kmem_zalloc(sizeof (dmu_buf_impl_t *) * nblks, KM_SLEEP); - dba->dba_dbp = dbp; + dmu_ctx->count = nblks; + dmu_ctx->dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); if (dn->dn_objset->os_dsl_dataset) dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; @@ -421,10 +416,10 @@ zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, offset); for (i = 0; i < nblks; i++) { - dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); + dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, dmu_ctx->tag); if (db == NULL) { rw_exit(&dn->dn_struct_rwlock); - dmu_buf_rele_array(dba); + dmu_buf_rele_array(dmu_ctx); zio_nowait(zio); return (EIO); } @@ -435,7 +430,7 @@ else curthread->td_ru.ru_oublock++; #endif - dbp[i] = db; + dmu_ctx->dbp[i] = &db->db; } rw_exit(&dn->dn_struct_rwlock); @@ -445,14 +440,14 @@ if (dp && dsl_pool_sync_context(dp)) dp->dp_read_overhead += gethrtime() - start; if (err) { - dmu_buf_rele_array(dba); + dmu_buf_rele_array(dmu_ctx); return (err); } /* wait for other io to complete */ if (read) { for (i = 0; i < nblks; i++) { - dmu_buf_impl_t *db = dbp[i]; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dmu_ctx->dbp[i]; mutex_enter(&db->db_mtx); while (db->db_state & (DB_READ|DB_FILL)) cv_wait(&db->db_changed, &db->db_mtx); @@ -460,23 +455,22 @@ err = EIO; mutex_exit(&db->db_mtx); if (err) { - dmu_buf_rele_array(dba); + dmu_buf_rele_array(dmu_ctx); return (err); } } } /* Hook up the callbacks to each affected dbuf. */ - /* The callback and private data better both be set or unset. */ - ASSERT(((dba_cb == NULL) ^ (dba_private == NULL)) == 0); - if (dba_cb != NULL) { - dba->dba_cb = dba_cb; - dba->dba_private = dba_private; - for (i = 0; i < nblks; i++) - list_insert_head(&dbp[i]->db_callbacks, dba); + /* If the callback isn't set, private data better not be either. */ + ASSERT(dmu_ctx->dmu_cb != NULL || dmu_ctx->dmu_cb_private == NULL); + if (dmu_ctx->dmu_cb != NULL) { + for (i = 0; i < nblks; i++) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dmu_ctx->dbp[i]; + list_insert_head(&db->db_callbacks, dmu_ctx); + } } - *dbap = dba; return (0); } @@ -486,7 +480,7 @@ */ static int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, - uint64_t length, int read, void *tag, dbuf_array_t **dbap) + uint64_t length, int read, dmu_context_t *dmu_ctx) { dnode_t *dn; int err; @@ -495,8 +489,8 @@ if (err) return (err); - err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - dbap, DMU_READ_PREFETCH, NULL, NULL); + err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, + dmu_ctx, DMU_READ_PREFETCH); dnode_rele(dn, FTAG); @@ -509,18 +503,19 @@ * can NOT release the hold on each buffer individually with dmu_buf_rele. */ static void -dmu_buf_rele_array(dbuf_array_t *dba) +dmu_buf_rele_array(dmu_context_t *dmu_ctx) { int i; - for (i = 0; i < dba->dba_count; i++) { - if (dba->dba_dbp[i]) - dbuf_rele(dba->dba_dbp[i], dba->dba_tag); + for (i = 0; i < dmu_ctx->count; i++) { + if (dmu_ctx->dbp[i]) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dmu_ctx->dbp[i]; + dbuf_rele(db, dmu_ctx->tag); + } } - if (dba->dba_count > 0) - kmem_free(dba->dba_dbp, - sizeof(dmu_buf_impl_t *) * dba->dba_count); - kmem_free(dba, sizeof(dbuf_array_t)); + if (dmu_ctx->count > 0) + kmem_free(dmu_ctx->dbp, + sizeof(dmu_buf_t *) * dmu_ctx->count); } /** \brief Asynchronously try to read in the data. @@ -745,12 +740,17 @@ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf, uint32_t flags) + void *buf, uint32_t flags, dmu_context_t *dmu_ctx_p) { dnode_t *dn; - dbuf_array_t *dba; + dmu_context_t *dmu_ctx = dmu_ctx_p; int err; + if (dmu_ctx == NULL) { + dmu_ctx = kmem_zalloc(sizeof(dmu_context_t), KM_SLEEP); + dmu_ctx->tag = FTAG; + } + err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); @@ -776,14 +776,14 @@ * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, - TRUE, FTAG, &dba, flags, NULL, NULL); + TRUE, dmu_ctx, flags); if (err) break; - for (i = 0; i < dba->dba_count; i++) { + for (i = 0; i < dmu_ctx->count; i++) { int tocpy; int bufoff; - dmu_buf_t *db = &dba->dba_dbp[i]->db; + dmu_buf_t *db = dmu_ctx->dbp[i]; ASSERT(size > 0); @@ -796,36 +796,43 @@ size -= tocpy; buf = (char *)buf + tocpy; } - dmu_buf_rele_array(dba); + dmu_buf_rele_array(dmu_ctx); } dnode_rele(dn, FTAG); + if (dmu_ctx != dmu_ctx_p) + kmem_free(dmu_ctx, sizeof(dmu_context_t)); return (err); } void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx) + const void *buf, dmu_tx_t *tx, dmu_context_t *dmu_ctx_p) { - dbuf_array_t *dba; int i; + dmu_context_t *dmu_ctx = dmu_ctx_p; if (size == 0) return; + if (dmu_ctx == NULL) { + dmu_ctx = kmem_zalloc(sizeof(dmu_context_t), KM_SLEEP); + dmu_ctx->tag = FTAG; + } + VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &dba)); + FALSE, dmu_ctx)); - for (i = 0; i < dba->dba_count; i++) { + for (i = 0; i < dmu_ctx->count; i++) { int tocpy; int bufoff; - dmu_buf_t *db = &dba->dba_dbp[i]->db; + dmu_buf_t *db = dmu_ctx->dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); - ASSERT(i == 0 || i == dba->dba_count - 1 || + ASSERT(i == 0 || i == dmu_ctx->count - 1 || tocpy == db->db_size); if (tocpy == db->db_size) @@ -841,28 +848,39 @@ size -= tocpy; buf = (char *)buf + tocpy; } - dmu_buf_rele_array(dba); + dmu_buf_rele_array(dmu_ctx); + + if (dmu_ctx != dmu_ctx_p) + kmem_free(dmu_ctx, sizeof(dmu_context_t)); } void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - dmu_tx_t *tx) + dmu_tx_t *tx, dmu_context_t *dmu_ctx_p) { - dbuf_array_t *dba; + dmu_context_t *dmu_ctx = dmu_ctx_p; int i; if (size == 0) return; + if (dmu_ctx == NULL) { + dmu_ctx = kmem_zalloc(sizeof(dmu_context_t), KM_SLEEP); + dmu_ctx->tag = FTAG; + } + VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &dba)); + FALSE, dmu_ctx)); - for (i = 0; i < dba->dba_count; i++) { - dmu_buf_t *db = &dba->dba_dbp[i]->db; + for (i = 0; i < dmu_ctx->count; i++) { + dmu_buf_t *db = dmu_ctx->dbp[i]; dmu_buf_will_not_fill(db, tx); } - dmu_buf_rele_array(dba); + dmu_buf_rele_array(dmu_ctx); + + if (dmu_ctx != dmu_ctx_p) + kmem_free(dmu_ctx, sizeof(dmu_context_t)); } /** @@ -990,18 +1008,24 @@ #ifdef _KERNEL int -dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) +dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, + dmu_context_t *dmu_ctx_p) { - dbuf_array_t *dba; int i, err; + dmu_context_t *dmu_ctx = dmu_ctx_p; xuio_t *xuio = NULL; + if (dmu_ctx == NULL) { + dmu_ctx = kmem_zalloc(sizeof(dmu_context_t), KM_SLEEP); + dmu_ctx->tag = FTAG; + } + /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ - err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, - &dba); + err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, + dmu_ctx); if (err) return (err); @@ -1010,10 +1034,10 @@ xuio = (xuio_t *)uio; #endif - for (i = 0; i < dba->dba_count; i++) { + for (i = 0; i < dmu_ctx->count; i++) { int tocpy; int bufoff; - dmu_buf_t *db = &dba->dba_dbp[i]->db; + dmu_buf_t *db = dmu_ctx->dbp[i]; ASSERT(size > 0); @@ -1043,34 +1067,42 @@ size -= tocpy; } - dmu_buf_rele_array(dba); + dmu_buf_rele_array(dmu_ctx); + if (dmu_ctx != dmu_ctx_p) + kmem_free(dmu_ctx, sizeof(dmu_context_t)); return (err); } static int -dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) +dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx, + dmu_context_t *dmu_ctx_p) { - dbuf_array_t *dba; int err = 0; + dmu_context_t *dmu_ctx = dmu_ctx_p; int i; + if (dmu_ctx == NULL) { + dmu_ctx = kmem_zalloc(sizeof(dmu_context_t), KM_SLEEP); + dmu_ctx->tag = FTAG; + } + err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, - FALSE, FTAG, &dba, DMU_READ_PREFETCH, NULL, NULL); + FALSE, dmu_ctx, DMU_READ_PREFETCH); if (err) return (err); - for (i = 0; i < dba->dba_count; i++) { + for (i = 0; i < dmu_ctx->count; i++) { int tocpy; int bufoff; - dmu_buf_t *db = &dba->dba_dbp[i]->db; + dmu_buf_t *db = dmu_ctx->dbp[i]; ASSERT(size > 0); bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); - ASSERT(i == 0 || i == dba->dba_count-1 || tocpy == db->db_size); + ASSERT(i == 0 || i == dmu_ctx->count-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); @@ -1094,13 +1126,16 @@ size -= tocpy; } - dmu_buf_rele_array(dba); + dmu_buf_rele_array(dmu_ctx); + if (dmu_ctx != dmu_ctx_p) + kmem_free(dmu_ctx, sizeof(dmu_context_t)); + return (err); } int dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, - dmu_tx_t *tx) + dmu_tx_t *tx, dmu_context_t *dmu_ctx_p) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; @@ -1111,7 +1146,7 @@ DB_DNODE_ENTER(db); dn = DB_DNODE(db); - err = dmu_write_uio_dnode(dn, uio, size, tx); + err = dmu_write_uio_dnode(dn, uio, size, tx, dmu_ctx_p); DB_DNODE_EXIT(db); return (err); @@ -1119,7 +1154,7 @@ int dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, - dmu_tx_t *tx) + dmu_tx_t *tx, dmu_context_t *dmu_ctx_p) { dnode_t *dn; int err; @@ -1131,7 +1166,7 @@ if (err) return (err); - err = dmu_write_uio_dnode(dn, uio, size, tx); + err = dmu_write_uio_dnode(dn, uio, size, tx, dmu_ctx_p); dnode_rele(dn, FTAG); @@ -1141,24 +1176,28 @@ #ifdef sun int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - page_t *pp, dmu_tx_t *tx) + page_t *pp, dmu_tx_t *tx, dmu_context_t *dmu_ctx_p) { - dbuf_array_t *dba; + dmu_context_t *dmu_ctx = dmu_ctx_p; int i; int err; + if (dmu_ctx == NULL) { + dmu_ctx = kmem_zalloc(sizeof(dmu_context_t), KM_SLEEP); + dmu_ctx->tag = FTAG; + } + if (size == 0) return (0); - err = dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &dba); + err = dmu_buf_hold_array(os, object, offset, size, FALSE, dmu_ctx); if (err) return (err); - for (i = 0; i < dba->dba_count; i++) { + for (i = 0; i < dmu_ctx->count; i++) { int tocpy, copied, thiscpy; int bufoff; - dmu_buf_t *db = dba->dba_dbp[i]->db; + dmu_buf_t *db = dmu_ctx->dbp[i]; caddr_t va; ASSERT(size > 0); @@ -1167,7 +1206,7 @@ bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); - ASSERT(i == 0 || i == dba->dba_count-1 || tocpy == db->db_size); + ASSERT(i == 0 || i == dmu_ctx->count-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); @@ -1189,7 +1228,10 @@ offset += tocpy; size -= tocpy; } - dmu_buf_rele_array(dba); + dmu_buf_rele_array(dmu_ctx); + if (dmu_ctx != dmu_ctx_p) + kmem_free(dmu_ctx, sizeof(dmu_context_t)); + return (err); } #endif /* sun */ @@ -1253,7 +1295,7 @@ DB_DNODE_EXIT(dbuf); dbuf_rele(db, FTAG); - dmu_write(os, object, offset, blksz, buf->b_data, tx); + dmu_write(os, object, offset, blksz, buf->b_data, tx, NULL); dmu_return_arcbuf(buf); XUIOSTAT_BUMP(xuiostat_wbuf_copied); } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c#2 (text) ==== @@ -1146,7 +1146,7 @@ if (ra->byteswap) dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); dmu_write(os, drrw->drr_object, - drrw->drr_offset, drrw->drr_length, data, tx); + drrw->drr_offset, drrw->drr_length, data, tx, NULL); dmu_tx_commit(tx); return (0); } @@ -1203,7 +1203,7 @@ return (err); } dmu_write(os, drrwbr->drr_object, - drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); + drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx, NULL); dmu_buf_rele(dbp, FTAG); dmu_tx_commit(tx); return (0); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c#2 (text) ==== @@ -970,7 +970,7 @@ ASSERT(smo->smo_object != 0); dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * (sm->sm_start >> vd->vdev_ms_shift), - sizeof (uint64_t), &smo->smo_object, tx); + sizeof (uint64_t), &smo->smo_object, tx, NULL); } mutex_enter(&msp->ms_lock); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c#3 (text) ==== @@ -1299,7 +1299,7 @@ packed = kmem_alloc(nvsize, KM_SLEEP); error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, - DMU_READ_PREFETCH); + DMU_READ_PREFETCH, NULL); if (error == 0) error = nvlist_unpack(packed, nvsize, value, 0); kmem_free(packed, nvsize); @@ -5263,7 +5263,7 @@ KM_SLEEP) == 0); bzero(packed + nvsize, bufsize - nvsize); - dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); + dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx, NULL); kmem_free(packed, bufsize); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c#2 (text) ==== @@ -125,12 +125,12 @@ firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof); if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread, - buf, DMU_READ_PREFETCH)) != 0) + buf, DMU_READ_PREFETCH, NULL)) != 0) return (err); if (firstread != sizeof (reclen)) { if ((err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, sizeof (reclen) - firstread, - buf + firstread, DMU_READ_PREFETCH)) != 0) + buf + firstread, DMU_READ_PREFETCH, NULL)) != 0) return (err); } @@ -161,13 +161,13 @@ phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof); shpp->sh_eof += len; - dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx); + dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx, NULL); len -= firstwrite; if (len > 0) { /* write out the rest at the beginning of physical file */ dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len, - len, (char *)buf + firstwrite, tx); + len, (char *)buf + firstwrite, tx, NULL); } return (0); @@ -409,10 +409,10 @@ } err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf, - DMU_READ_PREFETCH); + DMU_READ_PREFETCH, NULL); if (leftover && err == 0) { err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, - leftover, buf + read_len, DMU_READ_PREFETCH); + leftover, buf + read_len, DMU_READ_PREFETCH, NULL); } mutex_exit(&spa->spa_history_lock); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c#2 (text) ==== @@ -312,7 +312,7 @@ mutex_exit(sm->sm_lock); error = dmu_read(os, smo->smo_object, offset, size, entry_map, - DMU_READ_PREFETCH); + DMU_READ_PREFETCH, NULL); mutex_enter(sm->sm_lock); if (error != 0) break; @@ -449,7 +449,7 @@ if (entry == entry_map_end) { mutex_exit(sm->sm_lock); dmu_write(os, smo->smo_object, smo->smo_objsize, - bufsize, entry_map, tx); + bufsize, entry_map, tx, NULL); mutex_enter(sm->sm_lock); smo->smo_objsize += bufsize; entry = entry_map; @@ -469,7 +469,7 @@ size = (entry - entry_map) * sizeof (uint64_t); mutex_exit(sm->sm_lock); dmu_write(os, smo->smo_object, smo->smo_objsize, - size, entry_map, tx); + size, entry_map, tx, NULL); mutex_enter(sm->sm_lock); smo->smo_objsize += size; } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#20 (text) ==== @@ -319,44 +319,17 @@ int size; } dbuf_dirty_range_t; -struct dbuf_array; -struct dmu_buf_impl; - /** * A callback function pointer which provides the callee, via dba.dba_private, - * a way to keep track of the state of an array of dbufs. + * a way to keep track of the state of an array of dbufs. See dmu_context_t. */ -typedef void(*dmu_callback_t)(struct dbuf_array *, struct dmu_buf_impl *); - -typedef struct dbuf_array { - - /** The set of dbufs in this array. */ - struct dmu_buf_impl **dba_dbp; - - /** The number of dbufs in the array. */ - size_t dba_count; - - /** The tag used for the array. */ - void *dba_tag; - - /** The callback to call if the conditions are met. */ - dmu_callback_t dba_cb; - - /** The dbuf states when a callback may be called. */ - int dba_states; - - /** Private data for the callback. */ - void *dba_private; - -} dbuf_array_t; - typedef struct dbuf_callback_node { /** This object's entry in the list in dmu_buf_impl_t. */ list_node_t dbc_link; - /** The dbuf array this callback is associated with. */ - dbuf_array_t *dbc_dba; + /** The DMU context this callback is associated with. */ + dmu_context_t *dmu_ctx; } dbuf_callback_node_t; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#4 (text) ==== @@ -211,6 +211,34 @@ void *db_data; /**< data in buffer */ } dmu_buf_t; +/** + * \brief These structures are for DMU consumers that want async callbacks. + */ +struct dmu_context; +typedef void (*dmu_callback_t)(struct dmu_context *, void *); + +typedef struct dmu_context { + + /** The set of buffers associated with this context. */ + struct dmu_buf **dbp; + + /** The number of buffers associated with this context. */ + size_t count; + + /** The tag used for this context. */ + void *tag; + + /** The callback to call if the conditions are met. */ + dmu_callback_t dmu_cb; + + /** The dbuf states when a callback may be called. */ + int states; + + /** Private data for the callback. */ + void *dmu_cb_private; + +} dmu_context_t; + typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); /* @@ -337,18 +365,19 @@ #define DMU_READ_PREFETCH 0 /* prefetch */ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf, uint32_t flags); + void *buf, uint32_t flags, dmu_context_t *dmu_ctx_p); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx); + const void *buf, dmu_tx_t *tx, dmu_context_t *dmu_ctx_p); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - dmu_tx_t *tx); -int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); + dmu_tx_t *tx, dmu_context_t *dmu_ctx_p); +int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, + dmu_context_t *dmu_ctx_p); int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, - dmu_tx_t *tx); + dmu_tx_t *tx, dmu_context_t *dmu_ctx_p); int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size, - dmu_tx_t *tx); + dmu_tx_t *tx, dmu_context_t *dmu_ctx_p); int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, - uint64_t size, struct page *pp, dmu_tx_t *tx); + uint64_t size, struct page *pp, dmu_tx_t *tx, dmu_context_t *dmu_ctx_p); struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c#2 (text) ==== @@ -846,7 +846,7 @@ uint64_t object = 0; error = dmu_read(mos, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, - DMU_READ_PREFETCH); + DMU_READ_PREFETCH, NULL); if (error) return (error); if (object != 0) { ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c#2 (text) ==== @@ -1109,7 +1109,7 @@ if (znode_acl.z_acl_extern_obj) { error = dmu_read(zp->z_zfsvfs->z_os, znode_acl.z_acl_extern_obj, 0, aclnode->z_size, - aclnode->z_acldata, DMU_READ_PREFETCH); + aclnode->z_acldata, DMU_READ_PREFETCH, NULL); } else { bcopy(znode_acl.z_ace_data, aclnode->z_acldata, aclnode->z_size); @@ -1273,7 +1273,8 @@ if (aclnode->z_ace_count == 0) continue; dmu_write(zfsvfs->z_os, aoid, off, - aclnode->z_size, aclnode->z_acldata, tx); + aclnode->z_size, aclnode->z_acldata, tx, + NULL); off += aclnode->z_size; } } else { ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c#2 (text) ==== @@ -132,7 +132,7 @@ packed = kmem_alloc(fuid_size, KM_SLEEP); VERIFY(dmu_read(os, fuid_obj, 0, - fuid_size, packed, DMU_READ_PREFETCH) == 0); + fuid_size, packed, DMU_READ_PREFETCH, NULL) == 0); VERIFY(nvlist_unpack(packed, fuid_size, &nvp, 0) == 0); VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY, @@ -275,7 +275,7 @@ nvlist_free(nvp); zfsvfs->z_fuid_size = nvsize; dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0, - zfsvfs->z_fuid_size, packed, tx); + zfsvfs->z_fuid_size, packed, tx, NULL); kmem_free(packed, zfsvfs->z_fuid_size); VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj, FTAG, &db)); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c#2 (text) ==== @@ -494,12 +494,14 @@ itx = zil_itx_create(txtype, sizeof (*lr) + (write_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; - if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, - zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { - zil_itx_destroy(itx); - itx = zil_itx_create(txtype, sizeof (*lr)); - lr = (lr_write_t *)&itx->itx_lr; - write_state = WR_NEED_COPY; + if (write_state == WR_COPIED) { + if (dmu_read(zp->z_zfsvfs->z_os, zp->z_id, off, len, + lr + 1, DMU_READ_NO_PREFETCH, NULL) != 0) { + zil_itx_destroy(itx); + itx = zil_itx_create(txtype, sizeof (*lr)); + lr = (lr_write_t *)&itx->itx_lr; + write_state = WR_NEED_COPY; + } } itx->itx_wr_state = write_state; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c#4 (text) ==== @@ -408,10 +408,10 @@ va = zfs_map_page(pp, &sf); if (segflg == UIO_NOCOPY) { (void) dmu_write(os, oid, start+off, nbytes, - va+off, tx); + va+off, tx, NULL); } else { (void) dmu_read(os, oid, start+off, nbytes, - va+off, DMU_READ_PREFETCH); + va+off, DMU_READ_PREFETCH, NULL); } zfs_unmap_page(sf); VM_OBJECT_LOCK(obj); @@ -463,7 +463,7 @@ VM_OBJECT_UNLOCK(obj); va = zfs_map_page(pp, &sf); error = dmu_read(os, zp->z_id, start, bytes, va, - DMU_READ_PREFETCH); + DMU_READ_PREFETCH, NULL); if (bytes != PAGESIZE && error == 0) bzero(va + bytes, PAGESIZE - bytes); zfs_unmap_page(sf); @@ -533,7 +533,7 @@ page_unlock(pp); } else { VM_OBJECT_UNLOCK(obj); - error = dmu_read_uio(os, zp->z_id, uio, bytes); + error = dmu_read_uio(os, zp->z_id, uio, bytes, NULL); VM_OBJECT_LOCK(obj); } len -= bytes; @@ -674,7 +674,7 @@ if (vn_has_cached_data(vp)) error = mappedread(vp, nbytes, uio); else - error = dmu_read_uio(os, zp->z_id, uio, nbytes); + error = dmu_read_uio(os, zp->z_id, uio, nbytes, NULL); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) @@ -942,7 +942,7 @@ if (abuf == NULL) { tx_bytes = uio->uio_resid; error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, nbytes, tx); + uio, nbytes, tx, NULL); tx_bytes -= uio->uio_resid; } else { tx_bytes = nbytes; @@ -957,7 +957,7 @@ aiov->iov_base != abuf->b_data)) { ASSERT(xuio); dmu_write(zfsvfs->z_os, zp->z_id, woff, - aiov->iov_len, aiov->iov_base, tx); + aiov->iov_len, aiov->iov_base, tx, NULL); dmu_return_arcbuf(abuf); xuio_stat_wbuf_copied(); } else { @@ -1147,7 +1147,7 @@ error = ENOENT; } else { error = dmu_read(os, object, offset, size, buf, - DMU_READ_NO_PREFETCH); + DMU_READ_NO_PREFETCH, NULL); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ @@ -4329,7 +4329,7 @@ if (zp->z_blksz <= PAGESIZE) { caddr_t va = zfs_map_page(pp, S_READ); ASSERT3U(len, <=, PAGESIZE); - dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); + dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx, NULL); zfs_unmap_page(pp, va); } else { err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx); @@ -4604,7 +4604,7 @@ ASSERT3U(io_off, ==, cur_pp->p_offset); va = zfs_map_page(cur_pp, S_WRITE); err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, - DMU_READ_PREFETCH); + DMU_READ_PREFETCH, NULL); zfs_unmap_page(cur_pp, va); if (err) { /* On error, toss the entire kluster */ @@ -5497,7 +5497,7 @@ VM_OBJECT_UNLOCK(object); va = zfs_map_page(mreq, &sf); error = dmu_read(os, zp->z_id, IDX_TO_OFF(mreq->pindex), - size, va, DMU_READ_PREFETCH); + size, va, DMU_READ_PREFETCH, NULL); if (size != PAGE_SIZE) bzero(va + size, PAGE_SIZE - size); zfs_unmap_page(sf); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#4 (text) ==== @@ -415,7 +415,7 @@ if (error) { dmu_tx_abort(tx); } else { - dmu_write(os, ZVOL_OBJ, offset, length, data, tx); + dmu_write(os, ZVOL_OBJ, offset, length, data, tx, NULL); dmu_tx_commit(tx); } @@ -1009,7 +1009,7 @@ */ if (buf != NULL) { /* immediate write */ error = dmu_read(os, object, offset, size, buf, - DMU_READ_NO_PREFETCH); + DMU_READ_NO_PREFETCH, NULL); } else { size = zv->zv_volblocksize; offset = P2ALIGN(offset, size); @@ -1084,12 +1084,14 @@ itx = zil_itx_create(TX_WRITE, sizeof (*lr) + (write_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; - if (write_state == WR_COPIED && dmu_read(zv->zv_objset, - ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { - zil_itx_destroy(itx); - itx = zil_itx_create(TX_WRITE, sizeof (*lr)); - lr = (lr_write_t *)&itx->itx_lr; - write_state = WR_NEED_COPY; + if (write_state == WR_COPIED) { + if (dmu_read(zv->zv_objset, ZVOL_OBJ, off, len, lr + 1, + DMU_READ_NO_PREFETCH, NULL) != 0) { + zil_itx_destroy(itx); + itx = zil_itx_create(TX_WRITE, sizeof (*lr)); + lr = (lr_write_t *)&itx->itx_lr; + write_state = WR_NEED_COPY; + } } itx->itx_wr_state = write_state; @@ -1244,7 +1246,7 @@ size_t size = MIN(resid, zvol_maxphys); if (doread) { error = dmu_read(os, ZVOL_OBJ, off, size, addr, - DMU_READ_PREFETCH); + DMU_READ_PREFETCH, NULL); } else { dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); @@ -1252,7 +1254,8 @@ if (error) { dmu_tx_abort(tx); } else { - dmu_write(os, ZVOL_OBJ, off, size, addr, tx); + dmu_write(os, ZVOL_OBJ, off, size, addr, tx, + NULL); zvol_log_write(zv, tx, off, size, sync); dmu_tx_commit(tx); } @@ -1362,7 +1365,8 @@ if (bytes > volsize - uio->uio_loffset) bytes = volsize - uio->uio_loffset; - error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes); + error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes, + NULL); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) @@ -1446,7 +1450,8 @@ dmu_assign_arcbuf(zv->zv_dbuf, off, buf, tx); error = 0; } else { - error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx); + error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx, + NULL); } if (error == 0) zvol_log_write(zv, tx, off, bytes, sync); @@ -1463,7 +1468,7 @@ dmu_tx_abort(tx); break; } - error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx); + error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx, NULL); if (error == 0) zvol_log_write(zv, tx, off, bytes, sync); dmu_tx_commit(tx); Change 521719 by willa@willa_repo on 2012/01/19 14:21:48 Rollback to change 521532. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c#3 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#56 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#18 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c#3 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c#3 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c#4 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c#3 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c#3 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#21 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#5 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c#3 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c#3 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c#3 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c#3 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c#5 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#5 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c#3 (text) ==== @@ -340,7 +340,7 @@ mutex_enter(&bpo->bpo_lock); dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), - sizeof (subobj), &subobj, tx, NULL); + sizeof (subobj), &subobj, tx); bpo->bpo_phys->bpo_num_subobjs++; /* @@ -361,8 +361,7 @@ 0, FTAG, &subdb, 0)); dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), - numsubsub * sizeof (subobj), subdb->db_data, tx, - NULL); + numsubsub * sizeof (subobj), subdb->db_data, tx); dmu_buf_rele(subdb, FTAG); bpo->bpo_phys->bpo_num_subobjs += numsubsub; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#56 (text) ==== @@ -2707,8 +2707,8 @@ for (dbc = list_head(&db->db_callbacks); db != NULL; dbc = list_next(&db->db_callbacks, dbc)) { - if (dbc->dmu_ctx->states & db->db_state) - dbc->dmu_ctx->dmu_cb(dbc->dmu_ctx->dmu_cb_private, db); + if (dbc->dbc_dba->dba_states & db->db_state) + dbc->dbc_dba->dba_cb(dbc->dbc_dba, db); /* XXX what else should we do? */ } } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#18 (text) ==== @@ -44,7 +44,9 @@ #include #endif -static void dmu_buf_rele_array(dmu_context_t *dmu_ctx); +static int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, + uint64_t length, int read, void *tag, dbuf_array_t **dbap); +static void dmu_buf_rele_array(dbuf_array_t *dba); const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { byteswap_uint8_array, TRUE, "unallocated" }, @@ -372,10 +374,11 @@ */ static int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - int read, dmu_context_t *dmu_ctx, uint32_t flags) + int read, void *tag, dbuf_array_t **dbap, uint32_t flags) { dsl_pool_t *dp = NULL; dmu_buf_impl_t **dbp; + dbuf_array_t *dba; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; @@ -406,8 +409,11 @@ } nblks = 1; } - dmu_ctx->count = nblks; - dmu_ctx->dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); + dba = kmem_zalloc(sizeof(dbuf_array_t), KM_SLEEP); + dba->dba_count = nblks; + dba->dba_tag = tag; + dbp = kmem_zalloc(sizeof (dmu_buf_impl_t *) * nblks, KM_SLEEP); + dba->dba_dbp = dbp; if (dn->dn_objset->os_dsl_dataset) dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; @@ -416,10 +422,10 @@ zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, offset); for (i = 0; i < nblks; i++) { - dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, dmu_ctx->tag); + dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); if (db == NULL) { rw_exit(&dn->dn_struct_rwlock); - dmu_buf_rele_array(dmu_ctx); + dmu_buf_rele_array(dba); zio_nowait(zio); return (EIO); } @@ -430,7 +436,7 @@ else curthread->td_ru.ru_oublock++; #endif - dmu_ctx->dbp[i] = &db->db; + dbp[i] = db; } rw_exit(&dn->dn_struct_rwlock); @@ -440,14 +446,14 @@ if (dp && dsl_pool_sync_context(dp)) dp->dp_read_overhead += gethrtime() - start; if (err) { - dmu_buf_rele_array(dmu_ctx); + dmu_buf_rele_array(dba); return (err); } /* wait for other io to complete */ if (read) { for (i = 0; i < nblks; i++) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)dmu_ctx->dbp[i]; + dmu_buf_impl_t *db = dbp[i]; mutex_enter(&db->db_mtx); while (db->db_state & (DB_READ|DB_FILL)) cv_wait(&db->db_changed, &db->db_mtx); @@ -455,22 +461,13 @@ err = EIO; mutex_exit(&db->db_mtx); if (err) { - dmu_buf_rele_array(dmu_ctx); + dmu_buf_rele_array(dba); return (err); } } } - /* Hook up the callbacks to each affected dbuf. */ - /* If the callback isn't set, private data better not be either. */ - ASSERT(dmu_ctx->dmu_cb != NULL || dmu_ctx->dmu_cb_private == NULL); - if (dmu_ctx->dmu_cb != NULL) { - for (i = 0; i < nblks; i++) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)dmu_ctx->dbp[i]; - list_insert_head(&db->db_callbacks, dmu_ctx); - } - } - + *dbap = dba; return (0); } @@ -480,7 +477,7 @@ */ static int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, - uint64_t length, int read, dmu_context_t *dmu_ctx) + uint64_t length, int read, void *tag, dbuf_array_t **dbap) { dnode_t *dn; int err; @@ -489,33 +486,49 @@ if (err) return (err); - err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, - dmu_ctx, DMU_READ_PREFETCH); + err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, + dbap, DMU_READ_PREFETCH); dnode_rele(dn, FTAG); return (err); } +static int +dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, + uint64_t length, int read, void *tag, dbuf_array_t **dbap) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + int err; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, + dbap, DMU_READ_PREFETCH); + DB_DNODE_EXIT(db); + + return (err); +} + /** * Releases the hold on an array of dmu_buf_t*'s, and frees the array. The * hold on the array of buffers MUST be released with dmu_buf_rele_array. You * can NOT release the hold on each buffer individually with dmu_buf_rele. */ static void -dmu_buf_rele_array(dmu_context_t *dmu_ctx) +dmu_buf_rele_array(dbuf_array_t *dba) { int i; - for (i = 0; i < dmu_ctx->count; i++) { - if (dmu_ctx->dbp[i]) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)dmu_ctx->dbp[i]; - dbuf_rele(db, dmu_ctx->tag); - } + for (i = 0; i < dba->dba_count; i++) { + if (dba->dba_dbp[i]) + dbuf_rele(dba->dba_dbp[i], dba->dba_tag); } - if (dmu_ctx->count > 0) - kmem_free(dmu_ctx->dbp, - sizeof(dmu_buf_t *) * dmu_ctx->count); + if (dba->dba_count > 0) + kmem_free(dba->dba_dbp, + sizeof(dmu_buf_impl_t *) * dba->dba_count); + kmem_free(dba, sizeof(dbuf_array_t)); } /** \brief Asynchronously try to read in the data. @@ -740,17 +753,12 @@ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf, uint32_t flags, dmu_context_t *dmu_ctx_p) + void *buf, uint32_t flags) { dnode_t *dn; - dmu_context_t *dmu_ctx = dmu_ctx_p; + dbuf_array_t *dba; int err; - if (dmu_ctx == NULL) { - dmu_ctx = kmem_zalloc(sizeof(dmu_context_t), KM_SLEEP); - dmu_ctx->tag = FTAG; - } - err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); @@ -776,14 +784,14 @@ * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, - TRUE, dmu_ctx, flags); + TRUE, FTAG, &dba, flags); if (err) break; - for (i = 0; i < dmu_ctx->count; i++) { + for (i = 0; i < dba->dba_count; i++) { int tocpy; int bufoff; - dmu_buf_t *db = dmu_ctx->dbp[i]; + dmu_buf_t *db = &dba->dba_dbp[i]->db; ASSERT(size > 0); @@ -796,43 +804,36 @@ size -= tocpy; buf = (char *)buf + tocpy; } - dmu_buf_rele_array(dmu_ctx); + dmu_buf_rele_array(dba); } dnode_rele(dn, FTAG); - if (dmu_ctx != dmu_ctx_p) - kmem_free(dmu_ctx, sizeof(dmu_context_t)); return (err); } void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx, dmu_context_t *dmu_ctx_p) + const void *buf, dmu_tx_t *tx) { + dbuf_array_t *dba; int i; - dmu_context_t *dmu_ctx = dmu_ctx_p; if (size == 0) return; - if (dmu_ctx == NULL) { - dmu_ctx = kmem_zalloc(sizeof(dmu_context_t), KM_SLEEP); - dmu_ctx->tag = FTAG; - } - VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, - FALSE, dmu_ctx)); + FALSE, FTAG, &dba)); - for (i = 0; i < dmu_ctx->count; i++) { + for (i = 0; i < dba->dba_count; i++) { int tocpy; int bufoff; - dmu_buf_t *db = dmu_ctx->dbp[i]; + dmu_buf_t *db = &dba->dba_dbp[i]->db; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); - ASSERT(i == 0 || i == dmu_ctx->count - 1 || + ASSERT(i == 0 || i == dba->dba_count - 1 || tocpy == db->db_size); if (tocpy == db->db_size) @@ -848,39 +849,28 @@ size -= tocpy; buf = (char *)buf + tocpy; } - dmu_buf_rele_array(dmu_ctx); - - if (dmu_ctx != dmu_ctx_p) - kmem_free(dmu_ctx, sizeof(dmu_context_t)); + dmu_buf_rele_array(dba); } void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - dmu_tx_t *tx, dmu_context_t *dmu_ctx_p) + dmu_tx_t *tx) { - dmu_context_t *dmu_ctx = dmu_ctx_p; + dbuf_array_t *dba; int i; if (size == 0) return; - if (dmu_ctx == NULL) { - dmu_ctx = kmem_zalloc(sizeof(dmu_context_t), KM_SLEEP); - dmu_ctx->tag = FTAG; - } - VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, - FALSE, dmu_ctx)); + FALSE, FTAG, &dba)); - for (i = 0; i < dmu_ctx->count; i++) { - dmu_buf_t *db = dmu_ctx->dbp[i]; + for (i = 0; i < dba->dba_count; i++) { + dmu_buf_t *db = &dba->dba_dbp[i]->db; dmu_buf_will_not_fill(db, tx); } - dmu_buf_rele_array(dmu_ctx); - - if (dmu_ctx != dmu_ctx_p) - kmem_free(dmu_ctx, sizeof(dmu_context_t)); + dmu_buf_rele_array(dba); } /** @@ -1008,24 +998,18 @@ #ifdef _KERNEL int -dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, - dmu_context_t *dmu_ctx_p) +dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) { + dbuf_array_t *dba; int i, err; - dmu_context_t *dmu_ctx = dmu_ctx_p; xuio_t *xuio = NULL; - if (dmu_ctx == NULL) { - dmu_ctx = kmem_zalloc(sizeof(dmu_context_t), KM_SLEEP); - dmu_ctx->tag = FTAG; - } - /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ - err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, - dmu_ctx); + err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, + &dba); if (err) return (err); @@ -1034,10 +1018,10 @@ xuio = (xuio_t *)uio; #endif - for (i = 0; i < dmu_ctx->count; i++) { + for (i = 0; i < dba->dba_count; i++) { int tocpy; int bufoff; - dmu_buf_t *db = dmu_ctx->dbp[i]; + dmu_buf_t *db = &dba->dba_dbp[i]->db; ASSERT(size > 0); @@ -1067,42 +1051,34 @@ size -= tocpy; } - dmu_buf_rele_array(dmu_ctx); - if (dmu_ctx != dmu_ctx_p) - kmem_free(dmu_ctx, sizeof(dmu_context_t)); + dmu_buf_rele_array(dba); return (err); } static int -dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx, - dmu_context_t *dmu_ctx_p) +dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) { + dbuf_array_t *dba; int err = 0; - dmu_context_t *dmu_ctx = dmu_ctx_p; int i; - if (dmu_ctx == NULL) { - dmu_ctx = kmem_zalloc(sizeof(dmu_context_t), KM_SLEEP); - dmu_ctx->tag = FTAG; - } - err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, - FALSE, dmu_ctx, DMU_READ_PREFETCH); + FALSE, FTAG, &dba, DMU_READ_PREFETCH); if (err) return (err); - for (i = 0; i < dmu_ctx->count; i++) { + for (i = 0; i < dba->dba_count; i++) { int tocpy; int bufoff; - dmu_buf_t *db = dmu_ctx->dbp[i]; + dmu_buf_t *db = &dba->dba_dbp[i]->db; ASSERT(size > 0); bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); - ASSERT(i == 0 || i == dmu_ctx->count-1 || tocpy == db->db_size); + ASSERT(i == 0 || i == dba->dba_count-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); @@ -1126,16 +1102,13 @@ size -= tocpy; } - dmu_buf_rele_array(dmu_ctx); - if (dmu_ctx != dmu_ctx_p) - kmem_free(dmu_ctx, sizeof(dmu_context_t)); - + dmu_buf_rele_array(dba); return (err); } int dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, - dmu_tx_t *tx, dmu_context_t *dmu_ctx_p) + dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; @@ -1146,7 +1119,7 @@ DB_DNODE_ENTER(db); dn = DB_DNODE(db); - err = dmu_write_uio_dnode(dn, uio, size, tx, dmu_ctx_p); + err = dmu_write_uio_dnode(dn, uio, size, tx); DB_DNODE_EXIT(db); return (err); @@ -1154,7 +1127,7 @@ int dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, - dmu_tx_t *tx, dmu_context_t *dmu_ctx_p) + dmu_tx_t *tx) { dnode_t *dn; int err; @@ -1166,7 +1139,7 @@ if (err) return (err); - err = dmu_write_uio_dnode(dn, uio, size, tx, dmu_ctx_p); + err = dmu_write_uio_dnode(dn, uio, size, tx); dnode_rele(dn, FTAG); @@ -1176,28 +1149,24 @@ #ifdef sun int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - page_t *pp, dmu_tx_t *tx, dmu_context_t *dmu_ctx_p) + page_t *pp, dmu_tx_t *tx) { - dmu_context_t *dmu_ctx = dmu_ctx_p; + dbuf_array_t *dba; int i; int err; - if (dmu_ctx == NULL) { - dmu_ctx = kmem_zalloc(sizeof(dmu_context_t), KM_SLEEP); - dmu_ctx->tag = FTAG; - } - if (size == 0) return (0); - err = dmu_buf_hold_array(os, object, offset, size, FALSE, dmu_ctx); + err = dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &dba); if (err) return (err); - for (i = 0; i < dmu_ctx->count; i++) { + for (i = 0; i < dba->dba_count; i++) { int tocpy, copied, thiscpy; int bufoff; - dmu_buf_t *db = dmu_ctx->dbp[i]; + dmu_buf_t *db = dba->dba_dbp[i]->db; caddr_t va; ASSERT(size > 0); @@ -1206,7 +1175,7 @@ bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); - ASSERT(i == 0 || i == dmu_ctx->count-1 || tocpy == db->db_size); + ASSERT(i == 0 || i == dba->dba_count-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); @@ -1228,10 +1197,7 @@ offset += tocpy; size -= tocpy; } - dmu_buf_rele_array(dmu_ctx); - if (dmu_ctx != dmu_ctx_p) - kmem_free(dmu_ctx, sizeof(dmu_context_t)); - + dmu_buf_rele_array(dba); return (err); } #endif /* sun */ @@ -1295,7 +1261,7 @@ DB_DNODE_EXIT(dbuf); dbuf_rele(db, FTAG); - dmu_write(os, object, offset, blksz, buf->b_data, tx, NULL); + dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); XUIOSTAT_BUMP(xuiostat_wbuf_copied); } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c#3 (text) ==== @@ -1146,7 +1146,7 @@ if (ra->byteswap) dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); dmu_write(os, drrw->drr_object, - drrw->drr_offset, drrw->drr_length, data, tx, NULL); + drrw->drr_offset, drrw->drr_length, data, tx); dmu_tx_commit(tx); return (0); } @@ -1203,7 +1203,7 @@ return (err); } dmu_write(os, drrwbr->drr_object, - drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx, NULL); + drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); dmu_buf_rele(dbp, FTAG); dmu_tx_commit(tx); return (0); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c#3 (text) ==== @@ -970,7 +970,7 @@ ASSERT(smo->smo_object != 0); dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * (sm->sm_start >> vd->vdev_ms_shift), - sizeof (uint64_t), &smo->smo_object, tx, NULL); + sizeof (uint64_t), &smo->smo_object, tx); } mutex_enter(&msp->ms_lock); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c#4 (text) ==== @@ -1299,7 +1299,7 @@ packed = kmem_alloc(nvsize, KM_SLEEP); error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, - DMU_READ_PREFETCH, NULL); + DMU_READ_PREFETCH); if (error == 0) error = nvlist_unpack(packed, nvsize, value, 0); kmem_free(packed, nvsize); @@ -5263,7 +5263,7 @@ KM_SLEEP) == 0); bzero(packed + nvsize, bufsize - nvsize); - dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx, NULL); + dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); kmem_free(packed, bufsize); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c#3 (text) ==== @@ -125,12 +125,12 @@ firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof); if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread, - buf, DMU_READ_PREFETCH, NULL)) != 0) + buf, DMU_READ_PREFETCH)) != 0) return (err); if (firstread != sizeof (reclen)) { if ((err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, sizeof (reclen) - firstread, - buf + firstread, DMU_READ_PREFETCH, NULL)) != 0) + buf + firstread, DMU_READ_PREFETCH)) != 0) return (err); } @@ -161,13 +161,13 @@ phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof); shpp->sh_eof += len; - dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx, NULL); + dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx); len -= firstwrite; if (len > 0) { /* write out the rest at the beginning of physical file */ dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len, - len, (char *)buf + firstwrite, tx, NULL); + len, (char *)buf + firstwrite, tx); } return (0); @@ -409,10 +409,10 @@ } err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf, - DMU_READ_PREFETCH, NULL); + DMU_READ_PREFETCH); if (leftover && err == 0) { err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, - leftover, buf + read_len, DMU_READ_PREFETCH, NULL); + leftover, buf + read_len, DMU_READ_PREFETCH); } mutex_exit(&spa->spa_history_lock); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c#3 (text) ==== @@ -312,7 +312,7 @@ mutex_exit(sm->sm_lock); error = dmu_read(os, smo->smo_object, offset, size, entry_map, - DMU_READ_PREFETCH, NULL); + DMU_READ_PREFETCH); mutex_enter(sm->sm_lock); if (error != 0) break; @@ -449,7 +449,7 @@ if (entry == entry_map_end) { mutex_exit(sm->sm_lock); dmu_write(os, smo->smo_object, smo->smo_objsize, - bufsize, entry_map, tx, NULL); + bufsize, entry_map, tx); mutex_enter(sm->sm_lock); smo->smo_objsize += bufsize; entry = entry_map; @@ -469,7 +469,7 @@ size = (entry - entry_map) * sizeof (uint64_t); mutex_exit(sm->sm_lock); dmu_write(os, smo->smo_object, smo->smo_objsize, - size, entry_map, tx, NULL); + size, entry_map, tx); mutex_enter(sm->sm_lock); smo->smo_objsize += size; } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#21 (text) ==== @@ -319,17 +319,44 @@ int size; } dbuf_dirty_range_t; +struct dbuf_array; +struct dmu_buf_impl; + /** * A callback function pointer which provides the callee, via dba.dba_private, - * a way to keep track of the state of an array of dbufs. See dmu_context_t. + * a way to keep track of the state of an array of dbufs. */ +typedef void(*dmu_callback_t)(struct dbuf_array *, struct dmu_buf_impl *); + +typedef struct dbuf_array { + + /** The set of dbufs in this array. */ + struct dmu_buf_impl **dba_dbp; + + /** The number of dbufs in the array. */ + size_t dba_count; + + /** The tag used for the array. */ + void *dba_tag; + + /** The callback to call if the conditions are met. */ + dmu_callback_t dba_cb; + + /** The dbuf states when a callback may be called. */ + int dba_states; + + /** Private data for the callback. */ + void *dba_private; + +} dbuf_array_t; + typedef struct dbuf_callback_node { /** This object's entry in the list in dmu_buf_impl_t. */ list_node_t dbc_link; - /** The DMU context this callback is associated with. */ - dmu_context_t *dmu_ctx; + /** The dbuf array this callback is associated with. */ + dbuf_array_t *dbc_dba; } dbuf_callback_node_t; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#5 (text) ==== @@ -211,34 +211,6 @@ void *db_data; /**< data in buffer */ } dmu_buf_t; -/** - * \brief These structures are for DMU consumers that want async callbacks. - */ -struct dmu_context; -typedef void (*dmu_callback_t)(struct dmu_context *, void *); - -typedef struct dmu_context { - - /** The set of buffers associated with this context. */ - struct dmu_buf **dbp; - - /** The number of buffers associated with this context. */ - size_t count; - - /** The tag used for this context. */ - void *tag; - - /** The callback to call if the conditions are met. */ - dmu_callback_t dmu_cb; - - /** The dbuf states when a callback may be called. */ - int states; - - /** Private data for the callback. */ - void *dmu_cb_private; - -} dmu_context_t; - typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); /* @@ -365,19 +337,18 @@ #define DMU_READ_PREFETCH 0 /* prefetch */ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf, uint32_t flags, dmu_context_t *dmu_ctx_p); + void *buf, uint32_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx, dmu_context_t *dmu_ctx_p); + const void *buf, dmu_tx_t *tx); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - dmu_tx_t *tx, dmu_context_t *dmu_ctx_p); -int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, - dmu_context_t *dmu_ctx_p); + dmu_tx_t *tx); +int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, - dmu_tx_t *tx, dmu_context_t *dmu_ctx_p); + dmu_tx_t *tx); int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size, - dmu_tx_t *tx, dmu_context_t *dmu_ctx_p); + dmu_tx_t *tx); int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, - uint64_t size, struct page *pp, dmu_tx_t *tx, dmu_context_t *dmu_ctx_p); + uint64_t size, struct page *pp, dmu_tx_t *tx); struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c#3 (text) ==== @@ -846,7 +846,7 @@ uint64_t object = 0; error = dmu_read(mos, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, - DMU_READ_PREFETCH, NULL); + DMU_READ_PREFETCH); if (error) return (error); if (object != 0) { ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c#3 (text) ==== @@ -1109,7 +1109,7 @@ if (znode_acl.z_acl_extern_obj) { error = dmu_read(zp->z_zfsvfs->z_os, znode_acl.z_acl_extern_obj, 0, aclnode->z_size, - aclnode->z_acldata, DMU_READ_PREFETCH, NULL); + aclnode->z_acldata, DMU_READ_PREFETCH); } else { bcopy(znode_acl.z_ace_data, aclnode->z_acldata, aclnode->z_size); @@ -1273,8 +1273,7 @@ if (aclnode->z_ace_count == 0) continue; dmu_write(zfsvfs->z_os, aoid, off, - aclnode->z_size, aclnode->z_acldata, tx, - NULL); + aclnode->z_size, aclnode->z_acldata, tx); off += aclnode->z_size; } } else { ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c#3 (text) ==== @@ -132,7 +132,7 @@ packed = kmem_alloc(fuid_size, KM_SLEEP); VERIFY(dmu_read(os, fuid_obj, 0, - fuid_size, packed, DMU_READ_PREFETCH, NULL) == 0); + fuid_size, packed, DMU_READ_PREFETCH) == 0); VERIFY(nvlist_unpack(packed, fuid_size, &nvp, 0) == 0); VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY, @@ -275,7 +275,7 @@ nvlist_free(nvp); zfsvfs->z_fuid_size = nvsize; dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0, - zfsvfs->z_fuid_size, packed, tx, NULL); + zfsvfs->z_fuid_size, packed, tx); kmem_free(packed, zfsvfs->z_fuid_size); VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj, FTAG, &db)); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c#3 (text) ==== @@ -494,14 +494,12 @@ itx = zil_itx_create(txtype, sizeof (*lr) + (write_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; - if (write_state == WR_COPIED) { - if (dmu_read(zp->z_zfsvfs->z_os, zp->z_id, off, len, - lr + 1, DMU_READ_NO_PREFETCH, NULL) != 0) { - zil_itx_destroy(itx); - itx = zil_itx_create(txtype, sizeof (*lr)); - lr = (lr_write_t *)&itx->itx_lr; - write_state = WR_NEED_COPY; - } + if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, + zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { + zil_itx_destroy(itx); + itx = zil_itx_create(txtype, sizeof (*lr)); + lr = (lr_write_t *)&itx->itx_lr; + write_state = WR_NEED_COPY; } itx->itx_wr_state = write_state; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c#5 (text) ==== @@ -408,10 +408,10 @@ va = zfs_map_page(pp, &sf); if (segflg == UIO_NOCOPY) { (void) dmu_write(os, oid, start+off, nbytes, - va+off, tx, NULL); + va+off, tx); } else { (void) dmu_read(os, oid, start+off, nbytes, - va+off, DMU_READ_PREFETCH, NULL); + va+off, DMU_READ_PREFETCH); } zfs_unmap_page(sf); VM_OBJECT_LOCK(obj); @@ -463,7 +463,7 @@ VM_OBJECT_UNLOCK(obj); va = zfs_map_page(pp, &sf); error = dmu_read(os, zp->z_id, start, bytes, va, - DMU_READ_PREFETCH, NULL); + DMU_READ_PREFETCH); if (bytes != PAGESIZE && error == 0) bzero(va + bytes, PAGESIZE - bytes); zfs_unmap_page(sf); @@ -533,7 +533,7 @@ page_unlock(pp); } else { VM_OBJECT_UNLOCK(obj); - error = dmu_read_uio(os, zp->z_id, uio, bytes, NULL); + error = dmu_read_uio(os, zp->z_id, uio, bytes); VM_OBJECT_LOCK(obj); } len -= bytes; @@ -674,7 +674,7 @@ if (vn_has_cached_data(vp)) error = mappedread(vp, nbytes, uio); else - error = dmu_read_uio(os, zp->z_id, uio, nbytes, NULL); + error = dmu_read_uio(os, zp->z_id, uio, nbytes); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) @@ -942,7 +942,7 @@ if (abuf == NULL) { tx_bytes = uio->uio_resid; error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, nbytes, tx, NULL); + uio, nbytes, tx); tx_bytes -= uio->uio_resid; } else { tx_bytes = nbytes; @@ -957,7 +957,7 @@ aiov->iov_base != abuf->b_data)) { ASSERT(xuio); dmu_write(zfsvfs->z_os, zp->z_id, woff, - aiov->iov_len, aiov->iov_base, tx, NULL); + aiov->iov_len, aiov->iov_base, tx); dmu_return_arcbuf(abuf); xuio_stat_wbuf_copied(); } else { @@ -1147,7 +1147,7 @@ error = ENOENT; } else { error = dmu_read(os, object, offset, size, buf, - DMU_READ_NO_PREFETCH, NULL); + DMU_READ_NO_PREFETCH); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ @@ -4329,7 +4329,7 @@ if (zp->z_blksz <= PAGESIZE) { caddr_t va = zfs_map_page(pp, S_READ); ASSERT3U(len, <=, PAGESIZE); - dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx, NULL); + dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); zfs_unmap_page(pp, va); } else { err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx); @@ -4604,7 +4604,7 @@ ASSERT3U(io_off, ==, cur_pp->p_offset); va = zfs_map_page(cur_pp, S_WRITE); err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, - DMU_READ_PREFETCH, NULL); + DMU_READ_PREFETCH); zfs_unmap_page(cur_pp, va); if (err) { /* On error, toss the entire kluster */ @@ -5497,7 +5497,7 @@ VM_OBJECT_UNLOCK(object); va = zfs_map_page(mreq, &sf); error = dmu_read(os, zp->z_id, IDX_TO_OFF(mreq->pindex), - size, va, DMU_READ_PREFETCH, NULL); + size, va, DMU_READ_PREFETCH); if (size != PAGE_SIZE) bzero(va + size, PAGE_SIZE - size); zfs_unmap_page(sf); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#5 (text) ==== @@ -415,7 +415,7 @@ if (error) { dmu_tx_abort(tx); } else { - dmu_write(os, ZVOL_OBJ, offset, length, data, tx, NULL); + dmu_write(os, ZVOL_OBJ, offset, length, data, tx); dmu_tx_commit(tx); } @@ -1009,7 +1009,7 @@ */ if (buf != NULL) { /* immediate write */ error = dmu_read(os, object, offset, size, buf, - DMU_READ_NO_PREFETCH, NULL); + DMU_READ_NO_PREFETCH); } else { size = zv->zv_volblocksize; offset = P2ALIGN(offset, size); @@ -1084,14 +1084,12 @@ itx = zil_itx_create(TX_WRITE, sizeof (*lr) + (write_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; - if (write_state == WR_COPIED) { - if (dmu_read(zv->zv_objset, ZVOL_OBJ, off, len, lr + 1, - DMU_READ_NO_PREFETCH, NULL) != 0) { - zil_itx_destroy(itx); - itx = zil_itx_create(TX_WRITE, sizeof (*lr)); - lr = (lr_write_t *)&itx->itx_lr; - write_state = WR_NEED_COPY; - } + if (write_state == WR_COPIED && dmu_read(zv->zv_objset, + ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { + zil_itx_destroy(itx); + itx = zil_itx_create(TX_WRITE, sizeof (*lr)); + lr = (lr_write_t *)&itx->itx_lr; + write_state = WR_NEED_COPY; } itx->itx_wr_state = write_state; @@ -1246,7 +1244,7 @@ size_t size = MIN(resid, zvol_maxphys); if (doread) { error = dmu_read(os, ZVOL_OBJ, off, size, addr, - DMU_READ_PREFETCH, NULL); + DMU_READ_PREFETCH); } else { dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); @@ -1254,8 +1252,7 @@ if (error) { dmu_tx_abort(tx); } else { - dmu_write(os, ZVOL_OBJ, off, size, addr, tx, - NULL); + dmu_write(os, ZVOL_OBJ, off, size, addr, tx); zvol_log_write(zv, tx, off, size, sync); dmu_tx_commit(tx); } @@ -1365,8 +1362,7 @@ if (bytes > volsize - uio->uio_loffset) bytes = volsize - uio->uio_loffset; - error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes, - NULL); + error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) @@ -1450,8 +1446,7 @@ dmu_assign_arcbuf(zv->zv_dbuf, off, buf, tx); error = 0; } else { - error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx, - NULL); + error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx); } if (error == 0) zvol_log_write(zv, tx, off, bytes, sync); @@ -1468,7 +1463,7 @@ dmu_tx_abort(tx); break; } - error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx, NULL); + error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx); if (error == 0) zvol_log_write(zv, tx, off, bytes, sync); dmu_tx_commit(tx); Change 521725 by willa@willa_repo on 2012/01/19 14:44:09 Revert change 521474. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#19 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#22 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#6 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#19 (text) ==== @@ -44,10 +44,6 @@ #include #endif -static int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, - uint64_t length, int read, void *tag, dbuf_array_t **dbap); -static void dmu_buf_rele_array(dbuf_array_t *dba); - const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { byteswap_uint8_array, TRUE, "unallocated" }, { zap_byteswap, TRUE, "object directory" }, @@ -374,11 +370,10 @@ */ static int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - int read, void *tag, dbuf_array_t **dbap, uint32_t flags) + int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { dsl_pool_t *dp = NULL; - dmu_buf_impl_t **dbp; - dbuf_array_t *dba; + dmu_buf_t **dbp; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; @@ -409,11 +404,7 @@ } nblks = 1; } - dba = kmem_zalloc(sizeof(dbuf_array_t), KM_SLEEP); - dba->dba_count = nblks; - dba->dba_tag = tag; - dbp = kmem_zalloc(sizeof (dmu_buf_impl_t *) * nblks, KM_SLEEP); - dba->dba_dbp = dbp; + dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); if (dn->dn_objset->os_dsl_dataset) dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; @@ -425,7 +416,7 @@ dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); if (db == NULL) { rw_exit(&dn->dn_struct_rwlock); - dmu_buf_rele_array(dba); + dmu_buf_rele_array(dbp, nblks, tag); zio_nowait(zio); return (EIO); } @@ -436,7 +427,7 @@ else curthread->td_ru.ru_oublock++; #endif - dbp[i] = db; + dbp[i] = &db->db; } rw_exit(&dn->dn_struct_rwlock); @@ -446,14 +437,14 @@ if (dp && dsl_pool_sync_context(dp)) dp->dp_read_overhead += gethrtime() - start; if (err) { - dmu_buf_rele_array(dba); + dmu_buf_rele_array(dbp, nblks, tag); return (err); } /* wait for other io to complete */ if (read) { for (i = 0; i < nblks; i++) { - dmu_buf_impl_t *db = dbp[i]; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; mutex_enter(&db->db_mtx); while (db->db_state & (DB_READ|DB_FILL)) cv_wait(&db->db_changed, &db->db_mtx); @@ -461,13 +452,14 @@ err = EIO; mutex_exit(&db->db_mtx); if (err) { - dmu_buf_rele_array(dba); + dmu_buf_rele_array(dbp, nblks, tag); return (err); } } } - *dbap = dba; + *numbufsp = nblks; + *dbpp = dbp; return (0); } @@ -477,7 +469,7 @@ */ static int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, - uint64_t length, int read, void *tag, dbuf_array_t **dbap) + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dnode_t *dn; int err; @@ -487,16 +479,16 @@ return (err); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - dbap, DMU_READ_PREFETCH); + numbufsp, dbpp, DMU_READ_PREFETCH); dnode_rele(dn, FTAG); return (err); } -static int +int dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, - uint64_t length, int read, void *tag, dbuf_array_t **dbap) + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; @@ -505,7 +497,7 @@ DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - dbap, DMU_READ_PREFETCH); + numbufsp, dbpp, DMU_READ_PREFETCH); DB_DNODE_EXIT(db); return (err); @@ -516,19 +508,21 @@ * hold on the array of buffers MUST be released with dmu_buf_rele_array. You * can NOT release the hold on each buffer individually with dmu_buf_rele. */ -static void -dmu_buf_rele_array(dbuf_array_t *dba) +void +dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) { int i; + dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; - for (i = 0; i < dba->dba_count; i++) { - if (dba->dba_dbp[i]) - dbuf_rele(dba->dba_dbp[i], dba->dba_tag); + if (numbufs == 0) + return; + + for (i = 0; i < numbufs; i++) { + if (dbp[i]) + dbuf_rele(dbp[i], tag); } - if (dba->dba_count > 0) - kmem_free(dba->dba_dbp, - sizeof(dmu_buf_impl_t *) * dba->dba_count); - kmem_free(dba, sizeof(dbuf_array_t)); + + kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); } /** \brief Asynchronously try to read in the data. @@ -756,8 +750,8 @@ void *buf, uint32_t flags) { dnode_t *dn; - dbuf_array_t *dba; - int err; + dmu_buf_t **dbp; + int numbufs, err; err = dnode_hold(os, object, FTAG, &dn); if (err) @@ -784,14 +778,14 @@ * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, - TRUE, FTAG, &dba, flags); + TRUE, FTAG, &numbufs, &dbp, flags); if (err) break; - for (i = 0; i < dba->dba_count; i++) { + for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; - dmu_buf_t *db = &dba->dba_dbp[i]->db; + dmu_buf_t *db = dbp[i]; ASSERT(size > 0); @@ -804,7 +798,7 @@ size -= tocpy; buf = (char *)buf + tocpy; } - dmu_buf_rele_array(dba); + dmu_buf_rele_array(dbp, numbufs, FTAG); } dnode_rele(dn, FTAG); return (err); @@ -814,27 +808,26 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { - dbuf_array_t *dba; - int i; + dmu_buf_t **dbp; + int numbufs, i; if (size == 0) return; VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &dba)); + FALSE, FTAG, &numbufs, &dbp)); - for (i = 0; i < dba->dba_count; i++) { + for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; - dmu_buf_t *db = &dba->dba_dbp[i]->db; + dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); - ASSERT(i == 0 || i == dba->dba_count - 1 || - tocpy == db->db_size); + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); @@ -849,28 +842,28 @@ size -= tocpy; buf = (char *)buf + tocpy; } - dmu_buf_rele_array(dba); + dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { - dbuf_array_t *dba; - int i; + dmu_buf_t **dbp; + int numbufs, i; if (size == 0) return; VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &dba)); + FALSE, FTAG, &numbufs, &dbp)); - for (i = 0; i < dba->dba_count; i++) { - dmu_buf_t *db = &dba->dba_dbp[i]->db; + for (i = 0; i < numbufs; i++) { + dmu_buf_t *db = dbp[i]; dmu_buf_will_not_fill(db, tx); } - dmu_buf_rele_array(dba); + dmu_buf_rele_array(dbp, numbufs, FTAG); } /** @@ -1000,8 +993,8 @@ int dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) { - dbuf_array_t *dba; - int i, err; + dmu_buf_t **dbp; + int numbufs, i, err; xuio_t *xuio = NULL; /* @@ -1009,7 +1002,7 @@ * to be reading in parallel. */ err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, - &dba); + &numbufs, &dbp); if (err) return (err); @@ -1018,10 +1011,10 @@ xuio = (xuio_t *)uio; #endif - for (i = 0; i < dba->dba_count; i++) { + for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; - dmu_buf_t *db = &dba->dba_dbp[i]->db; + dmu_buf_t *db = dbp[i]; ASSERT(size > 0); @@ -1051,7 +1044,7 @@ size -= tocpy; } - dmu_buf_rele_array(dba); + dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } @@ -1059,26 +1052,27 @@ static int dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) { - dbuf_array_t *dba; + dmu_buf_t **dbp; + int numbufs; int err = 0; int i; err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, - FALSE, FTAG, &dba, DMU_READ_PREFETCH); + FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); - for (i = 0; i < dba->dba_count; i++) { + for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; - dmu_buf_t *db = &dba->dba_dbp[i]->db; + dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); - ASSERT(i == 0 || i == dba->dba_count-1 || tocpy == db->db_size); + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); @@ -1102,7 +1096,7 @@ size -= tocpy; } - dmu_buf_rele_array(dba); + dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } @@ -1151,22 +1145,22 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, page_t *pp, dmu_tx_t *tx) { - dbuf_array_t *dba; - int i; + dmu_buf_t **dbp; + int numbufs, i; int err; if (size == 0) return (0); err = dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &dba); + FALSE, FTAG, &numbufs, &dbp); if (err) return (err); - for (i = 0; i < dba->dba_count; i++) { + for (i = 0; i < numbufs; i++) { int tocpy, copied, thiscpy; int bufoff; - dmu_buf_t *db = dba->dba_dbp[i]->db; + dmu_buf_t *db = dbp[i]; caddr_t va; ASSERT(size > 0); @@ -1175,7 +1169,7 @@ bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); - ASSERT(i == 0 || i == dba->dba_count-1 || tocpy == db->db_size); + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); @@ -1197,7 +1191,7 @@ offset += tocpy; size -= tocpy; } - dmu_buf_rele_array(dba); + dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } #endif /* sun */ ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#22 (text) ==== @@ -336,9 +336,6 @@ /** The number of dbufs in the array. */ size_t dba_count; - /** The tag used for the array. */ - void *dba_tag; - /** The callback to call if the conditions are met. */ dmu_callback_t dba_cb; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#6 (text) ==== @@ -283,6 +283,10 @@ void dmu_buf_rele(dmu_buf_t *db, void *tag); uint64_t dmu_buf_refcount(dmu_buf_t *db); +int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); +void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); + void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func); void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr, Change 521734 by willa@willa_repo on 2012/01/19 16:34:55 Reinstate dmu_context_t with a smaller change. Instead of changing all of the DMU interfaces at once, simply clone the case we are targeting, specifically dmu_read(). Clone dmu_buf_hold_array_by_dnode(), dmu_buf_rele_array(), & dmu_read(), modified to take dmu_context_t, as done in previous checkins. Stub dmu_read() to call dmu_read_cb() with a NULL callback & private. This allows shifting only zvol_strategy() to the new function without touching other consumers. dmu_buf_hold_array_by_dnode_cb(): - Allocate the meta-structure dmu_context_t so that the buffers are at the bottom of the structure, which allows performing only one malloc per call. - If a callback is specified, call zio_nowait() to tell ZIO to execute the asynchronous I/O's passed in. Otherwise, perform zio_wait() and cv_wait() as per usual. - In order to provide correct accounting for how much a particular array of buffers handles, calculate the amount that will be used in the copy later, here. - Move out the actual copy function to dmu_ctx_read_buf(), which is a dmu_callback_t that takes a dmu_context_t and a char * buf. I don't think it will be used as a dmu_cb by zvol, but will probably be called by another dmu_cb to perform the actual copy once all buffers have entered the CACHED state. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#57 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#20 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#23 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#7 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#57 (text) ==== @@ -2707,8 +2707,8 @@ for (dbc = list_head(&db->db_callbacks); db != NULL; dbc = list_next(&db->db_callbacks, dbc)) { - if (dbc->dbc_dba->dba_states & db->db_state) - dbc->dbc_dba->dba_cb(dbc->dbc_dba, db); + if (dbc->dmu_ctx->db_states & db->db_state) + dbc->dmu_ctx->dmu_cb(dbc->dmu_ctx, db); /* XXX what else should we do? */ } } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#20 (text) ==== @@ -745,12 +745,176 @@ return (0); } +/* + * DMU Context based functions. + */ + +void +dmu_buf_rele_array_cb(dmu_context_t *dmu_ctx) +{ + int i; + + for (i = 0; i < dmu_ctx->count; i++) { + /* May be called from error case, where ptr may be NULL. */ + if (dmu_ctx->dbp[i]) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dmu_ctx->dbp[i]; + dbuf_rele(db, dmu_ctx->tag); + } + } + + kmem_free(dmu_ctx, + sizeof(dmu_context_t) + dmu_ctx->count * sizeof (dmu_buf_t *)); +} int -dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf, uint32_t flags) +dmu_buf_hold_array_by_dnode_cb(dnode_t *dn, uint64_t offset, uint64_t length, + int read, void *tag, dmu_callback_t dmu_cb, void *priv, + dmu_context_t **dmu_ctx_p, uint32_t flags) +{ + dsl_pool_t *dp = NULL; + dmu_context_t *dmu_ctx; + uint64_t blkid, nblks, i; + uint32_t dbuf_flags; + int err; + zio_t *zio; + hrtime_t start; + + ASSERT(length <= DMU_MAX_ACCESS); + + dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; + if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) + dbuf_flags |= DB_RF_NOPREFETCH; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_datablkshift) { + int blkshift = dn->dn_datablkshift; + nblks = (P2ROUNDUP(offset+length, 1ULL<> blkshift; + } else { + if (offset + length > dn->dn_datablksz) { + zfs_panic_recover("zfs: accessing past end of object " + "%llx/%llx (size=%u access=%llu+%llu)", + (longlong_t)dn->dn_objset-> + os_dsl_dataset->ds_object, + (longlong_t)dn->dn_object, dn->dn_datablksz, + (longlong_t)offset, (longlong_t)length); + rw_exit(&dn->dn_struct_rwlock); + return (EIO); + } + nblks = 1; + } + dmu_ctx = kmem_zalloc( + sizeof(dmu_context_t) + nblks * sizeof(dmu_buf_t *), KM_SLEEP); + dmu_ctx->dmu_cb = dmu_cb; + dmu_ctx->dmu_cb_private = priv; + dmu_ctx->tag = tag; + dmu_ctx->count = nblks; + dmu_ctx->db_states = DB_CACHED; /* XXX do something more intelligent */ + dmu_ctx->offset = offset; + + if (dn->dn_objset->os_dsl_dataset) + dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; + if (dp && dsl_pool_sync_context(dp)) + start = gethrtime(); + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); + blkid = dbuf_whichblock(dn, offset); + for (i = 0; i < nblks; i++) { + dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); + int bufoff, bufsiz; + + if (db == NULL) { + rw_exit(&dn->dn_struct_rwlock); + dmu_buf_rele_array_cb(dmu_ctx); + zio_nowait(zio); + return (EIO); + } + /* Calculate the amount of data this buffer contributes. */ + bufoff = dmu_ctx->offset - db->db.db_offset; + bufsiz = (int)MIN(db->db.db_size - bufoff, length); + dmu_ctx->size += bufsiz; + ASSERT(dmu_ctx->size <= length); + /* initiate async i/o */ + if (read) + (void) dbuf_read(db, zio, dbuf_flags); +#ifdef _KERNEL + else + curthread->td_ru.ru_oublock++; +#endif + dmu_ctx->dbp[i] = &db->db; + } + rw_exit(&dn->dn_struct_rwlock); + + /* + * If a callback is specified, issue the I/O's without waiting. + * The callback will be responsible for cleaning up. + */ + if (dmu_cb != NULL) + zio_nowait(zio); + else { + /* wait for async i/o */ + err = zio_wait(zio); + /* track read overhead when we are in sync context */ + if (dp && dsl_pool_sync_context(dp)) + dp->dp_read_overhead += gethrtime() - start; + if (err) { + dmu_buf_rele_array_cb(dmu_ctx); + return (err); + } + + /* wait for other io to complete */ + if (read) { + for (i = 0; i < nblks; i++) { + dmu_buf_impl_t *db = + (dmu_buf_impl_t *)dmu_ctx->dbp[i]; + mutex_enter(&db->db_mtx); + while (db->db_state & (DB_READ|DB_FILL)) + cv_wait(&db->db_changed, &db->db_mtx); + if (db->db_state == DB_UNCACHED) + err = EIO; + mutex_exit(&db->db_mtx); + if (err) { + dmu_buf_rele_array_cb(dmu_ctx); + return (err); + } + } + } + } + + *dmu_ctx_p = dmu_ctx; + return (0); +} +void +dmu_ctx_read_buf(dmu_context_t *dmu_ctx, void *priv) +{ + char *buf = priv; + uint64_t size = dmu_ctx->size; + uint64_t offset = dmu_ctx->offset; + int i; + + for (i = 0; i < dmu_ctx->count; i++) { + int bufoff; + int tocpy; + dmu_buf_t *db = dmu_ctx->dbp[i]; + + ASSERT(size > 0); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + bcopy((char *)db->db_data + bufoff, buf, tocpy); + + offset += tocpy; + size -= tocpy; + buf = (char *)buf + tocpy; + } + dmu_buf_rele_array_cb(dmu_ctx); +} + +int +dmu_read_cb(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf, uint32_t flags, dmu_callback_t dmu_cb, void *priv) { dnode_t *dn; - dmu_buf_t **dbp; + dmu_context_t *dmu_ctx; int numbufs, err; err = dnode_hold(os, object, FTAG, &dn); @@ -768,7 +932,6 @@ bzero((char *)buf + newsz, size - newsz); size = newsz; } - while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); int i; @@ -777,33 +940,29 @@ * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ - err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, - TRUE, FTAG, &numbufs, &dbp, flags); + err = dmu_buf_hold_array_by_dnode_cb(dn, offset, mylen, + TRUE, FTAG, dmu_cb, priv, &dmu_ctx, flags); if (err) break; - for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; - dmu_buf_t *db = dbp[i]; + if (dmu_cb == NULL) + dmu_ctx_read_buf(dmu_ctx, buf); - ASSERT(size > 0); + offset += dmu_ctx->size; + size -= dmu_ctx->size; + } - bufoff = offset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - bcopy((char *)db->db_data + bufoff, buf, tocpy); - - offset += tocpy; - size -= tocpy; - buf = (char *)buf + tocpy; - } - dmu_buf_rele_array(dbp, numbufs, FTAG); - } dnode_rele(dn, FTAG); return (err); } +int +dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf, uint32_t flags) +{ + return (dmu_read_cb(os, object, offset, size, buf, flags, NULL, NULL)); +} + void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#23 (text) ==== @@ -322,38 +322,13 @@ struct dbuf_array; struct dmu_buf_impl; -/** - * A callback function pointer which provides the callee, via dba.dba_private, - * a way to keep track of the state of an array of dbufs. - */ -typedef void(*dmu_callback_t)(struct dbuf_array *, struct dmu_buf_impl *); - -typedef struct dbuf_array { - - /** The set of dbufs in this array. */ - struct dmu_buf_impl **dba_dbp; - - /** The number of dbufs in the array. */ - size_t dba_count; - - /** The callback to call if the conditions are met. */ - dmu_callback_t dba_cb; - - /** The dbuf states when a callback may be called. */ - int dba_states; - - /** Private data for the callback. */ - void *dba_private; - -} dbuf_array_t; - typedef struct dbuf_callback_node { /** This object's entry in the list in dmu_buf_impl_t. */ list_node_t dbc_link; - /** The dbuf array this callback is associated with. */ - dbuf_array_t *dbc_dba; + /** The DMU context this callback is associated with. */ + dmu_context_t *dmu_ctx; } dbuf_callback_node_t; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#7 (text) ==== @@ -211,6 +211,39 @@ void *db_data; /**< data in buffer */ } dmu_buf_t; +/** + * \brief These structures are for DMU consumers that want async + * callbacks. + */ +struct dmu_context; +typedef void (*dmu_callback_t)(struct dmu_context *, void *); + +typedef struct dmu_context { + + /** The number of buffers associated with this context. */ + size_t count; + + /** The tag used for this context. */ + void *tag; + + /** The callback to call if the conditions are met. */ + dmu_callback_t dmu_cb; + + /** The dbuf states when a callback may be called. */ + int db_states; + + /** The size and offset of this array of buffers. */ + uint64_t offset; + uint64_t size; + + /** Private data for the callback. */ + void *dmu_cb_private; + + /** The set of buffers associated with this context. */ + struct dmu_buf *dbp[0]; + +} dmu_context_t; + typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); /* @@ -342,6 +375,8 @@ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags); +int dmu_read_cb(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf, uint32_t flags, dmu_callback_t dmu_cb, void *priv); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, Change 521736 by willa@willa_repo on 2012/01/19 16:48:04 Add & remove the DMU context from the dbuf's callback list. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#21 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#21 (text) ==== @@ -753,12 +753,19 @@ dmu_buf_rele_array_cb(dmu_context_t *dmu_ctx) { int i; + dbuf_callback_node_t *dbc; for (i = 0; i < dmu_ctx->count; i++) { /* May be called from error case, where ptr may be NULL. */ if (dmu_ctx->dbp[i]) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dmu_ctx->dbp[i]; dbuf_rele(db, dmu_ctx->tag); + /* Remove us from the dbuf's callback list. */ + if (dmu_ctx->dmu_cb != NULL) { + mutex_enter(&db->db_mtx); + list_remove(&db->db_callbacks, dmu_ctx); + mutex_exit(&db->db_mtx); + } } } @@ -832,6 +839,13 @@ bufsiz = (int)MIN(db->db.db_size - bufoff, length); dmu_ctx->size += bufsiz; ASSERT(dmu_ctx->size <= length); + /* Associate the dbuf with this callback if specified. */ + if (dmu_cb != NULL) { + mutex_enter(&db->db_mtx); + /* FIFO behavior likely: insert at the end. */ + list_insert_tail(&db->db_callbacks, dmu_ctx); + mutex_exit(&db->db_mtx); + } /* initiate async i/o */ if (read) (void) dbuf_read(db, zio, dbuf_flags); Change 522343 by willa@willa_repo on 2012/01/24 13:45:38 Extend FreeBSD's taskqueue(9) interface to allow callbacks. This works by calling taskqueue_set_callback() on a queue after creating it, passing in the function pointer & data pointer desired. Two callback types are supported: INIT and SHUTDOWN. If defined, they will be called by the thread function at startup and shutdown. The intended target for this change is to allow ZIO task queues in ZFS to set up thread-local storage for DMU callback state data, prior to any tasks being executed. In this way, a ZIO executor thread can tell whether it needs to queue asynchronous I/Os until later by checking its TLS state. Affected files ... ... //depot/branches/redline/projects/cow/sys/kern/subr_taskqueue.c#3 edit ... //depot/branches/redline/projects/cow/sys/sys/taskqueue.h#3 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/kern/subr_taskqueue.c#3 (text) ==== @@ -63,6 +63,9 @@ int tq_spin; int tq_flags; int tq_callouts; + + taskqueue_callback_fn tq_callbacks[TASKQUEUE_CALLBACK_TYPE_MAX-1]; + void *tq_cb_contexts[TASKQUEUE_CALLBACK_TYPE_MAX-1]; }; #define TQ_FLAGS_ACTIVE (1 << 0) @@ -87,6 +90,13 @@ mtx_unlock(&(tq)->tq_mutex); \ } while (0) +#define TASKQUEUE_RUN_CALLBACK(tq, cb_type) \ + do { \ + if ((tq)->tq_callbacks[cb_type] != NULL) \ + (tq)->tq_callbacks[cb_type]( \ + (tq)->tq_cb_contexts[cb_type]); \ + } while (0) + void _timeout_task_init(struct taskqueue *queue, struct timeout_task *timeout_task, int priority, task_fn_t func, void *context) @@ -137,6 +147,19 @@ MTX_DEF, "taskqueue"); } +void +taskqueue_set_callback(struct taskqueue *queue, + enum taskqueue_callback_type cb_type, taskqueue_callback_fn callback, + void *context) +{ + + if (cb_type >= TASKQUEUE_CALLBACK_TYPE_MAX) + panic("Newer taskqueue consumer using old taskqueue API"); + + queue->tq_callbacks[cb_type] = callback; + queue->tq_cb_contexts[cb_type] = context; +} + /* * Signal a taskqueue thread to terminate. */ @@ -493,6 +516,8 @@ tqp = arg; tq = *tqp; TQ_LOCK(tq); + + TASKQUEUE_RUN_CALLBACK(tq, TASKQUEUE_CALLBACK_TYPE_INIT); while ((tq->tq_flags & TQ_FLAGS_ACTIVE) != 0) { taskqueue_run_locked(tq); /* @@ -506,6 +531,8 @@ } taskqueue_run_locked(tq); + TASKQUEUE_RUN_CALLBACK(tq, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN); + /* rendezvous with thread that asked us to terminate */ tq->tq_tcount--; wakeup_one(tq->tq_threads); ==== //depot/branches/redline/projects/cow/sys/sys/taskqueue.h#3 (text) ==== @@ -47,6 +47,14 @@ int f; }; +enum taskqueue_callback_type { + TASKQUEUE_CALLBACK_TYPE_INIT, + TASKQUEUE_CALLBACK_TYPE_SHUTDOWN, + TASKQUEUE_CALLBACK_TYPE_MAX, +}; + +typedef void (*taskqueue_callback_fn)(void *context); + /* * A notification callback function which is called from * taskqueue_enqueue(). The context argument is given in the call to @@ -76,6 +84,9 @@ void taskqueue_block(struct taskqueue *queue); void taskqueue_unblock(struct taskqueue *queue); int taskqueue_member(struct taskqueue *queue, struct thread *td); +void taskqueue_set_callback(struct taskqueue *queue, + enum taskqueue_callback_type cb_type, + taskqueue_callback_fn callback, void *context); #define TASK_INITIALIZER(priority, func, context) \ { .ta_pending = 0, \ Change 522351 by willa@willa_repo on 2012/01/24 14:28:55 Sandbox the async I/O stack with most parts implemented. The async mode is checked in disabled; the old case still passes. This change makes use of thread-local storage (TLS/TSD/OSD, depending on who you ask) to avoid the need for a mutex in dmu_ctx, separate I/O processing threads for the contexts etc. sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c: sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c: - Build on taskqueue(9) changes to allow ZIO to specify a context constructor to the task queue via the OpenSolaris taskq shim. - Register a thread-local storage key for async I/Os. Threads that use this key will store a malloc'd state structure there. Register a destructor alongside and have the TLS mechanism call it rather than having the thread call it. - Upon creation of ZIO threads by the SPA, pass in the constructor, which simply calls dmu_create_thread_context(). sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h: - Replace dbuf_callback_node_t with dmu_context_node_t, which is simply a linked list of DMU contexts. This structure will also be used by dmu_cb_state_t, the TLS structure. - Extend dmu_context_t to contain: - "holds": the number of holds remaining on the context - "flags": currently whether the DMU request was a read - "err": The number of errors that occurred. - "data_buf": Pointer to the initiator's data buffer for the context. - Export a few new DMU APIs to the world: - dmu_buf_ctx_rele(): Release a hold on a context. - dmu_buf_ctx_process(): Process the current thread's pending async I/O list, if initialized and non-empty. - dmu_create_thread_context(): Create the current thread's pending async I/O list. - dmu_destroy_thread_context(): Destroy the current thread's pending async I/O list. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c: - Move dbuf processing to dbuf.c:dbuf_process_dmu_contexts(). While holding a dbuf's mutex, if its new state matches, remove the DMU context from its list and release its refcount from that context. - Propagate error status up from DBUF_STATE_CHANGE() calls. - dbuf_callback_node_t -> dmu_context_node_t. - Get rid of dbuf_run_callbacks(), which isn't how this will work. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c: - Modify zio_execute(): when a ZIO thread has finished executing its work, call dmu_buf_ctx_process(). sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c: - Implement zvol_dmu_cb(), the callback that will be used by ZVOL to handle completed asynchronous reads. Because 99.9% of the work is done by the DMU layer, all it does is call g_io_deliver(). If any error is detected, it filters up the stack as EIO. - Add & comment out the replacement of dmu_read() with dmu_read_cb(). sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c: - Add a temp sysctl counter to track the current contexts-in-flight. - Implement dmu_{create,destroy}_thread_context(), which set up and tear down the current thread's async I/O done list. - Implement dmu_buf_ctx_process(): Process the current thread's async I/O done list, if one exists. - Change dmu_buf_rele_array_cb() so that it no longer removes the DMU context from each dbuf; that is now the dbuf's responsibility. - Implement dmu_ctx_read_buf(): The actual "read all dbufs into the initiator's buffer" function, which used to be in dmu_read(). This function also calls dmu_buf_rele_array_cb() to release the context. - Implement dmu_buf_ctx_dispatch(): Dispatch a completed context. - Implement dmu_buf_ctx_rele(): Release a context for a given dbuf. Also called by dmu_read_cb() to release the initiator's hold, and if possible, call back immediately in the cached case. - Improve dmu_buf_hold_array_by_dnode_cb(): - Distinguish between offset and size available in order to perform the buffer accounting correctly. - Initialize a context's holds as being the number of buffers + 1. This allows us to potentially immediately dispatch it when dmu_read_cb() exits, if everything is already cached. - If a dbuf is already CACHED, just release its hold, instead of bothering to add the context, call back into DMU, cleanup etc. - Sync dmu_read_cb() with the API changes. Fix a bug where its output pointer wasn't getting updated after each hold_array call. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c#2 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#60 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#23 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c#7 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#25 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#10 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c#7 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c#3 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#8 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h#2 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c#2 (text) ==== @@ -61,9 +61,10 @@ } SYSUNINIT(system_taskq_fini, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_fini, NULL); -taskq_t * -taskq_create(const char *name, int nthreads, pri_t pri, int minalloc __unused, - int maxalloc __unused, uint_t flags) +static taskq_t * +taskq_create_with_init(const char *name, int nthreads, pri_t pri, + int minalloc __unused, int maxalloc __unused, uint_t flags, + taskq_callback_fn ctor, taskq_callback_fn dtor) { taskq_t *tq; @@ -73,17 +74,34 @@ tq = kmem_alloc(sizeof(*tq), KM_SLEEP); tq->tq_queue = taskqueue_create(name, M_WAITOK, taskqueue_thread_enqueue, &tq->tq_queue); + if (ctor != NULL) + taskqueue_set_callback(tq->tq_queue, + TASKQUEUE_CALLBACK_TYPE_INIT, ctor, NULL); + if (dtor != NULL) + taskqueue_set_callback(tq->tq_queue, + TASKQUEUE_CALLBACK_TYPE_SHUTDOWN, dtor, NULL); (void) taskqueue_start_threads(&tq->tq_queue, nthreads, pri, "%s", name); return ((taskq_t *)tq); } taskq_t * +taskq_create(const char *name, int nthreads, pri_t pri, int minalloc __unused, + int maxalloc __unused, uint_t flags) +{ + + return (taskq_create_with_init(name, nthreads, pri, minalloc, maxalloc, + flags, NULL, NULL)); +} + +taskq_t * taskq_create_proc(const char *name, int nthreads, pri_t pri, int minalloc, - int maxalloc, proc_t *proc __unused, uint_t flags) + int maxalloc, proc_t *proc __unused, uint_t flags, taskq_callback_fn ctor, + taskq_callback_fn dtor) { - return (taskq_create(name, nthreads, pri, minalloc, maxalloc, flags)); + return (taskq_create_with_init(name, nthreads, pri, minalloc, maxalloc, + flags, ctor, dtor)); } void ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#60 (text) ==== @@ -114,16 +114,28 @@ return (crc); } -static void dbuf_run_callbacks(dmu_buf_impl_t *db); +static void +dbuf_process_dmu_contexts(dmu_buf_impl_t *db, boolean_t err) +{ + dmu_context_node_t *dcn, *next; + + for (dcn = list_head(&db->db_callbacks); dcn != NULL; dcn = next) { + next = list_next(&db->db_callbacks, dcn); + if (dcn->dmu_ctx->db_states & db->db_state) { + list_remove(&db->db_callbacks, dcn); + dmu_buf_ctx_rele(dcn->dmu_ctx, &db->db, err); + } + } +} -#define DBUF_STATE_CHANGE_COMMON(db, op, state, why) \ +#define DBUF_STATE_CHANGE_COMMON(db, op, state, err, why) \ (db)->db_state op state; \ if (!list_is_empty(&(db)->db_callbacks)) \ - dbuf_run_callbacks(db) + dbuf_process_dmu_contexts(db, err) #ifdef ZFS_DEBUG -#define DBUF_STATE_CHANGE(db, op, state, why) do { \ - DBUF_STATE_CHANGE_COMMON(db, op, state, why); \ +#define DBUF_STATE_CHANGE(db, op, state, err, why) do { \ + DBUF_STATE_CHANGE_COMMON(db, op, state, err, why); \ if (zfs_flags & ZFS_DEBUG_DBUF_STATE) { \ uint64_t __db_obj = (db)->db.db_object; \ char __db_buf[32]; \ @@ -140,8 +152,8 @@ } \ } while(0) #else -#define DBUF_STATE_CHANGE(db, op, state, why) do { \ - DBUF_STATE_CHANGE_COMMON(db, op, state, why); \ +#define DBUF_STATE_CHANGE(db, op, state, err, why) do { \ + DBUF_STATE_CHANGE_COMMON(db, op, state, err, why); \ } while(0) #endif @@ -506,7 +518,8 @@ dbuf_evict_user(db); db->db.db_data = NULL; if (db->db_state != DB_NOFILL) - DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "set data"); + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, B_FALSE, + "set data"); } } @@ -734,7 +747,7 @@ * to cached. */ ASSERT(db->db_buf != NULL); - DBUF_STATE_CHANGE(db, =, DB_CACHED, + DBUF_STATE_CHANGE(db, =, DB_CACHED, B_FALSE, "resolve of records in READ state"); } @@ -755,7 +768,7 @@ * read and transition to DB_CACHED. */ dbuf_set_data(db, buf); - DBUF_STATE_CHANGE(db, =, DB_CACHED, + DBUF_STATE_CHANGE(db, =, DB_CACHED, B_FALSE, "read completed with no dirty records"); } else { /* @@ -810,7 +823,8 @@ } else { ASSERT3P(db->db_buf, ==, NULL); db->db_state = DB_UNCACHED; - DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "read failed"); + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, + B_TRUE, "read failed"); } VERIFY(arc_buf_remove_ref(buf, db) == 1); } @@ -842,7 +856,7 @@ if (bonuslen) bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); dbuf_update_data(db); - DBUF_STATE_CHANGE(db, =, DB_CACHED, "bonus buffer filled"); + DBUF_STATE_CHANGE(db, =, DB_CACHED, B_FALSE, "bonus buffer filled"); return (TRUE); } @@ -882,7 +896,8 @@ buf = arc_buf_alloc(dn->dn_objset->os_spa, db->db.db_size, db, DBUF_GET_BUFC_TYPE(db)); bzero(buf->b_data, db->db.db_size); - DBUF_STATE_CHANGE(db, =, DB_READ, "hole read satisfied"); + DBUF_STATE_CHANGE(db, =, DB_READ, B_FALSE, + "hole read satisfied"); dbuf_read_complete(db, buf); return (TRUE); } @@ -953,7 +968,7 @@ DB_DNODE_EXIT(db); - DBUF_STATE_CHANGE(db, =, DB_READ, "read issued"); + DBUF_STATE_CHANGE(db, =, DB_READ, B_FALSE, "read issued"); mutex_exit(&db->db_mtx); if (DBUF_IS_L2CACHEABLE(db)) @@ -1368,7 +1383,7 @@ /* Now clear the contents. */ bzero(db->db.db_data, db->db.db_size); arc_buf_freeze(db->db_buf); - DBUF_STATE_CHANGE(db, =, DB_CACHED, + DBUF_STATE_CHANGE(db, =, DB_CACHED, B_FALSE, "dbuf has been freed"); } @@ -1897,7 +1912,8 @@ ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_state & (DB_UNCACHED|DB_NOFILL|DB_CACHED)); dbuf_set_data(db, NULL); - DBUF_STATE_CHANGE(db, =, DB_NOFILL, "allocating NOFILL buffer"); + DBUF_STATE_CHANGE(db, =, DB_NOFILL, + B_FALSE, "allocating NOFILL buffer"); } else if (how == DB_FILL) { if (db->db_state == DB_UNCACHED) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); @@ -1924,15 +1940,17 @@ dbuf_set_data(db, fill_buf); if (size != db->db.db_size) DBUF_STATE_CHANGE(db, =, - (DB_PARTIAL|DB_FILL), "notifying " - "of an initial partial fill"); + (DB_PARTIAL|DB_FILL), B_FALSE, + "notifying of an initial " + "partial fill"); else DBUF_STATE_CHANGE(db, =, DB_FILL, + B_FALSE, "notifying of a complete fill"); } else atomic_add_64(&dirty_buffers_already_cached, 1); } else if (db->db_state & (DB_READ|DB_PARTIAL)) { - DBUF_STATE_CHANGE(db, |=, DB_FILL, + DBUF_STATE_CHANGE(db, |=, DB_FILL, B_FALSE, "notifying of a followup partial fill"); } else { /* No wait on FILL is done for indirect blocks. */ @@ -2066,7 +2084,8 @@ dbuf_dirty_record_add_range(dr, offset, size); if ((db->db_state & DB_FILL) && list_is_empty(&dr->dt.dl.write_ranges)) - DBUF_STATE_CHANGE(db, =, DB_FILL, "writer fully filled"); + DBUF_STATE_CHANGE(db, =, DB_FILL, B_FALSE, + "writer fully filled"); } mutex_exit(&db->db_mtx); @@ -2403,7 +2422,7 @@ bzero(db->db.db_data, db->db.db_size); db->db_freed_in_flight = FALSE; dbuf_dirty_record_cleanup_ranges(dr); - DBUF_STATE_CHANGE(db, =, DB_CACHED, + DBUF_STATE_CHANGE(db, =, DB_CACHED, B_FALSE, "fill done handling freed in flight"); } else { /* @@ -2413,10 +2432,10 @@ * FILL bit, so it goes back to the steady state. */ if (db->db_state == DB_FILL) - DBUF_STATE_CHANGE(db, =, DB_CACHED, + DBUF_STATE_CHANGE(db, =, DB_CACHED, B_FALSE, "filler finished, complete buffer"); else { - DBUF_STATE_CHANGE(db, &=, ~DB_FILL, + DBUF_STATE_CHANGE(db, &=, ~DB_FILL, B_FALSE, "filler finished, incomplete buffer"); ASSERT(db->db_state & (DB_PARTIAL|DB_READ)); } @@ -2508,14 +2527,16 @@ arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } db->db.db_data = NULL; - DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "buffer cleared"); + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, B_FALSE, + "buffer cleared"); } ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); ASSERT(list_is_empty(&db->db_dirty_records)); - DBUF_STATE_CHANGE(db, =, DB_EVICTING, "buffer eviction started"); + DBUF_STATE_CHANGE(db, =, DB_EVICTING, B_FALSE, + "buffer eviction started"); db->db_blkptr = NULL; DB_DNODE_ENTER(db); @@ -2636,8 +2657,8 @@ list_create(&db->db_dirty_records, sizeof(dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, db_dirty_record_link)); - list_create(&db->db_callbacks, sizeof(dbuf_callback_node_t), - offsetof(dbuf_callback_node_t, dbc_link)); + list_create(&db->db_callbacks, sizeof(dmu_context_node_t), + offsetof(dmu_context_node_t, dcn_link)); db->db_objset = os; db->db.db_object = dn->dn_object; @@ -2660,7 +2681,8 @@ (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; - DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "bonus buffer created"); + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, B_FALSE, + "bonus buffer created"); /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); return (db); @@ -2691,7 +2713,8 @@ return (odb); } list_insert_head(&dn->dn_dbufs, db); - DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "regular buffer created"); + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, B_FALSE, + "regular buffer created"); mutex_exit(&dn->dn_dbufs_mtx); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); @@ -2708,19 +2731,6 @@ return (db); } -static void -dbuf_run_callbacks(dmu_buf_impl_t *db) -{ - dbuf_callback_node_t *dbc; - - for (dbc = list_head(&db->db_callbacks); db != NULL; - dbc = list_next(&db->db_callbacks, dbc)) { - if (dbc->dmu_ctx->db_states & db->db_state) - dbc->dmu_ctx->dmu_cb(dbc->dmu_ctx, db); - /* XXX what else should we do? */ - } -} - static int dbuf_do_evict(void *private) { ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#23 (text) ==== @@ -101,6 +101,15 @@ { byteswap_uint64_array, TRUE, "bpobj subobj" }, }; +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, dmu, CTLFLAG_RW, 0, "ZFS DMU"); +#define SYSCTL_COUNTER_U(name, desc) \ + uint64_t name; \ + SYSCTL_QUAD(_vfs_zfs_dmu, OID_AUTO, name, CTLFLAG_RD, \ + &name, 0, desc) + +SYSCTL_COUNTER_U(dmu_ctx_in_flight, "number of contexts in flight"); + /** * \brief Obtain the DMU buffer from the specified object which contains the * specified offset. @@ -753,29 +762,147 @@ * DMU Context based functions. */ +/* Used for TSD for processing completed asynchronous I/Os. */ +uint_t zfs_async_io_key; + +int +dmu_create_thread_context(void) +{ + dmu_cb_state_t *dcs; + + /* This function should never be called more than once in a thread. */ +#ifdef ZFS_DEBUG + dcs = tsd_get(zfs_async_io_key); + ASSERT(dcs == NULL); +#endif + + dcs = kmem_zalloc(sizeof(dmu_cb_state_t), KM_SLEEP); + list_create(&dcs->io_list, sizeof(dmu_context_node_t), + offsetof(dmu_context_node_t, dcn_link)); + return tsd_set(zfs_async_io_key, dcs); +} + void +dmu_destroy_thread_context(void *context __unused) +{ + dmu_cb_state_t *dcs; + + dcs = tsd_get(zfs_async_io_key); + /* This function may be called on a thread that didn't call create. */ + if (dcs == NULL) + return; + + /* + * This function should only get called after a thread has finished + * processing its queue. + */ + ASSERT(list_is_empty(&dcs->io_list)); + + kmem_free(dcs, sizeof(dmu_cb_state_t)); + VERIFY(tsd_set(zfs_async_io_key, NULL) == 0); +} + +void dmu_buf_rele_array_cb(dmu_context_t *dmu_ctx) { int i; - dbuf_callback_node_t *dbc; for (i = 0; i < dmu_ctx->count; i++) { - /* May be called from error case, where ptr may be NULL. */ + /* May be called from error case, where dbp[i] may be NULL. */ if (dmu_ctx->dbp[i]) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dmu_ctx->dbp[i]; dbuf_rele(db, dmu_ctx->tag); - /* Remove us from the dbuf's callback list. */ - if (dmu_ctx->dmu_cb != NULL) { - mutex_enter(&db->db_mtx); - list_remove(&db->db_callbacks, dmu_ctx); - mutex_exit(&db->db_mtx); - } } } + atomic_subtract_64(&dmu_ctx_in_flight, 1); kmem_free(dmu_ctx, sizeof(dmu_context_t) + dmu_ctx->count * sizeof (dmu_buf_t *)); } +static void +dmu_ctx_read_buf(dmu_context_t *dmu_ctx) +{ + char *buf = dmu_ctx->data_buf; + uint64_t size = dmu_ctx->size; + uint64_t offset = dmu_ctx->offset; + int i; + + for (i = 0; i < dmu_ctx->count; i++) { + int bufoff; + int tocpy; + dmu_buf_t *db = dmu_ctx->dbp[i]; + + ASSERT(size > 0); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + bcopy((char *)db->db_data + bufoff, buf, tocpy); + + offset += tocpy; + size -= tocpy; + buf = (char *)buf + tocpy; + } + dmu_buf_rele_array_cb(dmu_ctx); +} + +void +dmu_buf_ctx_dispatch(dmu_context_t *dmu_ctx) +{ + /* XXX only supports reads for now */ + ASSERT(dmu_ctx->flags & DMU_CTX_FLAG_READ); + if (dmu_ctx->flags & DMU_CTX_FLAG_READ) { + dmu_ctx_read_buf(dmu_ctx); + /* XXX implement uio version? */ + } + /* Inform the initiator that their I/O is finished. */ + dmu_ctx->dmu_cb(dmu_ctx); + dmu_buf_rele_array_cb(dmu_ctx); /* frees dmu_ctx */ +} + +/** + * \brief Release a DMU context for a given dbuf. + * + * \param dmu_ctx DMU context to release. + * \param db Dbuf to release for, if not NULL. + * \param err Whether an error occurred. + * + * \note If db is NULL, this is being called by the I/O initiator to + * release its own reference. This is done to allow immediate + * dispatching in case every dbuf entered the desired state + * before the initiator finished. This would most likely occur + * in the case of cache hits on the entire dataset. + */ +void +dmu_buf_ctx_rele(dmu_context_t *dmu_ctx, dmu_buf_t *vdb, boolean_t err) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)vdb; + + /* The I/O initiator passes a NULL db to see if everyone called in. */ + if (db != NULL) { + ASSERT(MUTEX_HELD(&db->db_mtx)); + + /* Report an error, if any. */ + if (err) + atomic_add_int(&dmu_ctx->err, 1); + } + + /* If we are finished, schedule this DMU context for delivery. */ + if (refcount_release(&dmu_ctx->holds)) { + dmu_cb_state_t *dcs = tsd_get(zfs_async_io_key); + if (dcs != NULL) { + list_insert_tail(&dcs->io_list, dmu_ctx); + } else { + /* + * The current thread doesn't have anything + * registered for this TSD, so it must not handle + * queued delivery. Dispatch this context now. + */ + dmu_buf_ctx_dispatch(dmu_ctx); + } + } +} + int dmu_buf_hold_array_by_dnode_cb(dnode_t *dn, uint64_t offset, uint64_t length, int read, void *tag, dmu_callback_t dmu_cb, void *priv, @@ -783,7 +910,7 @@ { dsl_pool_t *dp = NULL; dmu_context_t *dmu_ctx; - uint64_t blkid, nblks, i; + uint64_t blkid, nblks, i, avail_size; uint32_t dbuf_flags; int err; zio_t *zio; @@ -813,14 +940,20 @@ } nblks = 1; } + atomic_add_64(&dmu_ctx_in_flight, 1); dmu_ctx = kmem_zalloc( sizeof(dmu_context_t) + nblks * sizeof(dmu_buf_t *), KM_SLEEP); dmu_ctx->dmu_cb = dmu_cb; dmu_ctx->dmu_cb_private = priv; dmu_ctx->tag = tag; dmu_ctx->count = nblks; - dmu_ctx->db_states = DB_CACHED; /* XXX do something more intelligent */ + /* Include a refcount for the initiator. */ + refcount_init(&dmu_ctx->holds, nblks + 1); + /* XXX do something more intelligent about state matching? */ + dmu_ctx->db_states = (DB_UNCACHED|DB_CACHED); dmu_ctx->offset = offset; + if (read) + dmu_ctx->flags |= DMU_CTX_FLAG_READ; if (dn->dn_objset->os_dsl_dataset) dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; @@ -828,6 +961,7 @@ start = gethrtime(); zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, offset); + avail_size = length; for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); int bufoff, bufsiz; @@ -839,15 +973,28 @@ return (EIO); } /* Calculate the amount of data this buffer contributes. */ - bufoff = dmu_ctx->offset - db->db.db_offset; - bufsiz = (int)MIN(db->db.db_size - bufoff, length); + ASSERT(offset >= db->db.db_offset); + bufoff = offset - db->db.db_offset; + bufsiz = (int)MIN(db->db.db_size - bufoff, avail_size); dmu_ctx->size += bufsiz; ASSERT(dmu_ctx->size <= length); + offset += bufsiz; + avail_size -= bufsiz; /* Associate the dbuf with this callback if specified. */ if (dmu_cb != NULL) { mutex_enter(&db->db_mtx); - /* FIFO behavior likely: insert at the end. */ - list_insert_tail(&db->db_callbacks, dmu_ctx); + if (db->db_state == DB_CACHED) { + /* + * This buffer's already done. Don't check + * for DB_UNCACHED here because that only + * indicates an initialized buffer. + */ + refcount_release(&dmu_ctx->holds); + } else { + /* FIFO behavior likely: insert at the end. */ + list_insert_tail(&db->db_callbacks, dmu_ctx); + } + /* NB: all dbufs may have completed at this point! */ mutex_exit(&db->db_mtx); } /* initiate async i/o */ @@ -900,31 +1047,25 @@ *dmu_ctx_p = dmu_ctx; return (0); } + void -dmu_ctx_read_buf(dmu_context_t *dmu_ctx, void *priv) +dmu_buf_ctx_process(void) { - char *buf = priv; - uint64_t size = dmu_ctx->size; - uint64_t offset = dmu_ctx->offset; - int i; + dmu_cb_state_t *dcs = tsd_get(zfs_async_io_key); + dmu_context_t *dctx, *next; - for (i = 0; i < dmu_ctx->count; i++) { - int bufoff; - int tocpy; - dmu_buf_t *db = dmu_ctx->dbp[i]; + /* + * If the current thread didn't register, it doesn't handle queued + * async I/O's. It is probably not a zio thread. This is needed + * because zio_execute() can be called from non-zio threads. + */ + if (dcs == NULL) + return; - ASSERT(size > 0); - - bufoff = offset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - bcopy((char *)db->db_data + bufoff, buf, tocpy); - - offset += tocpy; - size -= tocpy; - buf = (char *)buf + tocpy; + for (dctx = list_head(&dcs->io_list); dctx != NULL; dctx = next) { + next = list_next(&dcs->io_list, dctx); + dmu_buf_ctx_dispatch(dctx); } - dmu_buf_rele_array_cb(dmu_ctx); } int @@ -963,11 +1104,17 @@ if (err) break; + /* Tell the handler of the context where to read to. */ + dmu_ctx->data_buf = buf; + if (dmu_cb == NULL) - dmu_ctx_read_buf(dmu_ctx, buf); + dmu_ctx_read_buf(dmu_ctx); + else + dmu_buf_ctx_rele(dmu_ctx, NULL, B_FALSE); offset += dmu_ctx->size; size -= dmu_ctx->size; + buf = (char *)buf + dmu_ctx->size; } dnode_rele(dn, FTAG); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c#7 (text) ==== @@ -696,6 +696,13 @@ offsetof(spa_error_entry_t, se_avl)); } +void +spa_zio_thread_init(void *context __unused) +{ + + dmu_create_thread_context(); +} + static taskq_t * spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, uint_t value) @@ -739,7 +746,7 @@ } #endif return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, - spa->spa_proc, flags)); + spa->spa_proc, flags, spa_zio_thread_init, NULL)); } static void ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#25 (text) ==== @@ -322,16 +322,6 @@ struct dbuf_array; struct dmu_buf_impl; -typedef struct dbuf_callback_node { - - /** This object's entry in the list in dmu_buf_impl_t. */ - list_node_t dbc_link; - - /** The DMU context this callback is associated with. */ - dmu_context_t *dmu_ctx; - -} dbuf_callback_node_t; - typedef struct dmu_buf_impl { /* * The following members are immutable, with the exception of @@ -405,7 +395,7 @@ /** List of dirty records for the buffer sorted newest to oldest. */ list_t db_dirty_records; - /** List of callbacks (see dbuf_callback_node_t). */ + /** List of callbacks (see dmu_context_node_t). */ list_t db_callbacks; /** @@ -437,6 +427,25 @@ kmutex_t hash_mutexes[DBUF_MUTEXES]; } dbuf_hash_table_t; +typedef struct dmu_context_node { + + /** This object's entry in the list. */ + list_node_t dcn_link; + + /** The DMU context this callback is associated with. */ + dmu_context_t *dmu_ctx; + +} dmu_context_node_t; + +/** + * \brief Thread-specific DMU callback state for processing async I/O's. + */ +typedef struct dmu_cb_state { + + /** The list of IOs that are ready to be processed. */ + list_t io_list; + +} dmu_cb_state_t; uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#10 (text) ==== @@ -222,12 +222,15 @@ * callbacks. */ struct dmu_context; -typedef void (*dmu_callback_t)(struct dmu_context *, void *); +typedef void (*dmu_callback_t)(struct dmu_context *); typedef struct dmu_context { /** The number of buffers associated with this context. */ - size_t count; + int count; + + /** Number of buffers left to complete. */ + int holds; /** The tag used for this context. */ void *tag; @@ -238,6 +241,16 @@ /** The dbuf states when a callback may be called. */ int db_states; + /** Flags for this block. */ + int flags; +#define DMU_CTX_FLAG_READ (1 << 1) + + /** The number of errors that occurred. */ + int err; + + /** Pointer to the data buffer. */ + void *data_buf; + /** The size and offset of this array of buffers. */ uint64_t offset; uint64_t size; @@ -250,6 +263,12 @@ } dmu_context_t; +void dmu_buf_ctx_rele(dmu_context_t *dmu_ctx, dmu_buf_t *vdb, + boolean_t err); +void dmu_buf_ctx_process(void); +int dmu_create_thread_context(void); +void dmu_destroy_thread_context(void *); + typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); /* @@ -371,6 +390,8 @@ uint64_t size); int dmu_free_object(objset_t *os, uint64_t object); +void dmu_buf_cb_process(void); + /* * Convenience functions. * ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c#7 (text) ==== @@ -5305,6 +5305,7 @@ uint_t zfs_fsyncer_key; extern uint_t rrw_tsd_key; +extern uint_t zfs_async_io_key; #ifdef sun int @@ -5325,6 +5326,7 @@ tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, NULL); + tsd_create(&zfs_async_io_key, NULL); error = ldi_ident_from_mod(&modlinkage, &zfs_li); ASSERT(error == 0); @@ -5355,6 +5357,7 @@ (void) ddi_modclose(sharefs_mod); tsd_destroy(&zfs_fsyncer_key); + tsd_destroy(&zfs_async_io_key); ldi_ident_release(zfs_li); zfs_li = NULL; mutex_destroy(&zfs_share_lock); @@ -5386,6 +5389,7 @@ tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, NULL); + tsd_create(&zfs_async_io_key, dmu_destroy_thread_context); printf("ZFS storage pool version " SPA_VERSION_STRING "\n"); root_mount_rel(zfs_root_token); @@ -5406,6 +5410,7 @@ tsd_destroy(&zfs_fsyncer_key); tsd_destroy(&rrw_tsd_key); + tsd_destroy(&zfs_async_io_key); mutex_destroy(&zfs_share_lock); break; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c#3 (text) ==== @@ -1266,8 +1266,10 @@ zio->io_stage = stage; rv = zio_pipeline[highbit(stage) - 1](zio); - if (rv == ZIO_PIPELINE_STOP) + if (rv == ZIO_PIPELINE_STOP) { + dmu_buf_ctx_process(); return; + } ASSERT(rv == ZIO_PIPELINE_CONTINUE); } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#8 (text) ==== @@ -1198,6 +1198,15 @@ } #endif /* sun */ +static void +zvol_dmu_cb(dmu_context_t *dmu_ctx) +{ + struct bio *bp = (struct bio *)dmu_ctx->dmu_cb_private; + int err = (dmu_ctx->err == 0) ? 0 : EIO; + + g_io_deliver(bp, err); +} + int zvol_strategy(struct bio *bp) { @@ -1247,8 +1256,13 @@ while (resid != 0 && off < volsize) { size_t size = MIN(resid, zvol_maxphys); if (doread) { +#if 0 + error = dmu_read_cb(os, ZVOL_OBJ, off, size, addr, + DMU_READ_PREFETCH, zvol_dmu_cb, bp); +#else error = dmu_read(os, ZVOL_OBJ, off, size, addr, DMU_READ_PREFETCH); +#endif } else { dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h#2 (text) ==== @@ -68,6 +68,8 @@ #ifdef _KERNEL +typedef void (*taskq_callback_fn)(void *); + extern taskq_t *system_taskq; extern void taskq_init(void); @@ -77,7 +79,7 @@ extern taskq_t *taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t); extern taskq_t *taskq_create_proc(const char *, int, pri_t, int, int, - struct proc *, uint_t); + struct proc *, uint_t, taskq_callback_fn, taskq_callback_fn); extern taskq_t *taskq_create_sysdc(const char *, int, int, int, struct proc *, uint_t, uint_t); extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); Change 522414 by willa@willa_repo on 2012/01/25 12:33:50 Handle ctor/dtor for libzpool taskq's. Affected files ... ... //depot/branches/redline/projects/cow/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h#2 edit ... //depot/branches/redline/projects/cow/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c#3 edit Differences ... ==== //depot/branches/redline/projects/cow/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h#2 (text) ==== @@ -362,6 +362,7 @@ typedef struct taskq taskq_t; typedef uintptr_t taskqid_t; typedef void (task_func_t)(void *); +typedef void (*taskq_callback_fn)(void *); #define TASKQ_PREPOPULATE 0x0001 #define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ @@ -377,8 +378,11 @@ extern taskq_t *system_taskq; extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); -#define taskq_create_proc(a, b, c, d, e, p, f) \ - (taskq_create(a, b, c, d, e, f)) +extern taskq_t *taskq_create_with_callbacks(const char *, int, pri_t, int, int, + uint_t, taskq_callback_fn, taskq_callback_fn); + +#define taskq_create_proc(a, b, c, d, e, p, f, g, h) \ + (taskq_create_with_callbacks(a, b, c, d, e, f, g, h)) #define taskq_create_sysdc(a, b, d, e, p, dc, f) \ (taskq_create(a, b, maxclsyspri, d, e, f)) extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); ==== //depot/branches/redline/projects/cow/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c#3 (text) ==== @@ -53,6 +53,8 @@ int tq_maxalloc_wait; task_t *tq_freelist; task_t tq_task; + taskq_callback_fn tq_ctor; + taskq_callback_fn tq_dtor; }; static task_t * @@ -161,6 +163,9 @@ task_t *t; mutex_enter(&tq->tq_lock); + if (tq->tq_ctor != NULL) + tq->tq_ctor(tq); + while (tq->tq_flags & TASKQ_ACTIVE) { if ((t = tq->tq_task.task_next) == &tq->tq_task) { if (--tq->tq_active == 0) @@ -182,14 +187,16 @@ } tq->tq_nthreads--; cv_broadcast(&tq->tq_wait_cv); + if (tq->tq_dtor != NULL) + tq->tq_dtor(tq); mutex_exit(&tq->tq_lock); return (NULL); } -/*ARGSUSED*/ taskq_t * -taskq_create(const char *name, int nthreads, pri_t pri, - int minalloc, int maxalloc, uint_t flags) +taskq_create_with_callbacks(const char *name, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags, taskq_callback_fn ctor, + taskq_callback_fn dtor) { taskq_t *tq = kmem_zalloc(sizeof (taskq_t), KM_SLEEP); int t; @@ -220,6 +227,8 @@ tq->tq_task.task_next = &tq->tq_task; tq->tq_task.task_prev = &tq->tq_task; tq->tq_threadlist = kmem_alloc(nthreads * sizeof (thread_t), KM_SLEEP); + tq->tq_ctor = ctor; + tq->tq_dtor = dtor; if (flags & TASKQ_PREPOPULATE) { mutex_enter(&tq->tq_lock); @@ -235,6 +244,15 @@ return (tq); } +/*ARGSUSED*/ +taskq_t * +taskq_create(const char *name, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags) +{ + return (taskq_create_with_callbacks(name, nthreads, pri, minalloc, + maxalloc, flags, NULL, NULL)); +} + void taskq_destroy(taskq_t *tq) { Change 522415 by willa@willa_repo on 2012/01/25 12:57:53 Add stubs for tsd_get() and tsd_set(). We may wish to properly define these for userspace ZFS programs in order to utilize or test queued async I/O's. Affected files ... ... //depot/branches/redline/projects/cow/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h#3 edit Differences ... ==== //depot/branches/redline/projects/cow/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h#3 (text) ==== @@ -213,6 +213,9 @@ */ #define curthread ((void *)(uintptr_t)thr_self()) +#define tsd_get(key) NULL +#define tsd_set(key, value) 0 + typedef struct kthread kthread_t; #define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ Change 522485 by willa@willa_repo on 2012/01/25 19:29:56 Another checkpoint for async reads. This doesn't quite work right; it looks like, in the sync case, dmu_read_cb() is exiting dmu_read_cb() before all I/O has finished. This checkin is targeted at solving the problem of how to chunk reads and still only call dmu_read_cb()'s callback once. The previous implementation created a single object for each hold_array call, but when each object was released, the callback was invoked. So another layer of indirection that tracks these objects is needed, in order to ensure that the caller's callback is only invoked once. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h: - Split dmu_context_t into a second structure, dmu_buf_set_t. These will contain the buffer sets themselves. The initial allocation in a dmu_run_cb() call will malloc both in one shot. - Use the opportunity to refine the interfaces. Now they are: - dmu_buf_set_read(): char * buffer set reader. - dmu_buf_set_rele(): Release a buffer set for a given dbuf, or if none provided, the initiator. - dmu_buf_set_dispatch(): Actually process a completed buffer set. Called by the thread-specific queue processor or directly. - dmu_context_rele(): Release a context for a buffer set or initiator. - dmu_thread_context_{create,process,destroy}(). sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c: - Add a new sysctl counter type, SYSCTL_REFCOUNT, which is what it sounds like. Used to track the number of buffer sets and contexts. - Rototill the interfaces accordingly. - dmu_buf_set_rele(): - Release the dbuf or intiator's hold on a buffer set. - Report any error via the associated DMU context. - If we're the last refcount, then: - If the current thread has a TSD, queue ourselves for dispatch, otherwise call dmu_buf_set_dispatch(). - dmu_buf_set_dispatch(): - Call dmu_buf_set_read() to read the buffers into the target buffer. - Release the dbufs from their duty. - Check to see whether this buffer set is the master, if it is not, free the buffer set. - Call dmu_context_rele() to release the buffer set's hold. - dmu_context_rele(): - Release buffer set or initiator hold on the DMU context. If we're not the last holder, return. - Call the initiator's callback, if specified. - Free the DMU context along with its buffer set. - Update dmu_buf_hold_array_by_dnode_cb(): - Take a pointer to the DMU context. If it is NULL, one will be created with an associated buffer set. Otherwise, just a buffer set will be created, with a pointer to the original DMU context. - Initialize both the DMU context and buffer set with an "initiator" hold, which makes sure that releases do not cause these objects to exit during the process of initiating them. - Initiate the I/O for a given dbuf prior to putting its buffer set on its list. In this way, we check its state as late as possible. - Check a dbuf's state regardless of whether we have a callback, in order to release its hold early if it's already done. This is needed anyway because dbuf_hold() may return a finished dbuf and so would never notify DMU. - Update dmu_read_cb(): - If an error occurs, make sure to release the context before exit. - Since the sync and async cases are handled the same now when it comes to holds/releases, release the initiator hold on the buffer set after we're done using it for accounting purposes. - In the sync case, release and discard the DMU context after every iteration. - In the async case, only release the DMU context's initiator hold after all I/O's have been issued. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c: - Fix usage of the dmu_context_node_t lists. It is necessary to separate the lists from the dmu_buf_set_t proper since this pointer may be placed on multiple lists. - Create two wrappers for adding/removing entries to these lists. These are needed in order to malloc/free the list entries. - dmu_context_node_{add,remove}(). - Change db->db_callbacks to db->db_dmu_contexts to more accurately reflect what the list contains. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h: - Update for the API changes. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#61 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#24 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c#8 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#26 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#11 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c#8 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c#4 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#61 (text) ==== @@ -92,6 +92,7 @@ SYSCTL_COUNTER_U(dirty_ranges_total, "number of total dirty ranges"); SYSCTL_COUNTER_U(dirty_writes_lost, "dirty writes lost"); SYSCTL_COUNTER_U(dirty_buffers_already_cached, "dirty buffers already cached"); +SYSCTL_COUNTER_U(dmu_notifications, "DMU notifications"); static uint64_t dbuf_hash_count; @@ -119,18 +120,19 @@ { dmu_context_node_t *dcn, *next; - for (dcn = list_head(&db->db_callbacks); dcn != NULL; dcn = next) { - next = list_next(&db->db_callbacks, dcn); - if (dcn->dmu_ctx->db_states & db->db_state) { - list_remove(&db->db_callbacks, dcn); - dmu_buf_ctx_rele(dcn->dmu_ctx, &db->db, err); + for (dcn = list_head(&db->db_dmu_contexts); dcn != NULL; dcn = next) { + next = list_next(&db->db_dmu_contexts, dcn); + if (dcn->buf_set->dmu_ctx->db_states & db->db_state) { + atomic_add_64(&dmu_notifications, 1); + dmu_buf_set_rele(dcn->buf_set, &db->db, err); + dmu_context_node_remove(&db->db_dmu_contexts, dcn); } } } #define DBUF_STATE_CHANGE_COMMON(db, op, state, err, why) \ (db)->db_state op state; \ - if (!list_is_empty(&(db)->db_callbacks)) \ + if (!list_is_empty(&(db)->db_dmu_contexts)) \ dbuf_process_dmu_contexts(db, err) #ifdef ZFS_DEBUG @@ -2657,7 +2659,7 @@ list_create(&db->db_dirty_records, sizeof(dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, db_dirty_record_link)); - list_create(&db->db_callbacks, sizeof(dmu_context_node_t), + list_create(&db->db_dmu_contexts, sizeof(dmu_context_node_t), offsetof(dmu_context_node_t, dcn_link)); db->db_objset = os; @@ -2789,7 +2791,7 @@ db->db_parent = NULL; db->db_buf = NULL; list_destroy(&db->db_dirty_records); - list_destroy(&db->db_callbacks); + list_destroy(&db->db_dmu_contexts); ASSERT(!list_link_active(&db->db_link)); ASSERT(db->db.db_data == NULL); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#24 (text) ==== @@ -107,8 +107,14 @@ uint64_t name; \ SYSCTL_QUAD(_vfs_zfs_dmu, OID_AUTO, name, CTLFLAG_RD, \ &name, 0, desc) +#define SYSCTL_REFCOUNT(name, desc) \ + uint_t name; \ + SYSCTL_INT(_vfs_zfs_dmu, OID_AUTO, name, CTLFLAG_RD, \ + &name, 0, desc) -SYSCTL_COUNTER_U(dmu_ctx_in_flight, "number of contexts in flight"); +SYSCTL_COUNTER_U(dbufs_already_cached, "number of dbufs already cached"); +SYSCTL_REFCOUNT(buf_set_in_flight, "number of buffer sets in flight"); +SYSCTL_REFCOUNT(dmu_ctx_in_flight, "number of DMU contexts in flight"); /** * \brief Obtain the DMU buffer from the specified object which contains the @@ -765,8 +771,122 @@ /* Used for TSD for processing completed asynchronous I/Os. */ uint_t zfs_async_io_key; +void +dmu_context_node_add(list_t *list, dmu_buf_set_t *buf_set) +{ + dmu_context_node_t *dcn = kmem_zalloc(sizeof(dmu_context_node_t), + KM_SLEEP); + dcn->buf_set = buf_set; + list_insert_tail(list, dcn); +} + +void +dmu_context_node_remove(list_t *list, dmu_context_node_t *dcn) +{ + list_remove(list, dcn); + kmem_free(dcn, sizeof(dmu_context_node_t)); +} + +/** + * \brief Perform a buffer set read for a char * target buffer. + * + * \param buf_set Buffer set to read. + */ +static void +dmu_buf_set_read(dmu_buf_set_t *buf_set) +{ + char *buf = (char *)buf_set->data_buf; + uint64_t size = buf_set->size; + uint64_t offset = buf_set->offset; + int i; + + for (i = 0; i < buf_set->count; i++) { + int bufoff; + int tocpy; + dmu_buf_t *db = buf_set->dbp[i]; + + ASSERT(size > 0); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + bcopy((char *)db->db_data + bufoff, buf, tocpy); + + offset += tocpy; + size -= tocpy; + buf = (char *)buf + tocpy; + } +} + +/** + * \brief Release a DMU context hold, cleaning up if no holds remain. + * + * \param dmu_ctx DMU context to release. + */ +void +dmu_context_rele(dmu_context_t *dmu_ctx) +{ + dmu_buf_set_t *buf_set; + + if (!refcount_release(&dmu_ctx->holds)) + return; + + ASSERT(dmu_ctx_in_flight > 0); + refcount_release(&dmu_ctx_in_flight); + + /* At this point, there are no buffer sets left. Call back. */ + if (dmu_ctx->dmu_cb != NULL) + dmu_ctx->dmu_cb(dmu_ctx); + + kmem_free(dmu_ctx, sizeof(dmu_context_t) + sizeof(dmu_buf_set_t) + + buf_set->count * sizeof(dmu_buf_t *)); +} + +/** + * \brief Handle a completed buffer set, and its DMU context if necessary. + * + * \param buf_set Buffer set to handle. + */ +void +dmu_buf_set_dispatch(dmu_buf_set_t *buf_set) +{ + int child, i; + + /* XXX only supports reads for now */ + ASSERT(buf_set->dmu_ctx->flags & DMU_CTX_FLAG_READ); + if (buf_set->dmu_ctx->flags & DMU_CTX_FLAG_READ) { + dmu_buf_set_read(buf_set); + /* XXX implement uio version? */ + } + + for (i = 0; i < buf_set->count; i++) { + /* May be called from error case, where dbp[i] may be NULL. */ + if (buf_set->dbp[i]) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)buf_set->dbp[i]; + dbuf_rele(db, buf_set->dmu_ctx->tag); + } + } + + ASSERT(buf_set_in_flight > 0); + refcount_release(&buf_set_in_flight); + + /* + * Check to see if this buffer set was allocated with the context. + * If it was, it will be freed by dmu_context_rele(). This ensures + * that someone cleans up after the master regardless of whether its + * buffer set is the last to finish. + */ + child = ((char *)buf_set != + ((char *)buf_set->dmu_ctx + sizeof(dmu_context_t))); + dmu_context_rele(buf_set->dmu_ctx); + if (child) { + kmem_free(buf_set, sizeof(dmu_buf_set_t) + + buf_set->count * sizeof(dmu_buf_t *)); + } +} + int -dmu_create_thread_context(void) +dmu_thread_context_create(void) { dmu_cb_state_t *dcs; @@ -783,7 +903,7 @@ } void -dmu_destroy_thread_context(void *context __unused) +dmu_thread_context_destroy(void *context __unused) { dmu_cb_state_t *dcs; @@ -803,115 +923,80 @@ } void -dmu_buf_rele_array_cb(dmu_context_t *dmu_ctx) +dmu_thread_context_process(void) { - int i; + dmu_cb_state_t *dcs = tsd_get(zfs_async_io_key); + dmu_context_node_t *dcn, *next; - for (i = 0; i < dmu_ctx->count; i++) { - /* May be called from error case, where dbp[i] may be NULL. */ - if (dmu_ctx->dbp[i]) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)dmu_ctx->dbp[i]; - dbuf_rele(db, dmu_ctx->tag); - } - } + /* + * If the current thread didn't register, it doesn't handle queued + * async I/O's. It is probably not a zio thread. This is needed + * because zio_execute() can be called from non-zio threads. + */ + if (dcs == NULL) + return; - atomic_subtract_64(&dmu_ctx_in_flight, 1); - kmem_free(dmu_ctx, - sizeof(dmu_context_t) + dmu_ctx->count * sizeof (dmu_buf_t *)); -} -static void -dmu_ctx_read_buf(dmu_context_t *dmu_ctx) -{ - char *buf = dmu_ctx->data_buf; - uint64_t size = dmu_ctx->size; - uint64_t offset = dmu_ctx->offset; - int i; - - for (i = 0; i < dmu_ctx->count; i++) { - int bufoff; - int tocpy; - dmu_buf_t *db = dmu_ctx->dbp[i]; - - ASSERT(size > 0); - - bufoff = offset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - bcopy((char *)db->db_data + bufoff, buf, tocpy); - - offset += tocpy; - size -= tocpy; - buf = (char *)buf + tocpy; - } - dmu_buf_rele_array_cb(dmu_ctx); -} - -void -dmu_buf_ctx_dispatch(dmu_context_t *dmu_ctx) -{ - /* XXX only supports reads for now */ - ASSERT(dmu_ctx->flags & DMU_CTX_FLAG_READ); - if (dmu_ctx->flags & DMU_CTX_FLAG_READ) { - dmu_ctx_read_buf(dmu_ctx); - /* XXX implement uio version? */ + for (dcn = list_head(&dcs->io_list); dcn != NULL; dcn = next) { + next = list_next(&dcs->io_list, dcn); + dmu_buf_set_dispatch(dcn->buf_set); + dmu_context_node_remove(&dcs->io_list, dcn); } - /* Inform the initiator that their I/O is finished. */ - dmu_ctx->dmu_cb(dmu_ctx); - dmu_buf_rele_array_cb(dmu_ctx); /* frees dmu_ctx */ } /** - * \brief Release a DMU context for a given dbuf. + * \brief Release a buffer set for a given dbuf. * - * \param dmu_ctx DMU context to release. - * \param db Dbuf to release for, if not NULL. + * \param buf_set Buffer set to release. + * \param vdb DMU buffer to release for. * \param err Whether an error occurred. * - * \note If db is NULL, this is being called by the I/O initiator to - * release its own reference. This is done to allow immediate - * dispatching in case every dbuf entered the desired state - * before the initiator finished. This would most likely occur - * in the case of cache hits on the entire dataset. + * \invariant If specified, the dbuf's mutex must be held. */ void -dmu_buf_ctx_rele(dmu_context_t *dmu_ctx, dmu_buf_t *vdb, boolean_t err) +dmu_buf_set_rele(dmu_buf_set_t *buf_set, dmu_buf_t *vdb, boolean_t err) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)vdb; - /* The I/O initiator passes a NULL db to see if everyone called in. */ - if (db != NULL) { + /* XXX seems like there should be more done with the dbuf here. */ + if (db != NULL) ASSERT(MUTEX_HELD(&db->db_mtx)); - /* Report an error, if any. */ - if (err) - atomic_add_int(&dmu_ctx->err, 1); - } + /* Report an error, if any. */ + if (err) + atomic_add_int(&buf_set->dmu_ctx->err, 1); - /* If we are finished, schedule this DMU context for delivery. */ - if (refcount_release(&dmu_ctx->holds)) { + /* If we are finished, schedule this buffer set for delivery. */ + ASSERT(buf_set->holds > 0); + if (refcount_release(&buf_set->holds)) { dmu_cb_state_t *dcs = tsd_get(zfs_async_io_key); +#if 0 + /* Initiator thread must not have a TSD. */ + if (buf_set->dmu_ctx->dmu_cb == NULL) + ASSERT(dcs == NULL); +#endif if (dcs != NULL) { - list_insert_tail(&dcs->io_list, dmu_ctx); + dmu_context_node_add(&dcs->io_list, buf_set); } else { /* * The current thread doesn't have anything * registered for this TSD, so it must not handle - * queued delivery. Dispatch this context now. + * queued delivery. Dispatch this set now. */ - dmu_buf_ctx_dispatch(dmu_ctx); + dmu_buf_set_dispatch(buf_set); } } } int dmu_buf_hold_array_by_dnode_cb(dnode_t *dn, uint64_t offset, uint64_t length, - int read, void *tag, dmu_callback_t dmu_cb, void *priv, - dmu_context_t **dmu_ctx_p, uint32_t flags) + int read, void *tag, dmu_callback_t dmu_cb, void *priv, void *buf, + dmu_context_t **dmu_ctx_p, dmu_buf_set_t **buf_set_p, uint32_t flags) { dsl_pool_t *dp = NULL; - dmu_context_t *dmu_ctx; uint64_t blkid, nblks, i, avail_size; uint32_t dbuf_flags; + dmu_context_t *dmu_ctx; + dmu_buf_set_t *buf_set; int err; zio_t *zio; hrtime_t start; @@ -940,20 +1025,44 @@ } nblks = 1; } - atomic_add_64(&dmu_ctx_in_flight, 1); - dmu_ctx = kmem_zalloc( - sizeof(dmu_context_t) + nblks * sizeof(dmu_buf_t *), KM_SLEEP); - dmu_ctx->dmu_cb = dmu_cb; - dmu_ctx->dmu_cb_private = priv; - dmu_ctx->tag = tag; - dmu_ctx->count = nblks; + if (*dmu_ctx_p == NULL) { + /* Create the DMU context AND the buffer set. */ + *dmu_ctx_p = kmem_zalloc( + sizeof(dmu_context_t) + sizeof(dmu_buf_set_t) + + nblks * sizeof(dmu_buf_t *), KM_SLEEP); + refcount_acquire(&dmu_ctx_in_flight); + dmu_ctx = *dmu_ctx_p; + *buf_set_p = (dmu_buf_set_t *) + ((char *)*dmu_ctx_p + sizeof(dmu_context_t)); + + /* Initialize a new DMU context. */ + /* XXX do something more intelligent about state matching? */ + dmu_ctx->db_states = (DB_UNCACHED|DB_CACHED); + dmu_ctx->dmu_cb = dmu_cb; + dmu_ctx->dmu_cb_private = priv; + dmu_ctx->tag = tag; + /* Include a refcount for the initiator & own buf_set. */ + refcount_init(&dmu_ctx->holds, 2); + if (read) + dmu_ctx->flags |= DMU_CTX_FLAG_READ; + } else { + /* Create only the new buffer set. */ + dmu_ctx = *dmu_ctx_p; + *buf_set_p = kmem_zalloc( + sizeof(dmu_buf_set_t) + nblks * sizeof(dmu_buf_t *), + KM_SLEEP); + refcount_acquire(&dmu_ctx->holds); + } + + /* Initialize a new buffer set. */ + refcount_acquire(&buf_set_in_flight); + buf_set = *buf_set_p; + buf_set->count = nblks; /* Include a refcount for the initiator. */ - refcount_init(&dmu_ctx->holds, nblks + 1); - /* XXX do something more intelligent about state matching? */ - dmu_ctx->db_states = (DB_UNCACHED|DB_CACHED); - dmu_ctx->offset = offset; - if (read) - dmu_ctx->flags |= DMU_CTX_FLAG_READ; + refcount_init(&buf_set->holds, nblks + 1); + buf_set->dmu_ctx = dmu_ctx; + buf_set->offset = offset; + buf_set->data_buf = buf; if (dn->dn_objset->os_dsl_dataset) dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; @@ -968,7 +1077,7 @@ if (db == NULL) { rw_exit(&dn->dn_struct_rwlock); - dmu_buf_rele_array_cb(dmu_ctx); + dmu_buf_set_rele(buf_set, NULL, B_TRUE); zio_nowait(zio); return (EIO); } @@ -976,27 +1085,11 @@ ASSERT(offset >= db->db.db_offset); bufoff = offset - db->db.db_offset; bufsiz = (int)MIN(db->db.db_size - bufoff, avail_size); - dmu_ctx->size += bufsiz; - ASSERT(dmu_ctx->size <= length); + buf_set->size += bufsiz; + ASSERT(buf_set->size <= length); offset += bufsiz; avail_size -= bufsiz; - /* Associate the dbuf with this callback if specified. */ - if (dmu_cb != NULL) { - mutex_enter(&db->db_mtx); - if (db->db_state == DB_CACHED) { - /* - * This buffer's already done. Don't check - * for DB_UNCACHED here because that only - * indicates an initialized buffer. - */ - refcount_release(&dmu_ctx->holds); - } else { - /* FIFO behavior likely: insert at the end. */ - list_insert_tail(&db->db_callbacks, dmu_ctx); - } - /* NB: all dbufs may have completed at this point! */ - mutex_exit(&db->db_mtx); - } + /* initiate async i/o */ if (read) (void) dbuf_read(db, zio, dbuf_flags); @@ -1004,13 +1097,31 @@ else curthread->td_ru.ru_oublock++; #endif - dmu_ctx->dbp[i] = &db->db; + + /* Make sure dbufs that don't notify DMU are caught here. */ + mutex_enter(&db->db_mtx); + if (db->db_state == DB_CACHED) { + /* + * This buffer's already done. Don't check + * for DB_UNCACHED here because that only + * indicates an initialized buffer. + */ + atomic_add_64(&dbufs_already_cached, 1); + ASSERT(buf_set->holds > 0); + refcount_release(&buf_set->holds); + } else { + /* Let the dbuf know this DMU context needs it. */ + dmu_context_node_add(&db->db_dmu_contexts, buf_set); + } + /* NB: all dbufs may have completed at this point! */ + mutex_exit(&db->db_mtx); + buf_set->dbp[i] = &db->db; } rw_exit(&dn->dn_struct_rwlock); /* * If a callback is specified, issue the I/O's without waiting. - * The callback will be responsible for cleaning up. + * The dbufs will be responsible for cleaning up. */ if (dmu_cb != NULL) zio_nowait(zio); @@ -1021,7 +1132,7 @@ if (dp && dsl_pool_sync_context(dp)) dp->dp_read_overhead += gethrtime() - start; if (err) { - dmu_buf_rele_array_cb(dmu_ctx); + dmu_buf_set_rele(buf_set, NULL, B_TRUE); return (err); } @@ -1029,7 +1140,7 @@ if (read) { for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = - (dmu_buf_impl_t *)dmu_ctx->dbp[i]; + (dmu_buf_impl_t *)buf_set->dbp[i]; mutex_enter(&db->db_mtx); while (db->db_state & (DB_READ|DB_FILL)) cv_wait(&db->db_changed, &db->db_mtx); @@ -1037,43 +1148,23 @@ err = EIO; mutex_exit(&db->db_mtx); if (err) { - dmu_buf_rele_array_cb(dmu_ctx); + dmu_buf_set_rele(buf_set, NULL, B_TRUE); return (err); } } } } - *dmu_ctx_p = dmu_ctx; return (0); } -void -dmu_buf_ctx_process(void) -{ - dmu_cb_state_t *dcs = tsd_get(zfs_async_io_key); - dmu_context_t *dctx, *next; - - /* - * If the current thread didn't register, it doesn't handle queued - * async I/O's. It is probably not a zio thread. This is needed - * because zio_execute() can be called from non-zio threads. - */ - if (dcs == NULL) - return; - - for (dctx = list_head(&dcs->io_list); dctx != NULL; dctx = next) { - next = list_next(&dcs->io_list, dctx); - dmu_buf_ctx_dispatch(dctx); - } -} - int dmu_read_cb(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags, dmu_callback_t dmu_cb, void *priv) { dnode_t *dn; - dmu_context_t *dmu_ctx; + dmu_context_t *dmu_ctx = NULL; + dmu_buf_set_t *buf_set; int numbufs, err; err = dnode_hold(os, object, FTAG, &dn); @@ -1100,22 +1191,29 @@ * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode_cb(dn, offset, mylen, - TRUE, FTAG, dmu_cb, priv, &dmu_ctx, flags); - if (err) + TRUE, FTAG, dmu_cb, priv, buf, &dmu_ctx, &buf_set, flags); + + if (err) { + if (dmu_cb == NULL) + dmu_context_rele(dmu_ctx); break; + } - /* Tell the handler of the context where to read to. */ - dmu_ctx->data_buf = buf; + offset += buf_set->size; + size -= buf_set->size; + buf = (char *)buf + buf_set->size; - if (dmu_cb == NULL) - dmu_ctx_read_buf(dmu_ctx); - else - dmu_buf_ctx_rele(dmu_ctx, NULL, B_FALSE); + /* Release initiator hold. */ + dmu_buf_set_rele(buf_set, NULL, B_FALSE); - offset += dmu_ctx->size; - size -= dmu_ctx->size; - buf = (char *)buf + dmu_ctx->size; + if (dmu_cb == NULL) { + /* Make sure the DMU context is not reused. */ + dmu_context_rele(dmu_ctx); + dmu_ctx = NULL; + } } + if (dmu_cb != NULL) + dmu_context_rele(dmu_ctx); dnode_rele(dn, FTAG); return (err); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c#8 (text) ==== @@ -700,7 +700,7 @@ spa_zio_thread_init(void *context __unused) { - dmu_create_thread_context(); + dmu_thread_context_create(); } static taskq_t * ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#26 (text) ==== @@ -395,8 +395,8 @@ /** List of dirty records for the buffer sorted newest to oldest. */ list_t db_dirty_records; - /** List of callbacks (see dmu_context_node_t). */ - list_t db_callbacks; + /** List of DMU contexts (see dmu_context_node_t). */ + list_t db_dmu_contexts; /** * Our link on the owner dnodes's dn_dbufs list. @@ -429,14 +429,17 @@ typedef struct dmu_context_node { - /** This object's entry in the list. */ + /** This entry's link in the list. */ list_node_t dcn_link; - /** The DMU context this callback is associated with. */ - dmu_context_t *dmu_ctx; + /** This entry's buffer set pointer. */ + dmu_buf_set_t *buf_set; } dmu_context_node_t; +void dmu_context_node_add(list_t *list, dmu_buf_set_t *buf_set); +void dmu_context_node_remove(list_t *list, dmu_context_node_t *dcn); + /** * \brief Thread-specific DMU callback state for processing async I/O's. */ ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#11 (text) ==== @@ -226,10 +226,7 @@ typedef struct dmu_context { - /** The number of buffers associated with this context. */ - int count; - - /** Number of buffers left to complete. */ + /** Number of buffer sets left to complete. */ int holds; /** The tag used for this context. */ @@ -248,6 +245,22 @@ /** The number of errors that occurred. */ int err; + /** Private data for the callback. */ + void *dmu_cb_private; + +} dmu_context_t; + +typedef struct dmu_buf_set { + + /** The DMU context that this buffer set is associated with. */ + dmu_context_t *dmu_ctx; + + /** The number of buffers associated with this context. */ + int count; + + /** Number of buffers left to complete. */ + int holds; + /** Pointer to the data buffer. */ void *data_buf; @@ -255,19 +268,17 @@ uint64_t offset; uint64_t size; - /** Private data for the callback. */ - void *dmu_cb_private; + /** The set of buffers themselves. */ + struct dmu_buf *dbp[0]; - /** The set of buffers associated with this context. */ - struct dmu_buf *dbp[0]; +} dmu_buf_set_t; -} dmu_context_t; +void dmu_buf_set_rele(dmu_buf_set_t *buf_set, dmu_buf_t *vdb, boolean_t err); -void dmu_buf_ctx_rele(dmu_context_t *dmu_ctx, dmu_buf_t *vdb, - boolean_t err); -void dmu_buf_ctx_process(void); -int dmu_create_thread_context(void); -void dmu_destroy_thread_context(void *); +/* DMU thread context handlers. */ +int dmu_thread_context_create(void); +void dmu_thread_context_process(void); +void dmu_thread_context_destroy(void *); typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c#8 (text) ==== @@ -5326,7 +5326,7 @@ tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, NULL); - tsd_create(&zfs_async_io_key, NULL); + tsd_create(&zfs_async_io_key, dmu_thread_context_destroy); error = ldi_ident_from_mod(&modlinkage, &zfs_li); ASSERT(error == 0); @@ -5389,7 +5389,7 @@ tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, NULL); - tsd_create(&zfs_async_io_key, dmu_destroy_thread_context); + tsd_create(&zfs_async_io_key, dmu_thread_context_destroy); printf("ZFS storage pool version " SPA_VERSION_STRING "\n"); root_mount_rel(zfs_root_token); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c#4 (text) ==== @@ -1267,7 +1267,7 @@ rv = zio_pipeline[highbit(stage) - 1](zio); if (rv == ZIO_PIPELINE_STOP) { - dmu_buf_ctx_process(); + dmu_thread_context_process(); return; } Change 522488 by willa@willa_repo on 2012/01/25 21:21:16 "Fix" the sync reads. It means that DMU contexts without a callback can't queue any buffer set for completion later, even if the current thread has a queue. The scenario looks like this: - dmu_buf_hold_array_by_dnode_cb() issues reads async, then waits for their ZIOs, then waits on all its dbufs to reach the CACHED state. - One or more buffer changes to CACHED, and in so doing calls dmu_buf_set_rele() on its buffer set, inside a ZIO thread. The actual read is deferred until zio_execute() finishes. - Meanwhile, hold_array exits, satisfied that the dbufs are finished; dmu_read_cb() then releases its context, then exits without having completed the I/O. - zio_execute() processes the queued I/Os, touching buffers that have likely been released by dmu_read_cb()'s caller. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#25 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#25 (text) ==== @@ -969,12 +969,9 @@ ASSERT(buf_set->holds > 0); if (refcount_release(&buf_set->holds)) { dmu_cb_state_t *dcs = tsd_get(zfs_async_io_key); -#if 0 - /* Initiator thread must not have a TSD. */ - if (buf_set->dmu_ctx->dmu_cb == NULL) - ASSERT(dcs == NULL); -#endif - if (dcs != NULL) { + + /* XXX Without a callback, the buffer must be finished now. */ + if (dcs != NULL && buf_set->dmu_ctx->dmu_cb != NULL) { dmu_context_node_add(&dcs->io_list, buf_set); } else { /* @@ -1208,6 +1205,7 @@ if (dmu_cb == NULL) { /* Make sure the DMU context is not reused. */ + ASSERT(dmu_ctx->holds == 1); dmu_context_rele(dmu_ctx); dmu_ctx = NULL; } Change 522497 by willa@willa_repo on 2012/01/25 23:39:17 Simplify dmu_read_cb() a bit. There's no particular reason to NULL dmu_ctx each iteration without a callback. Regardless of whether the call is synchronous, simply call dmu_context_rele() after the loop exits, unless it is NULL. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#26 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#26 (text) ==== @@ -1190,11 +1190,8 @@ err = dmu_buf_hold_array_by_dnode_cb(dn, offset, mylen, TRUE, FTAG, dmu_cb, priv, buf, &dmu_ctx, &buf_set, flags); - if (err) { - if (dmu_cb == NULL) - dmu_context_rele(dmu_ctx); + if (err) break; - } offset += buf_set->size; size -= buf_set->size; @@ -1202,15 +1199,9 @@ /* Release initiator hold. */ dmu_buf_set_rele(buf_set, NULL, B_FALSE); - - if (dmu_cb == NULL) { - /* Make sure the DMU context is not reused. */ - ASSERT(dmu_ctx->holds == 1); - dmu_context_rele(dmu_ctx); - dmu_ctx = NULL; - } } - if (dmu_cb != NULL) + /* DMU context can be NULL if the first hold returned an error. */ + if (dmu_ctx != NULL) dmu_context_rele(dmu_ctx); dnode_rele(dn, FTAG); Change 522532 by willa@willa_repo on 2012/01/26 10:38:13 Get async reads working for zvols. Most of the remaining work was in the zvol layer. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c: - Fix an error handling scenario. If an error occurs and we have a DMU context, report it that way. - Add a few counters to check for error cases & activity. Some of these will be yanked later. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c: - Create a new zvol_read_context_t structure which contains the elements needed to successfully process a zvol read asynchronously. - Change zvol_strategy(): - Malloc this structure prior to the DMU reads and initialize it with range locking, the bio itself, initialize its hold count, etc. - For each dmu_read_cb() call issued, place an additional hold on the read context. - At the end of zvol_strategy(), if no read context was created, do the processing the old way. Otherwise, release the initiator hold on the read context. - Share code between zvol_strategy() and zvol_dmu_cb() to handle what happens when a read context has been completed, since the last hold may be from either case. Create zvol_dmu_read_release() for this. - In order to facilitate reporting (to the bio) how much data was completed, it was necessary to have the DMU context track how much data was completed; then in the callback, update the bio_completed figure using this amount. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#27 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#12 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#9 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#27 (text) ==== @@ -113,8 +113,12 @@ &name, 0, desc) SYSCTL_COUNTER_U(dbufs_already_cached, "number of dbufs already cached"); +SYSCTL_COUNTER_U(hold_array_early_errors, "hold_array early errors"); +SYSCTL_COUNTER_U(hold_array_dbuf_hold_errors, "hold_array dbuf_hold errors"); +SYSCTL_COUNTER_U(dmu_ctx_total, "total number of DMU contexts"); +SYSCTL_COUNTER_U(buf_set_total, "total number of buffer sets"); +SYSCTL_REFCOUNT(dmu_ctx_in_flight, "number of DMU contexts in flight"); SYSCTL_REFCOUNT(buf_set_in_flight, "number of buffer sets in flight"); -SYSCTL_REFCOUNT(dmu_ctx_in_flight, "number of DMU contexts in flight"); /** * \brief Obtain the DMU buffer from the specified object which contains the @@ -870,6 +874,8 @@ ASSERT(buf_set_in_flight > 0); refcount_release(&buf_set_in_flight); + atomic_add_64(&buf_set->dmu_ctx->completed_size, buf_set->size); + /* * Check to see if this buffer set was allocated with the context. * If it was, it will be freed by dmu_context_rele(). This ensures @@ -1011,6 +1017,7 @@ P2ALIGN(offset, 1ULL<> blkshift; } else { if (offset + length > dn->dn_datablksz) { + atomic_add_64(&hold_array_early_errors, 1); zfs_panic_recover("zfs: accessing past end of object " "%llx/%llx (size=%u access=%llu+%llu)", (longlong_t)dn->dn_objset-> @@ -1028,6 +1035,7 @@ sizeof(dmu_context_t) + sizeof(dmu_buf_set_t) + nblks * sizeof(dmu_buf_t *), KM_SLEEP); refcount_acquire(&dmu_ctx_in_flight); + atomic_add_64(&dmu_ctx_total, 1); dmu_ctx = *dmu_ctx_p; *buf_set_p = (dmu_buf_set_t *) ((char *)*dmu_ctx_p + sizeof(dmu_context_t)); @@ -1053,6 +1061,7 @@ /* Initialize a new buffer set. */ refcount_acquire(&buf_set_in_flight); + atomic_add_64(&buf_set_total, 1); buf_set = *buf_set_p; buf_set->count = nblks; /* Include a refcount for the initiator. */ @@ -1075,6 +1084,7 @@ if (db == NULL) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_set_rele(buf_set, NULL, B_TRUE); + atomic_add_64(&hold_array_dbuf_hold_errors, 1); zio_nowait(zio); return (EIO); } @@ -1201,8 +1211,11 @@ dmu_buf_set_rele(buf_set, NULL, B_FALSE); } /* DMU context can be NULL if the first hold returned an error. */ - if (dmu_ctx != NULL) + if (dmu_ctx != NULL) { + if (err) + atomic_add_int(&dmu_ctx->err, 1); dmu_context_rele(dmu_ctx); + } dnode_rele(dn, FTAG); return (err); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#12 (text) ==== @@ -235,6 +235,9 @@ /** The callback to call if the conditions are met. */ dmu_callback_t dmu_cb; + /** Completed size. */ + uint64_t completed_size; + /** The dbuf states when a callback may be called. */ int db_states; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#9 (text) ==== @@ -90,6 +90,21 @@ DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zvol, CTLFLAG_RW, 0, "ZFS ZVOL"); +#define SYSCTL_COUNTER_U(name, desc) \ + uint64_t name; \ + SYSCTL_QUAD(_vfs_zfs_zvol, OID_AUTO, name, CTLFLAG_RD, \ + &name, 0, desc); + +#define SYSCTL_REFCOUNT(name, desc) \ + uint_t name; \ + SYSCTL_INT(_vfs_zfs_zvol, OID_AUTO, name, CTLFLAG_RD, \ + &name, 0, desc); + +SYSCTL_COUNTER_U(read_ctx_total, "total number of read contexts"); +SYSCTL_REFCOUNT(read_ctx_in_flight, "number of read contexts in flight"); + /** * The zfsdev_state structure is protected by spa_namespace_lock from being * modified while it's being used, e.g. an open that comes in before a @@ -1198,15 +1213,43 @@ } #endif /* sun */ +typedef struct zvol_read_context { + uint_t holds; + struct bio *bp; + rl_t *rl; + int err; +} zvol_read_context_t; + static void +zvol_dmu_read_release(zvol_read_context_t *read_ctx) +{ + int err = (read_ctx->err == 0) ? 0 : EIO; + + if (refcount_release(&read_ctx->holds)) { + zfs_range_unlock(read_ctx->rl); + read_ctx->bp->bio_error = err; + g_io_deliver(read_ctx->bp, 0); + kmem_free(read_ctx, sizeof(zvol_read_context_t)); + refcount_release(&read_ctx_in_flight); + } +} + +static void zvol_dmu_cb(dmu_context_t *dmu_ctx) { - struct bio *bp = (struct bio *)dmu_ctx->dmu_cb_private; - int err = (dmu_ctx->err == 0) ? 0 : EIO; + zvol_read_context_t *read_ctx; - g_io_deliver(bp, err); + read_ctx = (zvol_read_context_t *)dmu_ctx->dmu_cb_private; + if (dmu_ctx->err != 0) + atomic_add_int(&read_ctx->err, dmu_ctx->err); + else + atomic_add_64(&read_ctx->bp->bio_completed, + dmu_ctx->completed_size); + zvol_dmu_read_release(read_ctx); } +#define ZVOL_ASYNC_READ 1 + int zvol_strategy(struct bio *bp) { @@ -1219,6 +1262,7 @@ int error = 0; boolean_t doread = (bp->bio_cmd == BIO_READ); boolean_t sync; + zvol_read_context_t *read_ctx = NULL; if (zv == NULL) { g_io_deliver(bp, ENXIO); @@ -1253,15 +1297,30 @@ rl = zfs_range_lock(&zv->zv_znode, off, resid, doread ? RL_READER : RL_WRITER); +#if ZVOL_ASYNC_READ + if (doread) { + /* XXX yet another malloc ... create a pool of these? */ + read_ctx = kmem_zalloc(sizeof(zvol_read_context_t), KM_SLEEP); + refcount_acquire(&read_ctx_in_flight); + atomic_add_64(&read_ctx_total, 1); + /* Initialize holds to include initiator. */ + refcount_init(&read_ctx->holds, 1); + read_ctx->rl = rl; + read_ctx->bp = bp; + } +#endif + while (resid != 0 && off < volsize) { size_t size = MIN(resid, zvol_maxphys); if (doread) { -#if 0 +#if ZVOL_ASYNC_READ + refcount_acquire(&read_ctx->holds); + dmu_read_cb(os, ZVOL_OBJ, off, size, addr, + DMU_READ_PREFETCH, zvol_dmu_cb, read_ctx); + /* XXX: no DMU context created??? */ +#else error = dmu_read_cb(os, ZVOL_OBJ, off, size, addr, - DMU_READ_PREFETCH, zvol_dmu_cb, bp); -#else - error = dmu_read(os, ZVOL_OBJ, off, size, addr, - DMU_READ_PREFETCH); + DMU_READ_PREFETCH, NULL, NULL); #endif } else { dmu_tx_t *tx = dmu_tx_create(os); @@ -1285,15 +1344,20 @@ addr += size; resid -= size; } - zfs_range_unlock(rl); + if (read_ctx == NULL) { + zfs_range_unlock(rl); - bp->bio_completed = bp->bio_length - resid; - if (bp->bio_completed < bp->bio_length) - bp->bio_error = (off > volsize ? EINVAL : error); + bp->bio_completed = bp->bio_length - resid; + if (bp->bio_completed < bp->bio_length) + bp->bio_error = (off > volsize ? EINVAL : error); - if (sync) - zil_commit(zv->zv_zilog, ZVOL_OBJ); - g_io_deliver(bp, 0); + if (sync) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + g_io_deliver(bp, 0); + } else { + /* Release initiator hold. */ + zvol_dmu_read_release(read_ctx); + } return (0); } Change 522622 by willa@willa_repo on 2012/01/26 14:42:32 A few fixes for zvol-dmu async reads, particularly in error path. Maintain an error count at the buffer set layer as well, so when all buffers exit, the buffer set knows whether to perform any I/O. Only bump the DMU context's completed size if it does. dmu_read_cb(): If an I/O error occurs at this level, report it to the callback instead of the caller, if one was passed in and a context was created (and therefore I/O was issued). zvol_strategy(): Do an extra release for a given I/O if issuing it failed, since dmu_read_cb() won't. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#28 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#13 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#10 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#28 (text) ==== @@ -856,12 +856,17 @@ { int child, i; - /* XXX only supports reads for now */ - ASSERT(buf_set->dmu_ctx->flags & DMU_CTX_FLAG_READ); - if (buf_set->dmu_ctx->flags & DMU_CTX_FLAG_READ) { - dmu_buf_set_read(buf_set); - /* XXX implement uio version? */ - } + /* Only perform I/O if no errors occurred for the buffer set. */ + if (buf_set->err == 0) { + /* XXX only supports reads for now */ + ASSERT(buf_set->dmu_ctx->flags & DMU_CTX_FLAG_READ); + if (buf_set->dmu_ctx->flags & DMU_CTX_FLAG_READ) { + dmu_buf_set_read(buf_set); + /* XXX implement uio version? */ + } + atomic_add_64(&buf_set->dmu_ctx->completed_size, buf_set->size); + } else + atomic_add_int(&buf_set->dmu_ctx->err, buf_set->err); for (i = 0; i < buf_set->count; i++) { /* May be called from error case, where dbp[i] may be NULL. */ @@ -874,8 +879,6 @@ ASSERT(buf_set_in_flight > 0); refcount_release(&buf_set_in_flight); - atomic_add_64(&buf_set->dmu_ctx->completed_size, buf_set->size); - /* * Check to see if this buffer set was allocated with the context. * If it was, it will be freed by dmu_context_rele(). This ensures @@ -969,7 +972,7 @@ /* Report an error, if any. */ if (err) - atomic_add_int(&buf_set->dmu_ctx->err, 1); + atomic_add_int(&buf_set->err, 1); /* If we are finished, schedule this buffer set for delivery. */ ASSERT(buf_set->holds > 0); @@ -1010,6 +1013,7 @@ if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) dbuf_flags |= DB_RF_NOPREFETCH; + /* Figure out the number of blocks needed for the buffer set. */ rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; @@ -1029,6 +1033,7 @@ } nblks = 1; } + if (*dmu_ctx_p == NULL) { /* Create the DMU context AND the buffer set. */ *dmu_ctx_p = kmem_zalloc( @@ -1063,12 +1068,12 @@ refcount_acquire(&buf_set_in_flight); atomic_add_64(&buf_set_total, 1); buf_set = *buf_set_p; + buf_set->offset = offset; + buf_set->data_buf = buf; buf_set->count = nblks; /* Include a refcount for the initiator. */ refcount_init(&buf_set->holds, nblks + 1); buf_set->dmu_ctx = dmu_ctx; - buf_set->offset = offset; - buf_set->data_buf = buf; if (dn->dn_objset->os_dsl_dataset) dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; @@ -1189,6 +1194,8 @@ bzero((char *)buf + newsz, size - newsz); size = newsz; } + if (size <= 0) + err = EINVAL; while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); int i; @@ -1210,10 +1217,21 @@ /* Release initiator hold. */ dmu_buf_set_rele(buf_set, NULL, B_FALSE); } - /* DMU context can be NULL if the first hold returned an error. */ + /* + * Either the caller didn't specify a callback, or a context was + * created, or we're returning an error to the caller. + */ + ASSERT(dmu_cb == NULL || dmu_ctx != NULL || err != 0); if (dmu_ctx != NULL) { - if (err) + /* + * If an I/O error occurred, tell the callback instead + * of caller. + */ + if (err && dmu_cb != NULL) { atomic_add_int(&dmu_ctx->err, 1); + err = 0; + } + /* Release initiator hold. */ dmu_context_rele(dmu_ctx); } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#13 (text) ==== @@ -271,6 +271,9 @@ uint64_t offset; uint64_t size; + /** The number of errors that occurred. */ + int err; + /** The set of buffers themselves. */ struct dmu_buf *dbp[0]; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#10 (text) ==== @@ -1315,7 +1315,7 @@ if (doread) { #if ZVOL_ASYNC_READ refcount_acquire(&read_ctx->holds); - dmu_read_cb(os, ZVOL_OBJ, off, size, addr, + error = dmu_read_cb(os, ZVOL_OBJ, off, size, addr, DMU_READ_PREFETCH, zvol_dmu_cb, read_ctx); /* XXX: no DMU context created??? */ #else @@ -1355,6 +1355,9 @@ zil_commit(zv->zv_zilog, ZVOL_OBJ); g_io_deliver(bp, 0); } else { + /* If an error is returned, no I/O was initiated. */ + if (error) + zvol_dmu_read_release(read_ctx); /* Release initiator hold. */ zvol_dmu_read_release(read_ctx); } Change 522636 by willa@willa_repo on 2012/01/26 22:42:44 Fix a few more bugs in async read paths. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c: - Add back the old implementation of dmu_read() for use in comparison with behavior of dmu_read_cb(). Make it possible to have dmu_read() call dmu_read_cb() (with NULL callback/private) via sysctl. - Fix the interface between dmu_buf_hold_array_by_dnode_cb() and dmu_read_cb() so that hold_array is responsible for updating the "next" offset and remaining size left to be processed. The benefit is that the interface for dmu_read_cb() is cleaner, so now it just loops on hold_array until its size goes to 0. - Fix a few bugs in dmu_buf_hold_array_by_dnode_cb(): - If a dbuf_hold() call fails, make sure to decrement the refcount and buffer set count by the number of buffers not processed, so that when the buffer set holds drop to 0, it is cleaned up. - Issue the async read on a dbuf prior to performing any checks on it; this will avoid dbuf callbacks into DMU in the case of hole reads or cache hits. - When releasing a dbuf immediately, buffer set holds should be > 1. - Fix dmu_read_cb() so that it now guarantees that if a callback is specified, it will be called, rain or shine. In the event a DMU context is not created on the heap, one will be constructed on the stack (with only private & error data) and passed directly to the callback. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c: - zvol_strategy(): Remove the read_ctx error release call now that dmu_read_cb() guarantees that the callback will be called. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#29 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#11 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#29 (text) ==== @@ -112,14 +112,19 @@ SYSCTL_INT(_vfs_zfs_dmu, OID_AUTO, name, CTLFLAG_RD, \ &name, 0, desc) -SYSCTL_COUNTER_U(dbufs_already_cached, "number of dbufs already cached"); SYSCTL_COUNTER_U(hold_array_early_errors, "hold_array early errors"); SYSCTL_COUNTER_U(hold_array_dbuf_hold_errors, "hold_array dbuf_hold errors"); SYSCTL_COUNTER_U(dmu_ctx_total, "total number of DMU contexts"); SYSCTL_COUNTER_U(buf_set_total, "total number of buffer sets"); +SYSCTL_COUNTER_U(hold_array_dbuf_errors, "Dbuf errors in hold_array"); +SYSCTL_COUNTER_U(hold_array_zio_wait_errors, "zio_wait errors in hold_array"); SYSCTL_REFCOUNT(dmu_ctx_in_flight, "number of DMU contexts in flight"); SYSCTL_REFCOUNT(buf_set_in_flight, "number of buffer sets in flight"); +uint_t dmu_read_using_async; +SYSCTL_INT(_vfs_zfs_dmu, OID_AUTO, dmu_read_using_async, CTLFLAG_RW, + &dmu_read_using_async, 0, "DMU reads always using async version"); + /** * \brief Obtain the DMU buffer from the specified object which contains the * specified offset. @@ -458,6 +463,7 @@ dp->dp_read_overhead += gethrtime() - start; if (err) { dmu_buf_rele_array(dbp, nblks, tag); + atomic_add_64(&hold_array_zio_wait_errors, 1); return (err); } @@ -994,9 +1000,9 @@ } int -dmu_buf_hold_array_by_dnode_cb(dnode_t *dn, uint64_t offset, uint64_t length, - int read, void *tag, dmu_callback_t dmu_cb, void *priv, void *buf, - dmu_context_t **dmu_ctx_p, dmu_buf_set_t **buf_set_p, uint32_t flags) +dmu_buf_hold_array_by_dnode_cb(dnode_t *dn, uint64_t *offset, uint64_t *length, + int read, void *tag, dmu_callback_t dmu_cb, void *priv, void **buf, + dmu_context_t **dmu_ctx_p, uint32_t flags) { dsl_pool_t *dp = NULL; uint64_t blkid, nblks, i, avail_size; @@ -1007,27 +1013,28 @@ zio_t *zio; hrtime_t start; - ASSERT(length <= DMU_MAX_ACCESS); + /* Determine the actual size this I/O set will try to perform. */ + avail_size = MIN(*length, DMU_MAX_ACCESS); dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; - if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) + if (flags & DMU_READ_NO_PREFETCH || avail_size > zfetch_array_rd_sz) dbuf_flags |= DB_RF_NOPREFETCH; /* Figure out the number of blocks needed for the buffer set. */ rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; - nblks = (P2ROUNDUP(offset+length, 1ULL<> blkshift; + nblks = (P2ROUNDUP(*offset+avail_size, 1ULL<> blkshift; } else { - if (offset + length > dn->dn_datablksz) { + if ((*offset + avail_size) > dn->dn_datablksz) { atomic_add_64(&hold_array_early_errors, 1); zfs_panic_recover("zfs: accessing past end of object " "%llx/%llx (size=%u access=%llu+%llu)", (longlong_t)dn->dn_objset-> os_dsl_dataset->ds_object, (longlong_t)dn->dn_object, dn->dn_datablksz, - (longlong_t)offset, (longlong_t)length); + (longlong_t)*offset, (longlong_t)avail_size); rw_exit(&dn->dn_struct_rwlock); return (EIO); } @@ -1042,7 +1049,7 @@ refcount_acquire(&dmu_ctx_in_flight); atomic_add_64(&dmu_ctx_total, 1); dmu_ctx = *dmu_ctx_p; - *buf_set_p = (dmu_buf_set_t *) + buf_set = (dmu_buf_set_t *) ((char *)*dmu_ctx_p + sizeof(dmu_context_t)); /* Initialize a new DMU context. */ @@ -1058,7 +1065,7 @@ } else { /* Create only the new buffer set. */ dmu_ctx = *dmu_ctx_p; - *buf_set_p = kmem_zalloc( + buf_set = kmem_zalloc( sizeof(dmu_buf_set_t) + nblks * sizeof(dmu_buf_t *), KM_SLEEP); refcount_acquire(&dmu_ctx->holds); @@ -1067,9 +1074,8 @@ /* Initialize a new buffer set. */ refcount_acquire(&buf_set_in_flight); atomic_add_64(&buf_set_total, 1); - buf_set = *buf_set_p; - buf_set->offset = offset; - buf_set->data_buf = buf; + buf_set->offset = *offset; + buf_set->data_buf = *buf; buf_set->count = nblks; /* Include a refcount for the initiator. */ refcount_init(&buf_set->holds, nblks + 1); @@ -1080,28 +1086,22 @@ if (dp && dsl_pool_sync_context(dp)) start = gethrtime(); zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); - blkid = dbuf_whichblock(dn, offset); - avail_size = length; + blkid = dbuf_whichblock(dn, *offset); for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); int bufoff, bufsiz; if (db == NULL) { + /* Fix up refcount & count. */ + for (;i < nblks;i++) + refcount_release(&buf_set->holds); + buf_set->count -= (nblks - i); rw_exit(&dn->dn_struct_rwlock); dmu_buf_set_rele(buf_set, NULL, B_TRUE); atomic_add_64(&hold_array_dbuf_hold_errors, 1); zio_nowait(zio); return (EIO); } - /* Calculate the amount of data this buffer contributes. */ - ASSERT(offset >= db->db.db_offset); - bufoff = offset - db->db.db_offset; - bufsiz = (int)MIN(db->db.db_size - bufoff, avail_size); - buf_set->size += bufsiz; - ASSERT(buf_set->size <= length); - offset += bufsiz; - avail_size -= bufsiz; - /* initiate async i/o */ if (read) (void) dbuf_read(db, zio, dbuf_flags); @@ -1110,6 +1110,17 @@ curthread->td_ru.ru_oublock++; #endif + /* Calculate the amount of data this buffer contributes. */ + ASSERT(*offset >= db->db.db_offset); + bufoff = *offset - db->db.db_offset; + bufsiz = (int)MIN(db->db.db_size - bufoff, avail_size); + buf_set->size += bufsiz; + avail_size -= bufsiz; + /* Update the caller's data to let them know what's next. */ + *offset += bufsiz; + *length -= bufsiz; + *buf = (char *)*buf + bufsiz; + /* Make sure dbufs that don't notify DMU are caught here. */ mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { @@ -1118,8 +1129,7 @@ * for DB_UNCACHED here because that only * indicates an initialized buffer. */ - atomic_add_64(&dbufs_already_cached, 1); - ASSERT(buf_set->holds > 0); + ASSERT(buf_set->holds > 1); refcount_release(&buf_set->holds); } else { /* Let the dbuf know this DMU context needs it. */ @@ -1145,6 +1155,7 @@ dp->dp_read_overhead += gethrtime() - start; if (err) { dmu_buf_set_rele(buf_set, NULL, B_TRUE); + atomic_add_64(&hold_array_zio_wait_errors, 1); return (err); } @@ -1160,6 +1171,8 @@ err = EIO; mutex_exit(&db->db_mtx); if (err) { + atomic_add_64( + &hold_array_dbuf_errors, 1); dmu_buf_set_rele(buf_set, NULL, B_TRUE); return (err); } @@ -1167,6 +1180,9 @@ } } + /* Release the initiator hold. */ + dmu_buf_set_rele(buf_set, NULL, B_FALSE); + return (0); } @@ -1174,14 +1190,13 @@ dmu_read_cb(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags, dmu_callback_t dmu_cb, void *priv) { - dnode_t *dn; + dnode_t *dn = NULL; dmu_context_t *dmu_ctx = NULL; - dmu_buf_set_t *buf_set; int numbufs, err; err = dnode_hold(os, object, FTAG, &dn); if (err) - return (err); + goto out; /* * Deal with odd block sizes, where there can't be data past the first @@ -1194,34 +1209,19 @@ bzero((char *)buf + newsz, size - newsz); size = newsz; } - if (size <= 0) - err = EINVAL; while (size > 0) { - uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); - int i; - /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ - err = dmu_buf_hold_array_by_dnode_cb(dn, offset, mylen, - TRUE, FTAG, dmu_cb, priv, buf, &dmu_ctx, &buf_set, flags); + err = dmu_buf_hold_array_by_dnode_cb(dn, &offset, &size, + TRUE, FTAG, dmu_cb, priv, &buf, &dmu_ctx, flags); if (err) break; + } - offset += buf_set->size; - size -= buf_set->size; - buf = (char *)buf + buf_set->size; - - /* Release initiator hold. */ - dmu_buf_set_rele(buf_set, NULL, B_FALSE); - } - /* - * Either the caller didn't specify a callback, or a context was - * created, or we're returning an error to the caller. - */ - ASSERT(dmu_cb == NULL || dmu_ctx != NULL || err != 0); +out: if (dmu_ctx != NULL) { /* * If an I/O error occurred, tell the callback instead @@ -1231,11 +1231,26 @@ atomic_add_int(&dmu_ctx->err, 1); err = 0; } + ASSERT(dmu_cb != NULL || dmu_ctx->holds == 1); /* Release initiator hold. */ dmu_context_rele(dmu_ctx); + } else if (dmu_cb != NULL) { + dmu_context_t tmp; + + /* + * No context created means no I/O initiated. But the + * callback still expects to be notified. + */ + bzero(&tmp, sizeof(dmu_context_t)); + tmp.dmu_cb_private = priv; + tmp.err = err; + err = 0; + dmu_cb(&tmp); } - dnode_rele(dn, FTAG); + if (dn != NULL) + dnode_rele(dn, FTAG); + return (err); } @@ -1243,7 +1258,63 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { - return (dmu_read_cb(os, object, offset, size, buf, flags, NULL, NULL)); + dnode_t *dn; + int err, numbufs; + dmu_buf_t **dbp; + + if (dmu_read_using_async) + return (dmu_read_cb(os, object, offset, size, buf, flags, + NULL, NULL)); + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + + /* + * Deal with odd block sizes, where there can't be data past the first + * block. If we ever do the tail block optimization, we will need to + * handle that here as well. + */ + if (dn->dn_maxblkid == 0) { + int newsz = offset > dn->dn_datablksz ? 0 : + MIN(size, dn->dn_datablksz - offset); + bzero((char *)buf + newsz, size - newsz); + size = newsz; + } + + while (size > 0) { + uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); + int i; + + /* + * NB: we could do this block-at-a-time, but it's nice + * to be reading in parallel. + */ + err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, + TRUE, FTAG, &numbufs, &dbp, flags); + if (err) + break; + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + bcopy((char *)db->db_data + bufoff, buf, tocpy); + + offset += tocpy; + size -= tocpy; + buf = (char *)buf + tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + } + dnode_rele(dn, FTAG); + return (err); } void ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#11 (text) ==== @@ -1319,8 +1319,8 @@ DMU_READ_PREFETCH, zvol_dmu_cb, read_ctx); /* XXX: no DMU context created??? */ #else - error = dmu_read_cb(os, ZVOL_OBJ, off, size, addr, - DMU_READ_PREFETCH, NULL, NULL); + error = dmu_read(os, ZVOL_OBJ, off, size, addr, + DMU_READ_PREFETCH); #endif } else { dmu_tx_t *tx = dmu_tx_create(os); @@ -1355,9 +1355,6 @@ zil_commit(zv->zv_zilog, ZVOL_OBJ); g_io_deliver(bp, 0); } else { - /* If an error is returned, no I/O was initiated. */ - if (error) - zvol_dmu_read_release(read_ctx); /* Release initiator hold. */ zvol_dmu_read_release(read_ctx); } Change 522655 by willa@willa_repo on 2012/01/27 10:19:09 Allow sysctl to control whether zvol reads are async, too. Enable by default both async zvol reads and use of the async implementation (with NULL callback/private) for dmu_read() callers. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#30 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#12 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#30 (text) ==== @@ -121,7 +121,7 @@ SYSCTL_REFCOUNT(dmu_ctx_in_flight, "number of DMU contexts in flight"); SYSCTL_REFCOUNT(buf_set_in_flight, "number of buffer sets in flight"); -uint_t dmu_read_using_async; +uint_t dmu_read_using_async = 1; SYSCTL_INT(_vfs_zfs_dmu, OID_AUTO, dmu_read_using_async, CTLFLAG_RW, &dmu_read_using_async, 0, "DMU reads always using async version"); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#12 (text) ==== @@ -105,6 +105,10 @@ SYSCTL_COUNTER_U(read_ctx_total, "total number of read contexts"); SYSCTL_REFCOUNT(read_ctx_in_flight, "number of read contexts in flight"); +uint_t zvol_async_read = 1; +SYSCTL_INT(_vfs_zfs_zvol, OID_AUTO, zvol_async_read, CTLFLAG_RW, + &zvol_async_read, 0, "ZVOL reads use asynchronous DMU calls"); + /** * The zfsdev_state structure is protected by spa_namespace_lock from being * modified while it's being used, e.g. an open that comes in before a @@ -1248,8 +1252,6 @@ zvol_dmu_read_release(read_ctx); } -#define ZVOL_ASYNC_READ 1 - int zvol_strategy(struct bio *bp) { @@ -1297,8 +1299,7 @@ rl = zfs_range_lock(&zv->zv_znode, off, resid, doread ? RL_READER : RL_WRITER); -#if ZVOL_ASYNC_READ - if (doread) { + if (zvol_async_read && doread) { /* XXX yet another malloc ... create a pool of these? */ read_ctx = kmem_zalloc(sizeof(zvol_read_context_t), KM_SLEEP); refcount_acquire(&read_ctx_in_flight); @@ -1308,20 +1309,18 @@ read_ctx->rl = rl; read_ctx->bp = bp; } -#endif while (resid != 0 && off < volsize) { size_t size = MIN(resid, zvol_maxphys); if (doread) { -#if ZVOL_ASYNC_READ - refcount_acquire(&read_ctx->holds); - error = dmu_read_cb(os, ZVOL_OBJ, off, size, addr, - DMU_READ_PREFETCH, zvol_dmu_cb, read_ctx); - /* XXX: no DMU context created??? */ -#else - error = dmu_read(os, ZVOL_OBJ, off, size, addr, - DMU_READ_PREFETCH); -#endif + if (zvol_async_read) { + refcount_acquire(&read_ctx->holds); + dmu_read_cb(os, ZVOL_OBJ, off, size, addr, + DMU_READ_PREFETCH, zvol_dmu_cb, read_ctx); + } else { + error = dmu_read(os, ZVOL_OBJ, off, size, addr, + DMU_READ_PREFETCH); + } } else { dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); Change 522669 by willa@willa_repo on 2012/01/27 12:35:19 Whitespace: Make these lines 80 characters or less. Create DN_NEXT_LEVEL() macro to wrap some of the checks. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#62 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h#5 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#62 (text) ==== @@ -2019,9 +2019,9 @@ ASSERT3U(dn->dn_nlevels, >, db->db_level); ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || dn->dn_phys->dn_nlevels > db->db_level || - dn->dn_next_nlevels[txgoff] > db->db_level || - dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || - dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); + DN_NEXT_LEVEL(dn, tx->tx_txg) > db->db_level || + DN_NEXT_LEVEL(dn, tx->tx_txg - 1) > db->db_level || + DN_NEXT_LEVEL(dn, tx->tx_txg - 2) > db->db_level); /* * We should only be dirtying in syncing context if it's the @@ -2031,7 +2031,8 @@ * this assertion only if we're not already dirty. */ os = dn->dn_objset; - ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || + ASSERT(!dmu_tx_is_syncing(tx) || + DMU_OBJECT_IS_SPECIAL(dn->dn_object) || os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); ASSERT(db->db.db_size != 0); @@ -2045,7 +2046,7 @@ * (and possibly deadlocking) in bp_get_dsize() while * also holding the db_mtx. * - * XXX Shouldn't this conditional check for SPILL blkid too? + * XXX Shouldn't this conditional ignore SPILL too? */ dnode_willuse_space(dn, db->db.db_size, tx); do_free_accounting = dbuf_block_freeable(db); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h#5 (text) ==== @@ -99,6 +99,10 @@ #define DNODES_PER_LEVEL (1ULL << DNODES_PER_LEVEL_SHIFT) /** \} */ +/* Next level for a given dnode and txg */ +#define DN_NEXT_LEVEL(dn, txg) \ + (dn)->dn_next_nlevels[(txg) & TXG_MASK] + /* The +2 here is a cheesy way to round up */ #define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \ (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT))) Change 522722 by willa@willa_repo on 2012/01/27 15:49:57 Add Spetra Logic copyrights to these files. Files that had only comment changes were left out. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c#3 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c#10 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#63 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#31 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c#8 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c#9 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c#9 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h#5 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#27 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#14 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h#6 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h#3 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c#9 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#13 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h#3 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c#3 (text) ==== @@ -2,6 +2,8 @@ * Copyright (c) 2009 Pawel Jakub Dawidek * All rights reserved. * + * Copyright (c) 2012 Spectra Logic Corporation. All rights reserved. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c#10 (text) ==== @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2011-2012 Spectra Logic Corporation. All rights reserved. */ /** ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#63 (text) ==== @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Spectra Logic Corporation. All rights reserved. */ #include ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#31 (text) ==== @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Spectra Logic Corporation. All rights reserved. */ #include ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c#8 (text) ==== @@ -23,6 +23,7 @@ */ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Spectra Logic Corporation. All rights reserved. */ #include ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c#9 (text) ==== @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Spectra Logic Corporation. All rights reserved. */ #include ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c#9 (text) ==== @@ -22,6 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2011-2012 Spectra Logic Corporation. All rights reserved. */ /** ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h#5 (text) ==== @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Spectra Logic Corporation. All rights reserved. */ #ifndef _SYS_ARC_H ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#27 (text) ==== @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Spectra Logic Corporation. All rights reserved. */ #ifndef _SYS_DBUF_H ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#14 (text) ==== @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2011-2012 Spectra Logic Corporation. All rights reserved. */ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h#6 (text) ==== @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012, Spectra Logic Corporation. All rights reserved. */ #ifndef _SYS_DNODE_H ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h#3 (text) ==== @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012, Spectra Logic Corporation. All rights reserved. */ #ifndef _SYS_ZFS_DEBUG_H ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c#9 (text) ==== @@ -25,6 +25,7 @@ * Portions Copyright 2011 Martin Matuska * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2011-2012, Spectra Logic Corporation. All rights reserved. */ #include ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#13 (text) ==== @@ -23,6 +23,8 @@ * * Copyright (c) 2006-2010 Pawel Jakub Dawidek * All rights reserved. + * + * Copyright (c) 2011-2012, Spectra Logic Corporation. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h#3 (text) ==== @@ -21,6 +21,8 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright (c) 2012 Spectra Logic Corporation. All rights reserved. */ #ifndef _SYS_TASKQ_H Change 523039 by willa@willa_repo on 2012/01/29 16:58:05 Break up dmu_buf_hold_array_by_dnode_cb() into smaller pieces. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#32 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#32 (text) ==== @@ -1000,48 +1000,14 @@ } } -int -dmu_buf_hold_array_by_dnode_cb(dnode_t *dn, uint64_t *offset, uint64_t *length, - int read, void *tag, dmu_callback_t dmu_cb, void *priv, void **buf, - dmu_context_t **dmu_ctx_p, uint32_t flags) +static void +dmu_buf_array_init(dmu_context_t **dmu_ctx_p, dmu_buf_set_t **buf_set_p, + void **buf, uint64_t *offset, void *tag, int nblks, dmu_callback_t dmu_cb, + void *priv, int read) { - dsl_pool_t *dp = NULL; - uint64_t blkid, nblks, i, avail_size; - uint32_t dbuf_flags; dmu_context_t *dmu_ctx; dmu_buf_set_t *buf_set; - int err; - zio_t *zio; - hrtime_t start; - - /* Determine the actual size this I/O set will try to perform. */ - avail_size = MIN(*length, DMU_MAX_ACCESS); - dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; - if (flags & DMU_READ_NO_PREFETCH || avail_size > zfetch_array_rd_sz) - dbuf_flags |= DB_RF_NOPREFETCH; - - /* Figure out the number of blocks needed for the buffer set. */ - rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (dn->dn_datablkshift) { - int blkshift = dn->dn_datablkshift; - nblks = (P2ROUNDUP(*offset+avail_size, 1ULL<> blkshift; - } else { - if ((*offset + avail_size) > dn->dn_datablksz) { - atomic_add_64(&hold_array_early_errors, 1); - zfs_panic_recover("zfs: accessing past end of object " - "%llx/%llx (size=%u access=%llu+%llu)", - (longlong_t)dn->dn_objset-> - os_dsl_dataset->ds_object, - (longlong_t)dn->dn_object, dn->dn_datablksz, - (longlong_t)*offset, (longlong_t)avail_size); - rw_exit(&dn->dn_struct_rwlock); - return (EIO); - } - nblks = 1; - } - if (*dmu_ctx_p == NULL) { /* Create the DMU context AND the buffer set. */ *dmu_ctx_p = kmem_zalloc( @@ -1081,15 +1047,20 @@ /* Include a refcount for the initiator. */ refcount_init(&buf_set->holds, nblks + 1); buf_set->dmu_ctx = dmu_ctx; + *buf_set_p = buf_set; +} - if (dn->dn_objset->os_dsl_dataset) - dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; - if (dp && dsl_pool_sync_context(dp)) - start = gethrtime(); - zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); +static int +dmu_buf_hold_array_setup_buffers(dmu_context_t *dmu_ctx, dmu_buf_set_t *buf_set, + dnode_t *dn, uint64_t nblks, uint64_t *offset, uint64_t *length, + void **buf, int dbuf_flags, zio_t *zio, uint64_t avail_size) +{ + uint64_t blkid; + int i; + blkid = dbuf_whichblock(dn, *offset); for (i = 0; i < nblks; i++) { - dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); + dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, dmu_ctx->tag); int bufoff, bufsiz; if (db == NULL) { @@ -1104,7 +1075,18 @@ return (EIO); } /* initiate async i/o */ - if (read) +#if 0 + { + int prefetch = db->db_level == 0 && + db->db_blkid != DMU_BONUS_BLKID && + (dbuf_flags & DB_RF_NOPREFETCH) == 0 && + dn != NULL && DBUF_IS_CACHEABLE(db); + + printf("hold_array read db=%p prefetch=%d\n", + db, prefetch); + } +#endif + if (dmu_ctx->flags & DMU_CTX_FLAG_READ) (void) dbuf_read(db, zio, dbuf_flags); #ifdef _KERNEL else @@ -1140,47 +1122,123 @@ mutex_exit(&db->db_mtx); buf_set->dbp[i] = &db->db; } - rw_exit(&dn->dn_struct_rwlock); + return 0; +} + +static int +dmu_buf_hold_array_process_io(dmu_buf_set_t *buf_set, zio_t *zio, + dsl_pool_t *dp, hrtime_t start) +{ + int err, i; /* * If a callback is specified, issue the I/O's without waiting. * The dbufs will be responsible for cleaning up. */ - if (dmu_cb != NULL) + if (buf_set->dmu_ctx->dmu_cb != NULL) { zio_nowait(zio); - else { - /* wait for async i/o */ - err = zio_wait(zio); - /* track read overhead when we are in sync context */ - if (dp && dsl_pool_sync_context(dp)) - dp->dp_read_overhead += gethrtime() - start; + return (0); + } + + /* wait for async i/o */ + err = zio_wait(zio); + /* track read overhead when we are in sync context */ + if (dp && dsl_pool_sync_context(dp)) + dp->dp_read_overhead += gethrtime() - start; + if (err) { + dmu_buf_set_rele(buf_set, NULL, B_TRUE); + atomic_add_64(&hold_array_zio_wait_errors, 1); + return (err); + } + + if ((buf_set->dmu_ctx->flags & DMU_CTX_FLAG_READ) == 0) + return (0); + + /* wait for other io to complete */ + for (i = 0; i < buf_set->count; i++) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)buf_set->dbp[i]; + mutex_enter(&db->db_mtx); + while (db->db_state & (DB_READ|DB_FILL)) + cv_wait(&db->db_changed, &db->db_mtx); + if (db->db_state == DB_UNCACHED) + err = EIO; + mutex_exit(&db->db_mtx); if (err) { + atomic_add_64(&hold_array_dbuf_errors, 1); dmu_buf_set_rele(buf_set, NULL, B_TRUE); - atomic_add_64(&hold_array_zio_wait_errors, 1); return (err); } + } + return (0); +} - /* wait for other io to complete */ - if (read) { - for (i = 0; i < nblks; i++) { - dmu_buf_impl_t *db = - (dmu_buf_impl_t *)buf_set->dbp[i]; - mutex_enter(&db->db_mtx); - while (db->db_state & (DB_READ|DB_FILL)) - cv_wait(&db->db_changed, &db->db_mtx); - if (db->db_state == DB_UNCACHED) - err = EIO; - mutex_exit(&db->db_mtx); - if (err) { - atomic_add_64( - &hold_array_dbuf_errors, 1); - dmu_buf_set_rele(buf_set, NULL, B_TRUE); - return (err); - } - } +int +dmu_buf_hold_array_by_dnode_cb(dnode_t *dn, uint64_t *offset, uint64_t *length, + int read, void *tag, dmu_callback_t dmu_cb, void *priv, void **buf, + dmu_context_t **dmu_ctx_p, uint32_t flags) +{ + dsl_pool_t *dp = NULL; + uint64_t blkid, nblks, i, avail_size; + uint32_t dbuf_flags; + dmu_context_t *dmu_ctx; + dmu_buf_set_t *buf_set; + int err; + zio_t *zio; + hrtime_t start; + + /* Determine the actual size this I/O set will try to perform. */ + avail_size = MIN(*length, DMU_MAX_ACCESS); + + dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; + if (flags & DMU_READ_NO_PREFETCH || avail_size > zfetch_array_rd_sz) + dbuf_flags |= DB_RF_NOPREFETCH; + + /* Figure out the number of blocks needed for the buffer set. */ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_datablkshift) { + int blkshift = dn->dn_datablkshift; + nblks = (P2ROUNDUP(*offset+avail_size, 1ULL<> blkshift; + } else { + if ((*offset + avail_size) > dn->dn_datablksz) { + atomic_add_64(&hold_array_early_errors, 1); + zfs_panic_recover("zfs: accessing past end of object " + "%llx/%llx (size=%u access=%llu+%llu)", + (longlong_t)dn->dn_objset-> + os_dsl_dataset->ds_object, + (longlong_t)dn->dn_object, dn->dn_datablksz, + (longlong_t)*offset, (longlong_t)avail_size); + rw_exit(&dn->dn_struct_rwlock); + return (EIO); } + nblks = 1; } + /* Initialize the buffer set and context, if necessary. */ + dmu_buf_array_init(dmu_ctx_p, &buf_set, buf, offset, tag, nblks, + dmu_cb, priv, read); + dmu_ctx = *dmu_ctx_p; + ASSERT(dmu_ctx != NULL && buf_set != NULL); + + if (dn->dn_objset->os_dsl_dataset) + dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; + if (dp && dsl_pool_sync_context(dp)) + start = gethrtime(); + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); + + /* Set up the buffers. */ + err = dmu_buf_hold_array_setup_buffers(dmu_ctx, buf_set, dn, nblks, + offset, length, buf, dbuf_flags, zio, avail_size); + if (err) + return (err); + + rw_exit(&dn->dn_struct_rwlock); + + /* Process the I/O requests. */ + err = dmu_buf_hold_array_process_io(buf_set, zio, dp, start); + if (err) + return (err); + /* Release the initiator hold. */ dmu_buf_set_rele(buf_set, NULL, B_FALSE); Change 523995 by willa@willa_repo on 2012/01/30 14:07:45 Explain bio.bio_cmd and bio.bio_flags better in comments. Affected files ... ... //depot/branches/redline/projects/cow/sys/sys/bio.h#2 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/sys/bio.h#2 (text) ==== @@ -41,19 +41,23 @@ #include /* bio_cmd */ -#define BIO_READ 0x01 -#define BIO_WRITE 0x02 -#define BIO_DELETE 0x04 -#define BIO_GETATTR 0x08 -#define BIO_FLUSH 0x10 +#define BIO_READ 0x01 /* Read I/O data */ +#define BIO_WRITE 0x02 /* Write I/O data */ +#define BIO_DELETE 0x04 /* TRIM or free blocks, i.e. mark as unused */ +#define BIO_GETATTR 0x08 /* Get GEOM attributes of object */ +#define BIO_FLUSH 0x10 /* Commit outstanding I/O now */ #define BIO_CMD0 0x20 /* Available for local hacks */ #define BIO_CMD1 0x40 /* Available for local hacks */ #define BIO_CMD2 0x80 /* Available for local hacks */ /* bio_flags */ -#define BIO_ERROR 0x01 -#define BIO_DONE 0x02 -#define BIO_ONQUEUE 0x04 +#define BIO_ERROR 0x01 /* An error occurred processing this bio. */ +#define BIO_DONE 0x02 /* This bio is finished. */ +#define BIO_ONQUEUE 0x04 /* This bio is in a queue & not yet taken. */ +/* + * This bio must be executed after all previous bios in the queue have been + * executed, and before any successive bios can be executed. + */ #define BIO_ORDERED 0x08 #ifdef _KERNEL Change 524137 by willa@willa_repo on 2012/01/31 11:49:18 Extend dmu_buf_set_dispatch() to support writes and UIOs. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#33 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#15 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#33 (text) ==== @@ -798,30 +798,111 @@ kmem_free(dcn, sizeof(dmu_context_node_t)); } +static void +dmu_buf_set_transfer_uio(dmu_buf_set_t *buf_set) +{ +#ifdef _KERNEL + uio_t *uio = buf_set->data_buf; + xuio_t *xuio = NULL; + enum uio_rw dir = UIO_WRITE; + dmu_context_t *dmu_ctx = buf_set->dmu_ctx; + dmu_tx_t *tx = dmu_ctx->tx; + uint64_t size = buf_set->size; + uint64_t offset = buf_set->offset; + int i; + +#ifdef UIO_XUIO + if (uio->uio_extflg == UIO_XUIO) + xuio = (xuio_t *)uio; +#endif + + if (dmu_ctx->flags & DMU_CTX_FLAG_READ) + dir = UIO_READ; + + for (i = 0; i < buf_set->count; i++) { + int bufoff; + int tocpy; + dmu_buf_t *db = buf_set->dbp[i]; + + bufoff = uio->uio_loffset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + if (dir == UIO_WRITE) { + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty_range(db, tx, bufoff, tocpy); + } + + if (xuio && dir == UIO_READ) { + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + arc_buf_t *dbuf_abuf = dbi->db_buf; + arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); + int err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); + if (!err) { + uio->uio_resid -= tocpy; + uio->uio_loffset += tocpy; + } + + if (abuf == dbuf_abuf) + XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); + else + XUIOSTAT_BUMP(xuiostat_rbuf_copied); + } else { + uiomove((char *)db->db_data + bufoff, tocpy, dir, uio); + } + size -= tocpy; + } +#endif +} + /** * \brief Perform a buffer set read for a char * target buffer. * * \param buf_set Buffer set to read. */ static void -dmu_buf_set_read(dmu_buf_set_t *buf_set) +dmu_buf_set_transfer(dmu_buf_set_t *buf_set) { char *buf = (char *)buf_set->data_buf; uint64_t size = buf_set->size; uint64_t offset = buf_set->offset; + dmu_context_t *dmu_ctx = buf_set->dmu_ctx; + dmu_tx_t *tx = dmu_ctx->tx; int i; + if (dmu_ctx->flags & DMU_CTX_FLAG_UIO) { + dmu_buf_set_transfer_uio(buf_set); + return; + } + for (i = 0; i < buf_set->count; i++) { int bufoff; int tocpy; dmu_buf_t *db = buf_set->dbp[i]; + char *src, *dst; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); - bcopy((char *)db->db_data + bufoff, buf, tocpy); + if (dmu_ctx->flags & DMU_CTX_FLAG_READ) { + src = (char *)db->db_data + bufoff; + dst = buf; + } else { + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty_range(db, tx, bufoff, tocpy); + src = buf; + dst = (char *)db->db_data + bufoff; + } + + bcopy(src, dst, tocpy); + + if ((dmu_ctx->flags & DMU_CTX_FLAG_READ) == 0) + dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; @@ -865,12 +946,7 @@ /* Only perform I/O if no errors occurred for the buffer set. */ if (buf_set->err == 0) { - /* XXX only supports reads for now */ - ASSERT(buf_set->dmu_ctx->flags & DMU_CTX_FLAG_READ); - if (buf_set->dmu_ctx->flags & DMU_CTX_FLAG_READ) { - dmu_buf_set_read(buf_set); - /* XXX implement uio version? */ - } + dmu_buf_set_transfer(buf_set); atomic_add_64(&buf_set->dmu_ctx->completed_size, buf_set->size); } else atomic_add_int(&buf_set->dmu_ctx->err, buf_set->err); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#15 (text) ==== @@ -245,10 +245,14 @@ /** Flags for this block. */ int flags; #define DMU_CTX_FLAG_READ (1 << 1) +#define DMU_CTX_FLAG_UIO (1 << 2) /** The number of errors that occurred. */ int err; + /** DMU Transaction, if one applies to this context. */ + dmu_tx_t *tx; + /** Private data for the callback. */ void *dmu_cb_private; Change 524139 by willa@willa_repo on 2012/01/31 11:52:36 Re-nuke the old implementation of dmu_read(). Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#34 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#34 (text) ==== @@ -122,10 +122,6 @@ SYSCTL_REFCOUNT(dmu_ctx_in_flight, "number of DMU contexts in flight"); SYSCTL_REFCOUNT(buf_set_in_flight, "number of buffer sets in flight"); -uint_t dmu_read_using_async = 1; -SYSCTL_INT(_vfs_zfs_dmu, OID_AUTO, dmu_read_using_async, CTLFLAG_RW, - &dmu_read_using_async, 0, "DMU reads always using async version"); - /** * \brief Obtain the DMU buffer from the specified object which contains the * specified offset. @@ -1393,63 +1389,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { - dnode_t *dn; - int err, numbufs; - dmu_buf_t **dbp; - - if (dmu_read_using_async) - return (dmu_read_cb(os, object, offset, size, buf, flags, - NULL, NULL)); - - err = dnode_hold(os, object, FTAG, &dn); - if (err) - return (err); - - /* - * Deal with odd block sizes, where there can't be data past the first - * block. If we ever do the tail block optimization, we will need to - * handle that here as well. - */ - if (dn->dn_maxblkid == 0) { - int newsz = offset > dn->dn_datablksz ? 0 : - MIN(size, dn->dn_datablksz - offset); - bzero((char *)buf + newsz, size - newsz); - size = newsz; - } - - while (size > 0) { - uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); - int i; - - /* - * NB: we could do this block-at-a-time, but it's nice - * to be reading in parallel. - */ - err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, - TRUE, FTAG, &numbufs, &dbp, flags); - if (err) - break; - - for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; - dmu_buf_t *db = dbp[i]; - - ASSERT(size > 0); - - bufoff = offset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - bcopy((char *)db->db_data + bufoff, buf, tocpy); - - offset += tocpy; - size -= tocpy; - buf = (char *)buf + tocpy; - } - dmu_buf_rele_array(dbp, numbufs, FTAG); - } - dnode_rele(dn, FTAG); - return (err); + return (dmu_read_cb(os, object, offset, size, buf, flags, NULL, NULL)); } void Change 524140 by willa@willa_repo on 2012/01/31 11:56:18 Remove some debugging counters and make others ifdef ZFS_DEBUG. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#35 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#35 (text) ==== @@ -113,14 +113,12 @@ SYSCTL_INT(_vfs_zfs_dmu, OID_AUTO, name, CTLFLAG_RD, \ &name, 0, desc) -SYSCTL_COUNTER_U(hold_array_early_errors, "hold_array early errors"); -SYSCTL_COUNTER_U(hold_array_dbuf_hold_errors, "hold_array dbuf_hold errors"); +#ifdef ZFS_DEBUG SYSCTL_COUNTER_U(dmu_ctx_total, "total number of DMU contexts"); SYSCTL_COUNTER_U(buf_set_total, "total number of buffer sets"); -SYSCTL_COUNTER_U(hold_array_dbuf_errors, "Dbuf errors in hold_array"); -SYSCTL_COUNTER_U(hold_array_zio_wait_errors, "zio_wait errors in hold_array"); SYSCTL_REFCOUNT(dmu_ctx_in_flight, "number of DMU contexts in flight"); SYSCTL_REFCOUNT(buf_set_in_flight, "number of buffer sets in flight"); +#endif /** * \brief Obtain the DMU buffer from the specified object which contains the @@ -460,7 +458,6 @@ dp->dp_read_overhead += gethrtime() - start; if (err) { dmu_buf_rele_array(dbp, nblks, tag); - atomic_add_64(&hold_array_zio_wait_errors, 1); return (err); } @@ -919,8 +916,10 @@ if (!refcount_release(&dmu_ctx->holds)) return; +#ifdef ZFS_DEBUG ASSERT(dmu_ctx_in_flight > 0); refcount_release(&dmu_ctx_in_flight); +#endif /* At this point, there are no buffer sets left. Call back. */ if (dmu_ctx->dmu_cb != NULL) @@ -955,8 +954,10 @@ } } +#ifdef ZFS_DEBUG ASSERT(buf_set_in_flight > 0); refcount_release(&buf_set_in_flight); +#endif /* * Check to see if this buffer set was allocated with the context. @@ -1085,8 +1086,10 @@ *dmu_ctx_p = kmem_zalloc( sizeof(dmu_context_t) + sizeof(dmu_buf_set_t) + nblks * sizeof(dmu_buf_t *), KM_SLEEP); +#ifdef ZFS_DEBUG refcount_acquire(&dmu_ctx_in_flight); atomic_add_64(&dmu_ctx_total, 1); +#endif dmu_ctx = *dmu_ctx_p; buf_set = (dmu_buf_set_t *) ((char *)*dmu_ctx_p + sizeof(dmu_context_t)); @@ -1111,8 +1114,10 @@ } /* Initialize a new buffer set. */ +#ifdef ZFS_DEBUG refcount_acquire(&buf_set_in_flight); atomic_add_64(&buf_set_total, 1); +#endif buf_set->offset = *offset; buf_set->data_buf = *buf; buf_set->count = nblks; @@ -1142,7 +1147,6 @@ buf_set->count -= (nblks - i); rw_exit(&dn->dn_struct_rwlock); dmu_buf_set_rele(buf_set, NULL, B_TRUE); - atomic_add_64(&hold_array_dbuf_hold_errors, 1); zio_nowait(zio); return (EIO); } @@ -1219,7 +1223,6 @@ dp->dp_read_overhead += gethrtime() - start; if (err) { dmu_buf_set_rele(buf_set, NULL, B_TRUE); - atomic_add_64(&hold_array_zio_wait_errors, 1); return (err); } @@ -1236,7 +1239,6 @@ err = EIO; mutex_exit(&db->db_mtx); if (err) { - atomic_add_64(&hold_array_dbuf_errors, 1); dmu_buf_set_rele(buf_set, NULL, B_TRUE); return (err); } @@ -1273,7 +1275,6 @@ P2ALIGN(*offset, 1ULL<> blkshift; } else { if ((*offset + avail_size) > dn->dn_datablksz) { - atomic_add_64(&hold_array_early_errors, 1); zfs_panic_recover("zfs: accessing past end of object " "%llx/%llx (size=%u access=%llu+%llu)", (longlong_t)dn->dn_objset-> Change 524210 by willa@willa_repo on 2012/01/31 15:02:17 Incorporate char * DMU writes into the callback API. Rename dmu_read_cb() to dmu_issue(), and revise its argument list. Change dmu_read() and dmu_write() so they are consumers of dmu_issue(). Writes are treated essentially the same as reads, except that they pass in a dmu_tx and no flags. They also do not place any holds on the overall context for a given buffer set, because the initiator must perform the I/O. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#36 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#16 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#14 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#36 (text) ==== @@ -864,6 +864,9 @@ dmu_tx_t *tx = dmu_ctx->tx; int i; + /* Having a transaction and being a reader is not supported. */ + ASSERT(tx != NULL || (dmu_ctx->flags & DMU_CTX_FLAG_READ)); + if (dmu_ctx->flags & DMU_CTX_FLAG_UIO) { dmu_buf_set_transfer_uio(buf_set); return; @@ -1076,7 +1079,7 @@ static void dmu_buf_array_init(dmu_context_t **dmu_ctx_p, dmu_buf_set_t **buf_set_p, void **buf, uint64_t *offset, void *tag, int nblks, dmu_callback_t dmu_cb, - void *priv, int read) + void *priv, int read, dmu_tx_t *tx) { dmu_context_t *dmu_ctx; dmu_buf_set_t *buf_set; @@ -1100,6 +1103,7 @@ dmu_ctx->dmu_cb = dmu_cb; dmu_ctx->dmu_cb_private = priv; dmu_ctx->tag = tag; + dmu_ctx->tx = tx; /* Include a refcount for the initiator & own buf_set. */ refcount_init(&dmu_ctx->holds, 2); if (read) @@ -1122,7 +1126,11 @@ buf_set->data_buf = *buf; buf_set->count = nblks; /* Include a refcount for the initiator. */ - refcount_init(&buf_set->holds, nblks + 1); + if (dmu_ctx->flags & DMU_CTX_FLAG_READ) + refcount_init(&buf_set->holds, nblks + 1); + else + /* For writes, dbufs never need to call us back. */ + refcount_init(&buf_set->holds, 1); buf_set->dmu_ctx = dmu_ctx; *buf_set_p = buf_set; } @@ -1151,17 +1159,6 @@ return (EIO); } /* initiate async i/o */ -#if 0 - { - int prefetch = db->db_level == 0 && - db->db_blkid != DMU_BONUS_BLKID && - (dbuf_flags & DB_RF_NOPREFETCH) == 0 && - dn != NULL && DBUF_IS_CACHEABLE(db); - - printf("hold_array read db=%p prefetch=%d\n", - db, prefetch); - } -#endif if (dmu_ctx->flags & DMU_CTX_FLAG_READ) (void) dbuf_read(db, zio, dbuf_flags); #ifdef _KERNEL @@ -1181,21 +1178,23 @@ *buf = (char *)*buf + bufsiz; /* Make sure dbufs that don't notify DMU are caught here. */ - mutex_enter(&db->db_mtx); - if (db->db_state == DB_CACHED) { - /* - * This buffer's already done. Don't check - * for DB_UNCACHED here because that only - * indicates an initialized buffer. - */ - ASSERT(buf_set->holds > 1); - refcount_release(&buf_set->holds); - } else { - /* Let the dbuf know this DMU context needs it. */ - dmu_context_node_add(&db->db_dmu_contexts, buf_set); + if (dmu_ctx->flags & DMU_CTX_FLAG_READ) { + mutex_enter(&db->db_mtx); + if (db->db_state == DB_CACHED) { + /* + * This buffer's already done. Don't check + * for DB_UNCACHED here because that only + * indicates an initialized buffer. + */ + ASSERT(buf_set->holds > 1); + refcount_release(&buf_set->holds); + } else { + /* Let the dbuf know this DMU context needs it. */ + dmu_context_node_add(&db->db_dmu_contexts, buf_set); + } + /* NB: all dbufs may have completed at this point! */ + mutex_exit(&db->db_mtx); } - /* NB: all dbufs may have completed at this point! */ - mutex_exit(&db->db_mtx); buf_set->dbp[i] = &db->db; } return 0; @@ -1210,8 +1209,11 @@ /* * If a callback is specified, issue the I/O's without waiting. * The dbufs will be responsible for cleaning up. + * + * Or, if this is a write, we're done. */ - if (buf_set->dmu_ctx->dmu_cb != NULL) { + if (buf_set->dmu_ctx->dmu_cb != NULL || + (buf_set->dmu_ctx->flags & DMU_CTX_FLAG_READ) == 0) { zio_nowait(zio); return (0); } @@ -1226,9 +1228,6 @@ return (err); } - if ((buf_set->dmu_ctx->flags & DMU_CTX_FLAG_READ) == 0) - return (0); - /* wait for other io to complete */ for (i = 0; i < buf_set->count; i++) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)buf_set->dbp[i]; @@ -1249,7 +1248,7 @@ int dmu_buf_hold_array_by_dnode_cb(dnode_t *dn, uint64_t *offset, uint64_t *length, int read, void *tag, dmu_callback_t dmu_cb, void *priv, void **buf, - dmu_context_t **dmu_ctx_p, uint32_t flags) + dmu_context_t **dmu_ctx_p, uint32_t flags, dmu_tx_t *tx) { dsl_pool_t *dp = NULL; uint64_t blkid, nblks, i, avail_size; @@ -1289,7 +1288,7 @@ /* Initialize the buffer set and context, if necessary. */ dmu_buf_array_init(dmu_ctx_p, &buf_set, buf, offset, tag, nblks, - dmu_cb, priv, read); + dmu_cb, priv, read, tx); dmu_ctx = *dmu_ctx_p; ASSERT(dmu_ctx != NULL && buf_set != NULL); @@ -1319,12 +1318,16 @@ } int -dmu_read_cb(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf, uint32_t flags, dmu_callback_t dmu_cb, void *priv) +dmu_issue(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf, uint32_t flags, dmu_callback_t dmu_cb, void *priv, int read, + dmu_tx_t *tx) { dnode_t *dn = NULL; dmu_context_t *dmu_ctx = NULL; - int numbufs, err; + int numbufs, err = 0; + + if (size == 0) + goto out; err = dnode_hold(os, object, FTAG, &dn); if (err) @@ -1335,7 +1338,7 @@ * block. If we ever do the tail block optimization, we will need to * handle that here as well. */ - if (dn->dn_maxblkid == 0) { + if (read && dn->dn_maxblkid == 0) { int newsz = offset > dn->dn_datablksz ? 0 : MIN(size, dn->dn_datablksz - offset); bzero((char *)buf + newsz, size - newsz); @@ -1347,7 +1350,7 @@ * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode_cb(dn, &offset, &size, - TRUE, FTAG, dmu_cb, priv, &buf, &dmu_ctx, flags); + read, FTAG, dmu_cb, priv, &buf, &dmu_ctx, flags, tx); if (err) break; @@ -1390,48 +1393,16 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { - return (dmu_read_cb(os, object, offset, size, buf, flags, NULL, NULL)); + return (dmu_issue(os, object, offset, size, buf, flags, + /*dmu_cb*/NULL, /*priv*/NULL, /*read*/TRUE, /*tx*/NULL)); } void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { - dmu_buf_t **dbp; - int numbufs, i; - - if (size == 0) - return; - - VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &numbufs, &dbp)); - - for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; - dmu_buf_t *db = dbp[i]; - - ASSERT(size > 0); - - bufoff = offset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); - - if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); - else - dmu_buf_will_dirty_range(db, tx, bufoff, tocpy); - - bcopy(buf, (char *)db->db_data + bufoff, tocpy); - - dmu_buf_fill_done(db, tx); - - offset += tocpy; - size -= tocpy; - buf = (char *)buf + tocpy; - } - dmu_buf_rele_array(dbp, numbufs, FTAG); + (void) dmu_issue(os, object, offset, size, (void *)(uintptr_t)(buf), 0, + /*dmu_cb*/NULL, /*priv*/NULL, /*read*/FALSE, /*tx*/tx); } void ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#16 (text) ==== @@ -424,8 +424,9 @@ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags); -int dmu_read_cb(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf, uint32_t flags, dmu_callback_t dmu_cb, void *priv); +int dmu_issue(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf, uint32_t flags, dmu_callback_t dmu_cb, void *priv, int read, + dmu_tx_t *tx); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#14 (text) ==== @@ -1317,8 +1317,9 @@ if (doread) { if (zvol_async_read) { refcount_acquire(&read_ctx->holds); - dmu_read_cb(os, ZVOL_OBJ, off, size, addr, - DMU_READ_PREFETCH, zvol_dmu_cb, read_ctx); + dmu_issue(os, ZVOL_OBJ, off, size, addr, + DMU_READ_PREFETCH, zvol_dmu_cb, read_ctx, + /*read*/TRUE, /*tx*/NULL); } else { error = dmu_read(os, ZVOL_OBJ, off, size, addr, DMU_READ_PREFETCH); Change 524359 by willa@willa_repo on 2012/02/02 11:38:01 Bubble up the DMU context structure to the top of the DMU API. Synchronous callers (i.e. dmu_read() and dmu_write()) place it on their stack. Asynchronous callers place it on the heap, and are responsible for freeing it, either directly or in the callback they specify. This also makes it easier to turn the DMU UIO calls into wrappers of the new API. fs/zfs/sys/dmu.h: - Change dmu_context_t to accomodate these changes: - Move the held dnode pointer here. - Move the buffer state data here. - Allow callbacks to specify up to 4 private pointers. - Fix dmu_buf_set_t so it accounts for the error case properly. In that scenario, buf_set->count is not the correct number of buffers to free. - Add dmu_context_setup() and dmu_context_rele() prototypes. fs/zfs/dmu.c: - dmu_read() and dmu_write(): Change these to initialize a DMU context on their stack, then call dmu_issue() as done previously. - Add dmu_context_setup_dnode(), which initializes a DMU context. Move some logic from dmu_issue() here. - Add dmu_context_setup(): This function wraps dmu_context_setup_dnode() and takes an pair that is used to look up and hold the dnode. This can return an error. - Simplify dmu_issue() into a function that executes chunks, returns an error to the callback if necessary, then releases the DMU context. - Change dmu_buf_hold_array_by_dnode_cb() to dmu_ctx_execute_chunk(): - Clean up the logic a bit and move more pieces to its subordinates. - Change the contract with the subordinates so that this function is always the one that drops struct_rwlock, and always the one that releases the buffer set. - Rename dmu_buf_hold_array_setup_buffers to dmu_buf_set_setup_buffers. - Rename dmu_buf_hold_array_process_io to dmu_buf_set_process_io. - Change dmu_buf_array_init to dmu_buf_set_init: - Don't initialize a DMU context any longer. - Change dmu_buf_set_dispatch: - Since buf_set->count reflects the number of buffers actually held, remove code that checked dbp[i] != NULL. - Since DMU contexts are no longer allocated alongside a buffer set, remove code that freed it differently for that scenario. - Change dmu_context_rele so that it releases the held dnode. - Change dmu_buf_set_transfer so it calculates the starting offset into the destination buffer. Previously these were simply assigned to the buffer set, but that was incompatible with UIOs. fs/zfs/zvol.c: - Now that dmu_write() is fully incorporated into the callback API, significantly simplify zvol_strategy() by having it just call dmu_context_setup() and dmu_issue(). If an error occurs setting up the DMU context, call dmu_context_rele() instead. - ZVOL no longer chunks; that is now done by dmu_issue(). - Rototill zvol_dmu_read_release() into zvol_dmu_cb(). - Change zvol_dmu_cb() so it now handles ALL I/O completion code that formerly lived in zvol_strategy(). Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#37 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#17 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#15 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#37 (text) ==== @@ -795,10 +795,10 @@ dmu_buf_set_transfer_uio(dmu_buf_set_t *buf_set) { #ifdef _KERNEL - uio_t *uio = buf_set->data_buf; + dmu_context_t *dmu_ctx = buf_set->dmu_ctx; + uio_t *uio = dmu_ctx->data_buf; xuio_t *xuio = NULL; enum uio_rw dir = UIO_WRITE; - dmu_context_t *dmu_ctx = buf_set->dmu_ctx; dmu_tx_t *tx = dmu_ctx->tx; uint64_t size = buf_set->size; uint64_t offset = buf_set->offset; @@ -857,9 +857,8 @@ static void dmu_buf_set_transfer(dmu_buf_set_t *buf_set) { - char *buf = (char *)buf_set->data_buf; - uint64_t size = buf_set->size; - uint64_t offset = buf_set->offset; + char *data; + uint64_t offset, size; dmu_context_t *dmu_ctx = buf_set->dmu_ctx; dmu_tx_t *tx = dmu_ctx->tx; int i; @@ -872,6 +871,12 @@ return; } + /* Initialize the state for this I/O. */ + data = (char *)dmu_ctx->data_buf + buf_set->offset - dmu_ctx->start; + size = buf_set->size; + offset = buf_set->offset; + + /* Perform the I/O copy, one buffer at a time. */ for (i = 0; i < buf_set->count; i++) { int bufoff; int tocpy; @@ -885,13 +890,13 @@ if (dmu_ctx->flags & DMU_CTX_FLAG_READ) { src = (char *)db->db_data + bufoff; - dst = buf; + dst = data; } else { if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty_range(db, tx, bufoff, tocpy); - src = buf; + src = data; dst = (char *)db->db_data + bufoff; } @@ -902,7 +907,7 @@ offset += tocpy; size -= tocpy; - buf = (char *)buf + tocpy; + data = (char *)data + tocpy; } } @@ -924,12 +929,11 @@ refcount_release(&dmu_ctx_in_flight); #endif + dnode_rele(dmu_ctx->dn, dmu_ctx->tag); + /* At this point, there are no buffer sets left. Call back. */ if (dmu_ctx->dmu_cb != NULL) dmu_ctx->dmu_cb(dmu_ctx); - - kmem_free(dmu_ctx, sizeof(dmu_context_t) + sizeof(dmu_buf_set_t) + - buf_set->count * sizeof(dmu_buf_t *)); } /** @@ -940,21 +944,20 @@ void dmu_buf_set_dispatch(dmu_buf_set_t *buf_set) { - int child, i; + int i; + dmu_context_t *dmu_ctx = buf_set->dmu_ctx; /* Only perform I/O if no errors occurred for the buffer set. */ if (buf_set->err == 0) { dmu_buf_set_transfer(buf_set); - atomic_add_64(&buf_set->dmu_ctx->completed_size, buf_set->size); + atomic_add_64(&dmu_ctx->completed_size, buf_set->size); } else - atomic_add_int(&buf_set->dmu_ctx->err, buf_set->err); + atomic_add_int(&dmu_ctx->err, buf_set->err); for (i = 0; i < buf_set->count; i++) { - /* May be called from error case, where dbp[i] may be NULL. */ - if (buf_set->dbp[i]) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)buf_set->dbp[i]; - dbuf_rele(db, buf_set->dmu_ctx->tag); - } + dmu_buf_impl_t *db = (dmu_buf_impl_t *)buf_set->dbp[i]; + ASSERT(db != NULL); + dbuf_rele(db, dmu_ctx->tag); } #ifdef ZFS_DEBUG @@ -962,19 +965,9 @@ refcount_release(&buf_set_in_flight); #endif - /* - * Check to see if this buffer set was allocated with the context. - * If it was, it will be freed by dmu_context_rele(). This ensures - * that someone cleans up after the master regardless of whether its - * buffer set is the last to finish. - */ - child = ((char *)buf_set != - ((char *)buf_set->dmu_ctx + sizeof(dmu_context_t))); - dmu_context_rele(buf_set->dmu_ctx); - if (child) { - kmem_free(buf_set, sizeof(dmu_buf_set_t) + - buf_set->count * sizeof(dmu_buf_t *)); - } + kmem_free(buf_set, sizeof(dmu_buf_set_t) + + buf_set->blocks_allocated * sizeof(dmu_buf_t *)); + dmu_context_rele(dmu_ctx); } int @@ -1077,54 +1070,27 @@ } static void -dmu_buf_array_init(dmu_context_t **dmu_ctx_p, dmu_buf_set_t **buf_set_p, - void **buf, uint64_t *offset, void *tag, int nblks, dmu_callback_t dmu_cb, - void *priv, int read, dmu_tx_t *tx) +dmu_buf_set_init(dmu_context_t *dmu_ctx, dmu_buf_set_t **buf_set_p, int nblks) { - dmu_context_t *dmu_ctx; dmu_buf_set_t *buf_set; + size_t set_size; - if (*dmu_ctx_p == NULL) { - /* Create the DMU context AND the buffer set. */ - *dmu_ctx_p = kmem_zalloc( - sizeof(dmu_context_t) + sizeof(dmu_buf_set_t) + - nblks * sizeof(dmu_buf_t *), KM_SLEEP); -#ifdef ZFS_DEBUG - refcount_acquire(&dmu_ctx_in_flight); - atomic_add_64(&dmu_ctx_total, 1); -#endif - dmu_ctx = *dmu_ctx_p; - buf_set = (dmu_buf_set_t *) - ((char *)*dmu_ctx_p + sizeof(dmu_context_t)); + ASSERT(dmu_ctx != NULL); + ASSERT(dmu_ctx->holds > 0); - /* Initialize a new DMU context. */ - /* XXX do something more intelligent about state matching? */ - dmu_ctx->db_states = (DB_UNCACHED|DB_CACHED); - dmu_ctx->dmu_cb = dmu_cb; - dmu_ctx->dmu_cb_private = priv; - dmu_ctx->tag = tag; - dmu_ctx->tx = tx; - /* Include a refcount for the initiator & own buf_set. */ - refcount_init(&dmu_ctx->holds, 2); - if (read) - dmu_ctx->flags |= DMU_CTX_FLAG_READ; - } else { - /* Create only the new buffer set. */ - dmu_ctx = *dmu_ctx_p; - buf_set = kmem_zalloc( - sizeof(dmu_buf_set_t) + nblks * sizeof(dmu_buf_t *), - KM_SLEEP); - refcount_acquire(&dmu_ctx->holds); - } + /* Create the new buffer set. */ + set_size = sizeof(dmu_buf_set_t) + nblks * sizeof(dmu_buf_t *); + buf_set = kmem_zalloc(set_size, KM_SLEEP); + refcount_acquire(&dmu_ctx->holds); /* Initialize a new buffer set. */ #ifdef ZFS_DEBUG refcount_acquire(&buf_set_in_flight); atomic_add_64(&buf_set_total, 1); #endif - buf_set->offset = *offset; - buf_set->data_buf = *buf; + buf_set->offset = dmu_ctx->offset; buf_set->count = nblks; + buf_set->blocks_allocated = nblks; /* Include a refcount for the initiator. */ if (dmu_ctx->flags & DMU_CTX_FLAG_READ) refcount_init(&buf_set->holds, nblks + 1); @@ -1135,26 +1101,42 @@ *buf_set_p = buf_set; } +/** + * \brief Set up the buffers for a given set. + * + * \param buf_set Buffer set to set up buffers for. + * \param zio Parent ZIO to issue I/O's with, as needed. + * \param io_size Total I/O size for this buffer set. + * + * \retval EIO If any buffer could not be held for this buffer set. + * \retval 0 Success. + */ static int -dmu_buf_hold_array_setup_buffers(dmu_context_t *dmu_ctx, dmu_buf_set_t *buf_set, - dnode_t *dn, uint64_t nblks, uint64_t *offset, uint64_t *length, - void **buf, int dbuf_flags, zio_t *zio, uint64_t avail_size) +dmu_buf_set_setup_buffers(dmu_buf_set_t *buf_set, zio_t *zio, + uint64_t io_size) { + dmu_context_t *dmu_ctx = buf_set->dmu_ctx; + dnode_t *dn = dmu_ctx->dn; uint64_t blkid; + int dbuf_flags; int i; - blkid = dbuf_whichblock(dn, *offset); - for (i = 0; i < nblks; i++) { + dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; + if (dmu_ctx->flags & DMU_CTX_FLAG_PREFETCH || + io_size > zfetch_array_rd_sz) + dbuf_flags |= DB_RF_NOPREFETCH; + + blkid = dbuf_whichblock(dn, dmu_ctx->offset); + for (i = 0; i < buf_set->count; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, dmu_ctx->tag); int bufoff, bufsiz; if (db == NULL) { + int blocks_held = i; /* Fix up refcount & count. */ - for (;i < nblks;i++) + for (;i < buf_set->count;i++) refcount_release(&buf_set->holds); - buf_set->count -= (nblks - i); - rw_exit(&dn->dn_struct_rwlock); - dmu_buf_set_rele(buf_set, NULL, B_TRUE); + buf_set->count = blocks_held; zio_nowait(zio); return (EIO); } @@ -1167,15 +1149,14 @@ #endif /* Calculate the amount of data this buffer contributes. */ - ASSERT(*offset >= db->db.db_offset); - bufoff = *offset - db->db.db_offset; - bufsiz = (int)MIN(db->db.db_size - bufoff, avail_size); + ASSERT(dmu_ctx->offset >= db->db.db_offset); + bufoff = dmu_ctx->offset - db->db.db_offset; + bufsiz = (int)MIN(db->db.db_size - bufoff, io_size); buf_set->size += bufsiz; - avail_size -= bufsiz; + io_size -= bufsiz; /* Update the caller's data to let them know what's next. */ - *offset += bufsiz; - *length -= bufsiz; - *buf = (char *)*buf + bufsiz; + dmu_ctx->offset += bufsiz; + dmu_ctx->size -= bufsiz; /* Make sure dbufs that don't notify DMU are caught here. */ if (dmu_ctx->flags & DMU_CTX_FLAG_READ) { @@ -1189,22 +1170,35 @@ ASSERT(buf_set->holds > 1); refcount_release(&buf_set->holds); } else { - /* Let the dbuf know this DMU context needs it. */ - dmu_context_node_add(&db->db_dmu_contexts, buf_set); + /* Let the dbuf know this context needs it. */ + dmu_context_node_add(&db->db_dmu_contexts, + buf_set); } /* NB: all dbufs may have completed at this point! */ mutex_exit(&db->db_mtx); } buf_set->dbp[i] = &db->db; } - return 0; + return (0); } +/** + * \brief Process the I/Os queued for a given buffer set. + * + * \param buf_set Buffer set to process I/Os for. + * \param zio Parent ZIO to watch. + * + * \retval errno Errors from zio_wait or a buffer went UNCACHED. + * \retval 0 Success. + */ static int -dmu_buf_hold_array_process_io(dmu_buf_set_t *buf_set, zio_t *zio, - dsl_pool_t *dp, hrtime_t start) +dmu_buf_set_process_io(dmu_buf_set_t *buf_set, zio_t *zio) { - int err, i; + int err, i, syncing; + dsl_pool_t *dp = NULL; + hrtime_t start = 0; + dmu_context_t *dmu_ctx = buf_set->dmu_ctx; + dnode_t *dn = dmu_ctx->dn; /* * If a callback is specified, issue the I/O's without waiting. @@ -1212,21 +1206,26 @@ * * Or, if this is a write, we're done. */ - if (buf_set->dmu_ctx->dmu_cb != NULL || - (buf_set->dmu_ctx->flags & DMU_CTX_FLAG_READ) == 0) { + if (dmu_ctx->dmu_cb != NULL || + (dmu_ctx->flags & DMU_CTX_FLAG_READ) == 0) { zio_nowait(zio); return (0); } - /* wait for async i/o */ + /* Time accounting for sync context. */ + if (dn->dn_objset->os_dsl_dataset) + dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; + if (dp && dsl_pool_sync_context(dp)) + start = gethrtime(); + + /* Wait for async i/o. */ err = zio_wait(zio); - /* track read overhead when we are in sync context */ - if (dp && dsl_pool_sync_context(dp)) + + /* Track read overhead when we are in sync context. */ + if (start) dp->dp_read_overhead += gethrtime() - start; - if (err) { - dmu_buf_set_rele(buf_set, NULL, B_TRUE); + if (err) return (err); - } /* wait for other io to complete */ for (i = 0; i < buf_set->count; i++) { @@ -1237,172 +1236,249 @@ if (db->db_state == DB_UNCACHED) err = EIO; mutex_exit(&db->db_mtx); - if (err) { - dmu_buf_set_rele(buf_set, NULL, B_TRUE); + if (err) return (err); - } } return (0); } +/** + * \brief Execute the next I/O chunk for the given DMU context. + * + * \param dmu_ctx The DMU context. + * + * \retval EIO Tried to access blocks beyond the end of the dnode. + * \retval errno Various other errors, primarily ZIO calls. + * \retval 0 Success. + */ int -dmu_buf_hold_array_by_dnode_cb(dnode_t *dn, uint64_t *offset, uint64_t *length, - int read, void *tag, dmu_callback_t dmu_cb, void *priv, void **buf, - dmu_context_t **dmu_ctx_p, uint32_t flags, dmu_tx_t *tx) +dmu_ctx_execute_chunk(dmu_context_t *dmu_ctx) { - dsl_pool_t *dp = NULL; - uint64_t blkid, nblks, i, avail_size; - uint32_t dbuf_flags; - dmu_context_t *dmu_ctx; - dmu_buf_set_t *buf_set; + uint64_t io_size, nblks; + dmu_buf_set_t *buf_set = NULL; int err; zio_t *zio; hrtime_t start; + dnode_t *dn = dmu_ctx->dn; /* Determine the actual size this I/O set will try to perform. */ - avail_size = MIN(*length, DMU_MAX_ACCESS); - - dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; - if (flags & DMU_READ_NO_PREFETCH || avail_size > zfetch_array_rd_sz) - dbuf_flags |= DB_RF_NOPREFETCH; + io_size = MIN(dmu_ctx->size, DMU_MAX_ACCESS); /* Figure out the number of blocks needed for the buffer set. */ rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { + int shift = 1ULL << dn->dn_datablkshift; int blkshift = dn->dn_datablkshift; - nblks = (P2ROUNDUP(*offset+avail_size, 1ULL<> blkshift; + nblks = P2ROUNDUP(dmu_ctx->offset + io_size, shift); + nblks -= P2ALIGN(dmu_ctx->offset, shift); + nblks >>= dn->dn_datablkshift; } else { - if ((*offset + avail_size) > dn->dn_datablksz) { + if ((dmu_ctx->offset + io_size) > dn->dn_datablksz) { zfs_panic_recover("zfs: accessing past end of object " "%llx/%llx (size=%u access=%llu+%llu)", (longlong_t)dn->dn_objset-> os_dsl_dataset->ds_object, (longlong_t)dn->dn_object, dn->dn_datablksz, - (longlong_t)*offset, (longlong_t)avail_size); - rw_exit(&dn->dn_struct_rwlock); - return (EIO); + (longlong_t)dmu_ctx->offset, + (longlong_t)io_size); + err = EIO; + goto out; } nblks = 1; } - /* Initialize the buffer set and context, if necessary. */ - dmu_buf_array_init(dmu_ctx_p, &buf_set, buf, offset, tag, nblks, - dmu_cb, priv, read, tx); - dmu_ctx = *dmu_ctx_p; - ASSERT(dmu_ctx != NULL && buf_set != NULL); + /* Now that the block count is known, initialize the buffer set. */ + dmu_buf_set_init(dmu_ctx, &buf_set, nblks); - if (dn->dn_objset->os_dsl_dataset) - dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; - if (dp && dsl_pool_sync_context(dp)) - start = gethrtime(); zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* Set up the buffers. */ - err = dmu_buf_hold_array_setup_buffers(dmu_ctx, buf_set, dn, nblks, - offset, length, buf, dbuf_flags, zio, avail_size); - if (err) - return (err); + err = dmu_buf_set_setup_buffers(buf_set, zio, io_size); +out: rw_exit(&dn->dn_struct_rwlock); - /* Process the I/O requests. */ - err = dmu_buf_hold_array_process_io(buf_set, zio, dp, start); - if (err) - return (err); + /* Process the I/O requests, if no errors have occurred yet. */ + if (err == 0) + err = dmu_buf_set_process_io(buf_set, zio); - /* Release the initiator hold. */ - dmu_buf_set_rele(buf_set, NULL, B_FALSE); + /* + * Release the initiator hold on the buffer. + * NB: This must occur after struct_rwlock is dropped, otherwise a + * deadlock may occur if someone needs new blocks from the dnode. + */ + if (buf_set != NULL) + dmu_buf_set_rele(buf_set, NULL, err ? B_TRUE : B_FALSE); - return (0); + return (err); } +/** + * \brief Issue the I/O specified in the given DMU context. + * + * \param dmu_ctx The DMU context. + * + * \return errno Errors executing I/O chunks. + * \return 0 If a DMU callback is specified; the callback + * receives any errors. + * \return 0 If no DMU callback is specified: Success. + */ int -dmu_issue(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf, uint32_t flags, dmu_callback_t dmu_cb, void *priv, int read, - dmu_tx_t *tx) +dmu_issue(dmu_context_t *dmu_ctx) +{ + int err = 0; + + /* While there is work left to do, execute the next chunk. */ + while (dmu_ctx->size > 0 && err == 0) + err = dmu_ctx_execute_chunk(dmu_ctx); + + /* If a callback is specified, forward any error to it. */ + if (err && dmu_ctx->dmu_cb != NULL) { + atomic_add_int(&dmu_ctx->err, 1); + err = 0; + } + ASSERT(dmu_ctx->dmu_cb != NULL || dmu_ctx->holds == 1); + /* Release initiator hold. */ + dmu_context_rele(dmu_ctx); + + return (err); +} + +/** + * \brief Set up a DMU context. + * + * \param dmu_ctx The DMU context. + * \param dn The held dnode to associate with the context. + * \param size Size of the I/O to be performed. + * \param offset Offset into the dnode to perform the I/O. + * \param dmu_cb Function to call back on completion; may be NULL. + * \param data_buf Data buffer to perform I/O transfers with. + * \param tx DMU transaction to use, if applicable. + * \param tag Hold tag to use. + * \param flags DMU context flags. + * + * \note The dnode must not be NULL, and must be held. + * \note The context may not specify a read and a transaction. + */ +void +dmu_context_setup_dnode(dmu_context_t *dmu_ctx, struct dnode *dn, uint64_t size, + uint64_t offset, dmu_callback_t dmu_cb, void *data_buf, dmu_tx_t *tx, + void *tag, uint32_t flags) { - dnode_t *dn = NULL; - dmu_context_t *dmu_ctx = NULL; - int numbufs, err = 0; +#ifdef _KERNEL + uio_t *uio; +#endif + +#ifdef ZFS_DEBUG + refcount_acquire(&dmu_ctx_in_flight); + atomic_add_64(&dmu_ctx_total, 1); +#endif - if (size == 0) - goto out; + /* Make sure the dnode passed in is valid. */ + ASSERT(dn != NULL); + ASSERT(!refcount_is_zero(&dn->dn_holds)); + /* Reads and DMU transactions are (currently) mutually exclusive. */ + ASSERT(((flags & DMU_CTX_FLAG_READ) == 0) ^ (tx == NULL)); - err = dnode_hold(os, object, FTAG, &dn); - if (err) - goto out; + bzero(dmu_ctx, sizeof(dmu_context_t)); + dmu_ctx->dn = dn; + dmu_ctx->dmu_cb = dmu_cb; + dmu_ctx->flags = flags; + dmu_ctx->data_buf = data_buf; + dmu_ctx->tx = tx; + dmu_ctx->tag = tag; - /* - * Deal with odd block sizes, where there can't be data past the first - * block. If we ever do the tail block optimization, we will need to - * handle that here as well. - */ - if (read && dn->dn_maxblkid == 0) { - int newsz = offset > dn->dn_datablksz ? 0 : - MIN(size, dn->dn_datablksz - offset); - bzero((char *)buf + newsz, size - newsz); - size = newsz; - } - while (size > 0) { - /* - * NB: we could do this block-at-a-time, but it's nice - * to be reading in parallel. - */ - err = dmu_buf_hold_array_by_dnode_cb(dn, &offset, &size, - read, FTAG, dmu_cb, priv, &buf, &dmu_ctx, flags, tx); + /* Initialize to a refcount for the initiator. */ + refcount_init(&dmu_ctx->holds, 1); + /* XXX do something more intelligent about state matching? */ + dmu_ctx->db_states = (DB_UNCACHED|DB_CACHED); - if (err) - break; + /* The initial offset is sourced differently for UIO. */ + if ((flags & DMU_CTX_FLAG_UIO) == 0) + dmu_ctx->offset = offset; +#ifdef _KERNEL + else { + uio = (uio_t *)data_buf; + dmu_ctx->offset = uio->uio_loffset; } +#endif + dmu_ctx->start = dmu_ctx->offset; -out: - if (dmu_ctx != NULL) { + if ((flags & DMU_CTX_FLAG_READ) && dn->dn_maxblkid == 0) { /* - * If an I/O error occurred, tell the callback instead - * of caller. + * Deal with odd block sizes, where there can't be data past + * the first block. If we ever do the tail block optimization, + * we will need to handle that here as well. */ - if (err && dmu_cb != NULL) { - atomic_add_int(&dmu_ctx->err, 1); - err = 0; - } - ASSERT(dmu_cb != NULL || dmu_ctx->holds == 1); - /* Release initiator hold. */ - dmu_context_rele(dmu_ctx); - } else if (dmu_cb != NULL) { - dmu_context_t tmp; + int newsz = offset > dn->dn_datablksz ? 0 : + MIN(size, dn->dn_datablksz - offset); + bzero((char *)data_buf + newsz, size - newsz); + size = newsz; + } + /* Set the final size now that it is known. */ + dmu_ctx->size = size; +} - /* - * No context created means no I/O initiated. But the - * callback still expects to be notified. - */ - bzero(&tmp, sizeof(dmu_context_t)); - tmp.dmu_cb_private = priv; - tmp.err = err; - err = 0; - dmu_cb(&tmp); - } +/** + * \brief Set up a DMU context. + * + * \param os Object set to associate with the DMU context. + * \param object Object ID to associate with the DMU context. + * + * \note See dmu_context_setup_dnode about the other parameters. + * \note This function wraps dmu_context_setup_dnode, and its + * purpose is to hide the dnode from the caller; it + * also makes it possible to avoid lookups. + * + * \retval errno Could not hold the dnode. + * \retval 0 Success. + */ +int +dmu_context_setup(dmu_context_t *dmu_ctx, objset_t *os, uint64_t object, + uint64_t size, uint64_t offset, void *data_buf, dmu_callback_t dmu_cb, + dmu_tx_t *tx, void *tag, uint32_t flags) +{ + dnode_t *dn = NULL; + int err; - if (dn != NULL) - dnode_rele(dn, FTAG); + err = dnode_hold(os, object, tag, &dn); + if (err) + return (err); - return (err); + dmu_context_setup_dnode(dmu_ctx, dn, size, offset, dmu_cb, data_buf, + tx, tag, flags); + return (0); } int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf, uint32_t flags) + void *data_buf, uint32_t flags) { - return (dmu_issue(os, object, offset, size, buf, flags, - /*dmu_cb*/NULL, /*priv*/NULL, /*read*/TRUE, /*tx*/NULL)); + int err; + dmu_context_t dmu_ctx; + /* XXX Fix up the callers to move this into the API. */ + uint32_t dmu_flags = DMU_CTX_FLAG_READ; + + if (flags & DMU_READ_PREFETCH) + dmu_flags |= DMU_CTX_FLAG_PREFETCH; + + dmu_context_setup(&dmu_ctx, os, object, size, offset, data_buf, + /*dmu_cb*/NULL, /*tx*/NULL, FTAG, dmu_flags); + + return (dmu_issue(&dmu_ctx)); } void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx) + const void *data_buf, dmu_tx_t *tx) { - (void) dmu_issue(os, object, offset, size, (void *)(uintptr_t)(buf), 0, - /*dmu_cb*/NULL, /*priv*/NULL, /*read*/FALSE, /*tx*/tx); + void *data_bufp = (void *)(uintptr_t)data_buf; + dmu_context_t dmu_ctx; + + dmu_context_setup(&dmu_ctx, os, object, size, offset, data_bufp, + /*dmu_cb*/NULL, tx, FTAG, /*flags*/0); + + (void) dmu_issue(&dmu_ctx); } void ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#17 (text) ==== @@ -227,6 +227,17 @@ typedef struct dmu_context { + /** The primary data associated with this context. */ + uint64_t size; /**< Remaining bytes to process. */ + uint64_t start; /**< Starting block offset. */ + uint64_t offset; /**< Current block offset. */ + uint32_t read_flags; /**< For reads only */ + dmu_tx_t *tx; /**< For writes only */ + void *data_buf; /**< UIO or char pointer */ + + /** The dnode held in association with this context. */ + struct dnode *dn; + /** Number of buffer sets left to complete. */ int holds; @@ -243,18 +254,16 @@ int db_states; /** Flags for this block. */ - int flags; + uint32_t flags; #define DMU_CTX_FLAG_READ (1 << 1) #define DMU_CTX_FLAG_UIO (1 << 2) +#define DMU_CTX_FLAG_PREFETCH (1 << 3) /** The number of errors that occurred. */ int err; - /** DMU Transaction, if one applies to this context. */ - dmu_tx_t *tx; - /** Private data for the callback. */ - void *dmu_cb_private; + void *callback_private[4]; } dmu_context_t; @@ -263,17 +272,18 @@ /** The DMU context that this buffer set is associated with. */ dmu_context_t *dmu_ctx; - /** The number of buffers associated with this context. */ + /** Number of buffers associated with this context. */ int count; + /** Number of buffers space has been allocated for. */ + int blocks_allocated; + /** Number of buffers left to complete. */ int holds; - /** Pointer to the data buffer. */ - void *data_buf; - - /** The size and offset of this array of buffers. */ + /* The offset (into the data buffer) of this array of buffers. */ uint64_t offset; + /** The size of the I/O. */ uint64_t size; /** The number of errors that occurred. */ @@ -285,6 +295,10 @@ } dmu_buf_set_t; void dmu_buf_set_rele(dmu_buf_set_t *buf_set, dmu_buf_t *vdb, boolean_t err); +int dmu_context_setup(dmu_context_t *dmu_ctx, objset_t *os, uint64_t object, + uint64_t size, uint64_t offset, void *data_buf, dmu_callback_t dmu_cb, + dmu_tx_t *tx, void *tag, uint32_t flags); +void dmu_context_rele(dmu_context_t *dmu_ctx); /* DMU thread context handlers. */ int dmu_thread_context_create(void); @@ -420,13 +434,12 @@ * Canfail routines will return 0 on success, or an errno if there is a * nonrecoverable I/O error. */ +// XXX REMOVE THESE IN FAVOR OF DMU_CTX_FLAG_PREFETCH #define DMU_READ_PREFETCH 0 /* prefetch */ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags); -int dmu_issue(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf, uint32_t flags, dmu_callback_t dmu_cb, void *priv, int read, - dmu_tx_t *tx); +int dmu_issue(dmu_context_t *dmu_ctx); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#15 (text) ==== @@ -92,25 +92,6 @@ DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); -SYSCTL_DECL(_vfs_zfs); -SYSCTL_NODE(_vfs_zfs, OID_AUTO, zvol, CTLFLAG_RW, 0, "ZFS ZVOL"); -#define SYSCTL_COUNTER_U(name, desc) \ - uint64_t name; \ - SYSCTL_QUAD(_vfs_zfs_zvol, OID_AUTO, name, CTLFLAG_RD, \ - &name, 0, desc); - -#define SYSCTL_REFCOUNT(name, desc) \ - uint_t name; \ - SYSCTL_INT(_vfs_zfs_zvol, OID_AUTO, name, CTLFLAG_RD, \ - &name, 0, desc); - -SYSCTL_COUNTER_U(read_ctx_total, "total number of read contexts"); -SYSCTL_REFCOUNT(read_ctx_in_flight, "number of read contexts in flight"); - -uint_t zvol_async_read = 1; -SYSCTL_INT(_vfs_zfs_zvol, OID_AUTO, zvol_async_read, CTLFLAG_RW, - &zvol_async_read, 0, "ZVOL reads use asynchronous DMU calls"); - /** * The zfsdev_state structure is protected by spa_namespace_lock from being * modified while it's being used, e.g. an open that comes in before a @@ -1219,54 +1200,46 @@ } #endif /* sun */ -typedef struct zvol_read_context { - uint_t holds; - struct bio *bp; - rl_t *rl; - int err; -} zvol_read_context_t; - static void -zvol_dmu_read_release(zvol_read_context_t *read_ctx) +zvol_dmu_cb(dmu_context_t *dmu_ctx) { - int err = (read_ctx->err == 0) ? 0 : EIO; + struct bio *bp = dmu_ctx->callback_private[0]; + rl_t *rl = dmu_ctx->callback_private[1]; + zvol_state_t *zv = dmu_ctx->callback_private[2]; + dmu_tx_t *tx = dmu_ctx->callback_private[3]; + int sync = (bp->bio_cmd != BIO_READ && + zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); - if (refcount_release(&read_ctx->holds)) { - zfs_range_unlock(read_ctx->rl); - read_ctx->bp->bio_error = err; - g_io_deliver(read_ctx->bp, 0); - kmem_free(read_ctx, sizeof(zvol_read_context_t)); - refcount_release(&read_ctx_in_flight); + bp->bio_completed = dmu_ctx->completed_size; + if (bp->bio_cmd != BIO_READ) { + ASSERT(zv != NULL && tx != NULL); + zvol_log_write(zv, tx, bp->bio_offset, bp->bio_completed, sync); + dmu_tx_commit(tx); + if (sync) + zil_commit(zv->zv_zilog, ZVOL_OBJ); } -} - -static void -zvol_dmu_cb(dmu_context_t *dmu_ctx) -{ - zvol_read_context_t *read_ctx; - - read_ctx = (zvol_read_context_t *)dmu_ctx->dmu_cb_private; - if (dmu_ctx->err != 0) - atomic_add_int(&read_ctx->err, dmu_ctx->err); - else - atomic_add_64(&read_ctx->bp->bio_completed, - dmu_ctx->completed_size); - zvol_dmu_read_release(read_ctx); + if (bp->bio_completed < bp->bio_length) { + if (dmu_ctx->offset > zv->zv_volsize) + bp->bio_error = EINVAL; + } else + bp->bio_error = (dmu_ctx->err == 0) ? 0 : EIO; + zfs_range_unlock(rl); + g_io_deliver(bp, 0); + kmem_free(dmu_ctx, sizeof(dmu_context_t)); } int zvol_strategy(struct bio *bp) { zvol_state_t *zv = bp->bio_to->private; - uint64_t off, volsize; - size_t resid; - char *addr; + uint64_t off; objset_t *os; rl_t *rl; int error = 0; boolean_t doread = (bp->bio_cmd == BIO_READ); - boolean_t sync; - zvol_read_context_t *read_ctx = NULL; + dmu_tx_t *tx = NULL; + dmu_context_t *dmu_ctx = NULL; + uint32_t dmu_flags = DMU_CTX_FLAG_PREFETCH; if (zv == NULL) { g_io_deliver(bp, ENXIO); @@ -1279,86 +1252,45 @@ } off = bp->bio_offset; - volsize = zv->zv_volsize; + ASSERT(zv->zv_objset != NULL); - os = zv->zv_objset; - ASSERT(os != NULL); - - addr = bp->bio_data; - resid = bp->bio_length; - - if (resid > 0 && (off < 0 || off >= volsize)) { + if (bp->bio_length > 0 && (off < 0 || off >= zv->zv_volsize)) { g_io_deliver(bp, EIO); return (0); } - sync = !doread && zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; - /* * There must be no buffer changes when doing a dmu_sync() because * we can't change the data whilst calculating the checksum. */ - rl = zfs_range_lock(&zv->zv_znode, off, resid, + rl = zfs_range_lock(&zv->zv_znode, off, bp->bio_length, doread ? RL_READER : RL_WRITER); - if (zvol_async_read && doread) { - /* XXX yet another malloc ... create a pool of these? */ - read_ctx = kmem_zalloc(sizeof(zvol_read_context_t), KM_SLEEP); - refcount_acquire(&read_ctx_in_flight); - atomic_add_64(&read_ctx_total, 1); - /* Initialize holds to include initiator. */ - refcount_init(&read_ctx->holds, 1); - read_ctx->rl = rl; - read_ctx->bp = bp; + if (doread) + dmu_flags |= DMU_CTX_FLAG_READ; + else { + tx = dmu_tx_create(zv->zv_objset); + dmu_tx_hold_write(tx, ZVOL_OBJ, off, bp->bio_length); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) + dmu_tx_abort(tx); } - while (resid != 0 && off < volsize) { - size_t size = MIN(resid, zvol_maxphys); - if (doread) { - if (zvol_async_read) { - refcount_acquire(&read_ctx->holds); - dmu_issue(os, ZVOL_OBJ, off, size, addr, - DMU_READ_PREFETCH, zvol_dmu_cb, read_ctx, - /*read*/TRUE, /*tx*/NULL); - } else { - error = dmu_read(os, ZVOL_OBJ, off, size, addr, - DMU_READ_PREFETCH); - } - } else { - dmu_tx_t *tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - dmu_write(os, ZVOL_OBJ, off, size, addr, tx); - zvol_log_write(zv, tx, off, size, sync); - dmu_tx_commit(tx); - } - } - if (error) { - /* convert checksum errors into IO errors */ - if (error == ECKSUM) - error = EIO; - break; - } - off += size; - addr += size; - resid -= size; - } - if (read_ctx == NULL) { - zfs_range_unlock(rl); - - bp->bio_completed = bp->bio_length - resid; - if (bp->bio_completed < bp->bio_length) - bp->bio_error = (off > volsize ? EINVAL : error); - - if (sync) - zil_commit(zv->zv_zilog, ZVOL_OBJ); - g_io_deliver(bp, 0); + dmu_ctx = kmem_zalloc(sizeof(dmu_context_t), KM_SLEEP); + error = dmu_context_setup(dmu_ctx, zv->zv_objset, ZVOL_OBJ, + bp->bio_length, off, bp->bio_data, zvol_dmu_cb, tx, FTAG, + dmu_flags); + /* All error handling is done in the callback. */ + dmu_ctx->callback_private[0] = bp; + dmu_ctx->callback_private[1] = rl; + dmu_ctx->callback_private[2] = zv; + dmu_ctx->callback_private[3] = tx; + if (error == 0) { + /* Pump primed, issue the I/O to the DMU. */ + (void) dmu_issue(dmu_ctx); } else { - /* Release initiator hold. */ - zvol_dmu_read_release(read_ctx); + dmu_ctx->err = error; + dmu_context_rele(dmu_ctx); } return (0); Change 524360 by willa@willa_repo on 2012/02/02 12:59:54 Change the UIO DMU APIs to wrap around dmu_context_setup/dmu_issue. Slight change to the dmu_context_setup routines so they merely, in kernel debug mode, assert that offset == uio->uio_loffset. In rewriting the actual calls, I realized it made more sense to have them simply pass uio->uio_loffset as the offset argument. So this is just a guard against future calls doing something different. Change FTAG so it discards any constness of __func__, necessary since existing APIs expect it passed as void *. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#38 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h#3 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#38 (text) ==== @@ -1365,9 +1365,6 @@ uint64_t offset, dmu_callback_t dmu_cb, void *data_buf, dmu_tx_t *tx, void *tag, uint32_t flags) { -#ifdef _KERNEL - uio_t *uio; -#endif #ifdef ZFS_DEBUG refcount_acquire(&dmu_ctx_in_flight); @@ -1387,23 +1384,22 @@ dmu_ctx->data_buf = data_buf; dmu_ctx->tx = tx; dmu_ctx->tag = tag; + dmu_ctx->offset = offset; + dmu_ctx->start = offset; + /* Make sure UIO callers pass in the correct offset. */ +#if defined(_KERNEL) && defined(ZFS_DEBUG) + if (flags & DMU_CTX_FLAG_UIO) { + uio_t *uio = (uio_t *)data_buf; + ASSERT(uio->uio_loffset == offset); + } +#endif + /* Initialize to a refcount for the initiator. */ refcount_init(&dmu_ctx->holds, 1); /* XXX do something more intelligent about state matching? */ dmu_ctx->db_states = (DB_UNCACHED|DB_CACHED); - /* The initial offset is sourced differently for UIO. */ - if ((flags & DMU_CTX_FLAG_UIO) == 0) - dmu_ctx->offset = offset; -#ifdef _KERNEL - else { - uio = (uio_t *)data_buf; - dmu_ctx->offset = uio->uio_loffset; - } -#endif - dmu_ctx->start = dmu_ctx->offset; - if ((flags & DMU_CTX_FLAG_READ) && dn->dn_maxblkid == 0) { /* * Deal with odd block sizes, where there can't be data past @@ -1629,111 +1625,16 @@ int dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) { - dmu_buf_t **dbp; - int numbufs, i, err; - xuio_t *xuio = NULL; + dmu_context_t dmu_ctx; + uint32_t dmu_flags = DMU_CTX_FLAG_READ|DMU_CTX_FLAG_UIO; + int err; - /* - * NB: we could do this block-at-a-time, but it's nice - * to be reading in parallel. - */ - err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, - &numbufs, &dbp); + err = dmu_context_setup(&dmu_ctx, os, object, size, uio->uio_loffset, + uio, /*dmu_cb*/NULL, /*tx*/NULL, FTAG, dmu_flags); if (err) return (err); -#ifdef UIO_XUIO - if (uio->uio_extflg == UIO_XUIO) - xuio = (xuio_t *)uio; -#endif - - for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; - dmu_buf_t *db = dbp[i]; - - ASSERT(size > 0); - - bufoff = uio->uio_loffset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - if (xuio) { - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - arc_buf_t *dbuf_abuf = dbi->db_buf; - arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); - err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); - if (!err) { - uio->uio_resid -= tocpy; - uio->uio_loffset += tocpy; - } - - if (abuf == dbuf_abuf) - XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); - else - XUIOSTAT_BUMP(xuiostat_rbuf_copied); - } else { - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_READ, uio); - } - if (err) - break; - - size -= tocpy; - } - dmu_buf_rele_array(dbp, numbufs, FTAG); - - return (err); -} - -static int -dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) -{ - dmu_buf_t **dbp; - int numbufs; - int err = 0; - int i; - - err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, - FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); - if (err) - return (err); - - for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; - dmu_buf_t *db = dbp[i]; - - ASSERT(size > 0); - - bufoff = uio->uio_loffset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); - - if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); - else - dmu_buf_will_dirty_range(db, tx, bufoff, tocpy); - - /* - * XXX uiomove could block forever (eg. nfs-backed - * pages). There needs to be a uiolockdown() function - * to lock the pages in memory, so that uiomove won't - * block. - */ - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_WRITE, uio); - - dmu_buf_fill_done(db, tx); - - if (err) - break; - - size -= tocpy; - } - - dmu_buf_rele_array(dbp, numbufs, FTAG); - return (err); + return (dmu_issue(&dmu_ctx)); } int @@ -1742,6 +1643,7 @@ { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; + dmu_context_t dmu_ctx; int err; if (size == 0) @@ -1749,7 +1651,9 @@ DB_DNODE_ENTER(db); dn = DB_DNODE(db); - err = dmu_write_uio_dnode(dn, uio, size, tx); + dmu_context_setup_dnode(&dmu_ctx, dn, size, uio->uio_loffset, + /*dmu_cb*/NULL, uio, tx, FTAG, DMU_CTX_FLAG_UIO); + err = dmu_issue(&dmu_ctx); DB_DNODE_EXIT(db); return (err); @@ -1759,21 +1663,19 @@ dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, dmu_tx_t *tx) { - dnode_t *dn; + dmu_context_t dmu_ctx; + uint32_t dmu_flags = DMU_CTX_FLAG_UIO; int err; if (size == 0) return (0); - err = dnode_hold(os, object, FTAG, &dn); + err = dmu_context_setup(&dmu_ctx, os, object, size, uio->uio_loffset, + uio, /*dmu_cb*/NULL, tx, FTAG, dmu_flags); if (err) return (err); - err = dmu_write_uio_dnode(dn, uio, size, tx); - - dnode_rele(dn, FTAG); - - return (err); + return (dmu_issue(&dmu_ctx)); } #ifdef sun ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h#3 (text) ==== @@ -40,7 +40,7 @@ * particular object, use FTAG (which is a string) for the holder_tag. * Otherwise, use the object that holds the reference. */ -#define FTAG ((char *)__func__) +#define FTAG ((char *)(uintptr_t)__func__) #ifdef ZFS_DEBUG typedef struct reference { Change 524433 by willa@willa_repo on 2012/02/02 14:32:35 Change the interface of dmu_read() to take context flags. DMU_READ_PREFETCH/DMU_READ_NO_PREFETCH are still used in other places, specifically for direct callers of dmu_buf_hold() etc. But since DMU contexts use other flags, it makes sense that these callers play along. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#39 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c#10 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c#6 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c#6 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#18 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c#6 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c#5 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c#5 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c#5 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c#8 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#16 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#39 (text) ==== @@ -1452,14 +1452,9 @@ { int err; dmu_context_t dmu_ctx; - /* XXX Fix up the callers to move this into the API. */ - uint32_t dmu_flags = DMU_CTX_FLAG_READ; - if (flags & DMU_READ_PREFETCH) - dmu_flags |= DMU_CTX_FLAG_PREFETCH; - dmu_context_setup(&dmu_ctx, os, object, size, offset, data_buf, - /*dmu_cb*/NULL, /*tx*/NULL, FTAG, dmu_flags); + /*dmu_cb*/NULL, /*tx*/NULL, FTAG, flags|DMU_CTX_FLAG_READ); return (dmu_issue(&dmu_ctx)); } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c#10 (text) ==== @@ -1387,7 +1387,7 @@ packed = kmem_alloc(nvsize, KM_SLEEP); error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, - DMU_READ_PREFETCH); + DMU_CTX_FLAG_PREFETCH); if (error == 0) error = nvlist_unpack(packed, nvsize, value, 0); kmem_free(packed, nvsize); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c#6 (text) ==== @@ -128,12 +128,12 @@ firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof); if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread, - buf, DMU_READ_PREFETCH)) != 0) + buf, DMU_CTX_FLAG_PREFETCH)) != 0) return (err); if (firstread != sizeof (reclen)) { if ((err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, sizeof (reclen) - firstread, - buf + firstread, DMU_READ_PREFETCH)) != 0) + buf + firstread, DMU_CTX_FLAG_PREFETCH)) != 0) return (err); } @@ -415,10 +415,10 @@ } err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf, - DMU_READ_PREFETCH); + DMU_CTX_FLAG_PREFETCH); if (leftover && err == 0) { err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, - leftover, buf + read_len, DMU_READ_PREFETCH); + leftover, buf + read_len, DMU_CTX_FLAG_PREFETCH); } mutex_exit(&spa->spa_history_lock); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c#6 (text) ==== @@ -338,7 +338,7 @@ mutex_exit(sm->sm_lock); error = dmu_read(os, smo->smo_object, offset, size, entry_map, - DMU_READ_PREFETCH); + DMU_CTX_FLAG_PREFETCH); mutex_enter(sm->sm_lock); if (error != 0) break; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#18 (text) ==== @@ -231,7 +231,6 @@ uint64_t size; /**< Remaining bytes to process. */ uint64_t start; /**< Starting block offset. */ uint64_t offset; /**< Current block offset. */ - uint32_t read_flags; /**< For reads only */ dmu_tx_t *tx; /**< For writes only */ void *data_buf; /**< UIO or char pointer */ ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c#6 (text) ==== @@ -861,7 +861,7 @@ uint64_t object = 0; error = dmu_read(mos, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, - DMU_READ_PREFETCH); + DMU_CTX_FLAG_PREFETCH); if (error) return (error); if (object != 0) { ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c#5 (text) ==== @@ -1118,7 +1118,7 @@ if (znode_acl.z_acl_extern_obj) { error = dmu_read(zp->z_zfsvfs->z_os, znode_acl.z_acl_extern_obj, 0, aclnode->z_size, - aclnode->z_acldata, DMU_READ_PREFETCH); + aclnode->z_acldata, DMU_CTX_FLAG_PREFETCH); } else { bcopy(znode_acl.z_ace_data, aclnode->z_acldata, aclnode->z_size); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c#5 (text) ==== @@ -134,7 +134,7 @@ packed = kmem_alloc(fuid_size, KM_SLEEP); VERIFY(dmu_read(os, fuid_obj, 0, - fuid_size, packed, DMU_READ_PREFETCH) == 0); + fuid_size, packed, DMU_CTX_FLAG_PREFETCH) == 0); VERIFY(nvlist_unpack(packed, fuid_size, &nvp, 0) == 0); VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY, ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c#5 (text) ==== @@ -503,7 +503,7 @@ (write_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, - zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { + zp->z_id, off, len, lr + 1, /*flags*/0) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(txtype, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c#8 (text) ==== @@ -405,7 +405,7 @@ va+off, tx); } else { (void) dmu_read(os, oid, start+off, nbytes, - va+off, DMU_READ_PREFETCH); + va+off, DMU_CTX_FLAG_PREFETCH); } zfs_unmap_page(sf); VM_OBJECT_LOCK(obj); @@ -457,7 +457,7 @@ VM_OBJECT_UNLOCK(obj); va = zfs_map_page(pp, &sf); error = dmu_read(os, zp->z_id, start, bytes, va, - DMU_READ_PREFETCH); + DMU_CTX_FLAG_PREFETCH); if (bytes != PAGESIZE && error == 0) bzero(va + bytes, PAGESIZE - bytes); zfs_unmap_page(sf); @@ -1144,7 +1144,7 @@ error = ENOENT; } else { error = dmu_read(os, object, offset, size, buf, - DMU_READ_NO_PREFETCH); + /*flags*/0); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ @@ -4616,7 +4616,7 @@ ASSERT3U(io_off, ==, cur_pp->p_offset); va = zfs_map_page(cur_pp, S_WRITE); err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, - DMU_READ_PREFETCH); + DMU_CTX_FLAG_PREFETCH); zfs_unmap_page(cur_pp, va); if (err) { /* On error, toss the entire kluster */ @@ -5520,7 +5520,7 @@ VM_OBJECT_UNLOCK(object); va = zfs_map_page(mreq, &sf); error = dmu_read(os, zp->z_id, IDX_TO_OFF(mreq->pindex), - size, va, DMU_READ_PREFETCH); + size, va, DMU_CTX_FLAG_PREFETCH); if (size != PAGE_SIZE) bzero(va + size, PAGE_SIZE - size); zfs_unmap_page(sf); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#16 (text) ==== @@ -1014,7 +1014,7 @@ */ if (buf != NULL) { /* immediate write */ error = dmu_read(os, object, offset, size, buf, - DMU_READ_NO_PREFETCH); + /*flags*/0); } else { size = zv->zv_volblocksize; offset = P2ALIGN(offset, size); @@ -1091,7 +1091,7 @@ (write_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (write_state == WR_COPIED && dmu_read(zv->zv_objset, - ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { + ZVOL_OBJ, off, len, lr + 1, /*flags*/0) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; Change 524438 by willa@willa_repo on 2012/02/02 16:09:19 Fix a few bugs introduced recently for UIO calls. It turns out that having a separate function for performing UIO transfers was perhaps not the best approach, because that one left out the dmu_buf_fill_done() call required after finishing a write, which led to a reader waiting for a fill_done call that never came. Merge UIO handling into the main dmu_buf_set_transfer() function, and only pull out the actual UIO transfer into a separate function, which seems to keep the main function relatively easy to read, and avoids duplication-related mistakes like these. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#40 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#40 (text) ==== @@ -791,63 +791,36 @@ kmem_free(dcn, sizeof(dmu_context_node_t)); } +#ifdef _KERNEL static void -dmu_buf_set_transfer_uio(dmu_buf_set_t *buf_set) +dmu_buf_move_uio(uio_t *uio, enum uio_rw dir, int offset, int size, + dmu_buf_t *db) { -#ifdef _KERNEL - dmu_context_t *dmu_ctx = buf_set->dmu_ctx; - uio_t *uio = dmu_ctx->data_buf; xuio_t *xuio = NULL; - enum uio_rw dir = UIO_WRITE; - dmu_tx_t *tx = dmu_ctx->tx; - uint64_t size = buf_set->size; - uint64_t offset = buf_set->offset; - int i; #ifdef UIO_XUIO if (uio->uio_extflg == UIO_XUIO) xuio = (xuio_t *)uio; #endif - if (dmu_ctx->flags & DMU_CTX_FLAG_READ) - dir = UIO_READ; - - for (i = 0; i < buf_set->count; i++) { - int bufoff; - int tocpy; - dmu_buf_t *db = buf_set->dbp[i]; - - bufoff = uio->uio_loffset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - if (dir == UIO_WRITE) { - if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); - else - dmu_buf_will_dirty_range(db, tx, bufoff, tocpy); + if (xuio && dir == UIO_READ) { + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + arc_buf_t *dbuf_abuf = dbi->db_buf; + arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); + int err = dmu_xuio_add(xuio, abuf, offset, size); + if (!err) { + uio->uio_resid -= size; + uio->uio_loffset += size; } - if (xuio && dir == UIO_READ) { - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - arc_buf_t *dbuf_abuf = dbi->db_buf; - arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); - int err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); - if (!err) { - uio->uio_resid -= tocpy; - uio->uio_loffset += tocpy; - } - - if (abuf == dbuf_abuf) - XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); - else - XUIOSTAT_BUMP(xuiostat_rbuf_copied); - } else { - uiomove((char *)db->db_data + bufoff, tocpy, dir, uio); - } - size -= tocpy; - } + if (abuf == dbuf_abuf) + XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); + else + XUIOSTAT_BUMP(xuiostat_rbuf_copied); + } else + uiomove((char *)db->db_data + offset, size, dir, uio); +} #endif -} /** * \brief Perform a buffer set read for a char * target buffer. @@ -862,52 +835,62 @@ dmu_context_t *dmu_ctx = buf_set->dmu_ctx; dmu_tx_t *tx = dmu_ctx->tx; int i; +#ifdef _KERNEL + uio_t *uio; + enum uio_rw dir = UIO_WRITE; - /* Having a transaction and being a reader is not supported. */ - ASSERT(tx != NULL || (dmu_ctx->flags & DMU_CTX_FLAG_READ)); - + /* Initialize the state for this I/O. */ if (dmu_ctx->flags & DMU_CTX_FLAG_UIO) { - dmu_buf_set_transfer_uio(buf_set); - return; + uio = (uio_t *)dmu_ctx->data_buf; + if (dmu_ctx->flags & DMU_CTX_FLAG_READ) + dir = UIO_READ; + } else +#endif + { + data = (char *)dmu_ctx->data_buf + buf_set->offset - + dmu_ctx->start; } - - /* Initialize the state for this I/O. */ - data = (char *)dmu_ctx->data_buf + buf_set->offset - dmu_ctx->start; size = buf_set->size; offset = buf_set->offset; + /* Having a transaction and being a reader is not supported. */ + ASSERT(tx != NULL || (dmu_ctx->flags & DMU_CTX_FLAG_READ)); + /* Perform the I/O copy, one buffer at a time. */ for (i = 0; i < buf_set->count; i++) { - int bufoff; - int tocpy; + int off, sz; dmu_buf_t *db = buf_set->dbp[i]; - char *src, *dst; ASSERT(size > 0); - bufoff = offset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); + off = offset - db->db_offset; + sz = (int)MIN(db->db_size - off, size); - if (dmu_ctx->flags & DMU_CTX_FLAG_READ) { - src = (char *)db->db_data + bufoff; - dst = data; - } else { - if (tocpy == db->db_size) + /* Write case: Notify dbuf that it will be dirtied. */ + if ((dmu_ctx->flags & DMU_CTX_FLAG_READ) == 0) { + if (sz == db->db_size) dmu_buf_will_fill(db, tx); else - dmu_buf_will_dirty_range(db, tx, bufoff, tocpy); - src = data; - dst = (char *)db->db_data + bufoff; + dmu_buf_will_dirty_range(db, tx, off, sz); } - bcopy(src, dst, tocpy); + if (dmu_ctx->flags & DMU_CTX_FLAG_UIO) { +#ifdef _KERNEL + dmu_buf_move_uio(uio, dir, off, sz, db); +#endif + } else { + if (dmu_ctx->flags & DMU_CTX_FLAG_READ) + bcopy((char *)db->db_data + off, data, sz); + else + bcopy(data, (char *)db->db_data + off, sz); + data = (char *)data + sz; + } if ((dmu_ctx->flags & DMU_CTX_FLAG_READ) == 0) dmu_buf_fill_done(db, tx); - offset += tocpy; - size -= tocpy; - data = (char *)data + tocpy; + offset += sz; + size -= sz; } } @@ -1122,7 +1105,7 @@ int i; dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; - if (dmu_ctx->flags & DMU_CTX_FLAG_PREFETCH || + if ((dmu_ctx->flags & DMU_CTX_FLAG_PREFETCH) == 0 || io_size > zfetch_array_rd_sz) dbuf_flags |= DB_RF_NOPREFETCH; Change 524439 by willa@willa_repo on 2012/02/02 16:32:37 Fix another bug where a particular DMU context may not hold the dnode. This is primarily for DB_DNODE_ENTER() callers, who inadvertently decrement the dnode hold count. They use a different refcount. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#41 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#19 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#41 (text) ==== @@ -912,7 +912,8 @@ refcount_release(&dmu_ctx_in_flight); #endif - dnode_rele(dmu_ctx->dn, dmu_ctx->tag); + if ((dmu_ctx->flags & DMU_CTX_FLAG_NO_HOLD) == 0) + dnode_rele(dmu_ctx->dn, dmu_ctx->tag); /* At this point, there are no buffer sets left. Call back. */ if (dmu_ctx->dmu_cb != NULL) @@ -1623,6 +1624,7 @@ dnode_t *dn; dmu_context_t dmu_ctx; int err; + uint32_t flags = DMU_CTX_FLAG_UIO|DMU_CTX_FLAG_NO_HOLD; if (size == 0) return (0); @@ -1630,7 +1632,7 @@ DB_DNODE_ENTER(db); dn = DB_DNODE(db); dmu_context_setup_dnode(&dmu_ctx, dn, size, uio->uio_loffset, - /*dmu_cb*/NULL, uio, tx, FTAG, DMU_CTX_FLAG_UIO); + /*dmu_cb*/NULL, uio, tx, FTAG, flags); err = dmu_issue(&dmu_ctx); DB_DNODE_EXIT(db); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#19 (text) ==== @@ -257,6 +257,7 @@ #define DMU_CTX_FLAG_READ (1 << 1) #define DMU_CTX_FLAG_UIO (1 << 2) #define DMU_CTX_FLAG_PREFETCH (1 << 3) +#define DMU_CTX_FLAG_NO_HOLD (1 << 4) /** The number of errors that occurred. */ int err; Change 524441 by willa@willa_repo on 2012/02/02 17:15:56 Refactor two more DMU pieces: Sun pages & prealloc. Both of these only apply to Solaris, although dmu_prealloc() is visible in FreeBSD. However, it is only used by ZVOL on Solaris. Incorporating them into the new callback API allows getting rid of the last remaining vestiges of the old dmu_buf_*_array* implementation, and of course simplifies their implementations too. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#42 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#20 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#42 (text) ==== @@ -383,169 +383,6 @@ } /** - * \note longer-term, we should modify all of the dmu_buf_*() interfaces - * to take a held dnode rather than -- the lookup is wasteful, - * and can induce severe lock contention when writing to several files - * whose dnodes are in the same block. - */ -static int -dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) -{ - dsl_pool_t *dp = NULL; - dmu_buf_t **dbp; - uint64_t blkid, nblks, i; - uint32_t dbuf_flags; - int err; - zio_t *zio; - hrtime_t start; - - ASSERT(length <= DMU_MAX_ACCESS); - - dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; - if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) - dbuf_flags |= DB_RF_NOPREFETCH; - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (dn->dn_datablkshift) { - int blkshift = dn->dn_datablkshift; - nblks = (P2ROUNDUP(offset+length, 1ULL<> blkshift; - } else { - if (offset + length > dn->dn_datablksz) { - zfs_panic_recover("zfs: accessing past end of object " - "%llx/%llx (size=%u access=%llu+%llu)", - (longlong_t)dn->dn_objset-> - os_dsl_dataset->ds_object, - (longlong_t)dn->dn_object, dn->dn_datablksz, - (longlong_t)offset, (longlong_t)length); - rw_exit(&dn->dn_struct_rwlock); - return (EIO); - } - nblks = 1; - } - dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); - - if (dn->dn_objset->os_dsl_dataset) - dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; - if (dp && dsl_pool_sync_context(dp)) - start = gethrtime(); - zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); - blkid = dbuf_whichblock(dn, offset); - for (i = 0; i < nblks; i++) { - dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); - if (db == NULL) { - rw_exit(&dn->dn_struct_rwlock); - dmu_buf_rele_array(dbp, nblks, tag); - zio_nowait(zio); - return (EIO); - } - /* initiate async i/o */ - if (read) - (void) dbuf_read(db, zio, dbuf_flags); -#ifdef _KERNEL - else - curthread->td_ru.ru_oublock++; -#endif - dbp[i] = &db->db; - } - rw_exit(&dn->dn_struct_rwlock); - - /* wait for async i/o */ - err = zio_wait(zio); - /* track read overhead when we are in sync context */ - if (dp && dsl_pool_sync_context(dp)) - dp->dp_read_overhead += gethrtime() - start; - if (err) { - dmu_buf_rele_array(dbp, nblks, tag); - return (err); - } - - /* wait for other io to complete */ - if (read) { - for (i = 0; i < nblks; i++) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; - mutex_enter(&db->db_mtx); - while (db->db_state & (DB_READ|DB_FILL)) - cv_wait(&db->db_changed, &db->db_mtx); - if (db->db_state == DB_UNCACHED) - err = EIO; - mutex_exit(&db->db_mtx); - if (err) { - dmu_buf_rele_array(dbp, nblks, tag); - return (err); - } - } - } - - *numbufsp = nblks; - *dbpp = dbp; - return (0); -} - -/** - * Holds the DMU buffers which contain all bytes in a range of an object. A - * pointer to an array of dmu_buf_t*'s is returned (in *dbpp). - */ -static int -dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, - uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) -{ - dnode_t *dn; - int err; - - err = dnode_hold(os, object, FTAG, &dn); - if (err) - return (err); - - err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - numbufsp, dbpp, DMU_READ_PREFETCH); - - dnode_rele(dn, FTAG); - - return (err); -} - -int -dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, - uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; - int err; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - numbufsp, dbpp, DMU_READ_PREFETCH); - DB_DNODE_EXIT(db); - - return (err); -} - -/** - * Releases the hold on an array of dmu_buf_t*'s, and frees the array. The - * hold on the array of buffers MUST be released with dmu_buf_rele_array. You - * can NOT release the hold on each buffer individually with dmu_buf_rele. - */ -void -dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) -{ - int i; - dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; - - if (numbufs == 0) - return; - - for (i = 0; i < numbufs; i++) { - if (dbp[i]) - dbuf_rele(dbp[i], tag); - } - - kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); -} - -/** * \brief Asynchronously try to read in the data. */ void @@ -791,12 +628,17 @@ kmem_free(dcn, sizeof(dmu_context_node_t)); } -#ifdef _KERNEL static void -dmu_buf_move_uio(uio_t *uio, enum uio_rw dir, int offset, int size, - dmu_buf_t *db) +dmu_buf_move_uio(dmu_context_t *dmu_ctx, dmu_buf_t *db, int off, int sz) { +#ifdef _KERNEL xuio_t *xuio = NULL; + uio_t *uio; + enum uio_rw dir; + + /* Initialize the state for this I/O. */ + uio = (uio_t *)dmu_ctx->data_buf; + dir = (dmu_ctx->flags & DMU_CTX_FLAG_READ) ? UIO_READ : UIO_WRITE; #ifdef UIO_XUIO if (uio->uio_extflg == UIO_XUIO) @@ -807,10 +649,10 @@ dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; arc_buf_t *dbuf_abuf = dbi->db_buf; arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); - int err = dmu_xuio_add(xuio, abuf, offset, size); + int err = dmu_xuio_add(xuio, abuf, off, sz); if (!err) { - uio->uio_resid -= size; - uio->uio_loffset += size; + uio->uio_resid -= sz; + uio->uio_loffset += sz; } if (abuf == dbuf_abuf) @@ -818,9 +660,31 @@ else XUIOSTAT_BUMP(xuiostat_rbuf_copied); } else - uiomove((char *)db->db_data + offset, size, dir, uio); + uiomove((char *)db->db_data + off, sz, dir, uio); +#endif } + +static void +dmu_buf_move_pages(dmu_context_t *dmu_ctx, dmu_buf_t *db, int off, int sz) +{ +#ifdef sun + int copied; + page_t *pp = dmu_context->data_buf; + + for (copied = 0; copied < sz; copied += PAGESIZE) { + caddr_t va; + int thiscpy; + + ASSERT3U(pp->p_offset, ==, db->db_offset + off); + thiscpy = MIN(PAGESIZE, sz - copied); + va = zfs_map_page(pp, S_READ); + bcopy(va, (char *)db->db_data + off, thiscpy); + zfs_unmap_page(pp, va); + pp = pp->p_next; + off += PAGESIZE; + } #endif +} /** * \brief Perform a buffer set read for a char * target buffer. @@ -835,32 +699,29 @@ dmu_context_t *dmu_ctx = buf_set->dmu_ctx; dmu_tx_t *tx = dmu_ctx->tx; int i; -#ifdef _KERNEL - uio_t *uio; - enum uio_rw dir = UIO_WRITE; + + /* Having a transaction and being a reader is not supported. */ + ASSERT(tx != NULL || (dmu_ctx->flags & DMU_CTX_FLAG_READ)); - /* Initialize the state for this I/O. */ - if (dmu_ctx->flags & DMU_CTX_FLAG_UIO) { - uio = (uio_t *)dmu_ctx->data_buf; - if (dmu_ctx->flags & DMU_CTX_FLAG_READ) - dir = UIO_READ; - } else -#endif - { - data = (char *)dmu_ctx->data_buf + buf_set->offset - - dmu_ctx->start; - } + /* + * Initialize the current state. Note that for special (non-char *) + * data pointers, 'data' is not used. + */ + data = (char *)dmu_ctx->data_buf + buf_set->offset - dmu_ctx->start; size = buf_set->size; offset = buf_set->offset; - /* Having a transaction and being a reader is not supported. */ - ASSERT(tx != NULL || (dmu_ctx->flags & DMU_CTX_FLAG_READ)); - /* Perform the I/O copy, one buffer at a time. */ for (i = 0; i < buf_set->count; i++) { int off, sz; dmu_buf_t *db = buf_set->dbp[i]; + if (dmu_ctx->flags & DMU_CTX_FLAG_NOFILL) { + dmu_buf_will_not_fill(db, tx); + /* No need to do any more here. */ + continue; + } + ASSERT(size > 0); off = offset - db->db_offset; @@ -874,11 +735,11 @@ dmu_buf_will_dirty_range(db, tx, off, sz); } - if (dmu_ctx->flags & DMU_CTX_FLAG_UIO) { -#ifdef _KERNEL - dmu_buf_move_uio(uio, dir, off, sz, db); -#endif - } else { + if (dmu_ctx->flags & DMU_CTX_FLAG_UIO) + dmu_buf_move_uio(dmu_ctx, db, off, sz); + else if (dmu_ctx->flags & DMU_CTX_FLAG_SUN_PAGES) + dmu_buf_move_pages(dmu_ctx, db, off, sz); + else { if (dmu_ctx->flags & DMU_CTX_FLAG_READ) bcopy((char *)db->db_data + off, data, sz); else @@ -1360,6 +1221,9 @@ ASSERT(!refcount_is_zero(&dn->dn_holds)); /* Reads and DMU transactions are (currently) mutually exclusive. */ ASSERT(((flags & DMU_CTX_FLAG_READ) == 0) ^ (tx == NULL)); + /* Sun pages and NOFILL are only for writes. */ + ASSERT((flags & (DMU_CTX_FLAG_SUN_PAGES|DMU_CTX_FLAG_NOFILL) == 0) || + (flags & DMU_CTX_FLAG_READ) == 0); bzero(dmu_ctx, sizeof(dmu_context_t)); dmu_ctx->dn = dn; @@ -1460,21 +1324,19 @@ dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { - dmu_buf_t **dbp; - int numbufs, i; + uint32_t flags = DMU_CTX_FLAG_NOFILL; + dmu_context_t dmu_ctx; + int err; if (size == 0) return; - VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &numbufs, &dbp)); + err = dmu_context_setup(&dmu_ctx, os, object, size, offset, + /*data_buf*/NULL, /*dmu_cb*/NULL, tx, FTAG, flags); + /* XXX the caller should really check... */ + VERIFY(err == 0); - for (i = 0; i < numbufs; i++) { - dmu_buf_t *db = dbp[i]; - - dmu_buf_will_not_fill(db, tx); - } - dmu_buf_rele_array(dbp, numbufs, FTAG); + (void) dmu_issue(&dmu_ctx); } /** @@ -1663,54 +1525,19 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, page_t *pp, dmu_tx_t *tx) { - dmu_buf_t **dbp; - int numbufs, i; + dmu_context_t dmu_ctx; + uint32_t dmu_flags = DMU_CTX_FLAG_SUN_PAGES; int err; if (size == 0) return (0); - err = dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &numbufs, &dbp); + err = dmu_context_setup(&dmu_ctx, os, object, size, offset, pp, + /*dmu_cb*/NULL, tx, FTAG, dmu_flags); if (err) return (err); - for (i = 0; i < numbufs; i++) { - int tocpy, copied, thiscpy; - int bufoff; - dmu_buf_t *db = dbp[i]; - caddr_t va; - - ASSERT(size > 0); - ASSERT3U(db->db_size, >=, PAGESIZE); - - bufoff = offset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); - - if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); - else - dmu_buf_will_dirty_range(db, tx, bufoff, tocpy); - - for (copied = 0; copied < tocpy; copied += PAGESIZE) { - ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); - thiscpy = MIN(PAGESIZE, tocpy - copied); - va = zfs_map_page(pp, S_READ); - bcopy(va, (char *)db->db_data + bufoff, thiscpy); - zfs_unmap_page(pp, va); - pp = pp->p_next; - bufoff += PAGESIZE; - } - - dmu_buf_fill_done(db, tx); - - offset += tocpy; - size -= tocpy; - } - dmu_buf_rele_array(dbp, numbufs, FTAG); - return (err); + return (dmu_issue(&dmu_ctx)); } #endif /* sun */ #endif ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#20 (text) ==== @@ -258,6 +258,8 @@ #define DMU_CTX_FLAG_UIO (1 << 2) #define DMU_CTX_FLAG_PREFETCH (1 << 3) #define DMU_CTX_FLAG_NO_HOLD (1 << 4) +#define DMU_CTX_FLAG_SUN_PAGES (1 << 5) +#define DMU_CTX_FLAG_NOFILL (1 << 6) /** The number of errors that occurred. */ int err; Change 524449 by willa@willa_repo on 2012/02/02 18:15:13 A few tweaks to the DMU context API. Rename dmu_context_setup*() to dmu_context_init*() to be more in line with local naming schemes. Harmonize these APIs so they pass arguments in the same order. Make dmu_context_init_dnode() cleaner and stricter about flags. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#43 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#21 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#17 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#43 (text) ==== @@ -1206,10 +1206,11 @@ * \note The context may not specify a read and a transaction. */ void -dmu_context_setup_dnode(dmu_context_t *dmu_ctx, struct dnode *dn, uint64_t size, - uint64_t offset, dmu_callback_t dmu_cb, void *data_buf, dmu_tx_t *tx, - void *tag, uint32_t flags) +dmu_context_init_dnode(dmu_context_t *dmu_ctx, struct dnode *dn, + uint64_t offset, uint64_t size, void *data_buf, dmu_callback_t dmu_cb, + dmu_tx_t *tx, void *tag, uint32_t flags) { + boolean_t reader = (flags & DMU_CTX_FLAG_READ) != 0; #ifdef ZFS_DEBUG refcount_acquire(&dmu_ctx_in_flight); @@ -1218,22 +1219,13 @@ /* Make sure the dnode passed in is valid. */ ASSERT(dn != NULL); - ASSERT(!refcount_is_zero(&dn->dn_holds)); + ASSERT(!refcount_is_zero(&dn->dn_holds) || + (flags & DMU_CTX_FLAG_NO_HOLD)); /* Reads and DMU transactions are (currently) mutually exclusive. */ - ASSERT(((flags & DMU_CTX_FLAG_READ) == 0) ^ (tx == NULL)); - /* Sun pages and NOFILL are only for writes. */ - ASSERT((flags & (DMU_CTX_FLAG_SUN_PAGES|DMU_CTX_FLAG_NOFILL) == 0) || - (flags & DMU_CTX_FLAG_READ) == 0); - - bzero(dmu_ctx, sizeof(dmu_context_t)); - dmu_ctx->dn = dn; - dmu_ctx->dmu_cb = dmu_cb; - dmu_ctx->flags = flags; - dmu_ctx->data_buf = data_buf; - dmu_ctx->tx = tx; - dmu_ctx->tag = tag; - dmu_ctx->offset = offset; - dmu_ctx->start = offset; + ASSERT(!reader ^ (tx == NULL)); + /* Make sure the flags are compatible with the I/O type. */ + ASSERT(reader || ((flags & DMU_CTX_READER_FLAGS) == 0)); + ASSERT(!reader || ((flags & DMU_CTX_WRITER_FLAGS) == 0)); /* Make sure UIO callers pass in the correct offset. */ #if defined(_KERNEL) && defined(ZFS_DEBUG) @@ -1243,24 +1235,34 @@ } #endif - /* Initialize to a refcount for the initiator. */ - refcount_init(&dmu_ctx->holds, 1); - /* XXX do something more intelligent about state matching? */ - dmu_ctx->db_states = (DB_UNCACHED|DB_CACHED); - - if ((flags & DMU_CTX_FLAG_READ) && dn->dn_maxblkid == 0) { - /* - * Deal with odd block sizes, where there can't be data past - * the first block. If we ever do the tail block optimization, - * we will need to handle that here as well. - */ + /* + * Deal with odd block sizes, where there can't be data past + * the first block. If we ever do the tail block optimization, + * we will need to handle that here as well. + */ + if (reader && dn->dn_maxblkid == 0) { int newsz = offset > dn->dn_datablksz ? 0 : MIN(size, dn->dn_datablksz - offset); bzero((char *)data_buf + newsz, size - newsz); size = newsz; } - /* Set the final size now that it is known. */ + + /* All set, actually initialize the context! */ + bzero(dmu_ctx, sizeof(dmu_context_t)); + dmu_ctx->dn = dn; + dmu_ctx->offset = offset; + dmu_ctx->start = offset; dmu_ctx->size = size; + dmu_ctx->data_buf = data_buf; + dmu_ctx->dmu_cb = dmu_cb; + dmu_ctx->tx = tx; + dmu_ctx->tag = tag; + dmu_ctx->flags = flags; + + /* Initialize including a refcount for the initiator. */ + refcount_init(&dmu_ctx->holds, 1); + /* XXX do something more intelligent about state matching? */ + dmu_ctx->db_states = (DB_UNCACHED|DB_CACHED); } /** @@ -1269,8 +1271,8 @@ * \param os Object set to associate with the DMU context. * \param object Object ID to associate with the DMU context. * - * \note See dmu_context_setup_dnode about the other parameters. - * \note This function wraps dmu_context_setup_dnode, and its + * \note See dmu_context_init_dnode about the other parameters. + * \note This function wraps dmu_context_init_dnode, and its * purpose is to hide the dnode from the caller; it * also makes it possible to avoid lookups. * @@ -1278,8 +1280,8 @@ * \retval 0 Success. */ int -dmu_context_setup(dmu_context_t *dmu_ctx, objset_t *os, uint64_t object, - uint64_t size, uint64_t offset, void *data_buf, dmu_callback_t dmu_cb, +dmu_context_init(dmu_context_t *dmu_ctx, objset_t *os, uint64_t object, + uint64_t offset, uint64_t size, void *data_buf, dmu_callback_t dmu_cb, dmu_tx_t *tx, void *tag, uint32_t flags) { dnode_t *dn = NULL; @@ -1289,7 +1291,7 @@ if (err) return (err); - dmu_context_setup_dnode(dmu_ctx, dn, size, offset, dmu_cb, data_buf, + dmu_context_init_dnode(dmu_ctx, dn, size, offset, data_buf, dmu_cb, tx, tag, flags); return (0); } @@ -1301,7 +1303,7 @@ int err; dmu_context_t dmu_ctx; - dmu_context_setup(&dmu_ctx, os, object, size, offset, data_buf, + dmu_context_init(&dmu_ctx, os, object, offset, size, data_buf, /*dmu_cb*/NULL, /*tx*/NULL, FTAG, flags|DMU_CTX_FLAG_READ); return (dmu_issue(&dmu_ctx)); @@ -1314,7 +1316,7 @@ void *data_bufp = (void *)(uintptr_t)data_buf; dmu_context_t dmu_ctx; - dmu_context_setup(&dmu_ctx, os, object, size, offset, data_bufp, + dmu_context_init(&dmu_ctx, os, object, offset, size, data_bufp, /*dmu_cb*/NULL, tx, FTAG, /*flags*/0); (void) dmu_issue(&dmu_ctx); @@ -1331,7 +1333,7 @@ if (size == 0) return; - err = dmu_context_setup(&dmu_ctx, os, object, size, offset, + err = dmu_context_init(&dmu_ctx, os, object, offset, size, /*data_buf*/NULL, /*dmu_cb*/NULL, tx, FTAG, flags); /* XXX the caller should really check... */ VERIFY(err == 0); @@ -1470,7 +1472,7 @@ uint32_t dmu_flags = DMU_CTX_FLAG_READ|DMU_CTX_FLAG_UIO; int err; - err = dmu_context_setup(&dmu_ctx, os, object, size, uio->uio_loffset, + err = dmu_context_init(&dmu_ctx, os, object, uio->uio_loffset, size, uio, /*dmu_cb*/NULL, /*tx*/NULL, FTAG, dmu_flags); if (err) return (err); @@ -1493,8 +1495,8 @@ DB_DNODE_ENTER(db); dn = DB_DNODE(db); - dmu_context_setup_dnode(&dmu_ctx, dn, size, uio->uio_loffset, - /*dmu_cb*/NULL, uio, tx, FTAG, flags); + dmu_context_init_dnode(&dmu_ctx, dn, uio->uio_loffset, size, + uio, /*dmu_cb*/NULL, tx, FTAG, flags); err = dmu_issue(&dmu_ctx); DB_DNODE_EXIT(db); @@ -1512,7 +1514,7 @@ if (size == 0) return (0); - err = dmu_context_setup(&dmu_ctx, os, object, size, uio->uio_loffset, + err = dmu_context_init(&dmu_ctx, os, object, uio->uio_loffset, size, uio, /*dmu_cb*/NULL, tx, FTAG, dmu_flags); if (err) return (err); @@ -1532,7 +1534,7 @@ if (size == 0) return (0); - err = dmu_context_setup(&dmu_ctx, os, object, size, offset, pp, + err = dmu_context_init(&dmu_ctx, os, object, offset, size, pp, /*dmu_cb*/NULL, tx, FTAG, dmu_flags); if (err) return (err); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#21 (text) ==== @@ -261,6 +261,9 @@ #define DMU_CTX_FLAG_SUN_PAGES (1 << 5) #define DMU_CTX_FLAG_NOFILL (1 << 6) +#define DMU_CTX_WRITER_FLAGS (DMU_CTX_FLAG_SUN_PAGES|DMU_CTX_FLAG_NOFILL) +#define DMU_CTX_READER_FLAGS (DMU_CTX_FLAG_PREFETCH) + /** The number of errors that occurred. */ int err; @@ -297,8 +300,8 @@ } dmu_buf_set_t; void dmu_buf_set_rele(dmu_buf_set_t *buf_set, dmu_buf_t *vdb, boolean_t err); -int dmu_context_setup(dmu_context_t *dmu_ctx, objset_t *os, uint64_t object, - uint64_t size, uint64_t offset, void *data_buf, dmu_callback_t dmu_cb, +int dmu_context_init(dmu_context_t *dmu_ctx, objset_t *os, uint64_t object, + uint64_t offset, uint64_t size, void *data_buf, dmu_callback_t dmu_cb, dmu_tx_t *tx, void *tag, uint32_t flags); void dmu_context_rele(dmu_context_t *dmu_ctx); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#17 (text) ==== @@ -1277,9 +1277,8 @@ } dmu_ctx = kmem_zalloc(sizeof(dmu_context_t), KM_SLEEP); - error = dmu_context_setup(dmu_ctx, zv->zv_objset, ZVOL_OBJ, - bp->bio_length, off, bp->bio_data, zvol_dmu_cb, tx, FTAG, - dmu_flags); + error = dmu_context_init(dmu_ctx, zv->zv_objset, ZVOL_OBJ, off, + bp->bio_length, bp->bio_data, zvol_dmu_cb, tx, FTAG, dmu_flags); /* All error handling is done in the callback. */ dmu_ctx->callback_private[0] = bp; dmu_ctx->callback_private[1] = rl; Change 524450 by willa@willa_repo on 2012/02/02 18:29:29 Fix the argument order of one of the dmu_context_init_dnode() calls. In dmu_read(), return error if dmu_context_init() fails, i.e. could not place a hold on the dnode. Make sure no non-Sun OS tries to call dmu_context_init_dnode with DMU_CTX_FLAG_SUN_PAGES flag bit set. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#44 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#44 (text) ==== @@ -1216,6 +1216,9 @@ refcount_acquire(&dmu_ctx_in_flight); atomic_add_64(&dmu_ctx_total, 1); #endif +#ifndef sun + ASSERT((flags & DMU_CTX_FLAG_SUN_PAGES) == 0); +#endif /* Make sure the dnode passed in is valid. */ ASSERT(dn != NULL); @@ -1291,7 +1294,7 @@ if (err) return (err); - dmu_context_init_dnode(dmu_ctx, dn, size, offset, data_buf, dmu_cb, + dmu_context_init_dnode(dmu_ctx, dn, offset, size, data_buf, dmu_cb, tx, tag, flags); return (0); } @@ -1303,8 +1306,10 @@ int err; dmu_context_t dmu_ctx; - dmu_context_init(&dmu_ctx, os, object, offset, size, data_buf, + err = dmu_context_init(&dmu_ctx, os, object, offset, size, data_buf, /*dmu_cb*/NULL, /*tx*/NULL, FTAG, flags|DMU_CTX_FLAG_READ); + if (err) + return (err); return (dmu_issue(&dmu_ctx)); } Change 524451 by willa@willa_repo on 2012/02/02 18:55:18 Take a stab at the memory leak warnings when ZFS unloads. The DMU context list nodes (which are malloc'd) seem ok, but just in case, add a ZFS_DEBUG-only refcount for them. The leaks always seem to be N*32 bytes. 32 bytes happens to be the size of a list_t (2 size_t's plus 2 pointers), which happens to be what we stick in the TSD. Just in case dmu_thread_context_destroy() doesn't actually get called when passed as a destructor to tsd_create(), make sure FreeBSD taskqueue calls it when a taskqueue thread exits. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#45 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c#11 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#45 (text) ==== @@ -114,6 +114,7 @@ &name, 0, desc) #ifdef ZFS_DEBUG +SYSCTL_REFCOUNT(dcn_in_flight, "DMU context nodes in flight"); SYSCTL_COUNTER_U(dmu_ctx_total, "total number of DMU contexts"); SYSCTL_COUNTER_U(buf_set_total, "total number of buffer sets"); SYSCTL_REFCOUNT(dmu_ctx_in_flight, "number of DMU contexts in flight"); @@ -619,6 +620,9 @@ KM_SLEEP); dcn->buf_set = buf_set; list_insert_tail(list, dcn); +#ifdef ZFS_DEBUG + refcount_acquire(&dcn_in_flight); +#endif } void @@ -626,6 +630,10 @@ { list_remove(list, dcn); kmem_free(dcn, sizeof(dmu_context_node_t)); +#ifdef ZFS_DEBUG + ASSERT(dcn_in_flight > 0); + refcount_release(&dcn_in_flight); +#endif } static void ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c#11 (text) ==== @@ -697,11 +697,18 @@ offsetof(spa_error_entry_t, se_avl)); } -void +static void spa_zio_thread_init(void *context __unused) { - dmu_thread_context_create(); + VERIFY(0 == dmu_thread_context_create()); +} + +static void +spa_zio_thread_destroy(void *context) +{ + + dmu_thread_context_destroy(context/*NOTUSED*/); } static taskq_t * @@ -747,7 +754,7 @@ } #endif return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, - spa->spa_proc, flags, spa_zio_thread_init, NULL)); + spa->spa_proc, flags, spa_zio_thread_init, spa_zio_thread_destroy)); } static void Change 524452 by willa@willa_repo on 2012/02/02 21:22:22 Remove the commented assign_arcbuf prototype; it's in the wrong place. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#18 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#18 (text) ==== @@ -1425,49 +1425,6 @@ uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); uint64_t off = uio->uio_loffset; - /* XXX - * zfs_write() uses dbuf_assign_arcbuf() in conjunction with - * uiocopy() for recordsize write segments to avoid holding - * up transactions on page faults (see the XXX comment in - * dmu_write_uio_dnode()). This function should do - * the same thing. Prototype follows: - */ -#if 0 - arc_buf_t *buf = NULL; - - bytes = MIN(uio->uio_resid, zv->zv_volblocksize); - if (bytes > volsize - off) /* don't write past the end */ - bytes = volsize - off; - - if (bytes >= zv->zv_volblocksize && - P2PHASE(off, zv->zv_volblocksize) == 0) { - buf = dmu_request_arcbuf(zv->zv_dbuf, bytes); - if (error = uiocopy(buf->b_data, bytes, UIO_WRITE, - uio, &cbytes)) { - dmu_return_arcbuf(buf); - break; - } - } - - tx = dmu_tx_create(zv->zv_objset); - dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - if (buf != NULL) - dbuf_return_arcbuf(buf); - break; - } - if (buf != NULL) { - dmu_assign_arcbuf(zv->zv_dbuf, off, buf, tx); - error = 0; - } else { - error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx); - } - if (error == 0) - zvol_log_write(zv, tx, off, bytes, sync); - dmu_tx_commit(tx); -#else dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); if (bytes > volsize - off) /* don't write past the end */ @@ -1486,7 +1443,6 @@ if (error) break; -#endif } zfs_range_unlock(rl); if (sync) Change 524453 by willa@willa_repo on 2012/02/02 21:28:36 Change dmu_prealloc()'s function signature to return errors. Check in zvol_prealloc() and ztest_prealloc(). Affected files ... ... //depot/branches/redline/projects/cow/cddl/contrib/opensolaris/cmd/ztest/ztest.c#3 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#46 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#22 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#19 edit Differences ... ==== //depot/branches/redline/projects/cow/cddl/contrib/opensolaris/cmd/ztest/ztest.c#3 (text) ==== @@ -1942,7 +1942,7 @@ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg != 0) { - dmu_prealloc(os, object, offset, size, tx); + (void) dmu_prealloc(os, object, offset, size, tx); dmu_tx_commit(tx); txg_wait_synced(dmu_objset_pool(os), txg); } else { ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#46 (text) ==== @@ -1335,7 +1335,7 @@ (void) dmu_issue(&dmu_ctx); } -void +int dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { @@ -1344,14 +1344,14 @@ int err; if (size == 0) - return; + return (0); err = dmu_context_init(&dmu_ctx, os, object, offset, size, /*data_buf*/NULL, /*dmu_cb*/NULL, tx, FTAG, flags); - /* XXX the caller should really check... */ - VERIFY(err == 0); + if (err) + return (err); - (void) dmu_issue(&dmu_ctx); + return (dmu_issue(&dmu_ctx)); } /** ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#22 (text) ==== @@ -447,7 +447,7 @@ int dmu_issue(dmu_context_t *dmu_ctx); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); -void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, +int dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#19 (text) ==== @@ -712,12 +712,13 @@ tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); error = dmu_tx_assign(tx, TXG_WAIT); + if (error == 0) + error = dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx); if (error) { dmu_tx_abort(tx); (void) dmu_free_long_range(os, ZVOL_OBJ, 0, off); return (error); } - dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx); dmu_tx_commit(tx); off += bytes; resid -= bytes; Change 524454 by willa@willa_repo on 2012/02/02 21:47:10 Require the NOFILL flag if data_buf is NULL. Fix zvol_strategy so it follows the dmu_context_init rules. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#47 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#20 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#47 (text) ==== @@ -1210,7 +1210,9 @@ * \param tag Hold tag to use. * \param flags DMU context flags. * - * \note The dnode must not be NULL, and must be held. + * \note The dnode must not be NULL. + * \note The dnode must be held, unless the DMU_CTX_FLAG_NO_HOLD + * flag is specified. * \note The context may not specify a read and a transaction. */ void @@ -1237,6 +1239,8 @@ /* Make sure the flags are compatible with the I/O type. */ ASSERT(reader || ((flags & DMU_CTX_READER_FLAGS) == 0)); ASSERT(!reader || ((flags & DMU_CTX_WRITER_FLAGS) == 0)); + /* The NOFILL flag and a NULL data_buf go hand in hand. */ + ASSERT((flags & DMU_CTX_FLAG_NOFILL) ^ (data_buf != NULL)); /* Make sure UIO callers pass in the correct offset. */ #if defined(_KERNEL) && defined(ZFS_DEBUG) ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#20 (text) ==== @@ -1240,7 +1240,7 @@ boolean_t doread = (bp->bio_cmd == BIO_READ); dmu_tx_t *tx = NULL; dmu_context_t *dmu_ctx = NULL; - uint32_t dmu_flags = DMU_CTX_FLAG_PREFETCH; + uint32_t dmu_flags = 0; if (zv == NULL) { g_io_deliver(bp, ENXIO); @@ -1268,7 +1268,7 @@ doread ? RL_READER : RL_WRITER); if (doread) - dmu_flags |= DMU_CTX_FLAG_READ; + dmu_flags = (DMU_CTX_FLAG_READ|DMU_CTX_FLAG_PREFETCH); else { tx = dmu_tx_create(zv->zv_objset); dmu_tx_hold_write(tx, ZVOL_OBJ, off, bp->bio_length); Change 524455 by willa@willa_repo on 2012/02/02 23:00:36 Push the DMU context up into the ZFS layer, in zfs_read(). This function implements the read(2) syscall for ZFS, and uses UIOs. Add a new API, dmu_context_seek(), which allows callers to update the context's offset, size, and target data buffer. It seems better to incorporate this as a separate API, since many DMU consumers may only call dmu_issue() once. Require all callers of dmu_context_init*() to call dmu_context_rele(). This makes it possible to hold the affected dnode only once and just keep updating and re-invoking dmu_issue(). It also makes sense that the caller of dmu_context_init*() is in fact the initiator, and should be the one that releases it. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#48 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#23 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c#9 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#21 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#48 (text) ==== @@ -1104,8 +1104,8 @@ * \retval errno Various other errors, primarily ZIO calls. * \retval 0 Success. */ -int -dmu_ctx_execute_chunk(dmu_context_t *dmu_ctx) +static int +dmu_context_execute_chunk(dmu_context_t *dmu_ctx) { uint64_t io_size, nblks; dmu_buf_set_t *buf_set = NULL; @@ -1183,7 +1183,7 @@ /* While there is work left to do, execute the next chunk. */ while (dmu_ctx->size > 0 && err == 0) - err = dmu_ctx_execute_chunk(dmu_ctx); + err = dmu_context_execute_chunk(dmu_ctx); /* If a callback is specified, forward any error to it. */ if (err && dmu_ctx->dmu_cb != NULL) { @@ -1191,8 +1191,6 @@ err = 0; } ASSERT(dmu_ctx->dmu_cb != NULL || dmu_ctx->holds == 1); - /* Release initiator hold. */ - dmu_context_rele(dmu_ctx); return (err); } @@ -1242,33 +1240,10 @@ /* The NOFILL flag and a NULL data_buf go hand in hand. */ ASSERT((flags & DMU_CTX_FLAG_NOFILL) ^ (data_buf != NULL)); - /* Make sure UIO callers pass in the correct offset. */ -#if defined(_KERNEL) && defined(ZFS_DEBUG) - if (flags & DMU_CTX_FLAG_UIO) { - uio_t *uio = (uio_t *)data_buf; - ASSERT(uio->uio_loffset == offset); - } -#endif - - /* - * Deal with odd block sizes, where there can't be data past - * the first block. If we ever do the tail block optimization, - * we will need to handle that here as well. - */ - if (reader && dn->dn_maxblkid == 0) { - int newsz = offset > dn->dn_datablksz ? 0 : - MIN(size, dn->dn_datablksz - offset); - bzero((char *)data_buf + newsz, size - newsz); - size = newsz; - } - /* All set, actually initialize the context! */ bzero(dmu_ctx, sizeof(dmu_context_t)); dmu_ctx->dn = dn; - dmu_ctx->offset = offset; - dmu_ctx->start = offset; - dmu_ctx->size = size; - dmu_ctx->data_buf = data_buf; + dmu_context_seek(dmu_ctx, offset, size, data_buf); dmu_ctx->dmu_cb = dmu_cb; dmu_ctx->tx = tx; dmu_ctx->tag = tag; @@ -1311,6 +1286,45 @@ return (0); } +/** + * \brief Update a DMU context for the next call. + * + * \param dmu_ctx The DMU context. + * \param data_buf The updated destination data buffer. + * \param offset The offset into the dnode. + * \param size The size of the next call. + */ +void +dmu_context_seek(dmu_context_t *dmu_ctx, uint64_t offset, uint64_t size, + void *data_buf) +{ + dnode_t *dn = dmu_ctx->dn; + + /* Make sure UIO callers pass in the correct offset. */ +#if defined(_KERNEL) && defined(ZFS_DEBUG) + if (dmu_ctx->flags & DMU_CTX_FLAG_UIO) { + uio_t *uio = (uio_t *)data_buf; + ASSERT(uio->uio_loffset == offset); + } +#endif + + /* + * Deal with odd block sizes, where there can't be data past + * the first block. If we ever do the tail block optimization, + * we will need to handle that here as well. + */ + if ((dmu_ctx->flags & DMU_CTX_FLAG_READ) && dn->dn_maxblkid == 0) { + int newsz = offset > dn->dn_datablksz ? 0 : + MIN(size, dn->dn_datablksz - offset); + bzero((char *)data_buf + newsz, size - newsz); + size = newsz; + } + dmu_ctx->offset = offset; + dmu_ctx->start = offset; + dmu_ctx->size = size; + dmu_ctx->data_buf = data_buf; +} + int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *data_buf, uint32_t flags) @@ -1323,7 +1337,10 @@ if (err) return (err); - return (dmu_issue(&dmu_ctx)); + err = dmu_issue(&dmu_ctx); + dmu_context_rele(&dmu_ctx); + + return (err); } void @@ -1337,6 +1354,7 @@ /*dmu_cb*/NULL, tx, FTAG, /*flags*/0); (void) dmu_issue(&dmu_ctx); + dmu_context_rele(&dmu_ctx); } int @@ -1355,7 +1373,10 @@ if (err) return (err); - return (dmu_issue(&dmu_ctx)); + err = dmu_issue(&dmu_ctx); + dmu_context_rele(&dmu_ctx); + + return (err); } /** @@ -1494,7 +1515,9 @@ if (err) return (err); - return (dmu_issue(&dmu_ctx)); + err = dmu_issue(&dmu_ctx); + dmu_context_rele(&dmu_ctx); + return (err); } int @@ -1515,6 +1538,7 @@ dmu_context_init_dnode(&dmu_ctx, dn, uio->uio_loffset, size, uio, /*dmu_cb*/NULL, tx, FTAG, flags); err = dmu_issue(&dmu_ctx); + dmu_context_rele(&dmu_ctx); DB_DNODE_EXIT(db); return (err); @@ -1536,7 +1560,9 @@ if (err) return (err); - return (dmu_issue(&dmu_ctx)); + err = dmu_issue(&dmu_ctx); + dmu_context_rele(&dmu_ctx); + return (err); } #ifdef sun @@ -1556,7 +1582,9 @@ if (err) return (err); - return (dmu_issue(&dmu_ctx)); + err = dmu_issue(&dmu_ctx); + dmu_context_rele(&dmu_ctx); + return (err); } #endif /* sun */ #endif ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#23 (text) ==== @@ -303,6 +303,8 @@ int dmu_context_init(dmu_context_t *dmu_ctx, objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *data_buf, dmu_callback_t dmu_cb, dmu_tx_t *tx, void *tag, uint32_t flags); +void dmu_context_seek(dmu_context_t *dmu_ctx, uint64_t offset, uint64_t size, + void *data_buf); void dmu_context_rele(dmu_context_t *dmu_ctx); /* DMU thread context handlers. */ ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c#9 (text) ==== @@ -561,10 +561,11 @@ znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; - ssize_t n, nbytes; + ssize_t n; int error; rl_t *rl; xuio_t *xuio = NULL; + dmu_context_t dmu_ctx; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); @@ -657,19 +658,27 @@ } #endif /* sun */ + error = dmu_context_init(&dmu_ctx, os, zp->z_id, uio->uio_loffset, n, + uio, /*dmu_cb*/NULL, /*tx*/NULL, FTAG, + DMU_CTX_FLAG_READ|DMU_CTX_FLAG_UIO); + if (error) + goto out; + while (n > 0) { - nbytes = MIN(n, zfs_read_chunk_size - + ssize_t sz = MIN(n, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); #ifdef __FreeBSD__ if (uio->uio_segflg == UIO_NOCOPY) - error = mappedread_sf(vp, nbytes, uio); + error = mappedread_sf(vp, sz, uio); else #endif /* __FreeBSD__ */ if (vn_has_cached_data(vp)) - error = mappedread(vp, nbytes, uio); - else - error = dmu_read_uio(os, zp->z_id, uio, nbytes); + error = mappedread(vp, sz, uio); + else { + dmu_context_seek(&dmu_ctx, uio->uio_loffset, sz, uio); + error = dmu_issue(&dmu_ctx); + } if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) @@ -677,8 +686,9 @@ break; } - n -= nbytes; + n -= sz; } + dmu_context_rele(&dmu_ctx); out: zfs_range_unlock(rl); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#21 (text) ==== @@ -1288,10 +1288,9 @@ if (error == 0) { /* Pump primed, issue the I/O to the DMU. */ (void) dmu_issue(dmu_ctx); - } else { + } else dmu_ctx->err = error; - dmu_context_rele(dmu_ctx); - } + dmu_context_rele(dmu_ctx); return (0); } Change 525783 by kenm@ken.spectrabsd5 on 2012/02/07 10:58:28 Checkpoint working dual-mode zvols. zvols can now be accessed via /dev/gzvol/... which is the GEOM path, or /dev/zvol/... which is the non-GEOM path. This still needs more cleanup and testing, but the two paths seem to work for reading and writing. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#22 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#22 (text) ==== @@ -90,6 +90,21 @@ .version = G_VERSION, }; +static d_open_t zvol_open; +static d_close_t zvol_close; +static d_strategy_t zvol_strategy; + +struct cdevsw zfs_zvol_cdevsw = { + .d_version = D_VERSION, + .d_flags = 0, + .d_name = "zvol", + .d_open = zvol_open, + .d_close = zvol_close, + .d_strategy = zvol_strategy, + .d_read = physread, + .d_write = physwrite, +}; + DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); /** @@ -119,6 +134,7 @@ uint64_t zv_volsize; /**< amount of space we advertise */ uint64_t zv_volblocksize; /**< volume block size */ struct g_provider *zv_provider; /**< GEOM provider */ + struct cdev *zv_dev; /**< DEVFS device */ uint8_t zv_min_bs; /**< minimum addressable block shift */ uint8_t zv_flags; /**< readonly, dumpified, etc. */ objset_t *zv_objset; /**< objset handle */ @@ -873,29 +889,14 @@ return (error); } -/*ARGSUSED*/ +/* + * Assumptions: zv != NULL and spa_namespace_lock is held. + */ static int -zvol_open(struct g_provider *pp, int flag, int count) +zvol_common_open(zvol_state_t *zv, int flag) { - zvol_state_t *zv; - int err = 0; + int err; - if (MUTEX_HELD(&spa_namespace_lock)) { - /* - * If the spa_namespace_lock is being held, it means that ZFS - * is trying to open ZVOL as its VDEV. This is not supported. - */ - return (EOPNOTSUPP); - } - - mutex_enter(&spa_namespace_lock); - - zv = pp->private; - if (zv == NULL) { - mutex_exit(&spa_namespace_lock); - return (ENXIO); - } - if (zv->zv_total_opens == 0) err = zvol_first_open(zv); if (err) { @@ -920,7 +921,8 @@ } #endif - zv->zv_total_opens += count; + /* XXX KDM need to figure out whether to track the count */ + zv->zv_total_opens++; mutex_exit(&spa_namespace_lock); return (err); @@ -929,14 +931,23 @@ zvol_last_close(zv); mutex_exit(&spa_namespace_lock); return (err); + } /*ARGSUSED*/ static int -zvol_close(struct g_provider *pp, int flag, int count) +zvol_geom_open(struct g_provider *pp, int flag, int count) { zvol_state_t *zv; - int error = 0; + int err = 0; + + if (MUTEX_HELD(&spa_namespace_lock)) { + /* + * If the spa_namespace_lock is being held, it means that ZFS + * is trying to open ZVOL as its VDEV. This is not supported. + */ + return (EOPNOTSUPP); + } mutex_enter(&spa_namespace_lock); @@ -946,6 +957,38 @@ return (ENXIO); } + return (zvol_common_open(zv, flag)); +} + +static int +zvol_open(struct cdev *dev, int flags, int fmt, struct thread *td) +{ + zvol_state_t *zv; + int err = 0; + + if (MUTEX_HELD(&spa_namespace_lock)) { + /* + * If the spa_namespace_lock is being held, it means that ZFS + * is trying to open ZVOL as its VDEV. This is not supported. + */ + return (EOPNOTSUPP); + } + + mutex_enter(&spa_namespace_lock); + + zv = (zvol_state_t *)dev->si_drv1; + if (zv == NULL) { + mutex_exit(&spa_namespace_lock); + return (ENXIO); + } + + return (zvol_common_open(zv, flags)); +} + +static int +zvol_common_close(zvol_state_t *zv) +{ + if (zv->zv_flags & ZVOL_EXCL) { ASSERT(zv->zv_total_opens == 1); zv->zv_flags &= ~ZVOL_EXCL; @@ -957,16 +1000,50 @@ */ ASSERT(zv->zv_total_opens != 0); + zv->zv_total_opens = 0; + /* * You may get multiple opens, but only one close. */ - zv->zv_total_opens -= count; + zvol_last_close(zv); + + mutex_exit(&spa_namespace_lock); + + return (0); +} + +/*ARGSUSED*/ +static int +zvol_geom_close(struct g_provider *pp, int flag, int count) +{ + zvol_state_t *zv; + + mutex_enter(&spa_namespace_lock); + + zv = pp->private; + if (zv == NULL) { + mutex_exit(&spa_namespace_lock); + /* XXX KDM should we just return 0 instead? */ + return (ENXIO); + } + return (zvol_common_close(zv)); +} + +static int +zvol_close(struct cdev *dev, int flags, int fmt, struct thread *td) +{ + zvol_state_t *zv; + + mutex_enter(&spa_namespace_lock); - if (zv->zv_total_opens == 0) - zvol_last_close(zv); + zv = (zvol_state_t *)dev->si_drv1; + if (zv == NULL) { + mutex_exit(&spa_namespace_lock); + /* XXX KDM should we just return 0 instead? */ + return (ENXIO); + } - mutex_exit(&spa_namespace_lock); - return (error); + return (zvol_common_close(zv)); } static void @@ -1225,14 +1302,17 @@ } else bp->bio_error = (dmu_ctx->err == 0) ? 0 : EIO; zfs_range_unlock(rl); - g_io_deliver(bp, 0); + if (bp->bio_to != NULL) + g_io_deliver(bp, 0); + else + bp->bio_done(bp); kmem_free(dmu_ctx, sizeof(dmu_context_t)); } -int -zvol_strategy(struct bio *bp) +static void +zvol_common_strategy(struct bio *bp, int geom_mode) { - zvol_state_t *zv = bp->bio_to->private; + zvol_state_t *zv; uint64_t off; objset_t *os; rl_t *rl; @@ -1242,22 +1322,51 @@ dmu_context_t *dmu_ctx = NULL; uint32_t dmu_flags = 0; + /* XXX KDM may be able to consolidate this into the non-GEOM case. */ + if (geom_mode != 0) + zv = bp->bio_to->private; + else { + zv = bp->bio_dev->si_drv1; + bp->bio_to = NULL; + } + if (zv == NULL) { - g_io_deliver(bp, ENXIO); - return (0); + error = ENXIO; + + if (geom_mode != 0) + g_io_deliver(bp, error); + else { + bp->bio_error = error; + bp->bio_done(bp); + } + return; } if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) { - g_io_deliver(bp, EROFS); - return (0); + error = EROFS; + + if (geom_mode != 0) + g_io_deliver(bp, error); + else { + bp->bio_error = error; + bp->bio_done(bp); + } + return; } off = bp->bio_offset; ASSERT(zv->zv_objset != NULL); if (bp->bio_length > 0 && (off < 0 || off >= zv->zv_volsize)) { - g_io_deliver(bp, EIO); - return (0); + error = EIO; + + if (geom_mode != 0) + g_io_deliver(bp, error); + else { + bp->bio_error = error; + bp->bio_done(bp); + } + return; } /* @@ -1291,8 +1400,18 @@ } else dmu_ctx->err = error; dmu_context_rele(dmu_ctx); +} + +static void +zvol_strategy(struct bio *bp) +{ + zvol_common_strategy(bp, /*geom_mode*/ 0); +} - return (0); +static void +zvol_geom_strategy(struct bio *bp) +{ + zvol_common_strategy(bp, /*geom_mode*/ 1); } #ifdef sun @@ -1982,7 +2101,7 @@ gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); gp->start = zvol_geom_start; gp->access = zvol_geom_access; - pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); + pp = g_new_providerf(gp, "g%s/%s", ZVOL_DRIVER, name); pp->sectorsize = DEV_BSIZE; zv = kmem_zalloc(sizeof(*zv), KM_SLEEP); @@ -1993,6 +2112,10 @@ pp->private = zv; + zv->zv_dev = make_dev(&zfs_zvol_cdevsw, /*unit*/ 0, UID_ROOT, + GID_OPERATOR, 0600, "%s/%s", ZVOL_DRIVER, name); + zv->zv_dev->si_drv1 = zv; + return (zv); } @@ -2027,6 +2150,8 @@ pp->private = NULL; g_wither_geom(pp->geom, ENXIO); + destroy_dev_sched(zv->zv_dev); + kmem_free(zv, sizeof(*zv)); } @@ -2073,9 +2198,9 @@ g_topology_unlock(); if (count > 0) - error = zvol_open(pp, flags, count); + error = zvol_geom_open(pp, flags, count); else - error = zvol_close(pp, flags, -count); + error = zvol_geom_close(pp, flags, -count); g_topology_lock(); return (error); } @@ -2140,7 +2265,7 @@ break; case BIO_READ: case BIO_WRITE: - zvol_strategy(bp); + zvol_geom_strategy(bp); break; } } @@ -2252,6 +2377,11 @@ return (0); } +/* + * XXX KDM + * These are used by ZFS to rename devices. Need to port them to straight + * devfs routines. + */ static void zvol_rename_minor(struct g_geom *gp, const char *newname) { @@ -2269,7 +2399,7 @@ zv->zv_provider = NULL; g_wither_provider(pp, ENXIO); - pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); + pp = g_new_providerf(gp, "g%s/%s", ZVOL_DRIVER, newname); pp->sectorsize = DEV_BSIZE; pp->mediasize = zv->zv_volsize; pp->private = zv; Change 525786 by willa@willa_repo on 2012/02/07 12:07:41 Clean up the DMU a bit more, upon Justin's review. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c: - Change dbuf_hold_impl() to accept another argument which is a dmu_buf_set_t associated with the hold action. If one is specified and it is a reader, then either: - If the dbuf is cached, release the buffer set. - Otherwise, add the buffer set to the dbuf's list. - This is done so that dmu_buf_set_setup_buffers() can pass the buffer set to dbuf_hold_impl() to have it do this while it is holding the dbuf mutex, instead of re-acquiring/re-dropping. - While I'm here, replace some calls of dbuf_hold_impl() with dbuf_hold() or dbuf_hold_level() as appropriate. - Update other dbuf_hold_impl() direct calls to pass in a NULL buf_set. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c: - Replace the block inside dmu_buf_set_transfer() which handles the data move, with a new function, dmu_buf_move(). - Remove the dbuf argument for dmu_buf_set_rele(). Previously this function handled cleaning up the dbufs for them, but now that is handled by the dbuf layer before calling into here. - Rename dmu_buf_set_dispatch() to dmu_buf_set_complete(). - Make dmu_buf_set_init() call dmu_buf_set_setup_buffers() instead of having them be two separate steps. dmu_buf_set_init() now returns errors from that function. - After combining those two functions, it no longer made sense to have dmu_context_execute_chunk() be a separate function. Pull it into dmu_issue() and nuke it. - Since dmu_issue() always calls dmu_buf_set_rele() for each call to dmu_buf_set_init(), it is no longer necessary to explicitly report errors to the context here. - Fold a buffer set's zio and chunk size into its structure. - Fix dmu_thread_context_create() so that it calls malloc(9) with M_NOWAIT; this is needed since the taskqueue mutex is held when this function gets executed. Reported by Ken. - In dmu_buf_set_setup_buffers(), because the function is serialized, no parallelism occurs relative to the buffer set passed in, and therefore atomics are not needed when operating on the buffer set. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h: - Remove the db_states DMU context variable & replace it with a static check for DB_UNCACHED|DB_CACHED, which is basically already hardcoded into the behavior of the callback API implementation. - Instead of checking for associated buffer sets to process on every state change, check in two places: at the end of dbuf_read_complete() or in the error case of dbuf_read_done(). sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h: - Rename dmu_buf_impl_t.db_dmu_contexts to db_dmu_buf_sets to more accurately reflect the list's contents. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c: - Make it clearer when dmu_thread_context_process() gets called. Make sure it gets called in the other exit path, just in case any buffer sets are placed on the thread's list before the exit occurs. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#64 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#49 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c#9 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c#10 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c#8 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#28 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#24 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c#5 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#64 (text) ==== @@ -92,8 +92,6 @@ SYSCTL_COUNTER_U(dirty_ranges_total, "number of total dirty ranges"); SYSCTL_COUNTER_U(dirty_writes_lost, "dirty writes lost"); -SYSCTL_COUNTER_U(dirty_buffers_already_cached, "dirty buffers already cached"); -SYSCTL_COUNTER_U(dmu_notifications, "DMU notifications"); static uint64_t dbuf_hash_count; @@ -116,29 +114,9 @@ return (crc); } -static void -dbuf_process_dmu_contexts(dmu_buf_impl_t *db, boolean_t err) -{ - dmu_context_node_t *dcn, *next; - - for (dcn = list_head(&db->db_dmu_contexts); dcn != NULL; dcn = next) { - next = list_next(&db->db_dmu_contexts, dcn); - if (dcn->buf_set->dmu_ctx->db_states & db->db_state) { - atomic_add_64(&dmu_notifications, 1); - dmu_buf_set_rele(dcn->buf_set, &db->db, err); - dmu_context_node_remove(&db->db_dmu_contexts, dcn); - } - } -} - -#define DBUF_STATE_CHANGE_COMMON(db, op, state, err, why) \ +#ifdef ZFS_DEBUG +#define DBUF_STATE_CHANGE(db, op, state, why) do { \ (db)->db_state op state; \ - if (!list_is_empty(&(db)->db_dmu_contexts)) \ - dbuf_process_dmu_contexts(db, err) - -#ifdef ZFS_DEBUG -#define DBUF_STATE_CHANGE(db, op, state, err, why) do { \ - DBUF_STATE_CHANGE_COMMON(db, op, state, err, why); \ if (zfs_flags & ZFS_DEBUG_DBUF_STATE) { \ uint64_t __db_obj = (db)->db.db_object; \ char __db_buf[32]; \ @@ -155,8 +133,8 @@ } \ } while(0) #else -#define DBUF_STATE_CHANGE(db, op, state, err, why) do { \ - DBUF_STATE_CHANGE_COMMON(db, op, state, err, why); \ +#define DBUF_STATE_CHANGE(db, op, state, why) do { \ + (db)->db_state op state; \ } while(0) #endif @@ -521,8 +499,7 @@ dbuf_evict_user(db); db->db.db_data = NULL; if (db->db_state != DB_NOFILL) - DBUF_STATE_CHANGE(db, =, DB_UNCACHED, B_FALSE, - "set data"); + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "set data"); } } @@ -728,6 +705,22 @@ } static void +dbuf_process_buf_sets(dmu_buf_impl_t *db, boolean_t err) +{ + dmu_context_node_t *dcn, *next; + + for (dcn = list_head(&db->db_dmu_buf_sets); dcn != NULL; dcn = next) { + next = list_next(&db->db_dmu_buf_sets, dcn); + dmu_buf_set_rele(dcn->buf_set, err); + dmu_context_node_remove(&db->db_dmu_buf_sets, dcn); + } +} +#define DBUF_PROCESS_BUF_SETS(db, err) do { \ + if (!list_is_empty(&(db)->db_dmu_buf_sets)) \ + dbuf_process_buf_sets(db, err); \ +} while (0) + +static void dbuf_read_complete(dmu_buf_impl_t *db, arc_buf_t *buf) { if (db->db_level == 0 && db->db_dirtycnt > 0) { @@ -750,7 +743,7 @@ * to cached. */ ASSERT(db->db_buf != NULL); - DBUF_STATE_CHANGE(db, =, DB_CACHED, B_FALSE, + DBUF_STATE_CHANGE(db, =, DB_CACHED, "resolve of records in READ state"); } @@ -771,7 +764,7 @@ * read and transition to DB_CACHED. */ dbuf_set_data(db, buf); - DBUF_STATE_CHANGE(db, =, DB_CACHED, B_FALSE, + DBUF_STATE_CHANGE(db, =, DB_CACHED, "read completed with no dirty records"); } else { /* @@ -783,6 +776,7 @@ arc_release(buf, db); VERIFY(arc_buf_remove_ref(buf, db) == 1); } + DBUF_PROCESS_BUF_SETS(db, B_FALSE); } static void @@ -826,8 +820,8 @@ } else { ASSERT3P(db->db_buf, ==, NULL); db->db_state = DB_UNCACHED; - DBUF_STATE_CHANGE(db, =, DB_UNCACHED, - B_TRUE, "read failed"); + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "read failed"); + DBUF_PROCESS_BUF_SETS(db, B_TRUE); } VERIFY(arc_buf_remove_ref(buf, db) == 1); } @@ -859,7 +853,7 @@ if (bonuslen) bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); dbuf_update_data(db); - DBUF_STATE_CHANGE(db, =, DB_CACHED, B_FALSE, "bonus buffer filled"); + DBUF_STATE_CHANGE(db, =, DB_CACHED, "bonus buffer filled"); return (TRUE); } @@ -899,8 +893,7 @@ buf = arc_buf_alloc(dn->dn_objset->os_spa, db->db.db_size, db, DBUF_GET_BUFC_TYPE(db)); bzero(buf->b_data, db->db.db_size); - DBUF_STATE_CHANGE(db, =, DB_READ, B_FALSE, - "hole read satisfied"); + DBUF_STATE_CHANGE(db, =, DB_READ, "hole read satisfied"); dbuf_read_complete(db, buf); return (TRUE); } @@ -971,7 +964,7 @@ DB_DNODE_EXIT(db); - DBUF_STATE_CHANGE(db, =, DB_READ, B_FALSE, "read issued"); + DBUF_STATE_CHANGE(db, =, DB_READ, "read issued"); mutex_exit(&db->db_mtx); if (DBUF_IS_L2CACHEABLE(db)) @@ -1386,7 +1379,7 @@ /* Now clear the contents. */ bzero(db->db.db_data, db->db.db_size); arc_buf_freeze(db->db_buf); - DBUF_STATE_CHANGE(db, =, DB_CACHED, B_FALSE, + DBUF_STATE_CHANGE(db, =, DB_CACHED, "dbuf has been freed"); } @@ -1915,8 +1908,7 @@ ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_state & (DB_UNCACHED|DB_NOFILL|DB_CACHED)); dbuf_set_data(db, NULL); - DBUF_STATE_CHANGE(db, =, DB_NOFILL, - B_FALSE, "allocating NOFILL buffer"); + DBUF_STATE_CHANGE(db, =, DB_NOFILL, "allocating NOFILL buffer"); } else if (how == DB_FILL) { if (db->db_state == DB_UNCACHED) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); @@ -1943,17 +1935,15 @@ dbuf_set_data(db, fill_buf); if (size != db->db.db_size) DBUF_STATE_CHANGE(db, =, - (DB_PARTIAL|DB_FILL), B_FALSE, + (DB_PARTIAL|DB_FILL), "notifying of an initial " "partial fill"); else DBUF_STATE_CHANGE(db, =, DB_FILL, - B_FALSE, "notifying of a complete fill"); - } else - atomic_add_64(&dirty_buffers_already_cached, 1); + } } else if (db->db_state & (DB_READ|DB_PARTIAL)) { - DBUF_STATE_CHANGE(db, |=, DB_FILL, B_FALSE, + DBUF_STATE_CHANGE(db, |=, DB_FILL, "notifying of a followup partial fill"); } else { /* No wait on FILL is done for indirect blocks. */ @@ -2088,7 +2078,7 @@ dbuf_dirty_record_add_range(dr, offset, size); if ((db->db_state & DB_FILL) && list_is_empty(&dr->dt.dl.write_ranges)) - DBUF_STATE_CHANGE(db, =, DB_FILL, B_FALSE, + DBUF_STATE_CHANGE(db, =, DB_FILL, "writer fully filled"); } @@ -2426,7 +2416,7 @@ bzero(db->db.db_data, db->db.db_size); db->db_freed_in_flight = FALSE; dbuf_dirty_record_cleanup_ranges(dr); - DBUF_STATE_CHANGE(db, =, DB_CACHED, B_FALSE, + DBUF_STATE_CHANGE(db, =, DB_CACHED, "fill done handling freed in flight"); } else { /* @@ -2436,10 +2426,10 @@ * FILL bit, so it goes back to the steady state. */ if (db->db_state == DB_FILL) - DBUF_STATE_CHANGE(db, =, DB_CACHED, B_FALSE, + DBUF_STATE_CHANGE(db, =, DB_CACHED, "filler finished, complete buffer"); else { - DBUF_STATE_CHANGE(db, &=, ~DB_FILL, B_FALSE, + DBUF_STATE_CHANGE(db, &=, ~DB_FILL, "filler finished, incomplete buffer"); ASSERT(db->db_state & (DB_PARTIAL|DB_READ)); } @@ -2531,16 +2521,14 @@ arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } db->db.db_data = NULL; - DBUF_STATE_CHANGE(db, =, DB_UNCACHED, B_FALSE, - "buffer cleared"); + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "buffer cleared"); } ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); ASSERT(list_is_empty(&db->db_dirty_records)); - DBUF_STATE_CHANGE(db, =, DB_EVICTING, B_FALSE, - "buffer eviction started"); + DBUF_STATE_CHANGE(db, =, DB_EVICTING, "buffer eviction started"); db->db_blkptr = NULL; DB_DNODE_ENTER(db); @@ -2619,7 +2607,7 @@ } else if (level < nlevels-1) { /* this block is referenced from an indirect block */ int err = dbuf_hold_impl(dn, level+1, - blkid >> epbs, fail_sparse, NULL, parentp); + blkid >> epbs, fail_sparse, NULL, parentp, NULL); if (err) return (err); err = dbuf_read(*parentp, NULL, @@ -2661,7 +2649,7 @@ list_create(&db->db_dirty_records, sizeof(dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, db_dirty_record_link)); - list_create(&db->db_dmu_contexts, sizeof(dmu_context_node_t), + list_create(&db->db_dmu_buf_sets, sizeof(dmu_context_node_t), offsetof(dmu_context_node_t, dcn_link)); db->db_objset = os; @@ -2685,8 +2673,7 @@ (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; - DBUF_STATE_CHANGE(db, =, DB_UNCACHED, B_FALSE, - "bonus buffer created"); + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "bonus buffer created"); /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); return (db); @@ -2717,8 +2704,7 @@ return (odb); } list_insert_head(&dn->dn_dbufs, db); - DBUF_STATE_CHANGE(db, =, DB_UNCACHED, B_FALSE, - "regular buffer created"); + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "regular buffer created"); mutex_exit(&dn->dn_dbufs_mtx); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); @@ -2793,7 +2779,7 @@ db->db_parent = NULL; db->db_buf = NULL; list_destroy(&db->db_dirty_records); - list_destroy(&db->db_dmu_contexts); + list_destroy(&db->db_dmu_buf_sets); ASSERT(!list_link_active(&db->db_link)); ASSERT(db->db.db_data == NULL); @@ -2858,11 +2844,12 @@ /** * \brief Returns with db_holds incremented, and db_mtx not held. * + * \note buf_set may be NULL. * \note dn_struct_rwlock must be held. */ int dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, - void *tag, dmu_buf_impl_t **dbp) + void *tag, dmu_buf_impl_t **dbp, dmu_buf_set_t *buf_set) { dmu_buf_impl_t *db, *parent = NULL; @@ -2934,6 +2921,14 @@ (void) refcount_add(&db->db_holds, tag); dbuf_update_data(db); DBUF_VERIFY(db); + /* If a reading buffer set is associated, add the callback now. */ + if (buf_set != NULL && (buf_set->dmu_ctx->flags & DMU_CTX_FLAG_READ)) { + if (db->db_state == DB_CACHED) { + /* Dbuf is already at the desired state. */ + dmu_buf_set_rele(buf_set, B_FALSE); + } else + dmu_context_node_add(&db->db_dmu_buf_sets, buf_set); + } mutex_exit(&db->db_mtx); /* NOTE: we can't rele the parent until after we drop the db_mtx */ @@ -2951,16 +2946,15 @@ dmu_buf_impl_t * dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) { - dmu_buf_impl_t *db; - int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); - return (err ? NULL : db); + return (dbuf_hold_level(dn, 0, blkid, tag)); } dmu_buf_impl_t * dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) { dmu_buf_impl_t *db; - int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); + int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db, + /*buf_set*/NULL); return (err ? NULL : db); } @@ -3256,8 +3250,8 @@ if (parent == NULL) { mutex_exit(&db->db_mtx); rw_enter(&dn->dn_struct_rwlock, RW_READER); - (void) dbuf_hold_impl(dn, db->db_level+1, - db->db_blkid >> epbs, FALSE, db, &parent); + parent = dbuf_hold_level(dn, db->db_level + 1, + db->db_blkid >> epbs, db); rw_exit(&dn->dn_struct_rwlock); mutex_enter(&db->db_mtx); db->db_parent = parent; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#49 (text) ==== @@ -694,6 +694,25 @@ #endif } +static void +dmu_buf_move(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, uint64_t sz) +{ + dmu_context_t *dmu_ctx = buf_set->dmu_ctx; + + if (dmu_ctx->flags & DMU_CTX_FLAG_UIO) + dmu_buf_move_uio(dmu_ctx, db, off, sz); + else if (dmu_ctx->flags & DMU_CTX_FLAG_SUN_PAGES) + dmu_buf_move_pages(dmu_ctx, db, off, sz); + else { + uint64_t dataoff = db->db_offset - dmu_ctx->start + off; + char *data = (char *)dmu_ctx->data_buf + dataoff; + if (dmu_ctx->flags & DMU_CTX_FLAG_READ) + bcopy((char *)db->db_data + off, data, sz); + else + bcopy(data, (char *)db->db_data + off, sz); + } +} + /** * \brief Perform a buffer set read for a char * target buffer. * @@ -702,7 +721,6 @@ static void dmu_buf_set_transfer(dmu_buf_set_t *buf_set) { - char *data; uint64_t offset, size; dmu_context_t *dmu_ctx = buf_set->dmu_ctx; dmu_tx_t *tx = dmu_ctx->tx; @@ -711,17 +729,13 @@ /* Having a transaction and being a reader is not supported. */ ASSERT(tx != NULL || (dmu_ctx->flags & DMU_CTX_FLAG_READ)); - /* - * Initialize the current state. Note that for special (non-char *) - * data pointers, 'data' is not used. - */ - data = (char *)dmu_ctx->data_buf + buf_set->offset - dmu_ctx->start; + /* Initialize the current state. */ size = buf_set->size; offset = buf_set->offset; /* Perform the I/O copy, one buffer at a time. */ for (i = 0; i < buf_set->count; i++) { - int off, sz; + uint64_t off, sz; dmu_buf_t *db = buf_set->dbp[i]; if (dmu_ctx->flags & DMU_CTX_FLAG_NOFILL) { @@ -743,17 +757,7 @@ dmu_buf_will_dirty_range(db, tx, off, sz); } - if (dmu_ctx->flags & DMU_CTX_FLAG_UIO) - dmu_buf_move_uio(dmu_ctx, db, off, sz); - else if (dmu_ctx->flags & DMU_CTX_FLAG_SUN_PAGES) - dmu_buf_move_pages(dmu_ctx, db, off, sz); - else { - if (dmu_ctx->flags & DMU_CTX_FLAG_READ) - bcopy((char *)db->db_data + off, data, sz); - else - bcopy(data, (char *)db->db_data + off, sz); - data = (char *)data + sz; - } + dmu_buf_move(buf_set, db, off, sz); if ((dmu_ctx->flags & DMU_CTX_FLAG_READ) == 0) dmu_buf_fill_done(db, tx); @@ -794,8 +798,8 @@ * * \param buf_set Buffer set to handle. */ -void -dmu_buf_set_dispatch(dmu_buf_set_t *buf_set) +static void +dmu_buf_set_complete(dmu_buf_set_t *buf_set) { int i; dmu_context_t *dmu_ctx = buf_set->dmu_ctx; @@ -834,7 +838,8 @@ ASSERT(dcs == NULL); #endif - dcs = kmem_zalloc(sizeof(dmu_cb_state_t), KM_SLEEP); + /* Called with taskqueue mutex held. */ + dcs = kmem_zalloc(sizeof(dmu_cb_state_t), KM_NOSLEEP); list_create(&dcs->io_list, sizeof(dmu_context_node_t), offsetof(dmu_context_node_t, dcn_link)); return tsd_set(zfs_async_io_key, dcs); @@ -876,7 +881,7 @@ for (dcn = list_head(&dcs->io_list); dcn != NULL; dcn = next) { next = list_next(&dcs->io_list, dcn); - dmu_buf_set_dispatch(dcn->buf_set); + dmu_buf_set_complete(dcn->buf_set); dmu_context_node_remove(&dcs->io_list, dcn); } } @@ -885,20 +890,14 @@ * \brief Release a buffer set for a given dbuf. * * \param buf_set Buffer set to release. - * \param vdb DMU buffer to release for. * \param err Whether an error occurred. * * \invariant If specified, the dbuf's mutex must be held. */ void -dmu_buf_set_rele(dmu_buf_set_t *buf_set, dmu_buf_t *vdb, boolean_t err) +dmu_buf_set_rele(dmu_buf_set_t *buf_set, boolean_t err) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)vdb; - /* XXX seems like there should be more done with the dbuf here. */ - if (db != NULL) - ASSERT(MUTEX_HELD(&db->db_mtx)); - /* Report an error, if any. */ if (err) atomic_add_int(&buf_set->err, 1); @@ -917,56 +916,21 @@ * registered for this TSD, so it must not handle * queued delivery. Dispatch this set now. */ - dmu_buf_set_dispatch(buf_set); + dmu_buf_set_complete(buf_set); } } } -static void -dmu_buf_set_init(dmu_context_t *dmu_ctx, dmu_buf_set_t **buf_set_p, int nblks) -{ - dmu_buf_set_t *buf_set; - size_t set_size; - - ASSERT(dmu_ctx != NULL); - ASSERT(dmu_ctx->holds > 0); - - /* Create the new buffer set. */ - set_size = sizeof(dmu_buf_set_t) + nblks * sizeof(dmu_buf_t *); - buf_set = kmem_zalloc(set_size, KM_SLEEP); - refcount_acquire(&dmu_ctx->holds); - - /* Initialize a new buffer set. */ -#ifdef ZFS_DEBUG - refcount_acquire(&buf_set_in_flight); - atomic_add_64(&buf_set_total, 1); -#endif - buf_set->offset = dmu_ctx->offset; - buf_set->count = nblks; - buf_set->blocks_allocated = nblks; - /* Include a refcount for the initiator. */ - if (dmu_ctx->flags & DMU_CTX_FLAG_READ) - refcount_init(&buf_set->holds, nblks + 1); - else - /* For writes, dbufs never need to call us back. */ - refcount_init(&buf_set->holds, 1); - buf_set->dmu_ctx = dmu_ctx; - *buf_set_p = buf_set; -} - /** * \brief Set up the buffers for a given set. * * \param buf_set Buffer set to set up buffers for. - * \param zio Parent ZIO to issue I/O's with, as needed. - * \param io_size Total I/O size for this buffer set. * - * \retval EIO If any buffer could not be held for this buffer set. - * \retval 0 Success. + * \retval errno If any buffer could not be held for this buffer set. + * \retval 0 Success. */ static int -dmu_buf_set_setup_buffers(dmu_buf_set_t *buf_set, zio_t *zio, - uint64_t io_size) +dmu_buf_set_setup_buffers(dmu_buf_set_t *buf_set) { dmu_context_t *dmu_ctx = buf_set->dmu_ctx; dnode_t *dn = dmu_ctx->dn; @@ -976,26 +940,31 @@ dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; if ((dmu_ctx->flags & DMU_CTX_FLAG_PREFETCH) == 0 || - io_size > zfetch_array_rd_sz) + buf_set->size > zfetch_array_rd_sz) dbuf_flags |= DB_RF_NOPREFETCH; blkid = dbuf_whichblock(dn, dmu_ctx->offset); + /* + * Note that while this loop is running, any zio's set up for async + * reads are not executing, therefore access to this buf_set is + * serialized within this function; i.e. atomics are not needed here. + */ for (i = 0; i < buf_set->count; i++) { - dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, dmu_ctx->tag); - int bufoff, bufsiz; + dmu_buf_impl_t *db = NULL; + int err = dbuf_hold_impl(dn, /*level*/0, blkid + i, + /*fail_sparse*/FALSE, dmu_ctx->tag, &db, buf_set); + uint64_t bufoff, bufsiz; if (db == NULL) { - int blocks_held = i; - /* Fix up refcount & count. */ - for (;i < buf_set->count;i++) - refcount_release(&buf_set->holds); - buf_set->count = blocks_held; - zio_nowait(zio); - return (EIO); + /* Only include counts for the processed buffers. */ + buf_set->count = i; + buf_set->holds = i + 1 /*initiator*/; + zio_nowait(buf_set->zio); + return (err); } /* initiate async i/o */ if (dmu_ctx->flags & DMU_CTX_FLAG_READ) - (void) dbuf_read(db, zio, dbuf_flags); + (void) dbuf_read(db, buf_set->zio, dbuf_flags); #ifdef _KERNEL else curthread->td_ru.ru_oublock++; @@ -1004,48 +973,107 @@ /* Calculate the amount of data this buffer contributes. */ ASSERT(dmu_ctx->offset >= db->db.db_offset); bufoff = dmu_ctx->offset - db->db.db_offset; - bufsiz = (int)MIN(db->db.db_size - bufoff, io_size); - buf_set->size += bufsiz; - io_size -= bufsiz; + bufsiz = (int)MIN(db->db.db_size - bufoff, buf_set->resid); + buf_set->resid -= bufsiz; /* Update the caller's data to let them know what's next. */ dmu_ctx->offset += bufsiz; dmu_ctx->size -= bufsiz; + /* Put this dbuf in the buffer set's list. */ + buf_set->dbp[i] = &db->db; + } + return (0); +} + +/** + * \brief Initialize a buffer set of a certain size. + * + * \param dmu_ctx DMU context to associate the buffer set with. + * \param buf_set_p Pointer to set to the new buffer set's address. + * \param size Requested size of the buffer set. + * + * \retval 0 Success. + * \retval EIO I/O error: tried to access past the end of the dnode, + * or dmu_buf_set_setup_buffers() failed. + */ +static int +dmu_buf_set_init(dmu_context_t *dmu_ctx, dmu_buf_set_t **buf_set_p, + uint64_t size) +{ + dmu_buf_set_t *buf_set; + size_t set_size; + int err, nblks; + dnode_t *dn = dmu_ctx->dn; + + ASSERT(dmu_ctx != NULL); + ASSERT(dmu_ctx->holds > 0); - /* Make sure dbufs that don't notify DMU are caught here. */ - if (dmu_ctx->flags & DMU_CTX_FLAG_READ) { - mutex_enter(&db->db_mtx); - if (db->db_state == DB_CACHED) { - /* - * This buffer's already done. Don't check - * for DB_UNCACHED here because that only - * indicates an initialized buffer. - */ - ASSERT(buf_set->holds > 1); - refcount_release(&buf_set->holds); - } else { - /* Let the dbuf know this context needs it. */ - dmu_context_node_add(&db->db_dmu_contexts, - buf_set); - } - /* NB: all dbufs may have completed at this point! */ - mutex_exit(&db->db_mtx); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + /* Figure out how many blocks are needed for the requested size. */ + if (dn->dn_datablkshift) { + ASSERT3U(dn->dn_datablksz, ==, 1 << dn->dn_datablkshift); + nblks = P2ROUNDUP(dmu_ctx->offset + size, dn->dn_datablksz); + nblks -= P2ALIGN(dmu_ctx->offset, dn->dn_datablksz); + nblks >>= dn->dn_datablkshift; + } else { + if ((dmu_ctx->offset + size) > dn->dn_datablksz) { + zfs_panic_recover("zfs: accessing past end of object " + "%llx/%llx (size=%u access=%llu+%llu)", + (longlong_t)dn->dn_objset-> + os_dsl_dataset->ds_object, + (longlong_t)dn->dn_object, dn->dn_datablksz, + (longlong_t)dmu_ctx->offset, + (longlong_t)size); + err = EIO; + goto out; } - buf_set->dbp[i] = &db->db; + nblks = 1; } - return (0); + + /* Create the new buffer set. */ + set_size = sizeof(dmu_buf_set_t) + nblks * sizeof(dmu_buf_t *); + buf_set = kmem_zalloc(set_size, KM_SLEEP); + refcount_acquire(&dmu_ctx->holds); + + /* Initialize a new buffer set. */ +#ifdef ZFS_DEBUG + refcount_acquire(&buf_set_in_flight); + atomic_add_64(&buf_set_total, 1); +#endif + buf_set->size = size; + buf_set->resid = size; + buf_set->offset = dmu_ctx->offset; + buf_set->count = nblks; + buf_set->blocks_allocated = nblks; + /* Include a refcount for the initiator. */ + if (dmu_ctx->flags & DMU_CTX_FLAG_READ) + refcount_init(&buf_set->holds, nblks + 1); + else + /* For writes, dbufs never need to call us back. */ + refcount_init(&buf_set->holds, 1); + buf_set->dmu_ctx = dmu_ctx; + buf_set->zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + *buf_set_p = buf_set; + + /* Set up the buffers. */ + err = dmu_buf_set_setup_buffers(buf_set); + +out: + rw_exit(&dn->dn_struct_rwlock); + return (err); } /** * \brief Process the I/Os queued for a given buffer set. * * \param buf_set Buffer set to process I/Os for. - * \param zio Parent ZIO to watch. * * \retval errno Errors from zio_wait or a buffer went UNCACHED. * \retval 0 Success. */ static int -dmu_buf_set_process_io(dmu_buf_set_t *buf_set, zio_t *zio) +dmu_buf_set_process_io(dmu_buf_set_t *buf_set) { int err, i, syncing; dsl_pool_t *dp = NULL; @@ -1061,7 +1089,7 @@ */ if (dmu_ctx->dmu_cb != NULL || (dmu_ctx->flags & DMU_CTX_FLAG_READ) == 0) { - zio_nowait(zio); + zio_nowait(buf_set->zio); return (0); } @@ -1072,7 +1100,7 @@ start = gethrtime(); /* Wait for async i/o. */ - err = zio_wait(zio); + err = zio_wait(buf_set->zio); /* Track read overhead when we are in sync context. */ if (start) @@ -1096,77 +1124,6 @@ } /** - * \brief Execute the next I/O chunk for the given DMU context. - * - * \param dmu_ctx The DMU context. - * - * \retval EIO Tried to access blocks beyond the end of the dnode. - * \retval errno Various other errors, primarily ZIO calls. - * \retval 0 Success. - */ -static int -dmu_context_execute_chunk(dmu_context_t *dmu_ctx) -{ - uint64_t io_size, nblks; - dmu_buf_set_t *buf_set = NULL; - int err; - zio_t *zio; - hrtime_t start; - dnode_t *dn = dmu_ctx->dn; - - /* Determine the actual size this I/O set will try to perform. */ - io_size = MIN(dmu_ctx->size, DMU_MAX_ACCESS); - - /* Figure out the number of blocks needed for the buffer set. */ - rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (dn->dn_datablkshift) { - int shift = 1ULL << dn->dn_datablkshift; - int blkshift = dn->dn_datablkshift; - nblks = P2ROUNDUP(dmu_ctx->offset + io_size, shift); - nblks -= P2ALIGN(dmu_ctx->offset, shift); - nblks >>= dn->dn_datablkshift; - } else { - if ((dmu_ctx->offset + io_size) > dn->dn_datablksz) { - zfs_panic_recover("zfs: accessing past end of object " - "%llx/%llx (size=%u access=%llu+%llu)", - (longlong_t)dn->dn_objset-> - os_dsl_dataset->ds_object, - (longlong_t)dn->dn_object, dn->dn_datablksz, - (longlong_t)dmu_ctx->offset, - (longlong_t)io_size); - err = EIO; - goto out; - } - nblks = 1; - } - - /* Now that the block count is known, initialize the buffer set. */ - dmu_buf_set_init(dmu_ctx, &buf_set, nblks); - - zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); - - /* Set up the buffers. */ - err = dmu_buf_set_setup_buffers(buf_set, zio, io_size); - -out: - rw_exit(&dn->dn_struct_rwlock); - - /* Process the I/O requests, if no errors have occurred yet. */ - if (err == 0) - err = dmu_buf_set_process_io(buf_set, zio); - - /* - * Release the initiator hold on the buffer. - * NB: This must occur after struct_rwlock is dropped, otherwise a - * deadlock may occur if someone needs new blocks from the dnode. - */ - if (buf_set != NULL) - dmu_buf_set_rele(buf_set, NULL, err ? B_TRUE : B_FALSE); - - return (err); -} - -/** * \brief Issue the I/O specified in the given DMU context. * * \param dmu_ctx The DMU context. @@ -1180,16 +1137,30 @@ dmu_issue(dmu_context_t *dmu_ctx) { int err = 0; + uint64_t io_size; + dmu_buf_set_t *buf_set; /* While there is work left to do, execute the next chunk. */ - while (dmu_ctx->size > 0 && err == 0) - err = dmu_context_execute_chunk(dmu_ctx); + dprintf("%s(%p) -> buf %p\n", __func__, dmu_ctx, dmu_ctx->data_buf); + while (dmu_ctx->size > 0 && err == 0) { + /* Determine this chunk's size. */ + io_size = MIN(dmu_ctx->size, DMU_MAX_ACCESS); + + /* Initialize the buffer set for this chunk. */ + dprintf("%s(%p@%lu+%lu) chunk %lu\n", __func__, dmu_ctx, + dmu_ctx->offset, dmu_ctx->size, io_size); + err = dmu_buf_set_init(dmu_ctx, &buf_set, io_size); + + /* Process the I/O requests, if the initialization passed. */ + if (err == 0) + err = dmu_buf_set_process_io(buf_set); - /* If a callback is specified, forward any error to it. */ - if (err && dmu_ctx->dmu_cb != NULL) { - atomic_add_int(&dmu_ctx->err, 1); - err = 0; + dmu_buf_set_rele(buf_set, err ? B_TRUE : B_FALSE); } + /* + * At this point, either there is a callback, or all buffer sets + * have finished processing. + */ ASSERT(dmu_ctx->dmu_cb != NULL || dmu_ctx->holds == 1); return (err); @@ -1251,8 +1222,6 @@ /* Initialize including a refcount for the initiator. */ refcount_init(&dmu_ctx->holds, 1); - /* XXX do something more intelligent about state matching? */ - dmu_ctx->db_states = (DB_UNCACHED|DB_CACHED); } /** ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c#9 (text) ==== @@ -317,7 +317,8 @@ dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db); + err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db, + /*buf_set*/NULL); rw_exit(&dn->dn_struct_rwlock); if (err) { @@ -530,7 +531,8 @@ blkoff = P2PHASE(blkid, epb); tochk = MIN(epb - blkoff, nblks); - err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf); + err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf, + /*buf_set*/NULL); if (err) { txh->txh_tx->tx_err = err; break; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c#10 (text) ==== @@ -1354,7 +1354,7 @@ goto fail; /* resize the old block */ - err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db); + err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db, /*buf_set*/NULL); if (err == 0) dbuf_new_size(db, size, tx); else if (err != ENOENT) @@ -1591,7 +1591,7 @@ if (len < head) head = len; if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE, - FTAG, &db) == 0) { + FTAG, &db, /*buf_set*/NULL) == 0) { caddr_t data; /* don't dirty if it isn't on disk and isn't dirty */ @@ -1629,7 +1629,7 @@ if (len < tail) tail = len; if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), - TRUE, FTAG, &db) == 0) { + TRUE, FTAG, &db, /*buf_set*/NULL) == 0) { /* don't dirty if not on disk and not dirty */ if (!list_is_empty(&db->db_dirty_records) || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { @@ -1880,7 +1880,8 @@ data = dn->dn_phys->dn_blkptr; } else { uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl); - error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db); + error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db, + /*buf_set*/NULL); if (error) { if (error != ENOENT) return (error); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c#8 (text) ==== @@ -164,7 +164,8 @@ rw_enter(&dn->dn_struct_rwlock, RW_READER); err = dbuf_hold_impl(dn, db->db_level-1, - (db->db_blkid << epbs) + i, TRUE, FTAG, &child); + (db->db_blkid << epbs) + i, TRUE, FTAG, &child, + /*buf_set*/NULL); rw_exit(&dn->dn_struct_rwlock); if (err == ENOENT) continue; @@ -275,7 +276,8 @@ if (BP_IS_HOLE(bp)) continue; rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb); + err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb, + /*buf_set*/NULL); ASSERT3U(err, ==, 0); rw_exit(&dn->dn_struct_rwlock); @@ -352,7 +354,8 @@ if (BP_IS_HOLE(bp)) continue; rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db); + err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db, + /*buf_set*/NULL); ASSERT3U(err, ==, 0); rw_exit(&dn->dn_struct_rwlock); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#28 (text) ==== @@ -396,8 +396,11 @@ /** List of dirty records for the buffer sorted newest to oldest. */ list_t db_dirty_records; - /** List of DMU contexts (see dmu_context_node_t). */ - list_t db_dmu_contexts; + /** + * List of DMU buffer sets dependent on this dbuf. + * See dmu_context_node_t, the indirect list entry structure used. + */ + list_t db_dmu_buf_sets; /** * Our link on the owner dnodes's dn_dbufs list. @@ -464,7 +467,7 @@ dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, void *tag); int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create, - void *tag, dmu_buf_impl_t **dbp); + void *tag, dmu_buf_impl_t **dbp, dmu_buf_set_t *buf_set); void dbuf_prefetch(struct dnode *dn, uint64_t blkid); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#24 (text) ==== @@ -223,6 +223,7 @@ * callbacks. */ struct dmu_context; +struct zio; typedef void (*dmu_callback_t)(struct dmu_context *); typedef struct dmu_context { @@ -249,9 +250,6 @@ /** Completed size. */ uint64_t completed_size; - /** The dbuf states when a callback may be called. */ - int db_states; - /** Flags for this block. */ uint32_t flags; #define DMU_CTX_FLAG_READ (1 << 1) @@ -290,16 +288,21 @@ uint64_t offset; /** The size of the I/O. */ uint64_t size; + /** The amount of data remaining to process for this buffer set. */ + uint64_t resid; /** The number of errors that occurred. */ int err; + /** The ZIO associated with this context. */ + struct zio *zio; + /** The set of buffers themselves. */ struct dmu_buf *dbp[0]; } dmu_buf_set_t; -void dmu_buf_set_rele(dmu_buf_set_t *buf_set, dmu_buf_t *vdb, boolean_t err); +void dmu_buf_set_rele(dmu_buf_set_t *buf_set, boolean_t err); int dmu_context_init(dmu_context_t *dmu_ctx, objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *data_buf, dmu_callback_t dmu_cb, dmu_tx_t *tx, void *tag, uint32_t flags); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c#5 (text) ==== @@ -1260,19 +1260,19 @@ boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); - return; + break; } zio->io_stage = stage; rv = zio_pipeline[highbit(stage) - 1](zio); - if (rv == ZIO_PIPELINE_STOP) { - dmu_thread_context_process(); - return; - } + if (rv == ZIO_PIPELINE_STOP) + break; ASSERT(rv == ZIO_PIPELINE_CONTINUE); } + /* Process any deferred events placed on this thread's list. */ + dmu_thread_context_process(); } /** Change 525787 by willa@willa_repo on 2012/02/07 12:25:18 Get rid of the ugly dmu_context_t.callback_private pointers. Instead, a DMU context initiator should create a struct that puts a dmu_context_t at the beginning, then adds whatever it needs after. Then the callback can simply cast the pointer back to its structure. Make it so for ZVOLs. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#25 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#23 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#25 (text) ==== @@ -265,9 +265,6 @@ /** The number of errors that occurred. */ int err; - /** Private data for the callback. */ - void *callback_private[4]; - } dmu_context_t; typedef struct dmu_buf_set { ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#23 (text) ==== @@ -1278,35 +1278,46 @@ } #endif /* sun */ +typedef struct zvol_dmu_state { + /** + * The DMU context associated with this DMU state. Note that this + * must be the first entry in order for the callback to be able to + * discover the zvol_dmu_state_t. + */ + dmu_context_t dmu_ctx; + struct bio *bp; + rl_t *rl; + zvol_state_t *zv; + dmu_tx_t *tx; +} zvol_dmu_state_t; + static void zvol_dmu_cb(dmu_context_t *dmu_ctx) { - struct bio *bp = dmu_ctx->callback_private[0]; - rl_t *rl = dmu_ctx->callback_private[1]; - zvol_state_t *zv = dmu_ctx->callback_private[2]; - dmu_tx_t *tx = dmu_ctx->callback_private[3]; - int sync = (bp->bio_cmd != BIO_READ && - zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); + zvol_dmu_state_t *zds = (zvol_dmu_state_t *)dmu_ctx; + int sync = (zds->bp->bio_cmd != BIO_READ && + zds->zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); - bp->bio_completed = dmu_ctx->completed_size; - if (bp->bio_cmd != BIO_READ) { - ASSERT(zv != NULL && tx != NULL); - zvol_log_write(zv, tx, bp->bio_offset, bp->bio_completed, sync); - dmu_tx_commit(tx); + zds->bp->bio_completed = dmu_ctx->completed_size; + if (zds->bp->bio_cmd != BIO_READ) { + ASSERT(zds->zv != NULL && zds->tx != NULL); + zvol_log_write(zds->zv, zds->tx, zds->bp->bio_offset, + zds->bp->bio_completed, sync); + dmu_tx_commit(zds->tx); if (sync) - zil_commit(zv->zv_zilog, ZVOL_OBJ); + zil_commit(zds->zv->zv_zilog, ZVOL_OBJ); } - if (bp->bio_completed < bp->bio_length) { - if (dmu_ctx->offset > zv->zv_volsize) - bp->bio_error = EINVAL; + if (zds->bp->bio_completed < zds->bp->bio_length) { + if (dmu_ctx->offset > zds->zv->zv_volsize) + zds->bp->bio_error = EINVAL; } else - bp->bio_error = (dmu_ctx->err == 0) ? 0 : EIO; - zfs_range_unlock(rl); - if (bp->bio_to != NULL) - g_io_deliver(bp, 0); + zds->bp->bio_error = (dmu_ctx->err == 0) ? 0 : EIO; + zfs_range_unlock(zds->rl); + if (zds->bp->bio_to != NULL) + g_io_deliver(zds->bp, 0); else - bp->bio_done(bp); - kmem_free(dmu_ctx, sizeof(dmu_context_t)); + zds->bp->bio_done(zds->bp); + kmem_free(zds, sizeof(zvol_dmu_state_t)); } static void @@ -1319,7 +1330,7 @@ int error = 0; boolean_t doread = (bp->bio_cmd == BIO_READ); dmu_tx_t *tx = NULL; - dmu_context_t *dmu_ctx = NULL; + zvol_dmu_state_t *zds = NULL; uint32_t dmu_flags = 0; /* XXX KDM may be able to consolidate this into the non-GEOM case. */ @@ -1386,20 +1397,20 @@ dmu_tx_abort(tx); } - dmu_ctx = kmem_zalloc(sizeof(dmu_context_t), KM_SLEEP); - error = dmu_context_init(dmu_ctx, zv->zv_objset, ZVOL_OBJ, off, + zds = kmem_zalloc(sizeof(zvol_dmu_state_t), KM_SLEEP); + error = dmu_context_init(&zds->dmu_ctx, zv->zv_objset, ZVOL_OBJ, off, bp->bio_length, bp->bio_data, zvol_dmu_cb, tx, FTAG, dmu_flags); /* All error handling is done in the callback. */ - dmu_ctx->callback_private[0] = bp; - dmu_ctx->callback_private[1] = rl; - dmu_ctx->callback_private[2] = zv; - dmu_ctx->callback_private[3] = tx; + zds->bp = bp; + zds->rl = rl; + zds->zv = zv; + zds->tx = tx; if (error == 0) { /* Pump primed, issue the I/O to the DMU. */ - (void) dmu_issue(dmu_ctx); + (void) dmu_issue(&zds->dmu_ctx); } else - dmu_ctx->err = error; - dmu_context_rele(dmu_ctx); + zds->dmu_ctx.err = error; + dmu_context_rele(&zds->dmu_ctx); } static void Change 525889 by willa@willa_repo on 2012/02/07 14:16:44 Call the taskqueue callbacks outside of the taskqueue critical section. Revert change 525786 such that it now calls malloc M_WAITOK to ensure that the TLS memory gets populated, even if it has to wait a bit. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#50 edit ... //depot/branches/redline/projects/cow/sys/kern/subr_taskqueue.c#4 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#50 (text) ==== @@ -839,7 +839,7 @@ #endif /* Called with taskqueue mutex held. */ - dcs = kmem_zalloc(sizeof(dmu_cb_state_t), KM_NOSLEEP); + dcs = kmem_zalloc(sizeof(dmu_cb_state_t), KM_SLEEP); list_create(&dcs->io_list, sizeof(dmu_context_node_t), offsetof(dmu_context_node_t, dcn_link)); return tsd_set(zfs_async_io_key, dcs); ==== //depot/branches/redline/projects/cow/sys/kern/subr_taskqueue.c#4 (text) ==== @@ -515,9 +515,9 @@ tqp = arg; tq = *tqp; + TASKQUEUE_RUN_CALLBACK(tq, TASKQUEUE_CALLBACK_TYPE_INIT); TQ_LOCK(tq); - TASKQUEUE_RUN_CALLBACK(tq, TASKQUEUE_CALLBACK_TYPE_INIT); while ((tq->tq_flags & TQ_FLAGS_ACTIVE) != 0) { taskqueue_run_locked(tq); /* @@ -531,12 +531,12 @@ } taskqueue_run_locked(tq); - TASKQUEUE_RUN_CALLBACK(tq, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN); - /* rendezvous with thread that asked us to terminate */ tq->tq_tcount--; wakeup_one(tq->tq_threads); TQ_UNLOCK(tq); + + TASKQUEUE_RUN_CALLBACK(tq, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN); kthread_exit(); } Change 525890 by kenm@ken.spectrabsd5 on 2012/02/07 14:40:27 Fix a stack garbage issue that was causing multiple opens to fail. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#24 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#24 (text) ==== @@ -897,6 +897,8 @@ { int err; + err = 0; + if (zv->zv_total_opens == 0) err = zvol_first_open(zv); if (err) { @@ -939,7 +941,6 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) { zvol_state_t *zv; - int err = 0; if (MUTEX_HELD(&spa_namespace_lock)) { /* @@ -964,7 +965,6 @@ zvol_open(struct cdev *dev, int flags, int fmt, struct thread *td) { zvol_state_t *zv; - int err = 0; if (MUTEX_HELD(&spa_namespace_lock)) { /* Change 525892 by willa@willa_repo on 2012/02/07 15:29:45 Fix non-char * buffer handling in dmu_context_seek(). These cases do not require a check for dn->dn_maxblkid. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#51 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#26 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#51 (text) ==== @@ -1282,7 +1282,8 @@ * the first block. If we ever do the tail block optimization, * we will need to handle that here as well. */ - if ((dmu_ctx->flags & DMU_CTX_FLAG_READ) && dn->dn_maxblkid == 0) { + if ((dmu_ctx->flags & DMU_CTX_FLAG_READ) && dn->dn_maxblkid == 0 && + DMU_CTX_BUF_IS_CHAR(dmu_ctx)) { int newsz = offset > dn->dn_datablksz ? 0 : MIN(size, dn->dn_datablksz - offset); bzero((char *)data_buf + newsz, size - newsz); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#26 (text) ==== @@ -262,6 +262,9 @@ #define DMU_CTX_WRITER_FLAGS (DMU_CTX_FLAG_SUN_PAGES|DMU_CTX_FLAG_NOFILL) #define DMU_CTX_READER_FLAGS (DMU_CTX_FLAG_PREFETCH) +#define DMU_CTX_BUF_IS_CHAR(dmu_ctx) \ + (((dmu_ctx)->flags & (DMU_CTX_FLAG_UIO|DMU_CTX_FLAG_SUN_PAGES)) == 0) + /** The number of errors that occurred. */ int err; Change 525894 by kenm@ken.spectrabsd5 on 2012/02/07 16:11:12 Implement zvol device renaming for the standard block device. Set the D_DISK flag in our cdevsw. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#25 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#25 (text) ==== @@ -96,7 +96,7 @@ struct cdevsw zfs_zvol_cdevsw = { .d_version = D_VERSION, - .d_flags = 0, + .d_flags = D_DISK, .d_name = "zvol", .d_open = zvol_open, .d_close = zvol_close, @@ -2388,16 +2388,12 @@ return (0); } -/* - * XXX KDM - * These are used by ZFS to rename devices. Need to port them to straight - * devfs routines. - */ static void zvol_rename_minor(struct g_geom *gp, const char *newname) { struct g_provider *pp; zvol_state_t *zv; + struct cdev *old_dev; ASSERT(MUTEX_HELD(&spa_namespace_lock)); g_topology_assert(); @@ -2417,6 +2413,19 @@ zv->zv_provider = pp; strlcpy(zv->zv_name, newname, sizeof(zv->zv_name)); g_error_provider(pp, 0); + + /* + * We're piggybacking on the GEOM code to rename standard block + * devices as well. + */ + old_dev = zv->zv_dev; + + zv->zv_dev = make_dev(&zfs_zvol_cdevsw, /*unit*/ 0, UID_ROOT, + GID_OPERATOR, 0600, "%s/%s", ZVOL_DRIVER,newname); + zv->zv_dev->si_drv1 = zv; + + destroy_dev_sched(old_dev); + } void Change 525898 by willa@willa_repo on 2012/02/07 19:05:18 Fix a refcounting bug in zvol. Add D_TRACKCLOSE to the zvol cdevsw.d_flags, and decrement zv_total_opens instead of just setting it to 0. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#26 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#26 (text) ==== @@ -96,7 +96,7 @@ struct cdevsw zfs_zvol_cdevsw = { .d_version = D_VERSION, - .d_flags = D_DISK, + .d_flags = D_DISK|D_TRACKCLOSE, .d_name = "zvol", .d_open = zvol_open, .d_close = zvol_close, @@ -923,7 +923,6 @@ } #endif - /* XXX KDM need to figure out whether to track the count */ zv->zv_total_opens++; mutex_exit(&spa_namespace_lock); @@ -1000,12 +999,13 @@ */ ASSERT(zv->zv_total_opens != 0); - zv->zv_total_opens = 0; + zv->zv_total_opens--; /* * You may get multiple opens, but only one close. */ - zvol_last_close(zv); + if (zv->zv_total_opens == 0) + zvol_last_close(zv); mutex_exit(&spa_namespace_lock); Change 525899 by willa@willa_repo on 2012/02/07 19:05:55 Set the PREFETCH flag for zfs_read calls into dmu_issue(). Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c#10 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c#10 (text) ==== @@ -660,7 +660,7 @@ error = dmu_context_init(&dmu_ctx, os, zp->z_id, uio->uio_loffset, n, uio, /*dmu_cb*/NULL, /*tx*/NULL, FTAG, - DMU_CTX_FLAG_READ|DMU_CTX_FLAG_UIO); + DMU_CTX_FLAG_READ|DMU_CTX_FLAG_UIO|DMU_CTX_FLAG_PREFETCH); if (error) goto out; Change 525900 by kenm@ken.spectrabsd5 on 2012/02/07 22:28:02 Another tweak to the open/close path for zvols. Make sure we track the count from the GEOM case, since we can get multiple opens and closes via the same access call. Specify a count of 1 for opens and closes in the non-GEOM case. This, combined with the previous change, should allow us to use the GEOM and non-GEOM devices simultaneously. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#27 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#27 (text) ==== @@ -893,7 +893,7 @@ * Assumptions: zv != NULL and spa_namespace_lock is held. */ static int -zvol_common_open(zvol_state_t *zv, int flag) +zvol_common_open(zvol_state_t *zv, int flag, int count) { int err; @@ -923,7 +923,7 @@ } #endif - zv->zv_total_opens++; + zv->zv_total_opens += count; mutex_exit(&spa_namespace_lock); return (err); @@ -932,7 +932,6 @@ zvol_last_close(zv); mutex_exit(&spa_namespace_lock); return (err); - } /*ARGSUSED*/ @@ -957,7 +956,7 @@ return (ENXIO); } - return (zvol_common_open(zv, flag)); + return (zvol_common_open(zv, flag, count)); } static int @@ -981,11 +980,11 @@ return (ENXIO); } - return (zvol_common_open(zv, flags)); + return (zvol_common_open(zv, flags, /*count*/ 1)); } static int -zvol_common_close(zvol_state_t *zv) +zvol_common_close(zvol_state_t *zv, int count) { if (zv->zv_flags & ZVOL_EXCL) { @@ -999,10 +998,11 @@ */ ASSERT(zv->zv_total_opens != 0); - zv->zv_total_opens--; + zv->zv_total_opens -= count; /* - * You may get multiple opens, but only one close. + * We track closes in the standard and GEOM cases, so we should get + * a close (or close count) for every open. */ if (zv->zv_total_opens == 0) zvol_last_close(zv); @@ -1026,7 +1026,7 @@ /* XXX KDM should we just return 0 instead? */ return (ENXIO); } - return (zvol_common_close(zv)); + return (zvol_common_close(zv, count)); } static int @@ -1043,7 +1043,7 @@ return (ENXIO); } - return (zvol_common_close(zv)); + return (zvol_common_close(zv, /*count*/ 1)); } static void Change 526130 by willa@willa_repo on 2012/02/09 13:17:49 Change zvol block device to use uio calls directly. The goal is to avoid physio overhead, to make its calls more directly comparable to file I/O. This commit, as it stands, panics on writes to zvols, but I am checking it in to avoid losing work. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c: - Change dmu_context_t.dmu_cb to context_cb, and add new callbacks: - buf_set_transfer_cb: Called to transfer a buffer. Used to allow callers to do particular things for every buffer set completion. - buf_transfer_cb: Called to transfer a buffer set block. Used to distinguish between reads and particular write cases. - move_cb: Called to actually copy data. - Refactor most of the DMU buffer transfer API. Split up the different pieces into separate functions which are called via function pointers initialized by dmu_context_init*() or by calling the new dmu_context_set_*() functions: - dmu_context_set_context_cb(): Set context_cb. - dmu_context_set_buf_set_transfer_cb(): Set buf_set_transfer_cb. - dmu_context_set_buf_transfer_cb(): Set buf_transfer_cb. - dmu_context_set_dmu_tx(): Set the transaction to be used for all chunks written. If specified, it is assumed the caller will handle committing the transaction, etc. - Make it possible to have dmu_issue() create new transactions for every buf_set. To do so, the caller simply needs to be a writer and not specify a transaction for the DMU context. - Modify the write cases to ensure that a DMU TX, if created inside dmu_issue(), is handled in all the customary ways. - Remove the tx & dmu_cb arguments from dmu_context_init*(), they will now be settable via the above inline functions. - dmu_buf_set_transfer() split into the following functions: - dmu_buf_set_read(): Buffer set read callback. - dmu_buf_set_write_no_tx(): Buffer set write callback, no TX. - dmu_buf_set_write_tx(): Buffer set write callback with TX commit. This simply calls dmu_buf_set_write_no_tx() then dmu_tx_commit(). - dmu_buf_set_nofill(): Just call dmu_buf_will_not_fill(). - dmu_buf_{read,write}_{uio,char}(): Actually copy a buffer to or from a UIO or char*. - dmu_buf_read_xuio(): xuio handler (not used in FreeBSD). - dmu_buf_write_pages(): Sun pages handler (not used in FreeBSD). - Set dmu_context_t.length to the originally requested size, so that a callback can see whether all data was transferred. - Add a new flag, DMU_CTX_FLAG_ASYNC, to dmu_context_t. This flag allows callers to specify a callback even for synchronous callers. In that case, the call will wait for ZIOs to finish executing and dbufs to finish processing before returning, but will still call the callback. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h: - Create new functions, zvol_freebsd_{read,write}, which are now the zvol cdevsw.d_{read,write} callbacks. The originally implemented zvol_{read,write} callbacks only work for Solaris, but the only difference with the FreeBSD callbacks is how they obtain the zvol context. Both provide an UIO and call into the DMU. - Refactor the zvol_common_strategy() call into the DMU to a new function, zvol_dmu_common(), so that it can also be used by the zvol_{,freebsd_}{read,write} calls. All calls into the zvol layer, that involve the DMU, whether they go through GEOM or block, now go through this function. - zvol_dmu_common() specifies zvol_dmu_write_buf() as a buf_transfer_cb, so as to override the default behavior by performing a call to zvol_log_write() just before calling dmu_tx_commit(). In this way, chunking transactions inside the DMU works the same way it did when the zvol layer did the chunking. - zvol_dmu_common() and zvol_dmu_done() use DMU_CTX_FLAG_ASYNC to determine which of them frees the zvol_dmu_state_t. - Misc. cleanup in zvol_common_strategy(). *: Sprinkle in a variety of additional asserts. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#52 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#27 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h#2 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c#11 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#28 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#52 (text) ==== @@ -637,47 +637,86 @@ } static void -dmu_buf_move_uio(dmu_context_t *dmu_ctx, dmu_buf_t *db, int off, int sz) +dmu_buf_read_xuio(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, + uint64_t sz) { #ifdef _KERNEL - xuio_t *xuio = NULL; - uio_t *uio; - enum uio_rw dir; + dmu_context_t *dmu_ctx = buf_set->dmu_ctx; + uio_t *uio = (uio_t *)dmu_ctx->data_buf; + xuio_t *xuio = (xuio_t *)uio; + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + arc_buf_t *dbuf_abuf = dbi->db_buf; + arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); - /* Initialize the state for this I/O. */ - uio = (uio_t *)dmu_ctx->data_buf; - dir = (dmu_ctx->flags & DMU_CTX_FLAG_READ) ? UIO_READ : UIO_WRITE; + if (dmu_xuio_add(xuio, abuf, off, sz) == 0) { + uio->uio_resid -= sz; + uio->uio_loffset += sz; + } -#ifdef UIO_XUIO - if (uio->uio_extflg == UIO_XUIO) - xuio = (xuio_t *)uio; + if (abuf == dbuf_abuf) + XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); + else + XUIOSTAT_BUMP(xuiostat_rbuf_copied); #endif +} - if (xuio && dir == UIO_READ) { - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - arc_buf_t *dbuf_abuf = dbi->db_buf; - arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); - int err = dmu_xuio_add(xuio, abuf, off, sz); - if (!err) { - uio->uio_resid -= sz; - uio->uio_loffset += sz; - } +static void +dmu_buf_read_uio(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, + uint64_t sz) +{ +#ifdef _KERNEL + uio_t *uio = (uio_t *)buf_set->dmu_ctx->data_buf; + struct iovec *iov = uio->uio_iov; + dprintf("%s: uio iov=%p iovcnt=%d base %p len %lu\n", + __func__, iov, uio->uio_iovcnt, iov->iov_base, + iov->iov_len); + if (uiomove((char *)db->db_data + off, sz, UIO_READ, uio)) + buf_set->err += 1; +#endif +} +static void +dmu_buf_write_uio(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, + uint64_t sz) +{ +#ifdef _KERNEL + uio_t *uio = (uio_t *)buf_set->dmu_ctx->data_buf; + struct iovec *iov = uio->uio_iov; + dprintf("%s: uio iov=%p iovcnt=%d base %p len %lu\n", + __func__, iov, uio->uio_iovcnt, iov->iov_base, + iov->iov_len); + if (uiomove((char *)db->db_data + off, sz, UIO_WRITE, uio)) + buf_set->err += 1; +#endif +} - if (abuf == dbuf_abuf) - XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); - else - XUIOSTAT_BUMP(xuiostat_rbuf_copied); - } else - uiomove((char *)db->db_data + off, sz, dir, uio); -#endif +static void +dmu_buf_read_char(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, + uint64_t sz) +{ + char *data = (char *)buf_set->dmu_ctx->data_buf + db->db_offset - + buf_set->dmu_ctx->start + off; + dprintf("%s(set=%p, db=%p, off=%lu, sz=%lu) db_data=%p data=%p\n", + __func__, buf_set, db, off, sz, db->db_data + off, data); + bcopy((char *)db->db_data + off, data, sz); +} +static void +dmu_buf_write_char(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, + uint64_t sz) +{ + char *data = (char *)buf_set->dmu_ctx->data_buf + db->db_offset - + buf_set->dmu_ctx->start + off; + dprintf("%s(set=%p, db=%p, off=%lu, sz=%lu) data=%p db_data=%p\n", + __func__, buf_set, db, off, sz, data, db->db_data + off); + bcopy(data, (char *)db->db_data + off, sz); } static void -dmu_buf_move_pages(dmu_context_t *dmu_ctx, dmu_buf_t *db, int off, int sz) +dmu_buf_write_pages(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, + uint64_t sz) { #ifdef sun int copied; - page_t *pp = dmu_context->data_buf; + page_t *pp = (page_t *)dmu_context->data_buf; for (copied = 0; copied < sz; copied += PAGESIZE) { caddr_t va; @@ -695,26 +734,45 @@ } static void -dmu_buf_move(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, uint64_t sz) +dmu_buf_set_nofill(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, + uint64_t sz) +{ + dmu_tx_t *tx = DMU_BUF_SET_TX(buf_set); + dmu_buf_will_not_fill(db, tx); + /* No need to do any more here. */ +} + +void +dmu_buf_set_write_no_tx(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, + uint64_t sz) +{ + dmu_tx_t *tx = DMU_BUF_SET_TX(buf_set); + + if (sz == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty_range(db, tx, off, sz); + buf_set->dmu_ctx->move_cb(buf_set, db, off, sz); + dmu_buf_fill_done(db, tx); +} + +static void +dmu_buf_set_write_tx(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, + uint64_t sz) { - dmu_context_t *dmu_ctx = buf_set->dmu_ctx; + dmu_buf_set_write_no_tx(buf_set, db, off, sz); + dmu_tx_commit(buf_set->tx); +} - if (dmu_ctx->flags & DMU_CTX_FLAG_UIO) - dmu_buf_move_uio(dmu_ctx, db, off, sz); - else if (dmu_ctx->flags & DMU_CTX_FLAG_SUN_PAGES) - dmu_buf_move_pages(dmu_ctx, db, off, sz); - else { - uint64_t dataoff = db->db_offset - dmu_ctx->start + off; - char *data = (char *)dmu_ctx->data_buf + dataoff; - if (dmu_ctx->flags & DMU_CTX_FLAG_READ) - bcopy((char *)db->db_data + off, data, sz); - else - bcopy(data, (char *)db->db_data + off, sz); - } +static void +dmu_buf_set_read(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, + uint64_t sz) +{ + buf_set->dmu_ctx->move_cb(buf_set, db, off, sz); } /** - * \brief Perform a buffer set read for a char * target buffer. + * \brief Perform a buffer set I/O transfer. * * \param buf_set Buffer set to read. */ @@ -726,42 +784,18 @@ dmu_tx_t *tx = dmu_ctx->tx; int i; - /* Having a transaction and being a reader is not supported. */ - ASSERT(tx != NULL || (dmu_ctx->flags & DMU_CTX_FLAG_READ)); - /* Initialize the current state. */ size = buf_set->size; offset = buf_set->offset; /* Perform the I/O copy, one buffer at a time. */ for (i = 0; i < buf_set->count; i++) { - uint64_t off, sz; dmu_buf_t *db = buf_set->dbp[i]; + uint64_t off = offset - db->db_offset; + uint64_t sz = MIN(db->db_size - off, size); - if (dmu_ctx->flags & DMU_CTX_FLAG_NOFILL) { - dmu_buf_will_not_fill(db, tx); - /* No need to do any more here. */ - continue; - } - ASSERT(size > 0); - - off = offset - db->db_offset; - sz = (int)MIN(db->db_size - off, size); - - /* Write case: Notify dbuf that it will be dirtied. */ - if ((dmu_ctx->flags & DMU_CTX_FLAG_READ) == 0) { - if (sz == db->db_size) - dmu_buf_will_fill(db, tx); - else - dmu_buf_will_dirty_range(db, tx, off, sz); - } - - dmu_buf_move(buf_set, db, off, sz); - - if ((dmu_ctx->flags & DMU_CTX_FLAG_READ) == 0) - dmu_buf_fill_done(db, tx); - + dmu_ctx->buf_transfer_cb(buf_set, db, off, sz); offset += sz; size -= sz; } @@ -789,8 +823,8 @@ dnode_rele(dmu_ctx->dn, dmu_ctx->tag); /* At this point, there are no buffer sets left. Call back. */ - if (dmu_ctx->dmu_cb != NULL) - dmu_ctx->dmu_cb(dmu_ctx); + if (dmu_ctx->context_cb != NULL) + dmu_ctx->context_cb(dmu_ctx); } /** @@ -806,9 +840,12 @@ /* Only perform I/O if no errors occurred for the buffer set. */ if (buf_set->err == 0) { - dmu_buf_set_transfer(buf_set); - atomic_add_64(&dmu_ctx->completed_size, buf_set->size); - } else + dmu_ctx->buf_set_transfer_cb(buf_set); + if (buf_set->err == 0) + atomic_add_64(&dmu_ctx->completed_size, buf_set->size); + } + /* Check again in case transfer causes errors. */ + if (buf_set->err) atomic_add_int(&dmu_ctx->err, buf_set->err); for (i = 0; i < buf_set->count; i++) { @@ -897,6 +934,7 @@ void dmu_buf_set_rele(dmu_buf_set_t *buf_set, boolean_t err) { + dmu_context_t *dmu_ctx = buf_set->dmu_ctx; /* Report an error, if any. */ if (err) @@ -907,13 +945,12 @@ if (refcount_release(&buf_set->holds)) { dmu_cb_state_t *dcs = tsd_get(zfs_async_io_key); - /* XXX Without a callback, the buffer must be finished now. */ - if (dcs != NULL && buf_set->dmu_ctx->dmu_cb != NULL) { + if (dcs != NULL && (dmu_ctx->flags & DMU_CTX_FLAG_ASYNC)) { dmu_context_node_add(&dcs->io_list, buf_set); } else { /* * The current thread doesn't have anything - * registered for this TSD, so it must not handle + * registered in its TSD, so it must not handle * queued delivery. Dispatch this set now. */ dmu_buf_set_complete(buf_set); @@ -1000,6 +1037,7 @@ uint64_t size) { dmu_buf_set_t *buf_set; + dmu_tx_t *tx = NULL; size_t set_size; int err, nblks; dnode_t *dn = dmu_ctx->dn; @@ -1030,6 +1068,15 @@ nblks = 1; } + /* Create a transaction for writes, if needed. */ + if ((dmu_ctx->flags & DMU_CTX_FLAG_READ) == 0 && dmu_ctx->tx == NULL) { + tx = dmu_tx_create(dn->dn_objset); + dmu_tx_hold_write(tx, dn->dn_object, dmu_ctx->offset, size); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) + goto out; + } + /* Create the new buffer set. */ set_size = sizeof(dmu_buf_set_t) + nblks * sizeof(dmu_buf_t *); buf_set = kmem_zalloc(set_size, KM_SLEEP); @@ -1045,6 +1092,8 @@ buf_set->offset = dmu_ctx->offset; buf_set->count = nblks; buf_set->blocks_allocated = nblks; + buf_set->tx = tx; + /* Include a refcount for the initiator. */ if (dmu_ctx->flags & DMU_CTX_FLAG_READ) refcount_init(&buf_set->holds, nblks + 1); @@ -1052,6 +1101,8 @@ /* For writes, dbufs never need to call us back. */ refcount_init(&buf_set->holds, 1); buf_set->dmu_ctx = dmu_ctx; + /* Either we're a reader or we have a transaction somewhere. */ + ASSERT((dmu_ctx->flags & DMU_CTX_FLAG_READ) || DMU_BUF_SET_TX(buf_set)); buf_set->zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); *buf_set_p = buf_set; @@ -1060,6 +1111,8 @@ err = dmu_buf_set_setup_buffers(buf_set); out: + if (err && tx != NULL) + dmu_tx_abort(tx); rw_exit(&dn->dn_struct_rwlock); return (err); } @@ -1082,12 +1135,10 @@ dnode_t *dn = dmu_ctx->dn; /* - * If a callback is specified, issue the I/O's without waiting. - * The dbufs will be responsible for cleaning up. - * - * Or, if this is a write, we're done. + * If the I/O is asynchronous, issue the I/O's without waiting. + * Writes do not need to wait for any ZIOs. */ - if (dmu_ctx->dmu_cb != NULL || + if ((dmu_ctx->flags & DMU_CTX_FLAG_ASYNC) || (dmu_ctx->flags & DMU_CTX_FLAG_READ) == 0) { zio_nowait(buf_set->zio); return (0); @@ -1140,8 +1191,13 @@ uint64_t io_size; dmu_buf_set_t *buf_set; + /* If this context is async, it must have a context callback. */ + ASSERT((dmu_ctx->flags & DMU_CTX_FLAG_ASYNC) == 0 || + dmu_ctx->context_cb != NULL); + /* While there is work left to do, execute the next chunk. */ - dprintf("%s(%p) -> buf %p\n", __func__, dmu_ctx, dmu_ctx->data_buf); + dprintf("%s(%p) -> buf %p off %lu sz %lu\n", __func__, dmu_ctx, + dmu_ctx->data_buf, dmu_ctx->offset, dmu_ctx->size); while (dmu_ctx->size > 0 && err == 0) { /* Determine this chunk's size. */ io_size = MIN(dmu_ctx->size, DMU_MAX_ACCESS); @@ -1158,10 +1214,10 @@ dmu_buf_set_rele(buf_set, err ? B_TRUE : B_FALSE); } /* - * At this point, either there is a callback, or all buffer sets + * At this point, either this I/O is async, or all buffer sets * have finished processing. */ - ASSERT(dmu_ctx->dmu_cb != NULL || dmu_ctx->holds == 1); + ASSERT((dmu_ctx->flags & DMU_CTX_FLAG_ASYNC) || dmu_ctx->holds == 1); return (err); } @@ -1173,21 +1229,17 @@ * \param dn The held dnode to associate with the context. * \param size Size of the I/O to be performed. * \param offset Offset into the dnode to perform the I/O. - * \param dmu_cb Function to call back on completion; may be NULL. * \param data_buf Data buffer to perform I/O transfers with. - * \param tx DMU transaction to use, if applicable. * \param tag Hold tag to use. * \param flags DMU context flags. * * \note The dnode must not be NULL. * \note The dnode must be held, unless the DMU_CTX_FLAG_NO_HOLD * flag is specified. - * \note The context may not specify a read and a transaction. */ void dmu_context_init_dnode(dmu_context_t *dmu_ctx, struct dnode *dn, - uint64_t offset, uint64_t size, void *data_buf, dmu_callback_t dmu_cb, - dmu_tx_t *tx, void *tag, uint32_t flags) + uint64_t offset, uint64_t size, void *data_buf, void *tag, uint32_t flags) { boolean_t reader = (flags & DMU_CTX_FLAG_READ) != 0; @@ -1203,8 +1255,6 @@ ASSERT(dn != NULL); ASSERT(!refcount_is_zero(&dn->dn_holds) || (flags & DMU_CTX_FLAG_NO_HOLD)); - /* Reads and DMU transactions are (currently) mutually exclusive. */ - ASSERT(!reader ^ (tx == NULL)); /* Make sure the flags are compatible with the I/O type. */ ASSERT(reader || ((flags & DMU_CTX_READER_FLAGS) == 0)); ASSERT(!reader || ((flags & DMU_CTX_WRITER_FLAGS) == 0)); @@ -1214,11 +1264,39 @@ /* All set, actually initialize the context! */ bzero(dmu_ctx, sizeof(dmu_context_t)); dmu_ctx->dn = dn; + dmu_ctx->length = size; dmu_context_seek(dmu_ctx, offset, size, data_buf); - dmu_ctx->dmu_cb = dmu_cb; - dmu_ctx->tx = tx; dmu_ctx->tag = tag; dmu_ctx->flags = flags; + /* Initialize default I/O callbacks. */ + dmu_ctx->buf_set_transfer_cb = dmu_buf_set_transfer; + if ((dmu_ctx->flags & DMU_CTX_FLAG_NOFILL) == 0) { + dmu_ctx->buf_transfer_cb = reader ? dmu_buf_set_read : + dmu_buf_set_write_tx; + } else + dmu_ctx->buf_transfer_cb = dmu_buf_set_nofill; + dmu_ctx->buf_transfer_cb = dmu_buf_set_read; + dmu_ctx->buf_transfer_cb = reader ? dmu_buf_set_read : + dmu_buf_set_write_tx; + if (dmu_ctx->flags & DMU_CTX_FLAG_UIO) { +#ifdef UIO_XUIO + uio_t *uio = (uio_t *)dmu_ctx->data_buf; + if (uio->uio_extflg == UIO_XUIO) { + ASSERT(reader); + dmu_ctx->move_cb = dmu_buf_read_xuio; + } else +#endif + { + dmu_ctx->move_cb = reader ? dmu_buf_read_uio : + dmu_buf_write_uio; + } + } else if (dmu_ctx->flags & DMU_CTX_FLAG_SUN_PAGES) { + /* implies writer */ + dmu_ctx->move_cb = dmu_buf_write_pages; + } else { + dmu_ctx->move_cb = reader ? dmu_buf_read_char : + dmu_buf_write_char; + } /* Initialize including a refcount for the initiator. */ refcount_init(&dmu_ctx->holds, 1); @@ -1240,8 +1318,7 @@ */ int dmu_context_init(dmu_context_t *dmu_ctx, objset_t *os, uint64_t object, - uint64_t offset, uint64_t size, void *data_buf, dmu_callback_t dmu_cb, - dmu_tx_t *tx, void *tag, uint32_t flags) + uint64_t offset, uint64_t size, void *data_buf, void *tag, uint32_t flags) { dnode_t *dn = NULL; int err; @@ -1250,8 +1327,7 @@ if (err) return (err); - dmu_context_init_dnode(dmu_ctx, dn, offset, size, data_buf, dmu_cb, - tx, tag, flags); + dmu_context_init_dnode(dmu_ctx, dn, offset, size, data_buf, tag, flags); return (0); } @@ -1269,13 +1345,19 @@ { dnode_t *dn = dmu_ctx->dn; - /* Make sure UIO callers pass in the correct offset. */ -#if defined(_KERNEL) && defined(ZFS_DEBUG) +#ifdef ZFS_DEBUG +#ifdef _KERNEL if (dmu_ctx->flags & DMU_CTX_FLAG_UIO) { uio_t *uio = (uio_t *)data_buf; + /* Make sure UIO callers pass in the correct offset. */ ASSERT(uio->uio_loffset == offset); } #endif + /* Make sure non-char * pointers stay the same. */ + if (!DMU_CTX_BUF_IS_CHAR(dmu_ctx)) + ASSERT(dmu_ctx->data_buf == NULL || + dmu_ctx->data_buf == data_buf); +#endif /* ZFS_DEBUG */ /* * Deal with odd block sizes, where there can't be data past @@ -1303,7 +1385,7 @@ dmu_context_t dmu_ctx; err = dmu_context_init(&dmu_ctx, os, object, offset, size, data_buf, - /*dmu_cb*/NULL, /*tx*/NULL, FTAG, flags|DMU_CTX_FLAG_READ); + FTAG, flags|DMU_CTX_FLAG_READ); if (err) return (err); @@ -1321,7 +1403,8 @@ dmu_context_t dmu_ctx; dmu_context_init(&dmu_ctx, os, object, offset, size, data_bufp, - /*dmu_cb*/NULL, tx, FTAG, /*flags*/0); + FTAG, /*flags*/0); + dmu_context_set_dmu_tx(&dmu_ctx, tx); (void) dmu_issue(&dmu_ctx); dmu_context_rele(&dmu_ctx); @@ -1339,10 +1422,11 @@ return (0); err = dmu_context_init(&dmu_ctx, os, object, offset, size, - /*data_buf*/NULL, /*dmu_cb*/NULL, tx, FTAG, flags); + /*data_buf*/NULL, FTAG, flags); if (err) return (err); + dmu_context_set_dmu_tx(&dmu_ctx, tx); err = dmu_issue(&dmu_ctx); dmu_context_rele(&dmu_ctx); @@ -1481,7 +1565,7 @@ int err; err = dmu_context_init(&dmu_ctx, os, object, uio->uio_loffset, size, - uio, /*dmu_cb*/NULL, /*tx*/NULL, FTAG, dmu_flags); + uio, FTAG, dmu_flags); if (err) return (err); @@ -1506,7 +1590,8 @@ DB_DNODE_ENTER(db); dn = DB_DNODE(db); dmu_context_init_dnode(&dmu_ctx, dn, uio->uio_loffset, size, - uio, /*dmu_cb*/NULL, tx, FTAG, flags); + uio, FTAG, flags); + dmu_context_set_dmu_tx(&dmu_ctx, tx); err = dmu_issue(&dmu_ctx); dmu_context_rele(&dmu_ctx); DB_DNODE_EXIT(db); @@ -1526,10 +1611,11 @@ return (0); err = dmu_context_init(&dmu_ctx, os, object, uio->uio_loffset, size, - uio, /*dmu_cb*/NULL, tx, FTAG, dmu_flags); + uio, FTAG, dmu_flags); if (err) return (err); + dmu_context_set_dmu_tx(&dmu_ctx, tx); err = dmu_issue(&dmu_ctx); dmu_context_rele(&dmu_ctx); return (err); @@ -1548,10 +1634,11 @@ return (0); err = dmu_context_init(&dmu_ctx, os, object, offset, size, pp, - /*dmu_cb*/NULL, tx, FTAG, dmu_flags); + FTAG, dmu_flags); if (err) return (err); + dmu_context_set_dmu_tx(&dmu_ctx, tx); err = dmu_issue(&dmu_ctx); dmu_context_rele(&dmu_ctx); return (err); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#27 (text) ==== @@ -41,6 +41,7 @@ * dmu_spa.h. */ +#include #include #include #include @@ -223,12 +224,17 @@ * callbacks. */ struct dmu_context; +struct dmu_buf_set; struct zio; -typedef void (*dmu_callback_t)(struct dmu_context *); +typedef void (*dmu_context_callback_t)(struct dmu_context *); +typedef void (*dmu_buf_set_callback_t)(struct dmu_buf_set *); +typedef void (*dmu_buf_transfer_callback_t)(struct dmu_buf_set *, dmu_buf_t *, + uint64_t, uint64_t); typedef struct dmu_context { /** The primary data associated with this context. */ + uint64_t length; /**< Requested total I/O length. */ uint64_t size; /**< Remaining bytes to process. */ uint64_t start; /**< Starting block offset. */ uint64_t offset; /**< Current block offset. */ @@ -244,8 +250,20 @@ /** The tag used for this context. */ void *tag; - /** The callback to call if the conditions are met. */ - dmu_callback_t dmu_cb; + /** The callback to call once an I/O completes entirely. */ + dmu_context_callback_t context_cb; + + /** The callback to call to transfer a buffer set. */ + dmu_buf_set_callback_t buf_set_transfer_cb; + + /** The callback to call to transfer a buffer. */ + dmu_buf_transfer_callback_t buf_transfer_cb; + + /** + * The callback to call to move a specific block's contents. This + * is normally only set by dmu_context_init(). + */ + dmu_buf_transfer_callback_t move_cb; /** Completed size. */ uint64_t completed_size; @@ -258,6 +276,7 @@ #define DMU_CTX_FLAG_NO_HOLD (1 << 4) #define DMU_CTX_FLAG_SUN_PAGES (1 << 5) #define DMU_CTX_FLAG_NOFILL (1 << 6) +#define DMU_CTX_FLAG_ASYNC (1 << 7) #define DMU_CTX_WRITER_FLAGS (DMU_CTX_FLAG_SUN_PAGES|DMU_CTX_FLAG_NOFILL) #define DMU_CTX_READER_FLAGS (DMU_CTX_FLAG_PREFETCH) @@ -291,6 +310,11 @@ /** The amount of data remaining to process for this buffer set. */ uint64_t resid; + /** For writes only, if the context doesn't have a transaction. */ + dmu_tx_t *tx; +#define DMU_BUF_SET_TX(buf_set) \ + ((buf_set)->dmu_ctx->tx ? (buf_set)->dmu_ctx->tx : (buf_set)->tx) + /** The number of errors that occurred. */ int err; @@ -304,11 +328,38 @@ void dmu_buf_set_rele(dmu_buf_set_t *buf_set, boolean_t err); int dmu_context_init(dmu_context_t *dmu_ctx, objset_t *os, uint64_t object, - uint64_t offset, uint64_t size, void *data_buf, dmu_callback_t dmu_cb, - dmu_tx_t *tx, void *tag, uint32_t flags); + uint64_t offset, uint64_t size, void *data_buf, void *tag, uint32_t flags); void dmu_context_seek(dmu_context_t *dmu_ctx, uint64_t offset, uint64_t size, void *data_buf); void dmu_context_rele(dmu_context_t *dmu_ctx); +void dmu_buf_set_write_no_tx(dmu_buf_set_t *buf_set, dmu_buf_t *buf, + uint64_t off, uint64_t sz); + +/* Optional context setters; use after calling dmu_context_init*(). */ +static inline void +dmu_context_set_context_cb(dmu_context_t *ctx, dmu_context_callback_t cb) +{ + ctx->context_cb = cb; +} +static inline void +dmu_context_set_buf_set_transfer_cb(dmu_context_t *ctx, + dmu_buf_set_callback_t cb) +{ + ctx->buf_set_transfer_cb = cb; +} +static inline void +dmu_context_set_buf_transfer_cb(dmu_context_t *ctx, + dmu_buf_transfer_callback_t cb) +{ + ctx->buf_transfer_cb = cb; +} +static inline void +dmu_context_set_dmu_tx(dmu_context_t *ctx, dmu_tx_t *tx) +{ + ASSERT(tx != NULL && ((ctx->flags & DMU_CTX_FLAG_READ) == 0)); + dmu_context_set_buf_transfer_cb(ctx, dmu_buf_set_write_no_tx); + ctx->tx = tx; +} /* DMU thread context handlers. */ int dmu_thread_context_create(void); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h#2 (text) ==== @@ -55,6 +55,10 @@ extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr); extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr); #endif /* sun */ +#ifdef __FreeBSD__ +extern int zvol_freebsd_read(struct cdev *dev, struct uio *uio, int ioflag); +extern int zvol_freebsd_write(struct cdev *dev, struct uio *uio, int ioflag); +#endif extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp); extern int zvol_busy(void); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c#11 (text) ==== @@ -659,8 +659,7 @@ #endif /* sun */ error = dmu_context_init(&dmu_ctx, os, zp->z_id, uio->uio_loffset, n, - uio, /*dmu_cb*/NULL, /*tx*/NULL, FTAG, - DMU_CTX_FLAG_READ|DMU_CTX_FLAG_UIO|DMU_CTX_FLAG_PREFETCH); + uio, FTAG, DMU_CTX_FLAG_READ|DMU_CTX_FLAG_UIO|DMU_CTX_FLAG_PREFETCH); if (error) goto out; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#28 (text) ==== @@ -101,8 +101,8 @@ .d_open = zvol_open, .d_close = zvol_close, .d_strategy = zvol_strategy, - .d_read = physread, - .d_write = physwrite, + .d_read = zvol_freebsd_read, + .d_write = zvol_freebsd_write, }; DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); @@ -1285,53 +1285,137 @@ * discover the zvol_dmu_state_t. */ dmu_context_t dmu_ctx; + zvol_state_t *zv; struct bio *bp; rl_t *rl; - zvol_state_t *zv; - dmu_tx_t *tx; } zvol_dmu_state_t; static void -zvol_dmu_cb(dmu_context_t *dmu_ctx) +zvol_dmu_write_buf(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, + uint64_t sz) +{ + zvol_dmu_state_t *zds = (zvol_dmu_state_t *)buf_set->dmu_ctx; + zvol_state_t *zv = zds->zv; + boolean_t sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); + dmu_tx_t *tx = DMU_BUF_SET_TX(buf_set); + + dmu_buf_set_write_no_tx(buf_set, db, off, sz); + + /* Log this write. */ + if ((zv->zv_flags & ZVOL_WCE) == 0 || sync) + zvol_log_write(zv, tx, buf_set->offset, buf_set->size, sync); + dmu_tx_commit(tx); +} + +static void +zvol_dmu_done(dmu_context_t *dmu_ctx) { zvol_dmu_state_t *zds = (zvol_dmu_state_t *)dmu_ctx; - int sync = (zds->bp->bio_cmd != BIO_READ && - zds->zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); + boolean_t reader = (dmu_ctx->flags & DMU_CTX_FLAG_READ); + boolean_t sync_always = zds->zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; + int err; - zds->bp->bio_completed = dmu_ctx->completed_size; - if (zds->bp->bio_cmd != BIO_READ) { - ASSERT(zds->zv != NULL && zds->tx != NULL); - zvol_log_write(zds->zv, zds->tx, zds->bp->bio_offset, - zds->bp->bio_completed, sync); - dmu_tx_commit(zds->tx); - if (sync) - zil_commit(zds->zv->zv_zilog, ZVOL_OBJ); - } - if (zds->bp->bio_completed < zds->bp->bio_length) { + if (!reader && sync_always) + zil_commit(zds->zv->zv_zilog, ZVOL_OBJ); + if (dmu_ctx->completed_size < dmu_ctx->length) { if (dmu_ctx->offset > zds->zv->zv_volsize) - zds->bp->bio_error = EINVAL; + err = EINVAL; } else - zds->bp->bio_error = (dmu_ctx->err == 0) ? 0 : EIO; + err = (dmu_ctx->err == 0) ? 0 : EIO; + /* Notify synchronous callers of the final errno. */ + dmu_ctx->err = err; + zfs_range_unlock(zds->rl); - if (zds->bp->bio_to != NULL) - g_io_deliver(zds->bp, 0); - else - zds->bp->bio_done(zds->bp); - kmem_free(zds, sizeof(zvol_dmu_state_t)); + + /* Finally, deliver to the caller. UIOs are already done. */ + if ((dmu_ctx->flags & DMU_CTX_FLAG_UIO) == 0) { + ASSERT(zds->bp != NULL); + zds->bp->bio_error = err; + zds->bp->bio_completed = dmu_ctx->completed_size; + if (zds->bp->bio_to != NULL) + g_io_deliver(zds->bp, 0); + else + zds->bp->bio_done(zds->bp); + } + if (dmu_ctx->flags & DMU_CTX_FLAG_ASYNC) + kmem_free(zds, sizeof(zvol_dmu_state_t)); +} + +int +zvol_dmu_common(zvol_state_t *zv, void *data, uint32_t dmu_flags) +{ + zvol_dmu_state_t *zds; + int error; + uint64_t io_size, off; + void *data_buf; + boolean_t reader = (dmu_flags & DMU_CTX_FLAG_READ) != 0; + struct bio *bp = NULL; + +#ifdef _KERNEL + if (dmu_flags & DMU_CTX_FLAG_UIO) + { + uio_t *uio = (uio_t *)data; + io_size = uio->uio_resid; + struct iovec *iov; + off = uio->uio_loffset; + data_buf = uio; + iov = uio->uio_iov; + dprintf("%s: uio iov=%p iovcnt=%d base %p len %lu\n", + __func__, iov, uio->uio_iovcnt, iov->iov_base, + iov->iov_len); + } else +#endif + { + bp = (struct bio *)data; + io_size = bp->bio_length; + off = bp->bio_offset; + data_buf = bp->bio_data; + } + + /* Don't allow I/Os past the end of the volume. */ + if (io_size > zv->zv_volsize - off) + io_size = zv->zv_volsize - off; + + if (reader) + dmu_flags |= DMU_CTX_FLAG_PREFETCH; + + zds = kmem_zalloc(sizeof(zvol_dmu_state_t), KM_SLEEP); + zds->zv = zv; + zds->bp = bp; + + /* Set up the DMU context & range lock. */ + error = dmu_context_init(&zds->dmu_ctx, zv->zv_objset, ZVOL_OBJ, + off, io_size, data_buf, FTAG, dmu_flags); + if (error) + return (error); + /* Override the writer case to log the writes. */ + if (!reader) + dmu_context_set_buf_transfer_cb(&zds->dmu_ctx, + zvol_dmu_write_buf); + dmu_context_set_context_cb(&zds->dmu_ctx, zvol_dmu_done); + zds->rl = zfs_range_lock(&zv->zv_znode, off, io_size, + reader ? RL_READER : RL_WRITER); + + /* Issue the write to the DMU and release the initiator hold. */ + dmu_issue(&zds->dmu_ctx); + if ((dmu_flags & DMU_CTX_FLAG_ASYNC) == 0) { + /* For the synchronous case, save the error to pass it up. */ + error = zds->dmu_ctx.err; + } else + ASSERT(zds->dmu_ctx.holds == 1); + dmu_context_rele(&zds->dmu_ctx); + /* In the async case, zvol_dmu_done frees us. */ + if ((dmu_flags & DMU_CTX_FLAG_ASYNC) == 0) + kmem_free(zds, sizeof(zvol_dmu_state_t)); + return (error); } static void zvol_common_strategy(struct bio *bp, int geom_mode) { zvol_state_t *zv; - uint64_t off; - objset_t *os; - rl_t *rl; int error = 0; - boolean_t doread = (bp->bio_cmd == BIO_READ); - dmu_tx_t *tx = NULL; - zvol_dmu_state_t *zds = NULL; - uint32_t dmu_flags = 0; + uint32_t dmu_flags = DMU_CTX_FLAG_ASYNC; /* XXX KDM may be able to consolidate this into the non-GEOM case. */ if (geom_mode != 0) @@ -1341,76 +1425,36 @@ bp->bio_to = NULL; } +#define ZVOL_STRATEGY_DELIVER(_err) \ + if (geom_mode != 0) \ + g_io_deliver(bp, _err); \ + else { \ + bp->bio_error = _err; \ + bp->bio_done(bp); \ + } + if (zv == NULL) { - error = ENXIO; - - if (geom_mode != 0) - g_io_deliver(bp, error); - else { - bp->bio_error = error; - bp->bio_done(bp); - } + ZVOL_STRATEGY_DELIVER(ENXIO); return; } if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) { - error = EROFS; - - if (geom_mode != 0) - g_io_deliver(bp, error); - else { - bp->bio_error = error; - bp->bio_done(bp); - } + ZVOL_STRATEGY_DELIVER(EROFS); return; } - off = bp->bio_offset; ASSERT(zv->zv_objset != NULL); - if (bp->bio_length > 0 && (off < 0 || off >= zv->zv_volsize)) { - error = EIO; - - if (geom_mode != 0) - g_io_deliver(bp, error); - else { - bp->bio_error = error; - bp->bio_done(bp); - } + if (bp->bio_length > 0 && + (bp->bio_offset < 0 || bp->bio_offset >= zv->zv_volsize)) { + ZVOL_STRATEGY_DELIVER(EIO); return; } +#undef ZVOL_STRATEGY_DELIVER - /* - * There must be no buffer changes when doing a dmu_sync() because - * we can't change the data whilst calculating the checksum. - */ - rl = zfs_range_lock(&zv->zv_znode, off, bp->bio_length, - doread ? RL_READER : RL_WRITER); - - if (doread) - dmu_flags = (DMU_CTX_FLAG_READ|DMU_CTX_FLAG_PREFETCH); - else { - tx = dmu_tx_create(zv->zv_objset); - dmu_tx_hold_write(tx, ZVOL_OBJ, off, bp->bio_length); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) - dmu_tx_abort(tx); - } - - zds = kmem_zalloc(sizeof(zvol_dmu_state_t), KM_SLEEP); - error = dmu_context_init(&zds->dmu_ctx, zv->zv_objset, ZVOL_OBJ, off, - bp->bio_length, bp->bio_data, zvol_dmu_cb, tx, FTAG, dmu_flags); - /* All error handling is done in the callback. */ - zds->bp = bp; - zds->rl = rl; - zds->zv = zv; - zds->tx = tx; - if (error == 0) { - /* Pump primed, issue the I/O to the DMU. */ - (void) dmu_issue(&zds->dmu_ctx); - } else - zds->dmu_ctx.err = error; - dmu_context_rele(&zds->dmu_ctx); + if (bp->bio_cmd == BIO_READ) + dmu_flags |= DMU_CTX_FLAG_READ; + zvol_dmu_common(zv, bp, dmu_flags); } static void @@ -1473,51 +1517,54 @@ return (error); } +#endif /* sun */ + +#if defined(__FreeBSD__) && defined(_KERNEL) +int +zvol_freebsd_read(struct cdev *dev, struct uio *uio, int ioflag) +{ + zvol_state_t *zv = (zvol_state_t *)dev->si_drv1; + if (uio->uio_resid > 0 && + (uio->uio_loffset < 0 || uio->uio_loffset >= zv->zv_volsize)) + return (EIO); + + return (zvol_dmu_common(zv, uio, DMU_CTX_FLAG_UIO|DMU_CTX_FLAG_READ)); +} +int +zvol_freebsd_write(struct cdev *dev, struct uio *uio, int ioflag) +{ + zvol_state_t *zv = (zvol_state_t *)dev->si_drv1; + + if (uio->uio_resid > 0 && + (uio->uio_loffset < 0 || uio->uio_loffset >= zv->zv_volsize)) + return (EIO); + + return (zvol_dmu_common(zv, uio, DMU_CTX_FLAG_UIO)); +} +#endif /* __FreeBSD__ && _KERNEL */ + +#ifdef sun /*ARGSUSED*/ int zvol_read(dev_t dev, uio_t *uio, cred_t *cr) { minor_t minor = getminor(dev); zvol_state_t *zv; - uint64_t volsize; - rl_t *rl; - int error = 0; zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); if (zv == NULL) return (ENXIO); - volsize = zv->zv_volsize; if (uio->uio_resid > 0 && - (uio->uio_loffset < 0 || uio->uio_loffset >= volsize)) + (uio->uio_loffset < 0 || uio->uio_loffset >= zv->zv_volsize)) return (EIO); - if (zv->zv_flags & ZVOL_DUMPIFIED) { - error = physio(zvol_strategy, NULL, dev, B_READ, - zvol_minphys, uio); - return (error); - } + if (zv->zv_flags & ZVOL_DUMPIFIED) + return (physio(zvol_strategy, NULL, dev, B_READ, + zvol_minphys, uio)); - rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, - RL_READER); - while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { - uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); - - /* don't read past the end */ - if (bytes > volsize - uio->uio_loffset) - bytes = volsize - uio->uio_loffset; - - error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes); - if (error) { - /* convert checksum errors into IO errors */ - if (error == ECKSUM) - error = EIO; - break; - } - } - zfs_range_unlock(rl); - return (error); + return (zvol_dmu_common(zv, uio, DMU_CTX_FLAG_UIO|DMU_CTX_FLAG_READ)); } /*ARGSUSED*/ @@ -1526,58 +1573,20 @@ { minor_t minor = getminor(dev); zvol_state_t *zv; - uint64_t volsize; - rl_t *rl; - int error = 0; - boolean_t sync; zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); if (zv == NULL) return (ENXIO); - volsize = zv->zv_volsize; if (uio->uio_resid > 0 && - (uio->uio_loffset < 0 || uio->uio_loffset >= volsize)) + (uio->uio_loffset < 0 || uio->uio_loffset >= zv->zv_volsize)) return (EIO); - if (zv->zv_flags & ZVOL_DUMPIFIED) { - error = physio(zvol_strategy, NULL, dev, B_WRITE, - zvol_minphys, uio); - return (error); - } + if (zv->zv_flags & ZVOL_DUMPIFIED) + return (physio(zvol_strategy, NULL, dev, B_WRITE, + zvol_minphys, uio)); - sync = !(zv->zv_flags & ZVOL_WCE) || - (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); - - rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, - RL_WRITER); - while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { - uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); - uint64_t off = uio->uio_loffset; - - dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); - - if (bytes > volsize - off) /* don't write past the end */ - bytes = volsize - off; - - dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - break; - } - error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx); - if (error == 0) - zvol_log_write(zv, tx, off, bytes, sync); - dmu_tx_commit(tx); - - if (error) - break; - } - zfs_range_unlock(rl); - if (sync) - zil_commit(zv->zv_zilog, ZVOL_OBJ); - return (error); + return (zvol_dmu_common(zv, uio, DMU_CTX_FLAG_UIO)); } int Change 526131 by willa@willa_repo on 2012/02/09 13:39:03 Temporarily fix the zvol writes. Bypass the new per-chunk transaction management by creating the transaction in zvol_dmu_common() and assigning it to the dmu_context_t. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#29 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#29 (text) ==== @@ -1290,6 +1290,7 @@ rl_t *rl; } zvol_dmu_state_t; +#ifdef NOTYET static void zvol_dmu_write_buf(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, uint64_t sz) @@ -1306,6 +1307,7 @@ zvol_log_write(zv, tx, buf_set->offset, buf_set->size, sync); dmu_tx_commit(tx); } +#endif static void zvol_dmu_done(dmu_context_t *dmu_ctx) @@ -1315,8 +1317,11 @@ boolean_t sync_always = zds->zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; int err; - if (!reader && sync_always) - zil_commit(zds->zv->zv_zilog, ZVOL_OBJ); + if (!reader) { + dmu_tx_commit(zds->dmu_ctx.tx); + if (sync_always) + zil_commit(zds->zv->zv_zilog, ZVOL_OBJ); + } if (dmu_ctx->completed_size < dmu_ctx->length) { if (dmu_ctx->offset > zds->zv->zv_volsize) err = EINVAL; @@ -1350,6 +1355,7 @@ void *data_buf; boolean_t reader = (dmu_flags & DMU_CTX_FLAG_READ) != 0; struct bio *bp = NULL; + dmu_tx_t *tx; #ifdef _KERNEL if (dmu_flags & DMU_CTX_FLAG_UIO) @@ -1389,9 +1395,20 @@ if (error) return (error); /* Override the writer case to log the writes. */ - if (!reader) + if (!reader) { + tx = dmu_tx_create(zv->zv_objset); + dmu_tx_hold_write(tx, ZVOL_OBJ, off, io_size); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + kmem_free(zds, sizeof(zvol_dmu_state_t)); + return (error); + } + dmu_context_set_dmu_tx(&zds->dmu_ctx, tx); +#ifdef NOTYET dmu_context_set_buf_transfer_cb(&zds->dmu_ctx, zvol_dmu_write_buf); +#endif + } dmu_context_set_context_cb(&zds->dmu_ctx, zvol_dmu_done); zds->rl = zfs_range_lock(&zv->zv_znode, off, io_size, reader ? RL_READER : RL_WRITER); Change 526138 by willa@willa_repo on 2012/02/09 17:07:20 Another shot at trying to do per-buf_set transactions. This doesn't seem to work, however; dnodes can't be held across transaction groups, so I tried to work around that by changing the transaction assignment to check whether it needs to do a re-hold; in this way we might avoid re-holding within a transaction group. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#53 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c#10 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h#4 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#30 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#53 (text) ==== @@ -819,7 +819,7 @@ refcount_release(&dmu_ctx_in_flight); #endif - if ((dmu_ctx->flags & DMU_CTX_FLAG_NO_HOLD) == 0) + if ((dmu_ctx->flags & DMU_CTX_FLAG_NO_HOLD) == 0 && dmu_ctx->dn) dnode_rele(dmu_ctx->dn, dmu_ctx->tag); /* At this point, there are no buffer sets left. Call back. */ @@ -1022,6 +1022,37 @@ } /** + * \brief Re-hold the dnode if the tx got a newer txg. + * + * \param tx DMU transaction to check. + * \param data DMU context to fix up. + * + * \retval 0 If no re-hold occurred or the re-hold succeeded. + * \retval err Any error that could be returned by dnode_hold(). + */ +static int +dmu_buf_set_tx_rehold_dnode(dmu_tx_t *tx, void *data) +{ + dmu_context_t *dmu_ctx = (dmu_context_t *)dmu_ctx; + objset_t *os = dmu_ctx->dn->dn_objset; + uint64_t object = dmu_ctx->dn->dn_object; + int err; + + /* + * This should only be called when all transactions we've already + * issued have completed, thus making the call safe. + */ + ASSERT(refcount_count(&dmu_ctx->dn->dn_tx_holds) == 1); + + printf("reholding dnode %p\n", dmu_ctx->dn); + dnode_rele(dmu_ctx->dn, dmu_ctx->tag); + dmu_ctx->dn = NULL; + err = dnode_hold(os, object, dmu_ctx->tag, &dmu_ctx->dn); + printf("reheld dnode %p\n", dmu_ctx->dn); + return (err); +} + +/** * \brief Initialize a buffer set of a certain size. * * \param dmu_ctx DMU context to associate the buffer set with. @@ -1045,6 +1076,32 @@ ASSERT(dmu_ctx != NULL); ASSERT(dmu_ctx->holds > 0); + /* + * Create a transaction for writes, if needed. This must be done + * first in order to hold the correct struct_rwlock, use the + * correct values for dn_datablksz, etc. + */ + if ((dmu_ctx->flags & DMU_CTX_FLAG_READ) == 0 && dmu_ctx->tx == NULL) { + tx = dmu_tx_create(dn->dn_objset); + dmu_tx_hold_write(tx, dn->dn_object, dmu_ctx->offset, size); + err = dmu_tx_assign_cb(tx, TXG_WAIT, + dmu_buf_set_tx_rehold_dnode, dmu_ctx); + if (err) { + dmu_tx_abort(tx); + return (err); + } + /* See if the TX crossed TXGs but the dnode didn't. */ + if (tx->tx_txg != dmu_ctx->dn->dn_assigned_txg) { + err = dmu_buf_set_tx_rehold_dnode(tx, dmu_ctx); + if (err) { + dmu_tx_abort(tx); + return (err); + } + } + /* The dnode may have changed; re-assign. */ + dn = dmu_ctx->dn; + } + rw_enter(&dn->dn_struct_rwlock, RW_READER); /* Figure out how many blocks are needed for the requested size. */ @@ -1068,15 +1125,6 @@ nblks = 1; } - /* Create a transaction for writes, if needed. */ - if ((dmu_ctx->flags & DMU_CTX_FLAG_READ) == 0 && dmu_ctx->tx == NULL) { - tx = dmu_tx_create(dn->dn_objset); - dmu_tx_hold_write(tx, dn->dn_object, dmu_ctx->offset, size); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) - goto out; - } - /* Create the new buffer set. */ set_size = sizeof(dmu_buf_set_t) + nblks * sizeof(dmu_buf_t *); buf_set = kmem_zalloc(set_size, KM_SLEEP); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c#10 (text) ==== @@ -1062,9 +1062,11 @@ * -# A specific txg. Use this if you need to ensure that multiple * transactions all sync in the same txg. Like TXG_NOWAIT, it * returns ERESTART if it can't assign you into the requested txg. + * + * If cb is not NULL, it will be called whenever a wait occurs. */ int -dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) +dmu_tx_assign_cb(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_assign_t cb, void *data) { int err; @@ -1073,12 +1075,19 @@ ASSERT(!dsl_pool_sync_context(tx->tx_pool)); while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { + printf("dmu_tx_try_assign failed for tx %p\n", tx); dmu_tx_unassign(tx); if (err != ERESTART || txg_how != TXG_WAIT) return (err); dmu_tx_wait(tx); + + if (cb) { + err = cb(tx, data); + if (err) + return (err); + } } txg_rele_to_quiesce(&tx->tx_txgh); @@ -1086,6 +1095,12 @@ return (0); } +int +dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) +{ + return (dmu_tx_assign_cb(tx, txg_how, NULL, NULL)); +} + void dmu_tx_wait(dmu_tx_t *tx) { ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h#4 (text) ==== @@ -103,10 +103,14 @@ void *dcb_data; /**< caller private data */ } dmu_tx_callback_t; +typedef int (*dmu_tx_assign_t)(dmu_tx_t *tx, void *data); + /* * These routines are defined in dmu.h, and are called by the user. */ dmu_tx_t *dmu_tx_create(objset_t *dd); +int dmu_tx_assign_cb(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_assign_t cb, + void *data); int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); void dmu_tx_commit(dmu_tx_t *tx); void dmu_tx_abort(dmu_tx_t *tx); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#30 (text) ==== @@ -1291,6 +1291,7 @@ } zvol_dmu_state_t; #ifdef NOTYET +#else static void zvol_dmu_write_buf(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, uint64_t sz) @@ -1396,6 +1397,7 @@ return (error); /* Override the writer case to log the writes. */ if (!reader) { +#ifdef NOTYET tx = dmu_tx_create(zv->zv_objset); dmu_tx_hold_write(tx, ZVOL_OBJ, off, io_size); error = dmu_tx_assign(tx, TXG_WAIT); @@ -1404,7 +1406,7 @@ return (error); } dmu_context_set_dmu_tx(&zds->dmu_ctx, tx); -#ifdef NOTYET +#else dmu_context_set_buf_transfer_cb(&zds->dmu_ctx, zvol_dmu_write_buf); #endif @@ -1413,17 +1415,17 @@ zds->rl = zfs_range_lock(&zv->zv_znode, off, io_size, reader ? RL_READER : RL_WRITER); - /* Issue the write to the DMU and release the initiator hold. */ + /* Issue the DMU I/O and release the initiator hold. */ dmu_issue(&zds->dmu_ctx); + /* Either this was an async issue, or we're the last holder. */ + ASSERT((dmu_flags & DMU_CTX_FLAG_ASYNC) || zds->dmu_ctx.holds == 1); + dmu_context_rele(&zds->dmu_ctx); + /* In the async case, zvol_dmu_done frees us. */ if ((dmu_flags & DMU_CTX_FLAG_ASYNC) == 0) { /* For the synchronous case, save the error to pass it up. */ error = zds->dmu_ctx.err; - } else - ASSERT(zds->dmu_ctx.holds == 1); - dmu_context_rele(&zds->dmu_ctx); - /* In the async case, zvol_dmu_done frees us. */ - if ((dmu_flags & DMU_CTX_FLAG_ASYNC) == 0) kmem_free(zds, sizeof(zvol_dmu_state_t)); + } return (error); } Change 526141 by willa@willa_repo on 2012/02/09 23:50:56 Fix writes the (hopefully finally) right way. After some additional research I determined that, in fact, DMU TX doesn't allow holding a dnode across transaction commits. So for writes, at best we can at least only perform one dnode hold per chunk, and consolidate the tx management into dmu_issue() as part of chunking. dmu_tx_hold_write() doesn't allow hold sizes >= DMU_MAX_ACCESS, so be sure to chunk in DMU_MAX_ACCESS/2 pieces, like was done before. Since we have to do a hold/rele cycle for every tx, don't require writes to provide a held dnode. Instead, everyone will directly provide the pair needed to hold it. Consequently, combine dmu_context_init{,_dnode}() and have the new function perform the dnode_hold for reads. For writes, if a tx is provided, perform it in dmu_issue() just before chunking; otherwise perform it each time a transaction is created in dmu_context_setup_tx(). Correct some confusion between dmu_context_t's buf_transfer_cb and buf_set_transfer_cb. dmu_tx_commit(), if called, needs to be called by a buf_set_transfer_cb, since it should be called after all buffers have been copied. This new function is dmu_buf_set_transfer_write_tx. This in turn calls dmu_buf_set_transfer_write, which releases the dnode each time it is called. Reconstitute the old per-buf transfer routines to handle the difference between a write and a read (write wraps move_cb with DMU calls that fill/dirty and after, fill_done). For ZVOL, provide a substitute for dmu_buf_set_transfer_write_tx, which calls that function, then performs the zvol log write, and then commits. Remove the ill-conceived dmu_tx_assign_cb() I added in change 526138. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#54 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c#11 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#28 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h#5 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c#12 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#31 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#54 (text) ==== @@ -734,7 +734,7 @@ } static void -dmu_buf_set_nofill(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, +dmu_buf_transfer_nofill(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, uint64_t sz) { dmu_tx_t *tx = DMU_BUF_SET_TX(buf_set); @@ -742,8 +742,8 @@ /* No need to do any more here. */ } -void -dmu_buf_set_write_no_tx(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, +static void +dmu_buf_transfer_write(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, uint64_t sz) { dmu_tx_t *tx = DMU_BUF_SET_TX(buf_set); @@ -757,26 +757,13 @@ } static void -dmu_buf_set_write_tx(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, - uint64_t sz) -{ - dmu_buf_set_write_no_tx(buf_set, db, off, sz); - dmu_tx_commit(buf_set->tx); -} - -static void -dmu_buf_set_read(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, +dmu_buf_transfer_read(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, uint64_t sz) { buf_set->dmu_ctx->move_cb(buf_set, db, off, sz); } -/** - * \brief Perform a buffer set I/O transfer. - * - * \param buf_set Buffer set to read. - */ -static void +void dmu_buf_set_transfer(dmu_buf_set_t *buf_set) { uint64_t offset, size; @@ -801,6 +788,25 @@ } } +void +dmu_buf_set_transfer_write(dmu_buf_set_t *buf_set) +{ + + dmu_buf_set_transfer(buf_set); + ASSERT(buf_set->dmu_ctx->dn != NULL); + /* Release the dnode immediately before committing the tx. */ + dnode_rele(buf_set->dmu_ctx->dn, buf_set->dmu_ctx->tag); + buf_set->dmu_ctx->dn = NULL; +} + +static void +dmu_buf_set_transfer_write_tx(dmu_buf_set_t *buf_set) +{ + + dmu_buf_set_transfer_write(buf_set); + dmu_tx_commit(buf_set->tx); +} + /** * \brief Release a DMU context hold, cleaning up if no holds remain. * @@ -819,7 +825,7 @@ refcount_release(&dmu_ctx_in_flight); #endif - if ((dmu_ctx->flags & DMU_CTX_FLAG_NO_HOLD) == 0 && dmu_ctx->dn) + if ((dmu_ctx->flags & DMU_CTX_FLAG_NO_HOLD) == 0 && dmu_ctx->dn != NULL) dnode_rele(dmu_ctx->dn, dmu_ctx->tag); /* At this point, there are no buffer sets left. Call back. */ @@ -1022,33 +1028,38 @@ } /** - * \brief Re-hold the dnode if the tx got a newer txg. + * \brief Set up a new transaction for the DMU context. * - * \param tx DMU transaction to check. - * \param data DMU context to fix up. - * - * \retval 0 If no re-hold occurred or the re-hold succeeded. - * \retval err Any error that could be returned by dnode_hold(). + * \param dmu_ctx DMU context to set up new transaction for. + * \param txp Address to store dmu_tx_t pointer. + * \param dnp Address to store dnode_t pointer for new dnode. */ static int -dmu_buf_set_tx_rehold_dnode(dmu_tx_t *tx, void *data) +dmu_context_setup_tx(dmu_context_t *dmu_ctx, dmu_tx_t **txp, dnode_t **dnp, + uint64_t size) { - dmu_context_t *dmu_ctx = (dmu_context_t *)dmu_ctx; - objset_t *os = dmu_ctx->dn->dn_objset; - uint64_t object = dmu_ctx->dn->dn_object; int err; + *txp = dmu_tx_create(dmu_ctx->os); + dmu_tx_hold_write(*txp, dmu_ctx->object, dmu_ctx->offset, size); + err = dmu_tx_assign(*txp, TXG_WAIT); + if (err) + goto out; + /* - * This should only be called when all transactions we've already - * issued have completed, thus making the call safe. + * Writer without caller TX: dnode hold is done here rather + * than in dmu_context_init(). */ - ASSERT(refcount_count(&dmu_ctx->dn->dn_tx_holds) == 1); + err = dnode_hold(dmu_ctx->os, dmu_ctx->object, dmu_ctx->tag, dnp); + if (err) + goto out; + dmu_ctx->dn = *dnp; - printf("reholding dnode %p\n", dmu_ctx->dn); - dnode_rele(dmu_ctx->dn, dmu_ctx->tag); - dmu_ctx->dn = NULL; - err = dnode_hold(os, object, dmu_ctx->tag, &dmu_ctx->dn); - printf("reheld dnode %p\n", dmu_ctx->dn); +out: + if (err && *txp != NULL) { + dmu_tx_abort(*txp); + *txp = NULL; + } return (err); } @@ -1082,24 +1093,9 @@ * correct values for dn_datablksz, etc. */ if ((dmu_ctx->flags & DMU_CTX_FLAG_READ) == 0 && dmu_ctx->tx == NULL) { - tx = dmu_tx_create(dn->dn_objset); - dmu_tx_hold_write(tx, dn->dn_object, dmu_ctx->offset, size); - err = dmu_tx_assign_cb(tx, TXG_WAIT, - dmu_buf_set_tx_rehold_dnode, dmu_ctx); - if (err) { - dmu_tx_abort(tx); + err = dmu_context_setup_tx(dmu_ctx, &tx, &dn, size); + if (err) return (err); - } - /* See if the TX crossed TXGs but the dnode didn't. */ - if (tx->tx_txg != dmu_ctx->dn->dn_assigned_txg) { - err = dmu_buf_set_tx_rehold_dnode(tx, dmu_ctx); - if (err) { - dmu_tx_abort(tx); - return (err); - } - } - /* The dnode may have changed; re-assign. */ - dn = dmu_ctx->dn; } rw_enter(&dn->dn_struct_rwlock, RW_READER); @@ -1161,7 +1157,8 @@ out: if (err && tx != NULL) dmu_tx_abort(tx); - rw_exit(&dn->dn_struct_rwlock); + if (dn != NULL) + rw_exit(&dn->dn_struct_rwlock); return (err); } @@ -1243,12 +1240,20 @@ ASSERT((dmu_ctx->flags & DMU_CTX_FLAG_ASYNC) == 0 || dmu_ctx->context_cb != NULL); + /* For writers, if a tx was specified but a dnode wasn't, hold here. */ + if (dmu_ctx->tx != NULL && dmu_ctx->dn == NULL) { + err = dnode_hold(dmu_ctx->os, dmu_ctx->object, dmu_ctx->tag, + &dmu_ctx->dn); + if (err) + return (err); + } + /* While there is work left to do, execute the next chunk. */ dprintf("%s(%p) -> buf %p off %lu sz %lu\n", __func__, dmu_ctx, dmu_ctx->data_buf, dmu_ctx->offset, dmu_ctx->size); while (dmu_ctx->size > 0 && err == 0) { /* Determine this chunk's size. */ - io_size = MIN(dmu_ctx->size, DMU_MAX_ACCESS); + io_size = MIN(dmu_ctx->size, DMU_MAX_ACCESS/2); /* Initialize the buffer set for this chunk. */ dprintf("%s(%p@%lu+%lu) chunk %lu\n", __func__, dmu_ctx, @@ -1274,58 +1279,76 @@ * \brief Set up a DMU context. * * \param dmu_ctx The DMU context. - * \param dn The held dnode to associate with the context. + * \param dn A held dnode to associate with the context, or NULL. + * \param os The object set associated with the context. + * \param object The object ID associated with the context. * \param size Size of the I/O to be performed. * \param offset Offset into the dnode to perform the I/O. * \param data_buf Data buffer to perform I/O transfers with. * \param tag Hold tag to use. * \param flags DMU context flags. * - * \note The dnode must not be NULL. - * \note The dnode must be held, unless the DMU_CTX_FLAG_NO_HOLD - * flag is specified. + * \note The dnode must not be NULL, unless this is a writer. + * \note The dnode, if specified, must be held, unless the + * DMU_CTX_FLAG_NO_HOLD flag is specified. */ -void -dmu_context_init_dnode(dmu_context_t *dmu_ctx, struct dnode *dn, - uint64_t offset, uint64_t size, void *data_buf, void *tag, uint32_t flags) +int +dmu_context_init(dmu_context_t *dmu_ctx, struct dnode *dn, objset_t *os, + uint64_t object, uint64_t offset, uint64_t size, void *data_buf, void *tag, + uint32_t flags) { boolean_t reader = (flags & DMU_CTX_FLAG_READ) != 0; + int err; #ifdef ZFS_DEBUG refcount_acquire(&dmu_ctx_in_flight); atomic_add_64(&dmu_ctx_total, 1); + /* Make sure the dnode is passed in appropriately. */ + if (dn == NULL) + ASSERT(os != NULL); + else + ASSERT(!refcount_is_zero(&dn->dn_holds) || + (flags & DMU_CTX_FLAG_NO_HOLD)); #endif #ifndef sun ASSERT((flags & DMU_CTX_FLAG_SUN_PAGES) == 0); #endif - /* Make sure the dnode passed in is valid. */ - ASSERT(dn != NULL); - ASSERT(!refcount_is_zero(&dn->dn_holds) || - (flags & DMU_CTX_FLAG_NO_HOLD)); /* Make sure the flags are compatible with the I/O type. */ ASSERT(reader || ((flags & DMU_CTX_READER_FLAGS) == 0)); ASSERT(!reader || ((flags & DMU_CTX_WRITER_FLAGS) == 0)); /* The NOFILL flag and a NULL data_buf go hand in hand. */ ASSERT((flags & DMU_CTX_FLAG_NOFILL) ^ (data_buf != NULL)); + /* + * If the caller is a reader and didn't pass in a dnode, hold it. + * Writers (re-)hold a dnode in dmu_context_setup_tx(), or if a tx + * is specified, in dmu_issue(). + */ + if (dn == NULL && (flags & DMU_CTX_FLAG_READ)) { + err = dnode_hold(os, object, tag, &dn); + if (err) + return (err); + } + /* All set, actually initialize the context! */ bzero(dmu_ctx, sizeof(dmu_context_t)); dmu_ctx->dn = dn; + dmu_ctx->os = os; + dmu_ctx->object = object; dmu_ctx->length = size; dmu_context_seek(dmu_ctx, offset, size, data_buf); dmu_ctx->tag = tag; dmu_ctx->flags = flags; + /* Initialize default I/O callbacks. */ - dmu_ctx->buf_set_transfer_cb = dmu_buf_set_transfer; + dmu_ctx->buf_set_transfer_cb = reader ? dmu_buf_set_transfer : + dmu_buf_set_transfer_write_tx; if ((dmu_ctx->flags & DMU_CTX_FLAG_NOFILL) == 0) { - dmu_ctx->buf_transfer_cb = reader ? dmu_buf_set_read : - dmu_buf_set_write_tx; + dmu_ctx->buf_transfer_cb = reader ? dmu_buf_transfer_read : + dmu_buf_transfer_write; } else - dmu_ctx->buf_transfer_cb = dmu_buf_set_nofill; - dmu_ctx->buf_transfer_cb = dmu_buf_set_read; - dmu_ctx->buf_transfer_cb = reader ? dmu_buf_set_read : - dmu_buf_set_write_tx; + dmu_ctx->buf_transfer_cb = dmu_buf_transfer_nofill; if (dmu_ctx->flags & DMU_CTX_FLAG_UIO) { #ifdef UIO_XUIO uio_t *uio = (uio_t *)dmu_ctx->data_buf; @@ -1348,34 +1371,6 @@ /* Initialize including a refcount for the initiator. */ refcount_init(&dmu_ctx->holds, 1); -} - -/** - * \brief Set up a DMU context. - * - * \param os Object set to associate with the DMU context. - * \param object Object ID to associate with the DMU context. - * - * \note See dmu_context_init_dnode about the other parameters. - * \note This function wraps dmu_context_init_dnode, and its - * purpose is to hide the dnode from the caller; it - * also makes it possible to avoid lookups. - * - * \retval errno Could not hold the dnode. - * \retval 0 Success. - */ -int -dmu_context_init(dmu_context_t *dmu_ctx, objset_t *os, uint64_t object, - uint64_t offset, uint64_t size, void *data_buf, void *tag, uint32_t flags) -{ - dnode_t *dn = NULL; - int err; - - err = dnode_hold(os, object, tag, &dn); - if (err) - return (err); - - dmu_context_init_dnode(dmu_ctx, dn, offset, size, data_buf, tag, flags); return (0); } @@ -1432,8 +1427,8 @@ int err; dmu_context_t dmu_ctx; - err = dmu_context_init(&dmu_ctx, os, object, offset, size, data_buf, - FTAG, flags|DMU_CTX_FLAG_READ); + err = dmu_context_init(&dmu_ctx, /*dnode*/NULL, os, object, offset, + size, data_buf, FTAG, flags|DMU_CTX_FLAG_READ); if (err) return (err); @@ -1449,9 +1444,11 @@ { void *data_bufp = (void *)(uintptr_t)data_buf; dmu_context_t dmu_ctx; + int err; - dmu_context_init(&dmu_ctx, os, object, offset, size, data_bufp, - FTAG, /*flags*/0); + err = dmu_context_init(&dmu_ctx, /*dnode*/NULL, os, object, offset, + size, data_bufp, FTAG, /*flags*/0); + VERIFY(err == 0); dmu_context_set_dmu_tx(&dmu_ctx, tx); (void) dmu_issue(&dmu_ctx); @@ -1469,8 +1466,8 @@ if (size == 0) return (0); - err = dmu_context_init(&dmu_ctx, os, object, offset, size, - /*data_buf*/NULL, FTAG, flags); + err = dmu_context_init(&dmu_ctx, /*dnode*/NULL, os, object, offset, + size, /*data_buf*/NULL, FTAG, flags); if (err) return (err); @@ -1612,8 +1609,8 @@ uint32_t dmu_flags = DMU_CTX_FLAG_READ|DMU_CTX_FLAG_UIO; int err; - err = dmu_context_init(&dmu_ctx, os, object, uio->uio_loffset, size, - uio, FTAG, dmu_flags); + err = dmu_context_init(&dmu_ctx, /*dnode*/NULL, os, object, + uio->uio_loffset, size, uio, FTAG, dmu_flags); if (err) return (err); @@ -1637,11 +1634,13 @@ DB_DNODE_ENTER(db); dn = DB_DNODE(db); - dmu_context_init_dnode(&dmu_ctx, dn, uio->uio_loffset, size, - uio, FTAG, flags); - dmu_context_set_dmu_tx(&dmu_ctx, tx); - err = dmu_issue(&dmu_ctx); - dmu_context_rele(&dmu_ctx); + err = dmu_context_init(&dmu_ctx, dn, dn->dn_objset, dn->dn_object, + uio->uio_loffset, size, uio, FTAG, flags); + if (err == 0) { + dmu_context_set_dmu_tx(&dmu_ctx, tx); + err = dmu_issue(&dmu_ctx); + dmu_context_rele(&dmu_ctx); + } DB_DNODE_EXIT(db); return (err); @@ -1658,8 +1657,8 @@ if (size == 0) return (0); - err = dmu_context_init(&dmu_ctx, os, object, uio->uio_loffset, size, - uio, FTAG, dmu_flags); + err = dmu_context_init(&dmu_ctx, /*dnode*/NULL, os, object, + uio->uio_loffset, size, uio, FTAG, dmu_flags); if (err) return (err); @@ -1681,8 +1680,8 @@ if (size == 0) return (0); - err = dmu_context_init(&dmu_ctx, os, object, offset, size, pp, - FTAG, dmu_flags); + err = dmu_context_init(&dmu_ctx, /*dnode*/NULL, os, object, offset, + size, pp, FTAG, dmu_flags); if (err) return (err); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c#11 (text) ==== @@ -1062,11 +1062,9 @@ * -# A specific txg. Use this if you need to ensure that multiple * transactions all sync in the same txg. Like TXG_NOWAIT, it * returns ERESTART if it can't assign you into the requested txg. - * - * If cb is not NULL, it will be called whenever a wait occurs. */ int -dmu_tx_assign_cb(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_assign_t cb, void *data) +dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) { int err; @@ -1075,19 +1073,12 @@ ASSERT(!dsl_pool_sync_context(tx->tx_pool)); while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { - printf("dmu_tx_try_assign failed for tx %p\n", tx); dmu_tx_unassign(tx); if (err != ERESTART || txg_how != TXG_WAIT) return (err); dmu_tx_wait(tx); - - if (cb) { - err = cb(tx, data); - if (err) - return (err); - } } txg_rele_to_quiesce(&tx->tx_txgh); @@ -1095,12 +1086,6 @@ return (0); } -int -dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) -{ - return (dmu_tx_assign_cb(tx, txg_how, NULL, NULL)); -} - void dmu_tx_wait(dmu_tx_t *tx) { ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#28 (text) ==== @@ -243,6 +243,8 @@ /** The dnode held in association with this context. */ struct dnode *dn; + objset_t *os; /**< Object set associated with the dnode. */ + uint64_t object; /**< Object ID associated with the dnode. */ /** Number of buffer sets left to complete. */ int holds; @@ -327,13 +329,14 @@ } dmu_buf_set_t; void dmu_buf_set_rele(dmu_buf_set_t *buf_set, boolean_t err); -int dmu_context_init(dmu_context_t *dmu_ctx, objset_t *os, uint64_t object, - uint64_t offset, uint64_t size, void *data_buf, void *tag, uint32_t flags); +int dmu_context_init(dmu_context_t *dmu_ctx, struct dnode *dn, objset_t *os, + uint64_t object, uint64_t offset, uint64_t size, void *data_buf, void *tag, + uint32_t flags); void dmu_context_seek(dmu_context_t *dmu_ctx, uint64_t offset, uint64_t size, void *data_buf); void dmu_context_rele(dmu_context_t *dmu_ctx); -void dmu_buf_set_write_no_tx(dmu_buf_set_t *buf_set, dmu_buf_t *buf, - uint64_t off, uint64_t sz); +void dmu_buf_set_transfer(dmu_buf_set_t *buf_set); +void dmu_buf_set_transfer_write(dmu_buf_set_t *buf_set); /* Optional context setters; use after calling dmu_context_init*(). */ static inline void @@ -357,7 +360,7 @@ dmu_context_set_dmu_tx(dmu_context_t *ctx, dmu_tx_t *tx) { ASSERT(tx != NULL && ((ctx->flags & DMU_CTX_FLAG_READ) == 0)); - dmu_context_set_buf_transfer_cb(ctx, dmu_buf_set_write_no_tx); + dmu_context_set_buf_set_transfer_cb(ctx, dmu_buf_set_transfer); ctx->tx = tx; } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h#5 (text) ==== @@ -103,14 +103,10 @@ void *dcb_data; /**< caller private data */ } dmu_tx_callback_t; -typedef int (*dmu_tx_assign_t)(dmu_tx_t *tx, void *data); - /* * These routines are defined in dmu.h, and are called by the user. */ dmu_tx_t *dmu_tx_create(objset_t *dd); -int dmu_tx_assign_cb(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_assign_t cb, - void *data); int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); void dmu_tx_commit(dmu_tx_t *tx); void dmu_tx_abort(dmu_tx_t *tx); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c#12 (text) ==== @@ -658,8 +658,9 @@ } #endif /* sun */ - error = dmu_context_init(&dmu_ctx, os, zp->z_id, uio->uio_loffset, n, - uio, FTAG, DMU_CTX_FLAG_READ|DMU_CTX_FLAG_UIO|DMU_CTX_FLAG_PREFETCH); + error = dmu_context_init(&dmu_ctx, /*dnode*/NULL, os, zp->z_id, + uio->uio_loffset, n, uio, FTAG, + DMU_CTX_FLAG_READ|DMU_CTX_FLAG_UIO|DMU_CTX_FLAG_PREFETCH); if (error) goto out; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#31 (text) ==== @@ -1290,25 +1290,21 @@ rl_t *rl; } zvol_dmu_state_t; -#ifdef NOTYET -#else static void -zvol_dmu_write_buf(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, - uint64_t sz) +zvol_dmu_buf_set_transfer_write(dmu_buf_set_t *buf_set) { zvol_dmu_state_t *zds = (zvol_dmu_state_t *)buf_set->dmu_ctx; zvol_state_t *zv = zds->zv; boolean_t sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); dmu_tx_t *tx = DMU_BUF_SET_TX(buf_set); - dmu_buf_set_write_no_tx(buf_set, db, off, sz); + dmu_buf_set_transfer_write(buf_set); /* Log this write. */ if ((zv->zv_flags & ZVOL_WCE) == 0 || sync) zvol_log_write(zv, tx, buf_set->offset, buf_set->size, sync); dmu_tx_commit(tx); } -#endif static void zvol_dmu_done(dmu_context_t *dmu_ctx) @@ -1318,11 +1314,8 @@ boolean_t sync_always = zds->zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; int err; - if (!reader) { - dmu_tx_commit(zds->dmu_ctx.tx); - if (sync_always) - zil_commit(zds->zv->zv_zilog, ZVOL_OBJ); - } + if (!reader && sync_always) + zil_commit(zds->zv->zv_zilog, ZVOL_OBJ); if (dmu_ctx->completed_size < dmu_ctx->length) { if (dmu_ctx->offset > zds->zv->zv_volsize) err = EINVAL; @@ -1367,9 +1360,6 @@ off = uio->uio_loffset; data_buf = uio; iov = uio->uio_iov; - dprintf("%s: uio iov=%p iovcnt=%d base %p len %lu\n", - __func__, iov, uio->uio_iovcnt, iov->iov_base, - iov->iov_len); } else #endif { @@ -1391,25 +1381,14 @@ zds->bp = bp; /* Set up the DMU context & range lock. */ - error = dmu_context_init(&zds->dmu_ctx, zv->zv_objset, ZVOL_OBJ, - off, io_size, data_buf, FTAG, dmu_flags); + error = dmu_context_init(&zds->dmu_ctx, /*dnode*/NULL, zv->zv_objset, + ZVOL_OBJ, off, io_size, data_buf, FTAG, dmu_flags); if (error) return (error); /* Override the writer case to log the writes. */ if (!reader) { -#ifdef NOTYET - tx = dmu_tx_create(zv->zv_objset); - dmu_tx_hold_write(tx, ZVOL_OBJ, off, io_size); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - kmem_free(zds, sizeof(zvol_dmu_state_t)); - return (error); - } - dmu_context_set_dmu_tx(&zds->dmu_ctx, tx); -#else - dmu_context_set_buf_transfer_cb(&zds->dmu_ctx, - zvol_dmu_write_buf); -#endif + dmu_context_set_buf_set_transfer_cb(&zds->dmu_ctx, + zvol_dmu_buf_set_transfer_write); } dmu_context_set_context_cb(&zds->dmu_ctx, zvol_dmu_done); zds->rl = zfs_range_lock(&zv->zv_znode, off, io_size, Change 526236 by willa@willa_repo on 2012/02/10 17:55:31 Another iteration of review cleanup & bug fixes. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: - Remove an assert that was never valid; Ken ran into this with an xdd session (with ZFS_DEBUG turned on). It's sufficiently rare that the assertion was invalid that it took until now to fire. Upon review, Justin & I concluded that it makes no sense to assert anything in particular about the dbuf's dirty records here. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h: - Rename some dmu_buf_set_t/dmu_context_t elements to have more descriptive names, e.g. dn_start instead of offset, dbp_length instead of blocks_allocated, etc. - Explain better why we do a dnode_hold() inside dmu_issue() for writers that provided a tx but not a dnode. - Get rid of dmu_buf_transfer_read, replacing it with simply dmu_ctx->move_cb, since that's all it did is call it; change the order of initialization in dmu_context_init() to accomodate. sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c: - Refactor zvol_dmu_common further. Require callers to pass in a zvol_dmu_state_t. In this way, they can place it on the stack or heap as they desire. Callers will need to populate it first. - Split zvol_dmu_common() into zvol_dmu_context_init() and zvol_dmu_issue(). Callers will provide most of the context needed to set up the DMU context, then call issue. - Change zvol_freebsd_read(), zvol_common_strategy(), etc. so that they simply pass in the appropriate size, offset, and data pointers, rather than having zvol_dmu_common() decode them. - Use the opportunity to refactor zvol_common_strategy() while I'm at it. Create a wrapper for zvol_dmu_state_t for strategy calls that contains things known only to it, such as the struct bio, and a delivery callback. Pass in a wrapper for the zvol_dmu_state_t done callback that handles these. - Change zvol_strategy() and zvol_geom_strategy() so they pass in the appropriate delivery callback. - The common ZVOL DMU interface no longer needs to distinguish between async and sync callers, or between UIO and struct bio callers, since the done callback is now different for those cases. - Sync callers now also avoid the overhead of malloc/free since they can now simply place their zvol_dmu_state_t on the stack. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#65 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#55 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#29 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#32 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#65 (text) ==== @@ -794,12 +794,6 @@ /* Any reads or writes must have a hold on this dbuf */ ASSERT(refcount_count(&db->db_holds) > 0); - /* - * Only level 0 blocks can be dirtied before being read - * (i.e. entering the DB_CACHED state). - */ - ASSERT(db->db_level == 0 || db->db_dirtycnt == 0); - if (zio == NULL || zio->io_error == 0) { /* Read succeeded. */ dbuf_read_complete(db, buf); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#55 (text) ==== @@ -694,7 +694,7 @@ uint64_t sz) { char *data = (char *)buf_set->dmu_ctx->data_buf + db->db_offset - - buf_set->dmu_ctx->start + off; + buf_set->dmu_ctx->dn_start + off; dprintf("%s(set=%p, db=%p, off=%lu, sz=%lu) db_data=%p data=%p\n", __func__, buf_set, db, off, sz, db->db_data + off, data); bcopy((char *)db->db_data + off, data, sz); @@ -704,7 +704,7 @@ uint64_t sz) { char *data = (char *)buf_set->dmu_ctx->data_buf + db->db_offset - - buf_set->dmu_ctx->start + off; + buf_set->dmu_ctx->dn_start + off; dprintf("%s(set=%p, db=%p, off=%lu, sz=%lu) data=%p db_data=%p\n", __func__, buf_set, db, off, sz, data, db->db_data + off); bcopy(data, (char *)db->db_data + off, sz); @@ -756,13 +756,6 @@ dmu_buf_fill_done(db, tx); } -static void -dmu_buf_transfer_read(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, - uint64_t sz) -{ - buf_set->dmu_ctx->move_cb(buf_set, db, off, sz); -} - void dmu_buf_set_transfer(dmu_buf_set_t *buf_set) { @@ -773,7 +766,7 @@ /* Initialize the current state. */ size = buf_set->size; - offset = buf_set->offset; + offset = buf_set->dn_start; /* Perform the I/O copy, one buffer at a time. */ for (i = 0; i < buf_set->count; i++) { @@ -866,7 +859,7 @@ #endif kmem_free(buf_set, sizeof(dmu_buf_set_t) + - buf_set->blocks_allocated * sizeof(dmu_buf_t *)); + buf_set->dbp_length * sizeof(dmu_buf_t *)); dmu_context_rele(dmu_ctx); } @@ -986,7 +979,7 @@ buf_set->size > zfetch_array_rd_sz) dbuf_flags |= DB_RF_NOPREFETCH; - blkid = dbuf_whichblock(dn, dmu_ctx->offset); + blkid = dbuf_whichblock(dn, dmu_ctx->dn_offset); /* * Note that while this loop is running, any zio's set up for async * reads are not executing, therefore access to this buf_set is @@ -1014,13 +1007,13 @@ #endif /* Calculate the amount of data this buffer contributes. */ - ASSERT(dmu_ctx->offset >= db->db.db_offset); - bufoff = dmu_ctx->offset - db->db.db_offset; + ASSERT(dmu_ctx->dn_offset >= db->db.db_offset); + bufoff = dmu_ctx->dn_offset - db->db.db_offset; bufsiz = (int)MIN(db->db.db_size - bufoff, buf_set->resid); buf_set->resid -= bufsiz; /* Update the caller's data to let them know what's next. */ - dmu_ctx->offset += bufsiz; - dmu_ctx->size -= bufsiz; + dmu_ctx->dn_offset += bufsiz; + dmu_ctx->resid -= bufsiz; /* Put this dbuf in the buffer set's list. */ buf_set->dbp[i] = &db->db; } @@ -1040,8 +1033,12 @@ { int err; + /* Readers and writers with a context transaction do not apply. */ + if ((dmu_ctx->flags & DMU_CTX_FLAG_READ) || dmu_ctx->tx != NULL) + return (0); + *txp = dmu_tx_create(dmu_ctx->os); - dmu_tx_hold_write(*txp, dmu_ctx->object, dmu_ctx->offset, size); + dmu_tx_hold_write(*txp, dmu_ctx->object, dmu_ctx->dn_offset, size); err = dmu_tx_assign(*txp, TXG_WAIT); if (err) goto out; @@ -1092,28 +1089,25 @@ * first in order to hold the correct struct_rwlock, use the * correct values for dn_datablksz, etc. */ - if ((dmu_ctx->flags & DMU_CTX_FLAG_READ) == 0 && dmu_ctx->tx == NULL) { - err = dmu_context_setup_tx(dmu_ctx, &tx, &dn, size); - if (err) - return (err); - } + err = dmu_context_setup_tx(dmu_ctx, &tx, &dn, size); + if (err) + return (err); rw_enter(&dn->dn_struct_rwlock, RW_READER); /* Figure out how many blocks are needed for the requested size. */ if (dn->dn_datablkshift) { - ASSERT3U(dn->dn_datablksz, ==, 1 << dn->dn_datablkshift); - nblks = P2ROUNDUP(dmu_ctx->offset + size, dn->dn_datablksz); - nblks -= P2ALIGN(dmu_ctx->offset, dn->dn_datablksz); + nblks = P2ROUNDUP(dmu_ctx->dn_offset + size, dn->dn_datablksz); + nblks -= P2ALIGN(dmu_ctx->dn_offset, dn->dn_datablksz); nblks >>= dn->dn_datablkshift; } else { - if ((dmu_ctx->offset + size) > dn->dn_datablksz) { + if ((dmu_ctx->dn_offset + size) > dn->dn_datablksz) { zfs_panic_recover("zfs: accessing past end of object " "%llx/%llx (size=%u access=%llu+%llu)", (longlong_t)dn->dn_objset-> os_dsl_dataset->ds_object, (longlong_t)dn->dn_object, dn->dn_datablksz, - (longlong_t)dmu_ctx->offset, + (longlong_t)dmu_ctx->dn_offset, (longlong_t)size); err = EIO; goto out; @@ -1124,7 +1118,6 @@ /* Create the new buffer set. */ set_size = sizeof(dmu_buf_set_t) + nblks * sizeof(dmu_buf_t *); buf_set = kmem_zalloc(set_size, KM_SLEEP); - refcount_acquire(&dmu_ctx->holds); /* Initialize a new buffer set. */ #ifdef ZFS_DEBUG @@ -1133,9 +1126,9 @@ #endif buf_set->size = size; buf_set->resid = size; - buf_set->offset = dmu_ctx->offset; + buf_set->dn_start = dmu_ctx->dn_offset; buf_set->count = nblks; - buf_set->blocks_allocated = nblks; + buf_set->dbp_length = nblks; buf_set->tx = tx; /* Include a refcount for the initiator. */ @@ -1145,13 +1138,13 @@ /* For writes, dbufs never need to call us back. */ refcount_init(&buf_set->holds, 1); buf_set->dmu_ctx = dmu_ctx; + refcount_acquire(&dmu_ctx->holds); /* Either we're a reader or we have a transaction somewhere. */ ASSERT((dmu_ctx->flags & DMU_CTX_FLAG_READ) || DMU_BUF_SET_TX(buf_set)); buf_set->zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); *buf_set_p = buf_set; - /* Set up the buffers. */ err = dmu_buf_set_setup_buffers(buf_set); out: @@ -1240,7 +1233,11 @@ ASSERT((dmu_ctx->flags & DMU_CTX_FLAG_ASYNC) == 0 || dmu_ctx->context_cb != NULL); - /* For writers, if a tx was specified but a dnode wasn't, hold here. */ + /* + * For writers, if a tx was specified but a dnode wasn't, hold here. + * This could be done in dmu_context_set_dmu_tx(), but that would + * require dmu.h to include a dnode_hold() prototype. + */ if (dmu_ctx->tx != NULL && dmu_ctx->dn == NULL) { err = dnode_hold(dmu_ctx->os, dmu_ctx->object, dmu_ctx->tag, &dmu_ctx->dn); @@ -1250,14 +1247,12 @@ /* While there is work left to do, execute the next chunk. */ dprintf("%s(%p) -> buf %p off %lu sz %lu\n", __func__, dmu_ctx, - dmu_ctx->data_buf, dmu_ctx->offset, dmu_ctx->size); - while (dmu_ctx->size > 0 && err == 0) { - /* Determine this chunk's size. */ - io_size = MIN(dmu_ctx->size, DMU_MAX_ACCESS/2); + dmu_ctx->data_buf, dmu_ctx->dn_offset, dmu_ctx->resid); + while (dmu_ctx->resid > 0 && err == 0) { + io_size = MIN(dmu_ctx->resid, DMU_MAX_ACCESS/2); - /* Initialize the buffer set for this chunk. */ dprintf("%s(%p@%lu+%lu) chunk %lu\n", __func__, dmu_ctx, - dmu_ctx->offset, dmu_ctx->size, io_size); + dmu_ctx->dn_offset, dmu_ctx->resid, io_size); err = dmu_buf_set_init(dmu_ctx, &buf_set, io_size); /* Process the I/O requests, if the initialization passed. */ @@ -1336,19 +1331,12 @@ dmu_ctx->dn = dn; dmu_ctx->os = os; dmu_ctx->object = object; - dmu_ctx->length = size; + dmu_ctx->size = size; dmu_context_seek(dmu_ctx, offset, size, data_buf); dmu_ctx->tag = tag; dmu_ctx->flags = flags; /* Initialize default I/O callbacks. */ - dmu_ctx->buf_set_transfer_cb = reader ? dmu_buf_set_transfer : - dmu_buf_set_transfer_write_tx; - if ((dmu_ctx->flags & DMU_CTX_FLAG_NOFILL) == 0) { - dmu_ctx->buf_transfer_cb = reader ? dmu_buf_transfer_read : - dmu_buf_transfer_write; - } else - dmu_ctx->buf_transfer_cb = dmu_buf_transfer_nofill; if (dmu_ctx->flags & DMU_CTX_FLAG_UIO) { #ifdef UIO_XUIO uio_t *uio = (uio_t *)dmu_ctx->data_buf; @@ -1368,6 +1356,13 @@ dmu_ctx->move_cb = reader ? dmu_buf_read_char : dmu_buf_write_char; } + dmu_ctx->buf_set_transfer_cb = reader ? dmu_buf_set_transfer : + dmu_buf_set_transfer_write_tx; + if ((dmu_ctx->flags & DMU_CTX_FLAG_NOFILL) == 0) { + dmu_ctx->buf_transfer_cb = reader ? dmu_ctx->move_cb : + dmu_buf_transfer_write; + } else + dmu_ctx->buf_transfer_cb = dmu_buf_transfer_nofill; /* Initialize including a refcount for the initiator. */ refcount_init(&dmu_ctx->holds, 1); @@ -1414,9 +1409,9 @@ bzero((char *)data_buf + newsz, size - newsz); size = newsz; } - dmu_ctx->offset = offset; - dmu_ctx->start = offset; - dmu_ctx->size = size; + dmu_ctx->dn_offset = offset; + dmu_ctx->dn_start = offset; + dmu_ctx->resid = size; dmu_ctx->data_buf = data_buf; } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#29 (text) ==== @@ -234,11 +234,11 @@ typedef struct dmu_context { /** The primary data associated with this context. */ - uint64_t length; /**< Requested total I/O length. */ - uint64_t size; /**< Remaining bytes to process. */ - uint64_t start; /**< Starting block offset. */ - uint64_t offset; /**< Current block offset. */ - dmu_tx_t *tx; /**< For writes only */ + uint64_t size; /**< Requested total I/O size. */ + uint64_t resid; /**< Remaining bytes to process. */ + uint64_t dn_start; /**< Starting block offset into the dnode. */ + uint64_t dn_offset; /**< Current block offset. */ + dmu_tx_t *tx; /**< Caller's transaction, if specified. */ void *data_buf; /**< UIO or char pointer */ /** The dnode held in association with this context. */ @@ -267,10 +267,10 @@ */ dmu_buf_transfer_callback_t move_cb; - /** Completed size. */ + /** Total number of bytes transferred. */ uint64_t completed_size; - /** Flags for this block. */ + /** Flags for this DMU context. */ uint32_t flags; #define DMU_CTX_FLAG_READ (1 << 1) #define DMU_CTX_FLAG_UIO (1 << 2) @@ -296,17 +296,17 @@ /** The DMU context that this buffer set is associated with. */ dmu_context_t *dmu_ctx; - /** Number of buffers associated with this context. */ + /** Number of dmu_bufs associated with this context. */ int count; - /** Number of buffers space has been allocated for. */ - int blocks_allocated; + /** Length of dbp; only used to free the correct size. */ + int dbp_length; - /** Number of buffers left to complete. */ + /** Number of dmu_bufs left to complete. */ int holds; - /* The offset (into the data buffer) of this array of buffers. */ - uint64_t offset; + /** The starting offset, relative to the associated dnode. */ + uint64_t dn_start; /** The size of the I/O. */ uint64_t size; /** The amount of data remaining to process for this buffer set. */ ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#32 (text) ==== @@ -1286,7 +1286,6 @@ */ dmu_context_t dmu_ctx; zvol_state_t *zv; - struct bio *bp; rl_t *rl; } zvol_dmu_state_t; @@ -1302,7 +1301,7 @@ /* Log this write. */ if ((zv->zv_flags & ZVOL_WCE) == 0 || sync) - zvol_log_write(zv, tx, buf_set->offset, buf_set->size, sync); + zvol_log_write(zv, tx, buf_set->dn_start, buf_set->size, sync); dmu_tx_commit(tx); } @@ -1310,161 +1309,150 @@ zvol_dmu_done(dmu_context_t *dmu_ctx) { zvol_dmu_state_t *zds = (zvol_dmu_state_t *)dmu_ctx; - boolean_t reader = (dmu_ctx->flags & DMU_CTX_FLAG_READ); boolean_t sync_always = zds->zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; int err; - if (!reader && sync_always) + if ((dmu_ctx->flags & DMU_CTX_FLAG_READ) == 0 && sync_always) zil_commit(zds->zv->zv_zilog, ZVOL_OBJ); - if (dmu_ctx->completed_size < dmu_ctx->length) { - if (dmu_ctx->offset > zds->zv->zv_volsize) + if (dmu_ctx->completed_size < dmu_ctx->size) { + if (dmu_ctx->dn_offset > zds->zv->zv_volsize) err = EINVAL; } else err = (dmu_ctx->err == 0) ? 0 : EIO; - /* Notify synchronous callers of the final errno. */ dmu_ctx->err = err; zfs_range_unlock(zds->rl); - - /* Finally, deliver to the caller. UIOs are already done. */ - if ((dmu_ctx->flags & DMU_CTX_FLAG_UIO) == 0) { - ASSERT(zds->bp != NULL); - zds->bp->bio_error = err; - zds->bp->bio_completed = dmu_ctx->completed_size; - if (zds->bp->bio_to != NULL) - g_io_deliver(zds->bp, 0); - else - zds->bp->bio_done(zds->bp); - } - if (dmu_ctx->flags & DMU_CTX_FLAG_ASYNC) - kmem_free(zds, sizeof(zvol_dmu_state_t)); } -int -zvol_dmu_common(zvol_state_t *zv, void *data, uint32_t dmu_flags) +static int +zvol_dmu_context_init(zvol_dmu_state_t *zds, void *data, uint64_t off, + uint64_t io_size, uint32_t dmu_flags, dmu_context_callback_t done_cb) { - zvol_dmu_state_t *zds; + zvol_state_t *zv = zds->zv; + boolean_t reader = (dmu_flags & DMU_CTX_FLAG_READ) != 0; int error; - uint64_t io_size, off; - void *data_buf; - boolean_t reader = (dmu_flags & DMU_CTX_FLAG_READ) != 0; - struct bio *bp = NULL; - dmu_tx_t *tx; -#ifdef _KERNEL - if (dmu_flags & DMU_CTX_FLAG_UIO) - { - uio_t *uio = (uio_t *)data; - io_size = uio->uio_resid; - struct iovec *iov; - off = uio->uio_loffset; - data_buf = uio; - iov = uio->uio_iov; - } else -#endif - { - bp = (struct bio *)data; - io_size = bp->bio_length; - off = bp->bio_offset; - data_buf = bp->bio_data; - } - - /* Don't allow I/Os past the end of the volume. */ + /* Truncate I/Os to the end of the volume, if needed. */ if (io_size > zv->zv_volsize - off) io_size = zv->zv_volsize - off; if (reader) dmu_flags |= DMU_CTX_FLAG_PREFETCH; - zds = kmem_zalloc(sizeof(zvol_dmu_state_t), KM_SLEEP); - zds->zv = zv; - zds->bp = bp; - - /* Set up the DMU context & range lock. */ error = dmu_context_init(&zds->dmu_ctx, /*dnode*/NULL, zv->zv_objset, - ZVOL_OBJ, off, io_size, data_buf, FTAG, dmu_flags); + ZVOL_OBJ, off, io_size, data, FTAG, dmu_flags); if (error) return (error); /* Override the writer case to log the writes. */ - if (!reader) { + if (!reader) dmu_context_set_buf_set_transfer_cb(&zds->dmu_ctx, zvol_dmu_buf_set_transfer_write); - } - dmu_context_set_context_cb(&zds->dmu_ctx, zvol_dmu_done); - zds->rl = zfs_range_lock(&zv->zv_znode, off, io_size, + dmu_context_set_context_cb(&zds->dmu_ctx, done_cb); + zds->rl = zfs_range_lock(&zds->zv->zv_znode, off, io_size, reader ? RL_READER : RL_WRITER); - /* Issue the DMU I/O and release the initiator hold. */ - dmu_issue(&zds->dmu_ctx); - /* Either this was an async issue, or we're the last holder. */ - ASSERT((dmu_flags & DMU_CTX_FLAG_ASYNC) || zds->dmu_ctx.holds == 1); + return (error); +} + +static void +zvol_dmu_issue(zvol_dmu_state_t *zds) +{ + int error; + + error = dmu_issue(&zds->dmu_ctx); + if (error) + zds->dmu_ctx.err++; dmu_context_rele(&zds->dmu_ctx); - /* In the async case, zvol_dmu_done frees us. */ - if ((dmu_flags & DMU_CTX_FLAG_ASYNC) == 0) { - /* For the synchronous case, save the error to pass it up. */ - error = zds->dmu_ctx.err; - kmem_free(zds, sizeof(zvol_dmu_state_t)); - } - return (error); +} + +typedef void (*zvol_strategy_deliver_cb)(struct bio *bp, int err); + +static void +zvol_strategy_bio_deliver(struct bio *bp, int err) +{ + bp->bio_error = err; + bp->bio_done(bp); +} + +/** + * Use another layer on top of zvol_dmu_state_t to provide additional + * context specific to zvol_common_strategy(), namely, the bio and the done + * callback, which calls zvol_dmu_done, as is done for zvol_dmu_state_t. + */ +typedef struct zvol_strategy_state { + zvol_dmu_state_t zds; + struct bio *bp; + zvol_strategy_deliver_cb deliver_cb; +} zvol_strategy_state_t; + +static void +zvol_strategy_dmu_done(dmu_context_t *dmu_ctx) +{ + zvol_strategy_state_t *zss = (zvol_strategy_state_t *)dmu_ctx; + + zvol_dmu_done(dmu_ctx); + zss->bp->bio_completed = dmu_ctx->completed_size; + zss->deliver_cb(zss->bp, dmu_ctx->err); + kmem_free(zss, sizeof(zvol_strategy_state_t)); } static void -zvol_common_strategy(struct bio *bp, int geom_mode) +zvol_common_strategy(struct bio *bp, zvol_state_t *zv, + zvol_strategy_deliver_cb deliver_cb) { - zvol_state_t *zv; + zvol_strategy_state_t *zss; int error = 0; uint32_t dmu_flags = DMU_CTX_FLAG_ASYNC; - /* XXX KDM may be able to consolidate this into the non-GEOM case. */ - if (geom_mode != 0) - zv = bp->bio_to->private; - else { - zv = bp->bio_dev->si_drv1; - bp->bio_to = NULL; - } - -#define ZVOL_STRATEGY_DELIVER(_err) \ - if (geom_mode != 0) \ - g_io_deliver(bp, _err); \ - else { \ - bp->bio_error = _err; \ - bp->bio_done(bp); \ - } - if (zv == NULL) { - ZVOL_STRATEGY_DELIVER(ENXIO); + deliver_cb(bp, ENXIO); return; } if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) { - ZVOL_STRATEGY_DELIVER(EROFS); + deliver_cb(bp, EROFS); return; } ASSERT(zv->zv_objset != NULL); - if (bp->bio_length > 0 && (bp->bio_offset < 0 || bp->bio_offset >= zv->zv_volsize)) { - ZVOL_STRATEGY_DELIVER(EIO); + deliver_cb(bp, EIO); return; } -#undef ZVOL_STRATEGY_DELIVER if (bp->bio_cmd == BIO_READ) dmu_flags |= DMU_CTX_FLAG_READ; - zvol_dmu_common(zv, bp, dmu_flags); + + zss = kmem_zalloc(sizeof(zvol_strategy_state_t), KM_SLEEP); + zss->bp = bp; + zss->deliver_cb = deliver_cb; + zss->zds.zv = zv; + + error = zvol_dmu_context_init(&zss->zds, bp->bio_data, bp->bio_offset, + bp->bio_length, dmu_flags, zvol_strategy_dmu_done); + if (error) { + kmem_free(zss, sizeof(zvol_strategy_state_t)); + deliver_cb(bp, error); + return; + } + + /* Errors are reported via the callback. */ + zvol_dmu_issue(&zss->zds); } static void zvol_strategy(struct bio *bp) { - zvol_common_strategy(bp, /*geom_mode*/ 0); + zvol_state_t *zv = bp->bio_dev->si_drv1; + zvol_common_strategy(bp, zv, zvol_strategy_bio_deliver); } static void zvol_geom_strategy(struct bio *bp) { - zvol_common_strategy(bp, /*geom_mode*/ 1); + zvol_state_t *zv = bp->bio_to->private; + zvol_common_strategy(bp, zv, g_io_deliver); } #ifdef sun @@ -1517,28 +1505,45 @@ } #endif /* sun */ +static int +zvol_dmu_uio_init(zvol_dmu_state_t *zds, uio_t *uio, uint32_t dmu_flags) +{ + + /* Don't allow I/Os that are not within the volume. */ + if (uio->uio_resid > 0 && + (uio->uio_loffset < 0 || uio->uio_loffset >= zds->zv->zv_volsize)) + return (EIO); + + return (zvol_dmu_context_init(zds, uio, uio->uio_loffset, + uio->uio_resid, dmu_flags, zvol_dmu_done)); +} + #if defined(__FreeBSD__) && defined(_KERNEL) int zvol_freebsd_read(struct cdev *dev, struct uio *uio, int ioflag) { - zvol_state_t *zv = (zvol_state_t *)dev->si_drv1; + zvol_dmu_state_t zds; + int err; - if (uio->uio_resid > 0 && - (uio->uio_loffset < 0 || uio->uio_loffset >= zv->zv_volsize)) - return (EIO); - - return (zvol_dmu_common(zv, uio, DMU_CTX_FLAG_UIO|DMU_CTX_FLAG_READ)); + zds.zv = (zvol_state_t *)dev->si_drv1; + err = zvol_dmu_uio_init(&zds, uio, DMU_CTX_FLAG_UIO|DMU_CTX_FLAG_READ); + if (err) + return (err); + zvol_dmu_issue(&zds); + return (zds.dmu_ctx.err); } int zvol_freebsd_write(struct cdev *dev, struct uio *uio, int ioflag) { - zvol_state_t *zv = (zvol_state_t *)dev->si_drv1; + zvol_dmu_state_t zds; + int err; - if (uio->uio_resid > 0 && - (uio->uio_loffset < 0 || uio->uio_loffset >= zv->zv_volsize)) - return (EIO); - - return (zvol_dmu_common(zv, uio, DMU_CTX_FLAG_UIO)); + zds.zv = (zvol_state_t *)dev->si_drv1; + err = zvol_dmu_uio_init(&zds, uio, DMU_CTX_FLAG_UIO); + if (err) + return (err); + zvol_dmu_issue(&zds); + return (zds.dmu_ctx.err); } #endif /* __FreeBSD__ && _KERNEL */ @@ -1548,21 +1553,22 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) { minor_t minor = getminor(dev); - zvol_state_t *zv; + zvol_dmu_state_t zds; + int err; - zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); - if (zv == NULL) + zds.zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); + if (zds.zv == NULL) return (ENXIO); - if (uio->uio_resid > 0 && - (uio->uio_loffset < 0 || uio->uio_loffset >= zv->zv_volsize)) - return (EIO); - - if (zv->zv_flags & ZVOL_DUMPIFIED) + if (zds.zv->zv_flags & ZVOL_DUMPIFIED) return (physio(zvol_strategy, NULL, dev, B_READ, zvol_minphys, uio)); - return (zvol_dmu_common(zv, uio, DMU_CTX_FLAG_UIO|DMU_CTX_FLAG_READ)); + err = zvol_dmu_uio_init(&zds, uio, DMU_CTX_FLAG_UIO|DMU_CTX_FLAG_READ); + if (err) + return (err); + + return (zvol_dmu_issue(&zds)); } /*ARGSUSED*/ @@ -1570,21 +1576,21 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) { minor_t minor = getminor(dev); - zvol_state_t *zv; + zvol_dmu_state_t zds; - zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); - if (zv == NULL) + zds.zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); + if (zds.zv == NULL) return (ENXIO); - if (uio->uio_resid > 0 && - (uio->uio_loffset < 0 || uio->uio_loffset >= zv->zv_volsize)) - return (EIO); - if (zv->zv_flags & ZVOL_DUMPIFIED) return (physio(zvol_strategy, NULL, dev, B_WRITE, zvol_minphys, uio)); - return (zvol_dmu_common(zv, uio, DMU_CTX_FLAG_UIO)); + err = zvol_dmu_uio_init(&zds, uio, DMU_CTX_FLAG_UIO); + if (err) + return (err); + + return (zvol_dmu_issue(&zds)); } int Change 526237 by willa@willa_repo on 2012/02/10 18:07:09 Refactor the zvol uio calls a bit further. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#33 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#33 (text) ==== @@ -1506,16 +1506,31 @@ #endif /* sun */ static int -zvol_dmu_uio_init(zvol_dmu_state_t *zds, uio_t *uio, uint32_t dmu_flags) +zvol_dmu_uio_common(zvol_dmu_state_t *zds, uio_t *uio, uint32_t dmu_flags) { + int err; + boolean_t reader = (dmu_flags & DMU_CTX_FLAG_READ); + + if (zds->zv == NULL) + return (ENXIO); + +#ifdef sun + if (zds.zv->zv_flags & ZVOL_DUMPIFIED) + return (physio(zvol_strategy, NULL, dev, + reader ? B_READ : B_WRITE, zvol_minphys, uio)); +#endif /* Don't allow I/Os that are not within the volume. */ if (uio->uio_resid > 0 && (uio->uio_loffset < 0 || uio->uio_loffset >= zds->zv->zv_volsize)) return (EIO); - return (zvol_dmu_context_init(zds, uio, uio->uio_loffset, - uio->uio_resid, dmu_flags, zvol_dmu_done)); + err = zvol_dmu_context_init(zds, uio, uio->uio_loffset, + uio->uio_resid, dmu_flags|DMU_CTX_FLAG_UIO, zvol_dmu_done); + if (err) + return (err); + zvol_dmu_issue(zds); + return (zds->dmu_ctx.err); } #if defined(__FreeBSD__) && defined(_KERNEL) @@ -1523,27 +1538,17 @@ zvol_freebsd_read(struct cdev *dev, struct uio *uio, int ioflag) { zvol_dmu_state_t zds; - int err; zds.zv = (zvol_state_t *)dev->si_drv1; - err = zvol_dmu_uio_init(&zds, uio, DMU_CTX_FLAG_UIO|DMU_CTX_FLAG_READ); - if (err) - return (err); - zvol_dmu_issue(&zds); - return (zds.dmu_ctx.err); + return (zvol_dmu_uio_common(&zds, uio, DMU_CTX_FLAG_READ)); } int zvol_freebsd_write(struct cdev *dev, struct uio *uio, int ioflag) { zvol_dmu_state_t zds; - int err; zds.zv = (zvol_state_t *)dev->si_drv1; - err = zvol_dmu_uio_init(&zds, uio, DMU_CTX_FLAG_UIO); - if (err) - return (err); - zvol_dmu_issue(&zds); - return (zds.dmu_ctx.err); + return (zvol_dmu_uio_common(&zds, uio, /*flags*/0)); } #endif /* __FreeBSD__ && _KERNEL */ @@ -1554,21 +1559,9 @@ { minor_t minor = getminor(dev); zvol_dmu_state_t zds; - int err; zds.zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); - if (zds.zv == NULL) - return (ENXIO); - - if (zds.zv->zv_flags & ZVOL_DUMPIFIED) - return (physio(zvol_strategy, NULL, dev, B_READ, - zvol_minphys, uio)); - - err = zvol_dmu_uio_init(&zds, uio, DMU_CTX_FLAG_UIO|DMU_CTX_FLAG_READ); - if (err) - return (err); - - return (zvol_dmu_issue(&zds)); + return (zvol_dmu_uio_common(&zds, uio, DMU_CTX_FLAG_READ)); } /*ARGSUSED*/ @@ -1579,18 +1572,7 @@ zvol_dmu_state_t zds; zds.zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); - if (zds.zv == NULL) - return (ENXIO); - - if (zv->zv_flags & ZVOL_DUMPIFIED) - return (physio(zvol_strategy, NULL, dev, B_WRITE, - zvol_minphys, uio)); - - err = zvol_dmu_uio_init(&zds, uio, DMU_CTX_FLAG_UIO); - if (err) - return (err); - - return (zvol_dmu_issue(&zds)); + return (zvol_dmu_uio_common(&zds, uio, /*flags*/0)); } int Change 526439 by willa@willa_repo on 2012/02/13 13:19:56 Reduce duplication of code for zvol_*_{open,close}(). Change the comment about zvol recursion to a printf to make it more obvious to users. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#34 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#34 (text) ==== @@ -889,8 +889,8 @@ return (error); } -/* - * Assumptions: zv != NULL and spa_namespace_lock is held. +/** + * \invariant: spa_namespace_lock must be held. */ static int zvol_common_open(zvol_state_t *zv, int flag, int count) @@ -899,6 +899,11 @@ err = 0; + if (zv == NULL) { + mutex_exit(&spa_namespace_lock); + return (ENXIO); + } + if (zv->zv_total_opens == 0) err = zvol_first_open(zv); if (err) { @@ -941,22 +946,12 @@ zvol_state_t *zv; if (MUTEX_HELD(&spa_namespace_lock)) { - /* - * If the spa_namespace_lock is being held, it means that ZFS - * is trying to open ZVOL as its VDEV. This is not supported. - */ + printf("ZFS: Using ZVOL as a vdev is not supported\n"); return (EOPNOTSUPP); } mutex_enter(&spa_namespace_lock); - - zv = pp->private; - if (zv == NULL) { - mutex_exit(&spa_namespace_lock); - return (ENXIO); - } - - return (zvol_common_open(zv, flag, count)); + return (zvol_common_open(pp->private, flag, count)); } static int @@ -965,28 +960,23 @@ zvol_state_t *zv; if (MUTEX_HELD(&spa_namespace_lock)) { - /* - * If the spa_namespace_lock is being held, it means that ZFS - * is trying to open ZVOL as its VDEV. This is not supported. - */ + printf("ZFS: Using ZVOL as a vdev is not supported\n"); return (EOPNOTSUPP); } mutex_enter(&spa_namespace_lock); + return (zvol_common_open(dev->si_drv1, flags, /*count*/ 1)); +} - zv = (zvol_state_t *)dev->si_drv1; +static int +zvol_common_close(zvol_state_t *zv, int count) +{ + if (zv == NULL) { mutex_exit(&spa_namespace_lock); return (ENXIO); } - return (zvol_common_open(zv, flags, /*count*/ 1)); -} - -static int -zvol_common_close(zvol_state_t *zv, int count) -{ - if (zv->zv_flags & ZVOL_EXCL) { ASSERT(zv->zv_total_opens == 1); zv->zv_flags &= ~ZVOL_EXCL; @@ -1016,34 +1006,17 @@ static int zvol_geom_close(struct g_provider *pp, int flag, int count) { - zvol_state_t *zv; mutex_enter(&spa_namespace_lock); - - zv = pp->private; - if (zv == NULL) { - mutex_exit(&spa_namespace_lock); - /* XXX KDM should we just return 0 instead? */ - return (ENXIO); - } - return (zvol_common_close(zv, count)); + return (zvol_common_close(pp->private, count)); } static int zvol_close(struct cdev *dev, int flags, int fmt, struct thread *td) { - zvol_state_t *zv; mutex_enter(&spa_namespace_lock); - - zv = (zvol_state_t *)dev->si_drv1; - if (zv == NULL) { - mutex_exit(&spa_namespace_lock); - /* XXX KDM should we just return 0 instead? */ - return (ENXIO); - } - - return (zvol_common_close(zv, /*count*/ 1)); + return (zvol_common_close(dev->si_drv1, /*count*/ 1)); } static void Change 526699 by willa@willa_repo on 2012/02/13 15:27:01 Change the include to . This makes userland consumers build cleanly. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#30 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#30 (text) ==== @@ -41,7 +41,7 @@ * dmu_spa.h. */ -#include +#include #include #include #include Change 526700 by willa@willa_repo on 2012/02/13 15:28:39 Change argument naming for taskqueue_enqueue_locked(). Using 'task' is too generic and confusing given that it is also the name of the struct in the STAILQ, as used here. Use the name 'queued_task' so it is obvious which is being referenced. Affected files ... ... //depot/branches/redline/projects/cow/sys/kern/subr_taskqueue.c#5 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/kern/subr_taskqueue.c#5 (text) ==== @@ -188,7 +188,7 @@ } static int -taskqueue_enqueue_locked(struct taskqueue *queue, struct task *task) +taskqueue_enqueue_locked(struct taskqueue *queue, struct task *queued_task) { struct task *ins; struct task *prev; @@ -196,9 +196,9 @@ /* * Count multiple enqueues. */ - if (task->ta_pending) { - if (task->ta_pending < USHRT_MAX) - task->ta_pending++; + if (queued_task->ta_pending) { + if (queued_task->ta_pending < USHRT_MAX) + queued_task->ta_pending++; return (0); } @@ -206,22 +206,24 @@ * Optimise the case when all tasks have the same priority. */ prev = STAILQ_LAST(&queue->tq_queue, task, ta_link); - if (!prev || prev->ta_priority >= task->ta_priority) { - STAILQ_INSERT_TAIL(&queue->tq_queue, task, ta_link); + if (!prev || prev->ta_priority >= queued_task->ta_priority) { + STAILQ_INSERT_TAIL(&queue->tq_queue, queued_task, ta_link); } else { prev = NULL; for (ins = STAILQ_FIRST(&queue->tq_queue); ins; prev = ins, ins = STAILQ_NEXT(ins, ta_link)) - if (ins->ta_priority < task->ta_priority) + if (ins->ta_priority < queued_task->ta_priority) break; if (prev) - STAILQ_INSERT_AFTER(&queue->tq_queue, prev, task, ta_link); + STAILQ_INSERT_AFTER(&queue->tq_queue, prev, queued_task, + ta_link); else - STAILQ_INSERT_HEAD(&queue->tq_queue, task, ta_link); + STAILQ_INSERT_HEAD(&queue->tq_queue, queued_task, + ta_link); } - task->ta_pending = 1; + queued_task->ta_pending = 1; if ((queue->tq_flags & TQ_FLAGS_BLOCKED) == 0) queue->tq_enqueue(queue->tq_context); else Change 526701 by willa@willa_repo on 2012/02/13 15:37:52 ifdef ZFS_DEBUG refcounting for write ranges in flight. Change the type to int for refcounts, so it can use the refcount_{release,acquire} API used in dmu.c. This also makes the userland build better, since it doesn't have a definition for atomic_subtract_64. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#66 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#66 (text) ==== @@ -75,12 +75,11 @@ * \brief dbuf hash table routines */ static dbuf_hash_table_t dbuf_hash_table; -int64_t dirty_ranges_in_flight; SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf, CTLFLAG_RW, 0, "ZFS DBUF"); -#define SYSCTL_COUNTER_I(name, desc) \ - int64_t name; \ +#define SYSCTL_REFCOUNT(name, desc) \ + int name; \ SYSCTL_QUAD(_vfs_zfs_dbuf, OID_AUTO, name, CTLFLAG_RD, \ &name, 0, desc) #define SYSCTL_COUNTER_U(name, desc) \ @@ -88,9 +87,10 @@ SYSCTL_QUAD(_vfs_zfs_dbuf, OID_AUTO, name, CTLFLAG_RD, \ &name, 0, desc) -SYSCTL_COUNTER_I(dirty_ranges_in_flight, "number of dirty ranges in flight"); - +#ifdef ZFS_DEBUG +SYSCTL_REFCOUNT(dirty_ranges_in_flight, "number of dirty ranges in flight"); SYSCTL_COUNTER_U(dirty_ranges_total, "number of total dirty ranges"); +#endif SYSCTL_COUNTER_U(dirty_writes_lost, "dirty writes lost"); static uint64_t dbuf_hash_count; @@ -1821,7 +1821,9 @@ old_range->end = MAX(range->end, old_range->end); old_range->size = old_range->end - old_range->start; list_remove(&dl->write_ranges, old_range); - atomic_subtract_64(&dirty_ranges_in_flight, 1); +#ifdef ZFS_DEBUG + refcount_release(&dirty_ranges_in_flight); +#endif kmem_free(range, sizeof(dbuf_dirty_range_t)); range = old_range; } @@ -1833,8 +1835,10 @@ } else { /* If old_range is NULL, this does a list_insert_tail(). */ list_insert_before(&dl->write_ranges, old_range, range); - atomic_add_64(&dirty_ranges_in_flight, 1); +#ifdef ZFS_DEBUG + refcount_acquire(&dirty_ranges_in_flight); atomic_add_64(&dirty_ranges_total, 1); +#endif } dbuf_dirty_record_check_ranges(dr); @@ -2123,8 +2127,10 @@ dl = &dr->dt.dl; while ((range = list_remove_head(&dl->write_ranges)) != NULL) { kmem_free(range, sizeof(dbuf_dirty_range_t)); - atomic_subtract_64(&dirty_ranges_in_flight, 1); +#ifdef ZFS_DEBUG + refcount_release(&dirty_ranges_in_flight); ASSERT(dirty_ranges_in_flight >= 0); +#endif } } Change 528920 by willa@willa_repo on 2012/02/22 17:21:19 Make dbuf_evict_user() not pass the dbuf to its callers. This is intended as a stepping stone to punting dbuf_evict_user() to a queue, similar to how we handle zio threads processing DMU callbacks, as a means of avoiding LORs. Checking this into the branch, as I'm not sure whether this is going to work. The problem is that although nobody actually needs the dbuf, some users do want db->db.db_data copied into their callback structure, and we may not be able to guarantee that this pointer is still accessible if the callback is queue-processed. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#67 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c#11 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c#6 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c#4 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c#4 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#31 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h#3 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c#4 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c#5 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#67 (text) ==== @@ -257,7 +257,7 @@ if (db->db_user_data_ptr_ptr) *db->db_user_data_ptr_ptr = db->db.db_data; - db->db_evict_func(&db->db, db->db_user_ptr); + db->db_evict_func(db->db_user_ptr); db->db_user_ptr = NULL; db->db_user_data_ptr_ptr = NULL; db->db_evict_func = NULL; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c#11 (text) ==== @@ -956,13 +956,11 @@ } static void -dnode_buf_pageout(dmu_buf_t *db, void *arg) +dnode_buf_pageout(void *arg) { dnode_children_t *children_dnodes = arg; int i; - int epb = db->db_size >> DNODE_SHIFT; - - ASSERT(epb == children_dnodes->dnc_count); + int epb = children_dnodes->dnc_count; for (i = 0; i < epb; i++) { dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c#6 (text) ==== @@ -258,11 +258,9 @@ return (B_TRUE); } -/* ARGSUSED */ static void -dsl_dataset_evict(dmu_buf_t *db, void *dsv) +dsl_dataset_evict_impl(dsl_dataset_t *ds, boolean_t evict_deadlist) { - dsl_dataset_t *ds = dsv; ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); @@ -277,7 +275,7 @@ } bplist_destroy(&ds->ds_pending_deadlist); - if (db != NULL) { + if (evict_deadlist) { dsl_deadlist_close(&ds->ds_deadlist); } else { ASSERT(ds->ds_deadlist.dl_dbuf == NULL); @@ -301,6 +299,13 @@ kmem_free(ds, sizeof (dsl_dataset_t)); } +/* ARGSUSED */ +static void +dsl_dataset_evict(void *dsv) +{ + dsl_dataset_evict_impl(dsv, B_TRUE); +} + static int dsl_dataset_get_snapname(dsl_dataset_t *ds) { @@ -747,7 +752,7 @@ if (ds->ds_dbuf) dsl_dataset_drop_ref(ds, tag); else - dsl_dataset_evict(NULL, ds); + dsl_dataset_evict_impl(ds, B_FALSE); } boolean_t @@ -1492,7 +1497,7 @@ /* ARGSUSED */ static void -dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) +dsl_dataset_refs_gone(void *argv) { struct refsarg *arg = argv; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c#4 (text) ==== @@ -50,7 +50,7 @@ /* ARGSUSED */ static void -dsl_dir_evict(dmu_buf_t *db, void *arg) +dsl_dir_evict(void *arg) { dsl_dir_t *dd = arg; dsl_pool_t *dp = dd->dd_pool; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c#4 (text) ==== @@ -1280,9 +1280,9 @@ /*ARGSUSED*/ void -sa_evict(dmu_buf_t *db, void *sap) +sa_evict(void *sap) { - panic("evicting sa dbuf %p\n", (void *)db); + panic("evicting sa dbuf\n"); } static void ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#31 (text) ==== @@ -369,7 +369,7 @@ void dmu_thread_context_process(void); void dmu_thread_context_destroy(void *); -typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); +typedef void dmu_buf_evict_func_t(void *); /* * The names of zap entries in the DIRECTORY_OBJECT of the MOS. ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h#3 (text) ==== @@ -182,7 +182,7 @@ int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp); void zap_unlockdir(zap_t *zap); -void zap_evict(dmu_buf_t *db, void *vmzap); +void zap_evict(void *vmzap); zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt); void zap_name_free(zap_name_t *zn); int zap_hashbits(zap_t *zap); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c#4 (text) ==== @@ -52,7 +52,7 @@ int fzap_default_block_shift = 14; /* 16k blocksize */ -static void zap_leaf_pageout(dmu_buf_t *db, void *vl); +static void zap_leaf_pageout(void *vl); static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); @@ -438,9 +438,8 @@ dmu_buf_rele(l->l_dbuf, NULL); } -_NOTE(ARGSUSED(0)) static void -zap_leaf_pageout(dmu_buf_t *db, void *vl) +zap_leaf_pageout(void *vl) { zap_leaf_t *l = vl; @@ -468,7 +467,7 @@ rw_exit(&l->l_rwlock); if (winner != NULL) { /* someone else set it first */ - zap_leaf_pageout(NULL, l); + zap_leaf_pageout(l); l = winner; } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c#5 (text) ==== @@ -710,9 +710,8 @@ return (dmu_object_free(os, zapobj, tx)); } -_NOTE(ARGSUSED(0)) void -zap_evict(dmu_buf_t *db, void *vzap) +zap_evict(void *vzap) { zap_t *zap = vzap; Change 529196 by willa@willa_repo on 2012/02/24 16:32:41 Merge from //depot/SpectraBSD/stable/9/... Change list: 526790 Fix the zvol-blkback interface so Windows can use zvols again. 526947 Fix bugs in handling of the DIOCGSECTORSIZE ioctl. 527050 Fix a lock order reversal (LOR) in ZFS DBUF. 527051 Fix creating ZVOLs with names that are too long. 528420 Allocate the correct amount space for the taskqueue's callbacks. 528872 Fix taskqueue thread shutdown for threads that set a shutdown callback. 529193 Rototill the dbuf user evict mechanism to avoid lock order reversals. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#68 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c#8 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c#12 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c#9 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c#7 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c#5 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c#5 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c#12 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#29 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#32 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h#7 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h#6 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h#5 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h#4 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h#4 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h#4 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h#3 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c#5 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c#6 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c#10 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c#5 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#35 integrate ... //depot/branches/redline/projects/cow/sys/kern/subr_taskqueue.c#6 integrate Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#68 (text) ==== @@ -90,6 +90,7 @@ #ifdef ZFS_DEBUG SYSCTL_REFCOUNT(dirty_ranges_in_flight, "number of dirty ranges in flight"); SYSCTL_COUNTER_U(dirty_ranges_total, "number of total dirty ranges"); +SYSCTL_COUNTER_U(user_evicts, "number of user evicts performed"); #endif SYSCTL_COUNTER_U(dirty_writes_lost, "dirty writes lost"); @@ -248,19 +249,18 @@ static arc_evict_func_t dbuf_do_evict; static void -dbuf_evict_user(dmu_buf_impl_t *db) +dbuf_queue_user_evict(dmu_buf_impl_t *db, list_t *evict_list_p) { ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_level != 0 || db->db_evict_func == NULL) + if (db->db_level != 0 || db->db_user == NULL) return; - if (db->db_user_data_ptr_ptr) - *db->db_user_data_ptr_ptr = db->db.db_data; - db->db_evict_func(db->db_user_ptr); - db->db_user_ptr = NULL; - db->db_user_data_ptr_ptr = NULL; - db->db_evict_func = NULL; +#ifdef ZFS_DEBUG + atomic_add_64(&user_evicts, 1); +#endif + list_insert_head(evict_list_p, db->db_user); + db->db_user = NULL; } boolean_t @@ -280,13 +280,13 @@ } void -dbuf_evict(dmu_buf_impl_t *db) +dbuf_evict(dmu_buf_impl_t *db, list_t *evict_list_p) { ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_buf == NULL); ASSERT(db->db_data_pending == NULL); - dbuf_clear(db); + dbuf_clear(db, evict_list_p); dbuf_destroy(db); } @@ -473,13 +473,29 @@ dbuf_update_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_level == 0 && db->db_user_data_ptr_ptr) { + if (db->db_level == 0 && + db->db_user != NULL && db->db_user->user_data_ptr_ptr != NULL) { ASSERT(!refcount_is_zero(&db->db_holds)); - *db->db_user_data_ptr_ptr = db->db.db_data; + *db->db_user->user_data_ptr_ptr = db->db.db_data; } } /** + * Clear the dbuf's ARC buffer. + */ +static void +dbuf_clear_data(dmu_buf_impl_t *db, list_t *evict_list_p) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); + db->db_buf = NULL; + dbuf_queue_user_evict(db, evict_list_p); + db->db.db_data = NULL; + if (db->db_state != DB_NOFILL) + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "clear data"); +} + +/** * Set the dbuf's buffer to the ARC buffer, including any associated state, * such as db_data. */ @@ -489,18 +505,11 @@ ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); db->db_buf = buf; - if (buf != NULL) { - ASSERT(buf->b_data != NULL); - db->db.db_data = buf->b_data; - if (!arc_released(buf)) - arc_set_callback(buf, dbuf_do_evict, db); - dbuf_update_data(db); - } else { - dbuf_evict_user(db); - db->db.db_data = NULL; - if (db->db_state != DB_NOFILL) - DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "set data"); - } + ASSERT(buf->b_data != NULL); + db->db.db_data = buf->b_data; + if (!arc_released(buf)) + arc_set_callback(buf, dbuf_do_evict, db); + dbuf_update_data(db); } /** @@ -512,6 +521,9 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) { arc_buf_t *abuf; + list_t evict_list; + + dmu_buf_create_user_evict_list(&evict_list); mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { @@ -525,9 +537,10 @@ } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); - dbuf_set_data(db, NULL); + dbuf_clear_data(db, &evict_list); mutex_exit(&db->db_mtx); } + dmu_buf_destroy_user_evict_list(&evict_list); return (abuf); } @@ -920,8 +933,6 @@ DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(!refcount_is_zero(&db->db_holds)); - /* We need the struct_rwlock to prevent db_blkptr from changing. */ - ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED || (db->db_state & DB_PARTIAL)); /* @@ -944,7 +955,6 @@ /* Check to see if a caller only wants cached buffers. */ if (*flags & DB_RF_CACHED_ONLY) { - /* XXX this code path doesn't drop the lock */ arc_buf_t *buf = arc_buf_find_bp(spa, db->db_blkptr, db); if (buf != NULL) { db->db_state = DB_READ; /* for read_complete */ @@ -952,7 +962,7 @@ *flags |= DB_RF_CACHED; } DB_DNODE_EXIT(db); - /*mutex_exit(&db->db_mtx);*/ + /* Cache lookups never drop the dbuf mutex. */ return; } @@ -961,6 +971,13 @@ DBUF_STATE_CHANGE(db, =, DB_READ, "read issued"); mutex_exit(&db->db_mtx); + /* + * db_blkptr is protected by both the dbuf mutex and the associated + * struct_rwlock. The caller must acquire struct_rwlock before + * reads that may sleep without the dbuf mutex held. + */ + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + if (DBUF_IS_L2CACHEABLE(db)) aflags |= ARC_L2CACHE; @@ -1004,13 +1021,10 @@ ASSERT(DB_DNODE_HELD(db)); - /* Prevent the block pointer from being changed from under us. */ - if (!held) - rw_enter(&dn->dn_struct_rwlock, RW_READER); + /* Make sure read_impl doesn't change its contract with us. */ + ASSERT(MUTEX_HELD(&db->db_mtx)); dbuf_read_impl(db, NULL, &rflags); ASSERT(MUTEX_HELD(&db->db_mtx)); - if (!held) - rw_exit(&dn->dn_struct_rwlock); return (db->db_state == DB_CACHED); } @@ -1119,7 +1133,7 @@ * dbuf list for the dnode. */ static void -dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) +dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg, list_t *evict_list_p) { dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); @@ -1172,7 +1186,7 @@ dr->dt.dl.dr_data = buf; } } else { - dbuf_set_data(db, NULL); + dbuf_clear_data(db, evict_list_p); } } @@ -1237,6 +1251,9 @@ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; uint64_t first_l1 = start >> epbs; uint64_t last_l1 = end >> epbs; + list_t evict_list; + + dmu_buf_create_user_evict_list(&evict_list); if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) { end = dn->dn_maxblkid; @@ -1302,7 +1319,7 @@ /* All consumers are finished, so evict the buffer */ if (refcount_count(&db->db_holds) == 0) { ASSERT(db->db_buf); - dbuf_clear(db); + dbuf_clear(db, &evict_list); continue; } @@ -1344,7 +1361,7 @@ */ dbuf_transition_to_read(db); } - dbuf_fix_old_data(db, txg); + dbuf_fix_old_data(db, txg, &evict_list); } } /* @@ -1378,8 +1395,11 @@ } mutex_exit(&db->db_mtx); + /* Process one dbuf at a time to reduce memory pressure. */ + dmu_buf_process_user_evicts(&evict_list); } mutex_exit(&dn->dn_dbufs_mtx); + dmu_buf_destroy_user_evict_list(&evict_list); } static int @@ -1535,7 +1555,7 @@ * transaction group. */ static dbuf_dirty_record_t * -dbuf_dirty_record_create(dmu_buf_impl_t *db, dmu_tx_t *tx) +dbuf_dirty_record_create(dmu_buf_impl_t *db, dmu_tx_t *tx, list_t *evict_list_p) { dbuf_dirty_record_t *dr; dnode_t *dn; @@ -1555,7 +1575,7 @@ if (db->db_state != DB_NOFILL) { if (db->db_blkid == DMU_BONUS_BLKID) { - dbuf_fix_old_data(db, tx->tx_txg); + dbuf_fix_old_data(db, tx->tx_txg, evict_list_p); data_old = db->db.db_data; } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { /* @@ -1569,7 +1589,8 @@ */ if (db->db_buf != NULL) { arc_release(db->db_buf, db); - dbuf_fix_old_data(db, tx->tx_txg); + dbuf_fix_old_data(db, tx->tx_txg, + evict_list_p); data_old = db->db_buf; } else { /* @@ -1717,20 +1738,23 @@ /* * Update the dirty record to add this dbuf to its parent's - * dirty record's list of dirty children. + * dirty record's list of dirty children. The indirect + * mutex could be conditionally acquired, but doing so is + * unlikely to save any effort in most cases. Acquiring it + * unconditionally keeps this path clean of apparent LORs. */ + mutex_enter(&di->dt.di.dr_mtx); mutex_enter(&db->db_mtx); /* possible race with dbuf_undirty() */ if (list_head(&db->db_dirty_records) == dr || dn->dn_object == DMU_META_DNODE_OBJECT) { - mutex_enter(&di->dt.di.dr_mtx); ASSERT3U(di->dr_txg, ==, tx->tx_txg); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&di->dt.di.dr_children, dr); - mutex_exit(&di->dt.di.dr_mtx); dr->dr_parent = di; } mutex_exit(&db->db_mtx); + mutex_exit(&di->dt.di.dr_mtx); } else { /* The dbuf's parent is the dnode */ ASSERT(db->db_level+1 == dn->dn_nlevels); @@ -1857,9 +1881,12 @@ boolean_t do_free_accounting = B_FALSE; boolean_t already_dirty = B_FALSE; int txgoff = tx->tx_txg & TXG_MASK; + list_t evict_list; ASSERT(how == DB_FILL || how == DB_NOFILL || how == DB_UNCACHED); + dmu_buf_create_user_evict_list(&evict_list); + DB_DNODE_ENTER(db); dn = DB_DNODE(db); @@ -1905,7 +1932,7 @@ if (how == DB_NOFILL) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_state & (DB_UNCACHED|DB_NOFILL|DB_CACHED)); - dbuf_set_data(db, NULL); + dbuf_clear_data(db, &evict_list); DBUF_STATE_CHANGE(db, =, DB_NOFILL, "allocating NOFILL buffer"); } else if (how == DB_FILL) { if (db->db_state == DB_UNCACHED) { @@ -2068,7 +2095,7 @@ * to make a copy of it so that the changes we make in this * transaction group won't leak out when we sync the older txg. */ - dr = dbuf_dirty_record_create(db, tx); + dr = dbuf_dirty_record_create(db, tx, &evict_list); } /* Add the dirty range and do some related bookkeeping. */ @@ -2082,6 +2109,8 @@ mutex_exit(&db->db_mtx); + dmu_buf_destroy_user_evict_list(&evict_list); + if (!already_dirty) { if (do_free_accounting && db->db_blkid != DMU_SPILL_BLKID) { blkptr_t *bp = db->db_blkptr; @@ -2150,10 +2179,13 @@ dnode_t *dn; uint64_t txg = tx->tx_txg; dbuf_dirty_record_t *dr; + list_t evict_list; ASSERT(txg != 0); ASSERT(db->db_blkid != DMU_BONUS_BLKID); + dmu_buf_create_user_evict_list(&evict_list); + mutex_enter(&db->db_mtx); /* * If this buffer is not dirty in this transaction @@ -2165,6 +2197,7 @@ break; } if (dr == NULL || dr->dr_txg < txg) { + dmu_buf_destroy_user_evict_list(&evict_list); mutex_exit(&db->db_mtx); return (0); } @@ -2259,13 +2292,17 @@ arc_buf_t *buf = db->db_buf; ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); - dbuf_set_data(db, NULL); + dbuf_clear_data(db, &evict_list); VERIFY(arc_buf_remove_ref(buf, db) == 1); - dbuf_evict(db); + dbuf_evict(db, &evict_list); + dmu_buf_destroy_user_evict_list(&evict_list); return (1); } mutex_exit(&db->db_mtx); + + dmu_buf_destroy_user_evict_list(&evict_list); + return (0); } @@ -2501,7 +2538,7 @@ * ARC: dbuf_do_evict()->dbuf_destroy() */ void -dbuf_clear(dmu_buf_impl_t *db) +dbuf_clear(dmu_buf_impl_t *db, list_t *evict_list_p) { dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; @@ -2512,7 +2549,7 @@ ASSERT(refcount_is_zero(&db->db_holds)); ASSERT(list_is_empty(&db->db_dirty_records)); - dbuf_evict_user(db); + dbuf_queue_user_evict(db, evict_list_p); if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); @@ -2661,9 +2698,7 @@ db->db_parent = parent; db->db_blkptr = blkptr; - db->db_user_ptr = NULL; - db->db_user_data_ptr_ptr = NULL; - db->db_evict_func = NULL; + db->db_user = NULL; db->db_immediate_evict = 0; db->db_freed_in_flight = 0; @@ -2726,6 +2761,9 @@ { arc_buf_t *buf = private; dmu_buf_impl_t *db = buf->b_private; + list_t evict_list; + + dmu_buf_create_user_evict_list(&evict_list); if (!MUTEX_HELD(&db->db_mtx)) mutex_enter(&db->db_mtx); @@ -2737,7 +2775,8 @@ ASSERT(db->db_state == DB_CACHED); DBUF_VERIFY(db); db->db_buf = NULL; - dbuf_evict(db); + dbuf_evict(db, &evict_list); + dmu_buf_destroy_user_evict_list(&evict_list); } else { mutex_exit(&db->db_mtx); dbuf_destroy(db); @@ -2852,11 +2891,14 @@ void *tag, dmu_buf_impl_t **dbp, dmu_buf_set_t *buf_set) { dmu_buf_impl_t *db, *parent = NULL; + list_t evict_list; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT3U(dn->dn_nlevels, >, level); + dmu_buf_create_user_evict_list(&evict_list); + *dbp = NULL; top: /* dbuf_find() returns with db_mtx held */ @@ -2885,7 +2927,7 @@ if (db->db_buf && refcount_is_zero(&db->db_holds)) { arc_buf_add_ref(db->db_buf, db); if (db->db_buf->b_data == NULL) { - dbuf_clear(db); + dbuf_clear(db, &evict_list); if (parent) { dbuf_rele(parent, NULL); parent = NULL; @@ -2931,6 +2973,8 @@ } mutex_exit(&db->db_mtx); + dmu_buf_destroy_user_evict_list(&evict_list); + /* NOTE: we can't rele the parent until after we drop the db_mtx */ if (parent) dbuf_rele(parent, NULL); @@ -3031,10 +3075,13 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) { int64_t holds; + list_t evict_list; ASSERT(MUTEX_HELD(&db->db_mtx)); DBUF_VERIFY(db); + dmu_buf_create_user_evict_list(&evict_list); + /* * Remove the reference to the dbuf before removing its hold on the * dnode so we can guarantee in dnode_move() that a referenced bonus @@ -3054,7 +3101,7 @@ if (holds == db->db_dirtycnt && db->db_level == 0 && db->db_immediate_evict) - dbuf_evict_user(db); + dbuf_queue_user_evict(db, &evict_list); if (holds == 0) { if (db->db_blkid == DMU_BONUS_BLKID) { @@ -3089,25 +3136,26 @@ #endif ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); - dbuf_evict(db); + dbuf_evict(db, &evict_list); } else if (arc_released(db->db_buf)) { arc_buf_t *buf = db->db_buf; /* * This dbuf has anonymous data associated with it. */ - dbuf_set_data(db, NULL); + dbuf_clear_data(db, &evict_list); VERIFY(arc_buf_remove_ref(buf, db) == 1); - dbuf_evict(db); + dbuf_evict(db, &evict_list); } else { VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); if (!DBUF_IS_CACHEABLE(db)) - dbuf_clear(db); + dbuf_clear(db, &evict_list); else mutex_exit(&db->db_mtx); } } else { mutex_exit(&db->db_mtx); } + dmu_buf_destroy_user_evict_list(&evict_list); } #pragma weak dmu_buf_refcount = dbuf_refcount @@ -3118,87 +3166,63 @@ } /** - * \param user_ptr [in] For use by the user and can be obtained - * via dmu_buf_get_user() + * \brief Update the user eviction data for the DMU buffer. * - * \param user_data_ptr_ptr [in, out] Should be NULL, or a pointer to a - * pointer which will be set to db->db_data when - * the caller is allowed to access it. Note that - * db->db_data can change when dmu_buf_read, - * dmu_buf_tryupgrade, dmu_buf_will_dirty, or - * dmu_buf_will_fill are called. - * *user_data_ptr_ptr will be set to the new - * value when it changes. - * - * \param evict_func [in] If not NULL, evict_func will be called - * when this buffer is being excised from the - * cache, so that the data structure pointed to - * by user_data_ptr_ptr can be cleaned up. + * \param db_fake The DMU buffer to set the data for. + * \param old_user The old user's eviction data pointer. + * \param new_user The new user's eviction data pointer. * * \returns NULL on success, or the existing user ptr if it's already - * been set. + * been set. * * dmu_evict_user() will call the evict_func for all buffers in a * objset with a given pageout func. */ -void * -dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, - dmu_buf_evict_func_t *evict_func) +dmu_buf_user_t * +dmu_buf_update_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, + dmu_buf_user_t *new_user) { - return (dmu_buf_update_user(db_fake, NULL, user_ptr, - user_data_ptr_ptr, evict_func)); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT(db->db_level == 0); + + mutex_enter(&db->db_mtx); + + if (db->db_user == old_user) { + db->db_user = new_user; + dbuf_update_data(db); + } else + old_user = db->db_user; + + mutex_exit(&db->db_mtx); + return (old_user); } -/** - * The same as set_user, but request immediate eviction when hold count goes - * to zero. - */ -void * -dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, - dmu_buf_evict_func_t *evict_func) +dmu_buf_user_t * +dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - db->db_immediate_evict = TRUE; - return (dmu_buf_update_user(db_fake, NULL, user_ptr, - user_data_ptr_ptr, evict_func)); + return (dmu_buf_update_user(db_fake, NULL, user)); } -void * -dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, - void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) +dmu_buf_user_t * +dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - ASSERT(db->db_level == 0); - ASSERT((user_ptr == NULL) == (evict_func == NULL)); - - mutex_enter(&db->db_mtx); - - if (db->db_user_ptr == old_user_ptr) { - db->db_user_ptr = user_ptr; - db->db_user_data_ptr_ptr = user_data_ptr_ptr; - db->db_evict_func = evict_func; - - dbuf_update_data(db); - } else { - old_user_ptr = db->db_user_ptr; - } - - mutex_exit(&db->db_mtx); - return (old_user_ptr); + db->db_immediate_evict = TRUE; + return (dmu_buf_update_user(db_fake, NULL, user)); } /** - * \return the user_ptr set with dmu_buf_set_user(), or NULL if not set. + * \return the db_user set with dmu_buf_update_user(), or NULL if not set. */ -void * +dmu_buf_user_t * dmu_buf_get_user(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT(!refcount_is_zero(&db->db_holds)); - return (db->db_user_ptr); + return (db->db_user); } /** ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c#8 (text) ==== @@ -1009,7 +1009,10 @@ dmu_objset_name(os, name); strlcat(name, "@", sizeof(name)); strlcat(name, snapname, sizeof(name)); - zvol_create_minors(name); + err = zvol_create_minors(name); + if (err) + printf("ZFS WARNING: Unable to create minors" + " for snapshot %s\n", name); } #endif #endif ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c#12 (text) ==== @@ -455,8 +455,11 @@ dn->dn_dirtyctx_firstset = NULL; } if (dn->dn_bonus != NULL) { + list_t evict_list; + dmu_buf_create_user_evict_list(&evict_list); mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_evict(dn->dn_bonus); + dbuf_evict(dn->dn_bonus, &evict_list); + dmu_buf_destroy_user_evict_list(&evict_list); dn->dn_bonus = NULL; } dn->dn_zio = NULL; @@ -956,13 +959,12 @@ } static void -dnode_buf_pageout(void *arg) +dnode_buf_pageout(dmu_buf_user_t *dbu) { - dnode_children_t *children_dnodes = arg; + dnode_children_t *children_dnodes = (dnode_children_t *)dbu; int i; - int epb = children_dnodes->dnc_count; - for (i = 0; i < epb; i++) { + for (i = 0; i < children_dnodes->dnc_count; i++) { dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; dnode_t *dn; @@ -992,7 +994,7 @@ dnh->dnh_dnode = NULL; } kmem_free(children_dnodes, sizeof (dnode_children_t) + - (epb - 1) * sizeof (dnode_handle_t)); + (children_dnodes->dnc_count - 1) * sizeof (dnode_handle_t)); } /** @@ -1074,7 +1076,7 @@ idx = object & (epb-1); ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); - children_dnodes = dmu_buf_get_user(&db->db); + children_dnodes = (dnode_children_t *)dmu_buf_get_user(&db->db); if (children_dnodes == NULL) { int i; dnode_children_t *winner; @@ -1086,8 +1088,11 @@ zrl_init(&dnh[i].dnh_zrlock); dnh[i].dnh_dnode = NULL; } - if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL, - dnode_buf_pageout)) { + dmu_buf_init_user(&children_dnodes->db_evict, + dnode_buf_pageout, NULL); + winner = (dnode_children_t *) + dmu_buf_set_user(&db->db, &children_dnodes->db_evict); + if (winner) { kmem_free(children_dnodes, sizeof (dnode_children_t) + (epb - 1) * sizeof (dnode_handle_t)); children_dnodes = winner; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c#9 (text) ==== @@ -383,6 +383,9 @@ { int progress; int pass = 0; + list_t evict_list; + + dmu_buf_create_user_evict_list(&evict_list); do { dmu_buf_impl_t *db, marker; @@ -408,10 +411,13 @@ mutex_exit(&db->db_mtx); } else if (refcount_is_zero(&db->db_holds)) { progress = TRUE; - dbuf_clear(db); /* exits db_mtx for us */ + dbuf_clear(db, &evict_list); } else { mutex_exit(&db->db_mtx); } + /* Make sure dbuf_clear exits db_mtx for us. */ + ASSERT(MUTEX_NOT_HELD(&db->db_mtx)); + dmu_buf_process_user_evicts(&evict_list); } list_remove(&dn->dn_dbufs, &marker); @@ -432,10 +438,11 @@ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) { mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_evict(dn->dn_bonus); + dbuf_evict(dn->dn_bonus, &evict_list); dn->dn_bonus = NULL; } rw_exit(&dn->dn_struct_rwlock); + dmu_buf_destroy_user_evict_list(&evict_list); } static void ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c#7 (text) ==== @@ -301,9 +301,9 @@ /* ARGSUSED */ static void -dsl_dataset_evict(void *dsv) +dsl_dataset_evict(dmu_buf_user_t *dbu) { - dsl_dataset_evict_impl(dsv, B_TRUE); + dsl_dataset_evict_impl((dsl_dataset_t *)dbu, B_TRUE); } static int @@ -394,7 +394,7 @@ if (doi.doi_type != DMU_OT_DSL_DATASET) return (EINVAL); - ds = dmu_buf_get_user(dbuf); + ds = (dsl_dataset_t *)dmu_buf_get_user(dbuf); if (ds == NULL) { dsl_dataset_t *winner; @@ -476,10 +476,12 @@ ds->ds_reserved = ds->ds_quota = 0; } - if (err == 0) { - winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, - dsl_dataset_evict); - } + dmu_buf_init_user(&ds->db_evict, dsl_dataset_evict, + (void **)&ds->ds_phys); + if (err == 0) + winner = (dsl_dataset_t *) + dmu_buf_set_user_ie(dbuf, &ds->db_evict); + if (err || winner) { bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); @@ -1490,6 +1492,7 @@ } struct refsarg { + dmu_buf_user_t db_evict; kmutex_t lock; boolean_t gone; kcondvar_t cv; @@ -1497,9 +1500,9 @@ /* ARGSUSED */ static void -dsl_dataset_refs_gone(void *argv) +dsl_dataset_refs_gone(dmu_buf_user_t *dbu) { - struct refsarg *arg = argv; + struct refsarg *arg = (struct refsarg *)dbu; mutex_enter(&arg->lock); arg->gone = TRUE; @@ -1511,13 +1514,18 @@ dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) { struct refsarg arg; + dmu_buf_user_t *old_user; bzero(&arg, sizeof(arg)); mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); arg.gone = FALSE; - (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, - dsl_dataset_refs_gone); + dmu_buf_init_user(&arg.db_evict, dsl_dataset_refs_gone, + (void **)&ds->ds_phys); + old_user = dmu_buf_update_user(ds->ds_dbuf, &ds->db_evict, + &arg.db_evict); + ASSERT(old_user == &ds->db_evict); + dmu_buf_rele(ds->ds_dbuf, tag); mutex_enter(&arg.lock); while (!arg.gone) ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c#5 (text) ==== @@ -50,9 +50,9 @@ /* ARGSUSED */ static void -dsl_dir_evict(void *arg) +dsl_dir_evict(dmu_buf_user_t *dbu) { - dsl_dir_t *dd = arg; + dsl_dir_t *dd = (dsl_dir_t *)dbu; dsl_pool_t *dp = dd->dd_pool; int t; @@ -90,7 +90,7 @@ err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); if (err) return (err); - dd = dmu_buf_get_user(dbuf); + dd = (dsl_dir_t *)dmu_buf_get_user(dbuf); #ifdef ZFS_DEBUG { dmu_object_info_t doi; @@ -159,8 +159,9 @@ dmu_buf_rele(origin_bonus, FTAG); } - winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys, - dsl_dir_evict); + dmu_buf_init_user(&dd->db_evict, dsl_dir_evict, + (void **)&dd->dd_phys); + winner = (dsl_dir_t *)dmu_buf_set_user_ie(dbuf, &dd->db_evict); if (winner) { if (dd->dd_parent) dsl_dir_close(dd->dd_parent, dd); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c#5 (text) ==== @@ -1280,7 +1280,7 @@ /*ARGSUSED*/ void -sa_evict(void *sap) +sa_evict(dmu_buf_user_t *dbu) { panic("evicting sa dbuf\n"); } @@ -1321,9 +1321,10 @@ void sa_handle_destroy(sa_handle_t *hdl) { + dmu_buf_t *db = hdl->sa_bonus; + mutex_enter(&hdl->sa_lock); - (void) dmu_buf_update_user((dmu_buf_t *)hdl->sa_bonus, hdl, - NULL, NULL, NULL); + (void) dmu_buf_update_user(db, &hdl->db_evict, NULL); if (hdl->sa_bonus_tab) { sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); @@ -1349,7 +1350,7 @@ { int error = 0; dmu_object_info_t doi; - sa_handle_t *handle; + sa_handle_t *handle = NULL, *winner = NULL; #ifdef ZFS_DEBUG dmu_object_info_from_db(db, &doi); @@ -1359,23 +1360,27 @@ /* find handle, if it exists */ /* if one doesn't exist then create a new one, and initialize it */ - handle = (hdl_type == SA_HDL_SHARED) ? dmu_buf_get_user(db) : NULL; + if (hdl_type == SA_HDL_SHARED) + handle = (sa_handle_t *)dmu_buf_get_user(db); + if (handle == NULL) { - sa_handle_t *newhandle; handle = kmem_cache_alloc(sa_cache, KM_SLEEP); + bzero(&handle->db_evict, sizeof(dmu_buf_user_t)); handle->sa_userp = userp; handle->sa_bonus = db; handle->sa_os = os; handle->sa_spill = NULL; error = sa_build_index(handle, SA_BONUS); - newhandle = (hdl_type == SA_HDL_SHARED) ? - dmu_buf_set_user_ie(db, handle, - NULL, sa_evict) : NULL; + if (hdl_type == SA_HDL_SHARED) { + dmu_buf_init_user(&handle->db_evict, sa_evict, NULL); + winner = (sa_handle_t *) + dmu_buf_set_user_ie(db, &handle->db_evict); + } - if (newhandle != NULL) { + if (winner != NULL) { kmem_cache_free(sa_cache, handle); - handle = newhandle; + handle = winner; } } *handlepp = handle; @@ -1888,8 +1893,10 @@ void sa_update_user(sa_handle_t *newhdl, sa_handle_t *oldhdl) { - (void) dmu_buf_update_user((dmu_buf_t *)newhdl->sa_bonus, - oldhdl, newhdl, NULL, sa_evict); + dmu_buf_t *db = newhdl->sa_bonus; + + dmu_buf_init_user(&newhdl->db_evict, sa_evict, NULL); + (void) dmu_buf_update_user(db, &oldhdl->db_evict, &newhdl->db_evict); oldhdl->sa_bonus = NULL; } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c#12 (text) ==== @@ -2479,7 +2479,7 @@ { spa_t *spa; spa_load_state_t state = SPA_LOAD_OPEN; - int error; + int error = 0; int locked = B_FALSE; int firstopen = B_FALSE; @@ -2581,15 +2581,23 @@ mutex_exit(&spa_namespace_lock); #ifdef __FreeBSD__ #ifdef _KERNEL - if (firstopen) - zvol_create_minors(pool); + if (firstopen) { + /* + * Don't pass up errors from here. The SPA was + * still created and we can't reasonably unwind it + * at this point. + */ + if (zvol_create_minors(pool)) + printf("ZFS WARNING: ZVOL device nodes for " + "pool %s could not be created\n", pool); + } #endif #endif } *spapp = spa; - return (0); + return (error); } int @@ -3591,13 +3599,23 @@ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); mutex_exit(&spa_namespace_lock); - spa_history_log_version(spa, LOG_POOL_IMPORT); #ifdef __FreeBSD__ #ifdef _KERNEL - zvol_create_minors(pool); + if (zvol_create_minors(pool)) { + /* + * Don't pass up errors from here. The SPA was + * still created and we can't reasonably unwind it + * at this point. + */ + printf("ZFS WARNING: Unable to create ZVOL block devices " + "for pool %s\n", pool); + } #endif #endif + + spa_history_log_version(spa, LOG_POOL_IMPORT); + return (0); } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#29 (text) ==== @@ -410,10 +410,8 @@ /** Data which is unique to data (leaf) blocks: */ - /** stuff we store for the user (see dmu_buf_set_user) */ - void *db_user_ptr; - void **db_user_data_ptr_ptr; - dmu_buf_evict_func_t *db_evict_func; + /** User callback information. See dmu_buf_set_user(). */ + dmu_buf_user_t *db_user; uint8_t db_immediate_evict; uint8_t db_freed_in_flight; @@ -493,8 +491,8 @@ int size, int how, arc_buf_t *db_buf); arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); -void dbuf_clear(dmu_buf_impl_t *db); -void dbuf_evict(dmu_buf_impl_t *db); +void dbuf_clear(dmu_buf_impl_t *db, list_t *evict_list); +void dbuf_evict(dmu_buf_impl_t *db, list_t *evict_list); void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); void dbuf_unoverride(dbuf_dirty_record_t *dr); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#32 (text) ==== @@ -369,7 +369,61 @@ void dmu_thread_context_process(void); void dmu_thread_context_destroy(void *); -typedef void dmu_buf_evict_func_t(void *); +struct dmu_buf_user; + +typedef void dmu_buf_evict_func_t(struct dmu_buf_user *); + +/* + * Consumers are expected to allocate and free space for this structure. + * Consequently, if any additional context is needed, another struct that + * includes this one at the start should be passed in. + */ +typedef struct dmu_buf_user { + /** + * This instance's link in the eviction queue. Set when the buffer + * has evicted and the callback needs to be called. + */ + list_node_t evict_queue_link; + /** This instance's eviction function pointer. */ + dmu_buf_evict_func_t *evict_func; + /** Location that db_data, when updated, should be copied to. */ + void **user_data_ptr_ptr; +} dmu_buf_user_t; + +/** Initialization routine for dmu_buf_user_t instances. */ +static inline void +dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, + void **user_data_ptr_ptr) +{ + list_link_init(&dbu->evict_queue_link); + dbu->evict_func = evict_func; + dbu->user_data_ptr_ptr = user_data_ptr_ptr; +} + +/** DMU buffer user eviction routines. */ +static inline void +dmu_buf_create_user_evict_list(list_t *evict_list_p) +{ + list_create(evict_list_p, sizeof(dmu_buf_user_t), + offsetof(dmu_buf_user_t, evict_queue_link)); +} +static inline void +dmu_buf_process_user_evicts(list_t *evict_list_p) +{ + dmu_buf_user_t *dbu, *next; + + for (dbu = list_head(evict_list_p); dbu != NULL; dbu = next) { + next = list_next(evict_list_p, dbu); + list_remove(evict_list_p, dbu); + dbu->evict_func(dbu); + } +} +static inline void +dmu_buf_destroy_user_evict_list(list_t *evict_list_p) +{ + dmu_buf_process_user_evicts(evict_list_p); + list_destroy(evict_list_p); +} /* * The names of zap entries in the DIRECTORY_OBJECT of the MOS. @@ -445,16 +499,13 @@ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); -void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr, - dmu_buf_evict_func_t *pageout_func); -void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr, - void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func); -void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, - void *user_ptr, void *user_data_ptr_ptr, - dmu_buf_evict_func_t *pageout_func); +dmu_buf_user_t *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user); +dmu_buf_user_t *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user); +dmu_buf_user_t *dmu_buf_update_user(dmu_buf_t *db, dmu_buf_user_t *old_user, + dmu_buf_user_t *new_user); void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func); -void *dmu_buf_get_user(dmu_buf_t *db); +dmu_buf_user_t *dmu_buf_get_user(dmu_buf_t *db); void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_dirty_range(dmu_buf_t *db, dmu_tx_t *tx, int offset, ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h#7 (text) ==== @@ -283,6 +283,8 @@ } dnode_handle_t; typedef struct dnode_children { + /** Dbuf user eviction data for this instance. */ + dmu_buf_user_t db_evict; size_t dnc_count; /**< number of children */ dnode_handle_t dnc_children[1]; /**< sized dynamically */ } dnode_children_t; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h#6 (text) ==== @@ -107,6 +107,9 @@ } dsl_dataset_phys_t; typedef struct dsl_dataset { + /** Dbuf user eviction data for this instance. */ + dmu_buf_user_t db_evict; + /** * \name Immutable * \{ */ ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h#5 (text) ==== @@ -76,6 +76,9 @@ } dsl_dir_phys_t; struct dsl_dir { + /** Dbuf user eviction data for this instance. */ + dmu_buf_user_t db_evict; + /** * \name These are immutable; no lock needed * \{ */ ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h#4 (text) ==== @@ -206,6 +206,8 @@ * This needs to be kept as small as possible. */ struct sa_handle { + /** Dbuf user eviction data for this instance. */ + dmu_buf_user_t db_evict; kmutex_t sa_lock; dmu_buf_t *sa_bonus; dmu_buf_t *sa_spill; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h#4 (text) ==== @@ -138,6 +138,8 @@ typedef struct zap_table_phys zap_table_phys_t; typedef struct zap { + /** Dbuf user eviction data for this instance. */ + dmu_buf_user_t db_evict; objset_t *zap_objset; uint64_t zap_object; struct dmu_buf *zap_dbuf; @@ -182,7 +184,7 @@ int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp); void zap_unlockdir(zap_t *zap); -void zap_evict(void *vmzap); +void zap_evict(dmu_buf_user_t *dbu); zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt); void zap_name_free(zap_name_t *zn); int zap_hashbits(zap_t *zap); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h#4 (text) ==== @@ -156,6 +156,8 @@ } zap_leaf_chunk_t; typedef struct zap_leaf { + /** Dbuf user eviction data for this instance. */ + dmu_buf_user_t db_evict; krwlock_t l_rwlock; uint64_t l_blkid; /**< 1<zap_rwlock)); zap->zap_ismicro = FALSE; - (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap, - &zap->zap_f.zap_phys, zap_evict); + dmu_buf_init_user(&zap->db_evict, zap_evict, + (void **)&zap->zap_f.zap_phys); + (void) dmu_buf_update_user(zap->zap_dbuf, &zap->db_evict, + &zap->db_evict); mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1; @@ -389,10 +390,19 @@ return (newblk); } +static void +zap_leaf_pageout(dmu_buf_user_t *dbu) +{ + zap_leaf_t *l = (zap_leaf_t *)dbu; + + rw_destroy(&l->l_rwlock); + kmem_free(l, sizeof (zap_leaf_t)); +} + static zap_leaf_t * zap_create_leaf(zap_t *zap, dmu_tx_t *tx) { - void *winner; + zap_leaf_t *winner; zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); @@ -406,7 +416,8 @@ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, DMU_READ_NO_PREFETCH)); - winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout); + dmu_buf_init_user(&l->db_evict, zap_leaf_pageout, (void **)&l->l_phys); + winner = (zap_leaf_t *)dmu_buf_set_user(l->l_dbuf, &l->db_evict); ASSERT(winner == NULL); dmu_buf_will_dirty(l->l_dbuf, tx); @@ -438,15 +449,6 @@ dmu_buf_rele(l->l_dbuf, NULL); } -static void -zap_leaf_pageout(void *vl) -{ - zap_leaf_t *l = vl; - - rw_destroy(&l->l_rwlock); - kmem_free(l, sizeof (zap_leaf_t)); -} - static zap_leaf_t * zap_open_leaf(uint64_t blkid, dmu_buf_t *db) { @@ -462,12 +464,13 @@ l->l_dbuf = db; l->l_phys = NULL; - winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout); + dmu_buf_init_user(&l->db_evict, zap_leaf_pageout, (void **)&l->l_phys); + winner = (zap_leaf_t *)dmu_buf_set_user(db, &l->db_evict); rw_exit(&l->l_rwlock); if (winner != NULL) { /* someone else set it first */ - zap_leaf_pageout(l); + zap_leaf_pageout(&l->db_evict); l = winner; } @@ -516,7 +519,7 @@ ASSERT3U(db->db_size, ==, 1 << bs); ASSERT(blkid != 0); - l = dmu_buf_get_user(db); + l = (zap_leaf_t *)dmu_buf_get_user(db); if (l == NULL) l = zap_open_leaf(blkid, db); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c#6 (text) ==== @@ -390,7 +390,9 @@ * it, because zap_lockdir() checks zap_ismicro without the lock * held. */ - winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict); + dmu_buf_init_user(&zap->db_evict, zap_evict, + (void **)&zap->zap_m.zap_phys); + winner = (zap_t *)dmu_buf_set_user(db, &zap->db_evict); if (winner != NULL) { rw_exit(&zap->zap_rwlock); @@ -476,7 +478,7 @@ } #endif - zap = dmu_buf_get_user(db); + zap = (zap_t *)dmu_buf_get_user(db); if (zap == NULL) zap = mzap_open(os, obj, db); @@ -711,9 +713,9 @@ } void -zap_evict(void *vzap) +zap_evict(dmu_buf_user_t *dbu) { - zap_t *zap = vzap; + zap_t *zap = (zap_t *)dbu; rw_destroy(&zap->zap_rwlock); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c#10 (text) ==== @@ -3068,17 +3068,16 @@ /* * It would be nice to do this atomically. */ - if (error == 0) { + if (error == 0) error = zfs_set_prop_nvlist(zc->zc_name, ZPROP_SRC_LOCAL, nvprops, NULL); - if (error != 0) - (void) dmu_objset_destroy(zc->zc_name, B_FALSE); - } nvlist_free(nvprops); #ifdef __FreeBSD__ if (error == 0 && type == DMU_OST_ZVOL) - zvol_create_minors(zc->zc_name); + error = zvol_create_minors(zc->zc_name); #endif + if (error) + (void) dmu_objset_destroy(zc->zc_name, B_FALSE); return (error); } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c#5 (text) ==== @@ -1166,7 +1166,7 @@ return (EINVAL); } - hdl = dmu_buf_get_user(db); + hdl = (sa_handle_t *)dmu_buf_get_user(db); if (hdl != NULL) { zp = sa_get_userdata(hdl); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#35 (text) ==== @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -93,6 +94,9 @@ static d_open_t zvol_open; static d_close_t zvol_close; static d_strategy_t zvol_strategy; +static d_read_t zvol_freebsd_read; +static d_write_t zvol_freebsd_write; +static d_ioctl_t zvol_freebsd_ioctl; struct cdevsw zfs_zvol_cdevsw = { .d_version = D_VERSION, @@ -103,6 +107,7 @@ .d_strategy = zvol_strategy, .d_read = zvol_freebsd_read, .d_write = zvol_freebsd_write, + .d_ioctl = zvol_freebsd_ioctl, }; DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); @@ -169,7 +174,6 @@ static int zvol_dump_fini(zvol_state_t *zv); static int zvol_dump_init(zvol_state_t *zv, boolean_t resize); -static zvol_state_t *zvol_geom_create(const char *name); static void zvol_geom_run(zvol_state_t *zv); static void zvol_geom_destroy(zvol_state_t *zv); static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); @@ -489,8 +493,96 @@ mutex_exit(&spa_namespace_lock); return (zv ? 0 : -1); } + +static int +zvol_create_minor_sun(zvol_state_t **zvp, const char *name) +{ + int error = 0; + boolean_t minor_created = B_FALSE; + minor_t minor; + + if ((minor = zfsdev_minor_alloc()) == 0) + return (ENXIO); + + if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) + return (EAGAIN); + (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME, + (char *)name); + + (void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor); + + if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR, + minor, DDI_PSEUDO, 0) == DDI_FAILURE) { + error = EAGAIN; + goto out; + } + + (void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor); + + if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK, + minor, DDI_PSEUDO, 0) == DDI_FAILURE) { + error = EAGAIN; + goto out; + } + minor_created = B_TRUE; + + zs = ddi_get_soft_state(zfsdev_state, minor); + zs->zss_type = ZSST_ZVOL; + zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); + if (zv == NULL) + error = ENOMEM; + else + *zvp = zv; + +out: + if (error) { + if (minor_created) + ddi_remove_minor_node(zfs_dip, chrbuf); + ddi_soft_state_free(zfsdev_state, minor); + } + return (error); +} #endif /* sun */ +#ifdef __FreeBSD__ +static int +zvol_create_minor_freebsd(zvol_state_t **zvp, const char *name) +{ + struct g_provider *pp; + struct g_geom *gp; + zvol_state_t *zv; + struct cdev *zv_dev; + int error = 0; + + error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &zv_dev, + &zfs_zvol_cdevsw, NULL, UID_ROOT, GID_OPERATOR, 0600, "%s/%s", + ZVOL_DRIVER, name); + if (error) { + printf("ZFS: ZVOL '%s': Could not create device node\n", name); + return (error); + } + + zv = kmem_zalloc(sizeof(*zv), KM_SLEEP); + gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); + gp->start = zvol_geom_start; + gp->access = zvol_geom_access; + pp = g_new_providerf(gp, "g%s/%s", ZVOL_DRIVER, name); + pp->sectorsize = DEV_BSIZE; + zv->zv_provider = pp; + zv->zv_state = 0; + zv->zv_dev = zv_dev; + bioq_init(&zv->zv_queue); + mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF); + + /* Provide both GEOM and the block device with its zvol state. */ + pp->private = zv; + zv->zv_dev->si_drv1 = zv; + *zvp = zv; + + return (0); +} +#endif + /** * \brief Create a minor node (plus a whole lot more) for the specified volume. */ @@ -499,7 +591,7 @@ { zfs_soft_state_t *zs; zvol_state_t *zv; - objset_t *os; + objset_t *os = NULL; dmu_object_info_t doi; int error; @@ -515,56 +607,19 @@ /* lie and say we're read-only */ error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os); - if (error) { - mutex_exit(&spa_namespace_lock); - return (error); - } + if (error) + goto out; -#ifdef sun - if ((minor = zfsdev_minor_alloc()) == 0) { - dmu_objset_disown(os, FTAG); - mutex_exit(&spa_namespace_lock); - return (ENXIO); - } - - if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) { - dmu_objset_disown(os, FTAG); - mutex_exit(&spa_namespace_lock); - return (EAGAIN); - } - (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME, - (char *)name); - - (void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor); - - if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR, - minor, DDI_PSEUDO, 0) == DDI_FAILURE) { - ddi_soft_state_free(zfsdev_state, minor); - dmu_objset_disown(os, FTAG); - mutex_exit(&spa_namespace_lock); - return (EAGAIN); - } - - (void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor); - - if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK, - minor, DDI_PSEUDO, 0) == DDI_FAILURE) { - ddi_remove_minor_node(zfs_dip, chrbuf); - ddi_soft_state_free(zfsdev_state, minor); - dmu_objset_disown(os, FTAG); - mutex_exit(&spa_namespace_lock); - return (EAGAIN); - } - - zs = ddi_get_soft_state(zfsdev_state, minor); - zs->zss_type = ZSST_ZVOL; - zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); -#else /* !sun */ - +#if defined(sun) + error = zvol_create_minor_sun(&zv, os, name); +#elif defined(__FreeBSD__) DROP_GIANT(); g_topology_lock(); - zv = zvol_geom_create(name); -#endif /* !sun */ + error = zvol_create_minor_freebsd(&zv, name); +#endif + + if (error) + goto out; (void) strlcpy(zv->zv_name, name, MAXPATHLEN); zv->zv_min_bs = DEV_BSHIFT; @@ -587,21 +642,30 @@ else zil_replay(os, zv, zvol_replay_vector); } - dmu_objset_disown(os, FTAG); - zv->zv_objset = NULL; + +out: + if (os != NULL) + dmu_objset_disown(os, FTAG); - zvol_minors++; + if (error == 0) { + zvol_minors++; + zv->zv_objset = NULL; + } mutex_exit(&spa_namespace_lock); - zvol_geom_run(zv); + if (error == 0) + zvol_geom_run(zv); +#ifdef __FreeBSD__ g_topology_unlock(); PICKUP_GIANT(); +#endif - ZFS_LOG(1, "ZVOL %s created.", name); + if (error == 0) + ZFS_LOG(1, "ZVOL %s created.", name); - return (0); + return (error); } /** @@ -1523,6 +1587,41 @@ zds.zv = (zvol_state_t *)dev->si_drv1; return (zvol_dmu_uio_common(&zds, uio, /*flags*/0)); } + +int +zvol_freebsd_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, + struct thread *td) +{ + zvol_state_t *zv = dev->si_drv1; + int error = 0; + + if (zv == NULL) + return (ENXIO); + + switch (cmd) { + case DIOCGSECTORSIZE: + *(u_int *)data = DEV_BSIZE; + break; + case DIOCGMEDIASIZE: + *(off_t *)data = zv->zv_volsize; + if (*(off_t *)data == 0) + error = ENOENT; + break; + /* + * TODO: These probably need to be implemented, too. There may be + * more, see sys/geom/geom_dev.c:g_dev_ioctl(). + */ + case DIOCGFLUSH: + case DIOCGDELETE: + case DIOCGSTRIPESIZE: + case DIOCGSTRIPEOFFSET: + /* FALLTHROUGH */ + default: + error = ENOIOCTL; + break; + } + return (error); +} #endif /* __FreeBSD__ && _KERNEL */ #ifdef sun @@ -2070,34 +2169,6 @@ } #endif /* sun */ -static zvol_state_t * -zvol_geom_create(const char *name) -{ - struct g_provider *pp; - struct g_geom *gp; - zvol_state_t *zv; - - gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); - gp->start = zvol_geom_start; - gp->access = zvol_geom_access; - pp = g_new_providerf(gp, "g%s/%s", ZVOL_DRIVER, name); - pp->sectorsize = DEV_BSIZE; - - zv = kmem_zalloc(sizeof(*zv), KM_SLEEP); - zv->zv_provider = pp; - zv->zv_state = 0; - bioq_init(&zv->zv_queue); - mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF); - - pp->private = zv; - - zv->zv_dev = make_dev(&zfs_zvol_cdevsw, /*unit*/ 0, UID_ROOT, - GID_OPERATOR, 0600, "%s/%s", ZVOL_DRIVER, name); - zv->zv_dev->si_drv1 = zv; - - return (zv); -} - static void zvol_geom_run(zvol_state_t *zv) { @@ -2282,8 +2353,8 @@ } if ((error = zvol_create_minor(sname)) != 0) { - printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n", - sname, error); + printf("ZFS WARNING: Unable to create ZVOL snapshot " + "%s (error=%d).\n", sname, error); break; } } @@ -2309,12 +2380,12 @@ return (error); } if (dmu_objset_type(os) == DMU_OST_ZVOL) { - if ((error = zvol_create_minor(name)) == 0) + error = zvol_create_minor(name); + if (error == 0) error = zvol_create_snapshots(os, name); - else { - printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n", - name, error); - } + if (error) + printf("ZFS WARNING: Unable to create ZVOL %s " + "(error=%d).\n", name, error); dmu_objset_rele(os, FTAG); return (error); } @@ -2343,57 +2414,65 @@ while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL, &cookie) == 0) { dmu_objset_rele(os, FTAG); - (void)zvol_create_minors(osname); - if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) { - printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n", - name, error); + error = zvol_create_minors(osname); + if (error) { + kmem_free(osname, MAXPATHLEN); + return (error); + } + error = dmu_objset_hold(name, FTAG, &os); + if (error) { + printf("ZFS WARNING: Unable to put hold on %s" + " (error=%d)\n", name, error); + kmem_free(osname, MAXPATHLEN); return (error); } } dmu_objset_rele(os, FTAG); kmem_free(osname, MAXPATHLEN); - return (0); + return (error); } static void zvol_rename_minor(struct g_geom *gp, const char *newname) { - struct g_provider *pp; + struct g_provider *new_pp, *old_pp; zvol_state_t *zv; - struct cdev *old_dev; + struct cdev *new_dev, *old_dev; + int error; ASSERT(MUTEX_HELD(&spa_namespace_lock)); g_topology_assert(); - pp = LIST_FIRST(&gp->provider); - ASSERT(pp != NULL); - zv = pp->private; + old_pp = LIST_FIRST(&gp->provider); + ASSERT(old_pp != NULL); + zv = old_pp->private; ASSERT(zv != NULL); - zv->zv_provider = NULL; - g_wither_provider(pp, ENXIO); + error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &new_dev, + &zfs_zvol_cdevsw, NULL, UID_ROOT, GID_OPERATOR, 0600, "%s/%s", + ZVOL_DRIVER, newname); + if (error) { + printf("ZFS: Could not rename ZVOL %s to %s\n", + zv->zv_name, newname); + return; + } - pp = g_new_providerf(gp, "g%s/%s", ZVOL_DRIVER, newname); - pp->sectorsize = DEV_BSIZE; - pp->mediasize = zv->zv_volsize; - pp->private = zv; - zv->zv_provider = pp; + new_pp = g_new_providerf(gp, "g%s/%s", ZVOL_DRIVER, newname); + new_pp->sectorsize = DEV_BSIZE; + new_pp->mediasize = zv->zv_volsize; + new_pp->private = zv; strlcpy(zv->zv_name, newname, sizeof(zv->zv_name)); - g_error_provider(pp, 0); + g_error_provider(new_pp, 0); /* * We're piggybacking on the GEOM code to rename standard block * devices as well. */ + destroy_dev_sched(zv->zv_dev); + zv->zv_dev = new_dev; + zv->zv_dev->si_drv1 = zv; old_dev = zv->zv_dev; - - zv->zv_dev = make_dev(&zfs_zvol_cdevsw, /*unit*/ 0, UID_ROOT, - GID_OPERATOR, 0600, "%s/%s", ZVOL_DRIVER,newname); - zv->zv_dev->si_drv1 = zv; - - destroy_dev_sched(old_dev); - } void ==== //depot/branches/redline/projects/cow/sys/kern/subr_taskqueue.c#6 (text) ==== @@ -64,8 +64,8 @@ int tq_flags; int tq_callouts; - taskqueue_callback_fn tq_callbacks[TASKQUEUE_CALLBACK_TYPE_MAX-1]; - void *tq_cb_contexts[TASKQUEUE_CALLBACK_TYPE_MAX-1]; + taskqueue_callback_fn tq_callbacks[TASKQUEUE_CALLBACK_TYPE_MAX]; + void *tq_cb_contexts[TASKQUEUE_CALLBACK_TYPE_MAX]; }; #define TQ_FLAGS_ACTIVE (1 << 0) @@ -533,12 +533,20 @@ } taskqueue_run_locked(tq); + /* + * This thread is on its way out, so just drop the lock temporarily + * in order to call the shutdown callback. This allows the callback + * to look at the taskqueue, even just before it dies. + */ + TQ_UNLOCK(tq); + TASKQUEUE_RUN_CALLBACK(tq, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN); + TQ_LOCK(tq); + /* rendezvous with thread that asked us to terminate */ tq->tq_tcount--; wakeup_one(tq->tq_threads); TQ_UNLOCK(tq); - TASKQUEUE_RUN_CALLBACK(tq, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN); kthread_exit(); } Change 529209 by willa@willa_repo on 2012/02/24 18:03:37 Integrate change 529208 to //depot/branches/redline/projects/cow/... Fix buildworld because of higher WARNS in zfsd for type converison. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#33 integrate Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#33 (text) ==== @@ -412,8 +412,9 @@ { dmu_buf_user_t *dbu, *next; - for (dbu = list_head(evict_list_p); dbu != NULL; dbu = next) { - next = list_next(evict_list_p, dbu); + for (dbu = (dmu_buf_user_t *)list_head(evict_list_p); dbu != NULL; + dbu = next) { + next = (dmu_buf_user_t *)list_next(evict_list_p, dbu); list_remove(evict_list_p, dbu); dbu->evict_func(dbu); } Change 530841 by willa@willa_repo on 2012/03/09 09:05:46 Merge changes 530707 and 530816 from //depot/SpectraBSD/stable/9/... Fix one more dbuf split brain condition. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#69 integrate Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#69 (text) ==== @@ -2442,11 +2442,13 @@ mutex_enter(&db->db_mtx); DBUF_VERIFY(db); if (db->db_state & DB_FILL) { + dbuf_dirty_record_t *dr; + + dr = list_head(&db->db_dirty_records); + ASSERT(dr->dr_txg == tx->tx_txg); + ASSERT(dr != db->db_data_pending); + if (db->db_level == 0 && db->db_freed_in_flight) { - dbuf_dirty_record_t *dr; - - dr = list_head(&db->db_dirty_records); - ASSERT(dr->dr_txg == tx->tx_txg); ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ @@ -2776,11 +2778,11 @@ DBUF_VERIFY(db); db->db_buf = NULL; dbuf_evict(db, &evict_list); - dmu_buf_destroy_user_evict_list(&evict_list); } else { mutex_exit(&db->db_mtx); dbuf_destroy(db); } + dmu_buf_destroy_user_evict_list(&evict_list); return (0); } @@ -2944,19 +2946,23 @@ * still referencing it from db_data, we need to make a copy * of it in case we decide we want to dirty it again in this txg. */ - if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && + if (db->db_data_pending && db->db_level == 0 && dn->dn_object != DMU_META_DNODE_OBJECT && - db->db_state == DB_CACHED && db->db_data_pending) { + (db->db_state & (DB_READ|DB_PARTIAL|DB_CACHED))) { dbuf_dirty_record_t *dr = db->db_data_pending; + /* dbuf_sync_bonus does not set db_data_pending. */ + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + if (dr->dt.dl.dr_data == db->db_buf) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + arc_buf_t *buf = arc_buf_alloc(dn->dn_objset->os_spa, + db->db.db_size, db, type); - dbuf_set_data(db, - arc_buf_alloc(dn->dn_objset->os_spa, - db->db.db_size, db, type)); - bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, - db->db.db_size); + dbuf_set_data(db, buf); + if (db->db_state == DB_CACHED) + bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, + db->db.db_size); } } Change 533163 by willa@willa_repo on 2012/03/22 08:59:55 Merge from //depot/SpectraBSD/stable/9/... Include the latest changes by Justin to fix two split brain issues that involve splitting the syncer's buffer at the proper time under certain race conditions. Changes: 532251 The syncer's buffer must be unconditionally disassociated. 533053 Correct a cause of ZFS data corruption. Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#70 integrate Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#70 (text) ==== @@ -637,6 +637,67 @@ return (&itr->hole); } +/* + * Perform any dbuf arc buffer splits required to guarantee + * the syncer operates on a stable buffer. + * + * \param db The dbuf to potentially split. + * \param syncer_dr The dirty record being processed by the syncer. + * \param deferred_split True if this check is being performed after a + * resolving read. + * + * If the syncer's buffer is currently "in use" in the + * open transaction group (i.e., there are active holds + * and db_data still references it), then make a copy + * before we start the write, so that any modifications + * from the open txg will not leak into this write. + * + * \note This copy does not need to be made for objects + * only modified in the syncing context (e.g. + * DNONE_DNODE blocks). + */ +static void +dbuf_syncer_split(dmu_buf_impl_t *db, dbuf_dirty_record_t *syncer_dr, + boolean_t deferred_split) +{ + if (syncer_dr && (db->db_state & DB_NOFILL) == 0 && + refcount_count(&db->db_holds) > 1 && + syncer_dr->dt.dl.dr_override_state != DR_OVERRIDDEN && + syncer_dr->dt.dl.dr_data == db->db_buf) { + dnode_t *dn = DB_DNODE(db); + spa_t *spa = dn->dn_objset->os_spa; + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + arc_buf_t *buf = arc_buf_alloc(spa, db->db.db_size, db, type); + + bcopy(db->db.db_data, buf->b_data, db->db.db_size); + if (deferred_split) { + /* + * In the case of a deferred split, the + * syncer has already generated a zio that + * references the syncer's arc buffer. + * Replace the open txg buffer instead. + * No activity in the open txg can be + * occurring yet. A reader is waiting + * for the resolve to complete, and a + * writer hasn't gotten around to creating + * a dirty record. Otherwise this dbuf + * would have already have been split. + */ + dbuf_set_data(db, buf); + } else { + /* + * The syncer has yet to create a write + * zio and since the dbuf may be in the + * CACHED state, activity in the open + * txg may be occurring. Switch out + * the syncer's dbuf, since it can tolerate + * the change. + */ + syncer_dr->dt.dl.dr_data = buf; + } + } +} + /** * \brief Merge write ranges for a dirty record. * @@ -715,6 +776,12 @@ old_buf = dl->dr_data; dr = list_prev(&db->db_dirty_records, dr); } + + /* + * Process any deferred syncer splits now that the buffer contents + * are fully valid. + */ + dbuf_syncer_split(db, db->db_data_pending, /*deferred_split*/B_TRUE); } static void @@ -2948,7 +3015,7 @@ */ if (db->db_data_pending && db->db_level == 0 && dn->dn_object != DMU_META_DNODE_OBJECT && - (db->db_state & (DB_READ|DB_PARTIAL|DB_CACHED))) { + db->db_state == DB_CACHED) { dbuf_dirty_record_t *dr = db->db_data_pending; /* dbuf_sync_bonus does not set db_data_pending. */ @@ -2960,9 +3027,8 @@ db->db.db_size, db, type); dbuf_set_data(db, buf); - if (db->db_state == DB_CACHED) - bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, - db->db.db_size); + bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, + db->db.db_size); } } @@ -3468,28 +3534,14 @@ /* Remember if we need to defer write execution to dbuf_read_done(). */ resolve_pending = !list_is_empty(&dr->dt.dl.write_ranges); - if ((db->db_state & DB_NOFILL) == 0 && - resolve_pending == FALSE && - dn->dn_object != DMU_META_DNODE_OBJECT && - refcount_count(&db->db_holds) > 1 && - dr->dt.dl.dr_override_state != DR_OVERRIDDEN && - *datap == db->db_buf) { - /* - * If this buffer is currently "in use" (i.e., there - * are active holds and db_data still references it), - * then make a copy before we start the write so that - * any modifications from the open txg will not leak - * into this write. - * - * NOTE: this copy does not need to be made for - * objects only modified in the syncing context (e.g. - * DNONE_DNODE blocks). - */ - int blksz = arc_buf_size(*datap); - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - *datap = arc_buf_alloc(os->os_spa, blksz, db, type); - bcopy(db->db.db_data, (*datap)->b_data, blksz); - } + /* + * Syncer splits must be deferred until the buffer contents + * are fully valid. + */ + if (resolve_pending == FALSE && + dn->dn_object != DMU_META_DNODE_OBJECT) + dbuf_syncer_split(db, dr, /*deferred_split*/B_FALSE); + /* Notify the world that this dirty record is about to write. */ db->db_data_pending = dr; Change 540537 by willa@willa_repo on 2012/05/06 21:17:42 Merge all recent ZFS work from SpectraBSD/stable/9 to the COW branch. One file was edited: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: - Change include of to . This integrate includes all of cddl/sbin/zfsd plus changes: 533161: A few updates for ztest debugging. 533278: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: 533630: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c: 535647: MST23149: Drive pull and reseat remains degraded 535822: Adding a ZFS test suite test that simulates write errors on a d 536013: * Fix a few unchecked error conditions that led to segfaults 536017: Fix a regex, now that I have a working zfsd (not checked in) to 536918: Properly terminate truncated events with a newline (end of even 537050: * Created a new abstract Reader class that provides a common 537052: * Fix file leak in CaseFile::Serialize(): open() without close( 537062: Style changes; no functional changes. The only changes that af 537136: Fix three memory leaks: 537379: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c: 537609: Eliminate trailing spaces. No changes to the text. 537617: Fixed BUG23251 zfsd loses track of its interval timer. The key 538809: Checkpoint a few minor improvements to ztest. 538877: Fix a bug in ztest_dmu_prealloc(). 538881: Added a test that adds and removes devices belonging to pools c 538882: * BUG23637: single drive pull results in deadlock. Fixed this 539618: Change the zvol_thrash test to put a geom label on each zvol. 539900: BUG23637 revert the fix introduced by change 538882 and intro 540010: Change the poolversion test to use disks as vdevs instead of fi 540272: Add the groupshow command to the STF suite. 540388: Fix most of the bugs found by the ztest program. 540407: Add the new ZIO_SET_ERROR macro so change 540388 builds. Affected files ... ... //depot/branches/redline/projects/cow/cddl/contrib/opensolaris/cmd/ztest/ztest.c#4 integrate ... //depot/branches/redline/projects/cow/cddl/sbin/zfsd/Makefile#2 integrate ... //depot/branches/redline/projects/cow/cddl/sbin/zfsd/callout.cc#2 integrate ... //depot/branches/redline/projects/cow/cddl/sbin/zfsd/callout.h#2 integrate ... //depot/branches/redline/projects/cow/cddl/sbin/zfsd/case_file.cc#2 integrate ... //depot/branches/redline/projects/cow/cddl/sbin/zfsd/case_file.h#2 integrate ... //depot/branches/redline/projects/cow/cddl/sbin/zfsd/dev_ctl_event.cc#2 integrate ... //depot/branches/redline/projects/cow/cddl/sbin/zfsd/dev_ctl_event.h#2 integrate ... //depot/branches/redline/projects/cow/cddl/sbin/zfsd/vdev.cc#2 integrate ... //depot/branches/redline/projects/cow/cddl/sbin/zfsd/vdev.h#2 integrate ... //depot/branches/redline/projects/cow/cddl/sbin/zfsd/vdev_iterator.cc#2 integrate ... //depot/branches/redline/projects/cow/cddl/sbin/zfsd/vdev_iterator.h#2 integrate ... //depot/branches/redline/projects/cow/cddl/sbin/zfsd/zfsd.cc#2 integrate ... //depot/branches/redline/projects/cow/cddl/sbin/zfsd/zfsd.h#2 integrate ... //depot/branches/redline/projects/cow/cddl/sbin/zfsd/zfsd_exception.cc#2 integrate ... //depot/branches/redline/projects/cow/cddl/sbin/zfsd/zfsd_exception.h#2 integrate ... //depot/branches/redline/projects/cow/cddl/sbin/zfsd/zpool_list.cc#2 integrate ... //depot/branches/redline/projects/cow/cddl/sbin/zfsd/zpool_list.h#2 integrate ... //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/README#2 integrate ... //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/bin/scripts/Makefile#2 integrate ... //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/bin/scripts/groupshow.ksh#1 branch ... //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/commands.txt#2 integrate ... //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/poolversion/cleanup.ksh#2 integrate ... //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/poolversion/setup.ksh#2 integrate ... //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/soft_errors/Makefile#1 branch ... //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/soft_errors/cleanup.ksh#1 branch ... //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/soft_errors/setup.ksh#1 branch ... //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/soft_errors/soft_errors.cfg#1 branch ... //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/soft_errors/soft_errors_001_pos.ksh#1 branch ... //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/zvol_thrash/Makefile#1 branch ... //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/zvol_thrash/cleanup.ksh#1 branch ... //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/zvol_thrash/setup.ksh#1 branch ... //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/zvol_thrash/zvol_thrash.cfg#1 branch ... //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/zvol_thrash/zvol_thrash_001_pos.ksh#1 branch ... //depot/branches/redline/projects/cow/cddl/usr.bin/zstreamdump/Makefile#3 integrate ... //depot/branches/redline/projects/cow/cddl/usr.bin/ztest/Makefile#3 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c#11 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#71 edit ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#56 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c#13 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c#13 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h#6 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#30 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#34 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h#8 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h#4 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h#3 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c#8 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c#11 integrate ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#36 integrate Differences ... ==== //depot/branches/redline/projects/cow/cddl/contrib/opensolaris/cmd/ztest/ztest.c#4 (text) ==== @@ -104,7 +104,6 @@ #include #include #include -#include #include #include #include @@ -223,6 +222,7 @@ typedef struct ztest_info { ztest_func_t *zi_func; /* test function */ + const char *zi_name; /* string name of test function */ uint64_t zi_iters; /* iterations per execution */ uint64_t *zi_interval; /* execute every seconds */ uint64_t zi_call_count; /* per-pass count */ @@ -268,37 +268,39 @@ uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ +#define ZI(name, iters, interval) { name, #name, iters, interval } + ztest_info_t ztest_info[] = { - { ztest_dmu_read_write, 1, &zopt_always }, - { ztest_dmu_write_parallel, 10, &zopt_always }, - { ztest_dmu_object_alloc_free, 1, &zopt_always }, - { ztest_dmu_commit_callbacks, 1, &zopt_always }, - { ztest_zap, 30, &zopt_always }, - { ztest_zap_parallel, 100, &zopt_always }, - { ztest_split_pool, 1, &zopt_always }, - { ztest_zil_commit, 1, &zopt_incessant }, - { ztest_zil_remount, 1, &zopt_sometimes }, - { ztest_dmu_read_write_zcopy, 1, &zopt_often }, - { ztest_dmu_objset_create_destroy, 1, &zopt_often }, - { ztest_dsl_prop_get_set, 1, &zopt_often }, - { ztest_spa_prop_get_set, 1, &zopt_sometimes }, + ZI(ztest_dmu_read_write, 1, &zopt_always ), + ZI(ztest_dmu_write_parallel, 10, &zopt_always ), + ZI(ztest_dmu_object_alloc_free, 1, &zopt_always ), + ZI(ztest_dmu_commit_callbacks, 1, &zopt_always ), + ZI(ztest_zap, 30, &zopt_always ), + ZI(ztest_zap_parallel, 100, &zopt_always ), + ZI(ztest_split_pool, 1, &zopt_always ), + ZI(ztest_zil_commit, 1, &zopt_incessant ), + ZI(ztest_zil_remount, 1, &zopt_sometimes ), + ZI(ztest_dmu_read_write_zcopy, 1, &zopt_often ), + ZI(ztest_dmu_objset_create_destroy, 1, &zopt_often ), + ZI(ztest_dsl_prop_get_set, 1, &zopt_often ), + ZI(ztest_spa_prop_get_set, 1, &zopt_sometimes ), #if 0 - { ztest_dmu_prealloc, 1, &zopt_sometimes }, + ZI(ztest_dmu_prealloc, 1, &zopt_often ), #endif - { ztest_fzap, 1, &zopt_sometimes }, - { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes }, - { ztest_spa_create_destroy, 1, &zopt_sometimes }, - { ztest_fault_inject, 1, &zopt_sometimes }, - { ztest_ddt_repair, 1, &zopt_sometimes }, - { ztest_dmu_snapshot_hold, 1, &zopt_sometimes }, - { ztest_reguid, 1, &zopt_sometimes }, - { ztest_spa_rename, 1, &zopt_rarely }, - { ztest_scrub, 1, &zopt_rarely }, - { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely }, - { ztest_vdev_attach_detach, 1, &zopt_rarely }, - { ztest_vdev_LUN_growth, 1, &zopt_rarely }, - { ztest_vdev_add_remove, 1, &zopt_vdevtime }, - { ztest_vdev_aux_add_remove, 1, &zopt_vdevtime }, + ZI(ztest_fzap, 1, &zopt_sometimes ), + ZI(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes ), + ZI(ztest_spa_create_destroy, 1, &zopt_sometimes ), + ZI(ztest_fault_inject, 1, &zopt_sometimes ), + ZI(ztest_ddt_repair, 1, &zopt_sometimes ), + ZI(ztest_dmu_snapshot_hold, 1, &zopt_sometimes ), + ZI(ztest_reguid, 1, &zopt_often ), + ZI(ztest_spa_rename, 1, &zopt_rarely ), + ZI(ztest_scrub, 1, &zopt_rarely ), + ZI(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely ), + ZI(ztest_vdev_attach_detach, 1, &zopt_rarely ), + ZI(ztest_vdev_LUN_growth, 1, &zopt_rarely ), + ZI(ztest_vdev_add_remove, 1, &zopt_vdevtime ), + ZI(ztest_vdev_aux_add_remove, 1, &zopt_vdevtime ), }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) @@ -404,6 +406,7 @@ } (void) fprintf(stderr, "%s\n", buf); fatal_msg = buf; /* to ease debugging */ + fflush(NULL); if (ztest_dump_core) abort(); exit(3); @@ -622,9 +625,12 @@ static void ztest_kill(ztest_shared_t *zs) { + pid_t curpid = getpid(); + zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(zs->zs_spa)); zs->zs_space = metaslab_class_get_space(spa_normal_class(zs->zs_spa)); - (void) kill(getpid(), SIGKILL); + printf("*** Crashing the current test process (pid %d)\n", curpid); + (void) kill(curpid, SIGKILL); } static uint64_t @@ -1427,7 +1433,6 @@ * but not always, because we also want to verify correct * behavior when the data was not recently read into cache. */ - ASSERT(offset % doi.doi_data_block_size == 0); if (ztest_random(4) != 0) { int prefetch = ztest_random(2) ? DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; @@ -1505,6 +1510,9 @@ return (ENOSPC); } + if (zopt_verbose >= 7) + printf("%s: freeing obj %d offset 0x%lx length 0x%lx tx %p\n", + __func__, lr->lr_foid, lr->lr_offset, lr->lr_length, tx); VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset, lr->lr_length, tx) == 0); @@ -3642,14 +3650,14 @@ * We've verified all the old bufwads, and made new ones. * Now write them out. */ - dmu_write(os, packobj, packoff, packsize, packbuf, tx); if (zopt_verbose >= 7) { - (void) printf("writing offset %llx size %llx" - " txg %llx\n", + (void) printf("writing obj %d offset %llx size %llx" + " txg %llx\n", packobj, (u_longlong_t)bigoff, (u_longlong_t)bigsize, (u_longlong_t)txg); } + dmu_write(os, packobj, packoff, packsize, packbuf, tx); for (off = bigoff, j = 0; j < s; j++, off += chunksize) { dmu_buf_t *dbt; if (i != 5) { @@ -3669,6 +3677,13 @@ VERIFY(dmu_buf_hold(os, bigobj, off, FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); } + if (zopt_verbose >= 7) { + (void) printf("assigning obj %d offset %llx " + "size %llx txg %llx\n", bigobj, + (u_longlong_t)bigoff, + (u_longlong_t)bigsize, + (u_longlong_t)txg); + } if (i != 5) { dmu_assign_arcbuf(bonus_db, off, bigbuf_arcbufs[j], tx); @@ -4506,11 +4521,17 @@ vdev_file_t *vf = vd0->vdev_tsd; if (vf != NULL && ztest_random(3) == 0) { + printf("Closing fd %d for path '%s'\n", + vf->vf_vnode->v_fd, vd0->vdev_path); (void) close(vf->vf_vnode->v_fd); vf->vf_vnode->v_fd = -1; } else if (ztest_random(2) == 0) { + printf("Marking vdev '%s' not readable\n", + vd0->vdev_path); vd0->vdev_cant_read = B_TRUE; } else { + printf("Marking vdev '%s' not writable\n", + vd0->vdev_path); vd0->vdev_cant_write = B_TRUE; } guid0 = vd0->vdev_guid; @@ -4557,11 +4578,13 @@ if (islog) (void) rw_wrlock(&ztest_shared->zs_name_lock); + printf("Offlining vdev '%s'\n", vd0->vdev_path); VERIFY(vdev_offline(spa, guid0, flags) != EBUSY); if (islog) (void) rw_unlock(&ztest_shared->zs_name_lock); } else { + printf("Onlining vdev '%s'\n", vd0->vdev_path); (void) vdev_online(spa, guid0, 0, NULL); } } @@ -4737,7 +4760,7 @@ return; if (zopt_verbose >= 3) { - (void) printf("Changed guid old %llu -> %llu\n", + (void) printf("Changed spa %p guid old %llu -> %llu\n", spa, (u_longlong_t)orig, (u_longlong_t)spa_guid(spa)); } @@ -4808,30 +4831,14 @@ int status; char zdb[MAXPATHLEN + MAXNAMELEN + 20]; char zbuf[1024]; - char *bin; - char *ztest; - char *isa; - int isalen; FILE *fp; - strlcpy(zdb, "/usr/bin/ztest", sizeof(zdb)); - - /* zdb lives in /usr/sbin, while ztest lives in /usr/bin */ - bin = strstr(zdb, "/usr/bin/"); - ztest = strstr(bin, "/ztest"); - isa = bin + 8; - isalen = ztest - isa; - isa = strdup(isa); - /* LINTED */ - (void) sprintf(bin, - "/usr/sbin%.*s/zdb -bcc%s%s -U %s %s", - isalen, - isa, + (void) sprintf(zdb, + "zdb -bcc%s%s -U %s %s", zopt_verbose >= 3 ? "s" : "", zopt_verbose >= 4 ? "v" : "", spa_config_path, pool); - free(isa); if (zopt_verbose >= 5) (void) printf("Executing %s\n", strstr(zdb, "zdb ")); @@ -4977,7 +4984,7 @@ ztest_deadman_thread(void *arg) { ztest_shared_t *zs = arg; - int grace = 300; + int grace = 600; hrtime_t delta; delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace; @@ -5005,10 +5012,8 @@ atomic_add_64(&zi->zi_call_time, functime); if (zopt_verbose >= 4) { - Dl_info dli; - (void) dladdr((void *)zi->zi_func, &dli); (void) printf("%6.2f sec in %s\n", - (double)functime / NANOSEC, dli.dli_sname); + (double)functime / NANOSEC, zi->zi_name); } } @@ -5639,14 +5644,12 @@ (void) printf("%7s %9s %s\n", "-----", "----", "--------"); for (int f = 0; f < ZTEST_FUNCS; f++) { - Dl_info dli; zi = &zs->zs_info[f]; print_time(zi->zi_call_time, timebuf); - (void) dladdr((void *)zi->zi_func, &dli); (void) printf("%7llu %9s %s\n", (u_longlong_t)zi->zi_call_count, timebuf, - dli.dli_sname); + zi->zi_name); } (void) printf("\n"); } ==== //depot/branches/redline/projects/cow/cddl/sbin/zfsd/Makefile#2 (text) ==== ==== //depot/branches/redline/projects/cow/cddl/sbin/zfsd/callout.cc#2 (text) ==== @@ -167,3 +167,37 @@ setitimer(ITIMER_REAL, &timerval, NULL); } } + +timeval +Callout::TimeRemaining() const +{ + /* + * Outline: Add the m_interval for each callout in s_activeCallouts + * ahead of this, except for the first callout. Add to that the result + * of getitimer (That's because the first callout stores its original + * interval setting while the timer is ticking). + */ + itimerval timervalToAlarm; + timeval timeToExpiry; + std::list::iterator it; + + if (! IsPending() ) { + timeToExpiry.tv_sec = INT_MAX; + timeToExpiry.tv_usec = 999999; /*maximum normalized value*/ + return (timeToExpiry); + } + + timerclear(&timeToExpiry); + getitimer(ITIMER_REAL, &timervalToAlarm); + timeval& timeToAlarm = timervalToAlarm.it_value; + timeradd(&timeToExpiry, &timeToAlarm, &timeToExpiry); + + it =s_activeCallouts.begin(); + it++; /*skip the first callout in the list*/ + for (; it != s_activeCallouts.end(); it++) { + timeradd(&timeToExpiry, &(*it)->m_interval, &timeToExpiry); + if ((*it) == this) + break; + } + return (timeToExpiry); +} ==== //depot/branches/redline/projects/cow/cddl/sbin/zfsd/callout.h#2 (text) ==== @@ -117,6 +117,17 @@ */ bool Reset(const timeval &interval, CalloutFunc_t *func, void *arg); + /** + * \brief Calculate the remaining time until this Callout's timer + * expires. + * + * The return value will be slightly greater than the actual time to + * expiry. + * + * If the callout is not pending, returns INT_MAX. + */ + timeval TimeRemaining() const; + private: /** * All active callouts sorted by expiration time. The callout ==== //depot/branches/redline/projects/cow/cddl/sbin/zfsd/case_file.cc#2 (text) ==== @@ -40,6 +40,7 @@ */ #include #include +#include #include #include #include @@ -53,6 +54,7 @@ /*============================ Namespace Control =============================*/ using std::auto_ptr; using std::hex; +using std::ifstream; using std::stringstream; using std::setfill; using std::setw; @@ -116,8 +118,12 @@ int numCaseFiles(scandir(s_caseFilePath.c_str(), &caseFiles, DeSerializeSelector, /*compar*/NULL)); - if (numCaseFiles == 0 || numCaseFiles == -1) + if (numCaseFiles == -1) + return; + if (numCaseFiles == 0) { + free(caseFiles); return; + } for (int i = 0; i < numCaseFiles; i++) { @@ -138,9 +144,17 @@ void CaseFile::PurgeAll() { - /* CaseFiles remove themselves from this list on destruction. */ - while (s_activeCases.size() != 0) - delete s_activeCases.front(); + /* + * Serialize casefiles before deleting them so that they can be reread + * and revalidated during BuildCaseFiles. + * CaseFiles remove themselves from this list on destruction. + */ + while (s_activeCases.size() != 0) { + CaseFile *casefile = s_activeCases.front(); + casefile->Serialize(); + delete casefile; + } + } //- CaseFile Public Methods ---------------------------------------------------- @@ -382,9 +396,7 @@ || event.Value("class") == "ereport.fs.zfs.checksum") { m_tentativeEvents.push_front(event.DeepCopy()); - if (!m_tentativeTimer.IsPending()) - m_tentativeTimer.Reset(s_removeGracePeriod, - OnGracePeriodEnded, this); + RegisterCallout(event); consumed = true; } @@ -393,6 +405,33 @@ return (consumed || closed); } + +void +CaseFile::RegisterCallout(const DevCtlEvent &event) +{ + timeval now, countdown, elapsed, timestamp, zero, remaining; + gettimeofday(&now, 0); + timestamp = event.GetTimestamp(); + timersub(&now, ×tamp, &elapsed); + timersub(&s_removeGracePeriod, &elapsed, &countdown); + /* + * If countdown is <= zero, Reset the timer to the + * smallest positive time value instead + */ + timerclear(&zero); + if (timercmp(&countdown, &zero, <=)) { + timerclear(&countdown); + countdown.tv_usec = 1; + } + + remaining = m_tentativeTimer.TimeRemaining(); + + if (!m_tentativeTimer.IsPending() + || timercmp(&countdown, &remaining, <)) + m_tentativeTimer.Reset(countdown, OnGracePeriodEnded, this); +} + + bool CaseFile::CloseIfSolved() { @@ -472,7 +511,6 @@ string evString; CaseFile *existingCaseFile(NULL); CaseFile *caseFile(NULL); - int fd(-1); try { uintmax_t poolGUID; @@ -505,8 +543,8 @@ .Find(vdevGUID)) == NULL) { /* * Either the pool no longer exists - * of this vdev is no longer a member of - * the pool. + * or this vdev is no longer a member of + * the pool. */ unlink(fullName.c_str()); return; @@ -519,27 +557,59 @@ */ caseFile = new CaseFile(Vdev(zpl.front(), vdevConf)); } - - fd = open(fullName.c_str(), O_RDONLY); - if (fd == -1) { + + ifstream caseStream(fullName.c_str()); + if (! caseStream) { throw ZfsdException("CaseFile::DeSerialize: Unable to " "read %s.\n", fileName); return; } + stringstream fakeDevdSocket(stringstream::in + | stringstream::out); + IstreamReader caseReader(&fakeDevdSocket); /* Re-load EventData */ - EventBuffer eventBuffer(fd); - while (eventBuffer.ExtractEvent(evString)) { - DevCtlEvent *event(DevCtlEvent::CreateEvent(evString)); - caseFile->m_events.push_back(event); + EventBuffer eventBuffer(caseReader); + caseStream >> std::noskipws >> std::ws; + while (!caseStream.eof()) { + /* + * Outline: + * read the beginning of a line and check it for + * "tentative". If found, discard "tentative". + * Shove into fakeDevdSocket. + * call ExtractEvent + * continue + */ + DevCtlEventList* destEvents; + string tentFlag("tentative "); + string line; + std::stringbuf lineBuf; + caseStream.get(lineBuf); + caseStream.ignore(); /*discard the newline character*/ + line = lineBuf.str(); + if (line.compare(0, tentFlag.size(), tentFlag) == 0) { + line.erase(0, tentFlag.size()); + destEvents = &caseFile->m_tentativeEvents; + } else { + destEvents = &caseFile->m_events; + } + fakeDevdSocket << line; + fakeDevdSocket << '\n'; + while (eventBuffer.ExtractEvent(evString)) { + DevCtlEvent *event(DevCtlEvent::CreateEvent( + evString)); + if (event != NULL) { + destEvents->push_back(event); + caseFile->RegisterCallout(*event); + } + } } - close(fd); + } catch (const ParseException &exp) { exp.Log(evString); if (caseFile != existingCaseFile) delete caseFile; - close(fd); /* * Since we can't parse the file, unlink it so we don't @@ -603,6 +673,24 @@ m_tentativeEvents.clear(); } + +void +CaseFile::SerializeEvList(const DevCtlEventList events, int fd, + const char* prefix) const +{ + if (events.empty()) + return; + for (DevCtlEventList::const_iterator curEvent = events.begin(); + curEvent != events.end(); curEvent++) { + const string &eventString((*curEvent)->GetEventString()); + + if (prefix) + write(fd, prefix, strlen(prefix)); + write(fd, eventString.c_str(), eventString.length()); + } +} + + void CaseFile::Serialize() { @@ -614,7 +702,7 @@ << "_vdev_" << VdevGUIDString() << ".case"; - if (m_events.empty()) { + if (m_events.empty() && m_tentativeEvents.empty()) { unlink(saveFile.str().c_str()); return; } @@ -625,12 +713,9 @@ saveFile.str().c_str()); return; } - for (DevCtlEventList::const_iterator curEvent = m_events.begin(); - curEvent != m_events.end(); curEvent++) { - const string &eventString((*curEvent)->GetEventString()); - - write(fd, eventString.c_str(), eventString.length()); - } + SerializeEvList(m_events, fd); + SerializeEvList(m_tentativeEvents, fd, "tentative "); + close(fd); } void ==== //depot/branches/redline/projects/cow/cddl/sbin/zfsd/case_file.h#2 (text) ==== @@ -171,7 +171,12 @@ bool ReEvaluate(const ZfsEvent &event); /** - * \breif Close a case if it is no longer relevant. + * \brief Register an itimer callout for the given event, if necessary + */ + void RegisterCallout(const DevCtlEvent &event); + + /** + * \brief Close a case if it is no longer relevant. * * This method deals with cases tracking soft errors. Soft errors * will be discarded should a remove event occur within a short period @@ -210,12 +215,12 @@ static int DeSerializeSelector(const struct dirent *dirEntry); /** - * \brief Given the name of a file containing a serialized CaseFile - * object, create/update an in-core CaseFile object + * \brief Given the name of a file containing serialized events from a + * CaseFile object, create/update an in-core CaseFile object * representing the serialized data. * - * \param fileName The name of a file containing a serialized - * CaseFile object. + * \param fileName The name of a file containing serialized events + * from a CaseFile object. */ static void DeSerializeFile(const char *fileName); @@ -248,6 +253,15 @@ */ void Serialize(); + /** + * \brief Serializes the supplied event list and writes it to fd + * + * \param prefix If not NULL, this prefix will be prepended to + * every event in the file. + */ + void SerializeEvList(const DevCtlEventList events, int fd, + const char* prefix=NULL) const; + /** * \brief Unconditionally close a CaseFile. */ ==== //depot/branches/redline/projects/cow/cddl/sbin/zfsd/dev_ctl_event.cc#2 (text) ==== @@ -101,7 +101,7 @@ if (Type() == DISCARDED_EVENT_TYPE) priority = LOG_INFO; - syslog(priority, ToString(parsedBuffer).c_str()); + syslog(priority, "%s", ToString(parsedBuffer).c_str()); } /*-------------------------------- DevCtlEvent -------------------------------*/ @@ -236,7 +236,7 @@ void DevCtlEvent::Log(int priority) const { - syslog(priority, ToString().c_str()); + syslog(priority, "%s", ToString().c_str()); } //- DevCtlEvent Virtual Public Methods ----------------------------------------- @@ -250,6 +250,23 @@ { } +timeval +DevCtlEvent::GetTimestamp() const +{ + timeval tv_timestamp; + struct tm tm_timestamp; + + if ( ! Contains("timestamp") ) { + throw ZfsdException("Event contains no timestamp: %s", + m_eventString.c_str()); + } + strptime(Value(string("timestamp")).c_str(), "%s", &tm_timestamp); + tv_timestamp.tv_sec = mktime(&tm_timestamp); + tv_timestamp.tv_usec = 0; + return (tv_timestamp); +} + + //- DevCtlEvent Protected Methods ---------------------------------------------- DevCtlEvent::DevCtlEvent(Type type, NVPairMap &map, const string &eventString) : m_type(type), @@ -455,9 +472,19 @@ || devLabel == NULL) return (NULL); - Vdev vdev(devLabel); - degraded = vdev.State() != VDEV_STATE_HEALTHY; - return (devLabel); + try { + Vdev vdev(devLabel); + degraded = vdev.State() != VDEV_STATE_HEALTHY; + return (devLabel); + } catch (ZfsdException &exp) { + string devName = fdevname(devFd); + string devPath = _PATH_DEV + devName; + string context("DevfsEvent::ReadLabel: " + + devPath + ": "); + + exp.GetString().insert(0, context); + exp.Log(); + } } return (NULL); } ==== //depot/branches/redline/projects/cow/cddl/sbin/zfsd/dev_ctl_event.h#2 (text) ==== @@ -158,7 +158,7 @@ * * All name => value data for events can be accessed via the Contains() * and Value() methods. name => value pairs for data not explicitly - * recieved as a a name => value pair are synthesized during parsing. For + * recieved as a name => value pair are synthesized during parsing. For * example, ATTACH and DETACH events have "device-name" and "parent" * name => value pairs added. */ @@ -276,6 +276,11 @@ */ virtual void Process() const; + /** + * Get the time that the event was created + */ + timeval GetTimestamp() const; + protected: /** Table entries used to map a type to a user friendly string. */ struct EventTypeRecord @@ -353,7 +358,8 @@ /** * Ingest event data from the supplied string. * - * \param eventString The string of devd event data to parse. + * \param[in] eventString The string of devd event data to parse. + * \param[out] nvpairs Returns the parsed data */ static void ParseEventString(Type type, const string &eventString, NVPairMap &nvpairs); ==== //depot/branches/redline/projects/cow/cddl/sbin/zfsd/vdev.cc#2 (text) ==== ==== //depot/branches/redline/projects/cow/cddl/sbin/zfsd/vdev.h#2 (text) ==== ==== //depot/branches/redline/projects/cow/cddl/sbin/zfsd/vdev_iterator.cc#2 (text) ==== ==== //depot/branches/redline/projects/cow/cddl/sbin/zfsd/vdev_iterator.h#2 (text) ==== ==== //depot/branches/redline/projects/cow/cddl/sbin/zfsd/zfsd.cc#2 (text) ==== @@ -43,6 +43,7 @@ #include #include +#include #include #include #include @@ -86,6 +87,32 @@ int g_debug = 0; libzfs_handle_t *g_zfsHandle; +/*-------------------------------- FDReader -------------------------------*/ +//- FDReader Public Methods ---------------------------------------------------- +size_t +FDReader::in_avail() const +{ + int bytes; + if (ioctl(m_fd, FIONREAD, &bytes)) { + syslog(LOG_ERR, "ioctl FIONREAD: %s", strerror(errno)); + return (0); + } + return (bytes); +} + + +/*-------------------------------- IstreamReader ---------------------------*/ +//- IstreamReader Public Methods ---------------------------------------------- +ssize_t +IstreamReader::read(char* buf, size_t count) +{ + m_stream->read(buf, count); + if (m_stream->fail()) + return (-1); + return (m_stream->gcount()); +} + + /*-------------------------------- EventBuffer -------------------------------*/ //- EventBuffer Static Data ---------------------------------------------------- /** @@ -98,18 +125,29 @@ */ const char EventBuffer::s_eventEndTokens[] = "\n"; +/** + * Key=Value pairs are terminated by whitespace. + */ +const char EventBuffer::s_keyPairSepTokens[] = " \t\n"; + //- EventBuffer Public Methods ------------------------------------------------- -EventBuffer::EventBuffer(int fd) - : m_fd(fd), +EventBuffer::EventBuffer(Reader& reader) + : m_reader(reader), m_validLen(0), m_parsedLen(0), - m_nextEventOffset(0) + m_nextEventOffset(0), + m_synchronized(true) { } bool EventBuffer::ExtractEvent(string &eventString) { + stringstream tsField; + timeval now; + + gettimeofday(&now, NULL); + tsField << " timestamp=" << now.tv_sec; while (UnParsed() > 0 || Fill()) { @@ -122,40 +160,67 @@ continue; } - char *nextEvent(m_buf + m_nextEventOffset); - size_t startLen(strcspn(nextEvent, s_eventStartTokens)); - bool aligned(startLen == 0); - if (aligned == false) { - warnx("Re-synchronizing with devd event stream"); - m_nextEventOffset += startLen; + char *nextEvent(m_buf + m_nextEventOffset); + bool truncated(true); + size_t eventLen(strcspn(nextEvent, s_eventEndTokens)); + + if (!m_synchronized) { + /* Discard data until an end token is read. */ + if (nextEvent[eventLen] != '\0') + m_synchronized = true; + m_nextEventOffset += eventLen; m_parsedLen = m_nextEventOffset; continue; - } + } else if (nextEvent[eventLen] == '\0') { - /* - * Start tokens may be end tokens too, so skip the start - * token when trying to find the end of the event. - */ - size_t eventLen(strcspn(nextEvent + 1, s_eventEndTokens) + 1); - if (nextEvent[eventLen] == '\0') { - /* Ran out of buffer before hitting a full event. */ m_parsedLen += eventLen; - continue; - } - - if (nextEvent[eventLen] != '\n') { - warnx("Improperly terminated event encountered"); + if (m_parsedLen < MAX_EVENT_SIZE) { + /* + * Ran out of buffer before hitting + * a full event. Fill() and try again. + */ + continue; + } + syslog(LOG_WARNING, + "Event exceeds event size limit of %d bytes."); } else { /* * Include the normal terminator in the extracted * event data. */ eventLen += 1; + truncated = false; } m_nextEventOffset += eventLen; m_parsedLen = m_nextEventOffset; eventString.assign(nextEvent, eventLen); + + if (truncated) { + size_t fieldEnd; + + /* Break cleanly at the end of a key<=>value pair. */ + fieldEnd = eventString.find_last_of(s_keyPairSepTokens); + if (fieldEnd != string::npos) + eventString.erase(fieldEnd); + eventString += '\n'; + + m_synchronized = false; + syslog(LOG_WARNING, + "Truncated %d characters from event.", + eventLen - fieldEnd); + } + + /* + * Add a timestamp as the final field of the event if it is + * not already present. + */ + if ( eventString.find("timestamp=") == string::npos) { + eventString.insert( + eventString.find_last_not_of('\n') + 1, + tsField.str()); + } + return (true); } return (false); @@ -165,7 +230,8 @@ bool EventBuffer::Fill() { - ssize_t result; + size_t avail; + ssize_t consumed(0); /* Compact the buffer. */ if (m_nextEventOffset != 0) { @@ -177,19 +243,26 @@ } /* Fill any empty space. */ - result = read(m_fd, m_buf + m_validLen, MAX_READ_SIZE - m_validLen); - if (result == -1) { - if (errno == EINTR || errno == EAGAIN) { - return (false); - } else { - err(1, "Read from devd socket failed"); + avail = m_reader.in_avail(); + if (avail) { + size_t want; + + want = std::min(avail, MAX_READ_SIZE - m_validLen); + consumed = m_reader.read(m_buf + m_validLen, want); + if (consumed == -1) { + if (errno == EINTR) { + return (false); + } else { + err(1, "EventBuffer::Fill(): Read failed"); + } } } - m_validLen += result; + + m_validLen += consumed; /* Guarantee our buffer is always NUL terminated. */ m_buf[m_validLen] = '\0'; - return (result > 0); + return (consumed > 0); } /*--------------------------------- ZfsDaemon --------------------------------*/ @@ -198,6 +271,7 @@ bool ZfsDaemon::s_terminateEventLoop; char ZfsDaemon::s_pidFilePath[] = "/var/run/zfsd.pid"; pidfh *ZfsDaemon::s_pidFH; +FDReader* ZfsDaemon::s_reader; int ZfsDaemon::s_devdSockFD = -1; int ZfsDaemon::s_signalPipeFD[2]; bool ZfsDaemon::s_systemRescanRequested(false); @@ -284,6 +358,7 @@ void ZfsDaemon::Fini() { + PurgeCaseFiles(); ClosePIDFile(); } @@ -365,10 +440,9 @@ return (false); } - /* Don't block on reads. */ - if (fcntl(s_devdSockFD, F_SETFL, O_NONBLOCK) == -1) - err(1, "Unable to enable nonblocking behavior on devd socket"); + /* Connect the stream to the file descriptor */ + s_reader = new FDReader(s_devdSockFD); syslog(LOG_INFO, "Connection to devd successful"); return (true); } @@ -376,7 +450,10 @@ void ZfsDaemon::DisconnectFromDevd() { + delete s_reader; + s_reader = NULL; close(s_devdSockFD); + s_devdSockFD = -1; } void @@ -426,8 +503,8 @@ { char discardBuf[256]; - while (read(s_devdSockFD, discardBuf, sizeof(discardBuf)) > 0) - ; + while (s_reader->in_avail()) + s_reader->read(discardBuf, sizeof(discardBuf)); } bool @@ -506,6 +583,7 @@ event = DevCtlEvent::CreateEvent(evString); if (event != NULL) event->Process(); + delete event; } } } @@ -539,7 +617,7 @@ void ZfsDaemon::EventLoop() { - EventBuffer eventBuffer(s_devdSockFD); + EventBuffer eventBuffer(*s_reader); while (s_terminateEventLoop == false) { struct pollfd fds[2]; @@ -591,13 +669,13 @@ RescanSystem(); } - if ((fds->revents & POLLERR) != 0) { + if ((fds[0].revents & POLLERR) != 0) { /* Try reconnecting. */ syslog(LOG_INFO, "Error on socket. Disconnecting."); break; } - if ((fds->revents & POLLHUP) != 0) { + if ((fds[0].revents & POLLHUP) != 0) { /* Try reconnecting. */ syslog(LOG_INFO, "Hup on socket. Disconnecting."); break; ==== //depot/branches/redline/projects/cow/cddl/sbin/zfsd/zfsd.h#2 (text) ==== @@ -42,6 +42,7 @@ #define _ZFSD_H_ #include +#include #include #include #include @@ -57,6 +58,7 @@ using std::auto_ptr; using std::map; using std::pair; +using std::istream; using std::string; /*================================ Global Data ===============================*/ @@ -74,21 +76,131 @@ #define NUM_ELEMENTS(x) (sizeof(x) / sizeof(*x)) /*============================= Class Definitions ============================*/ + +/*-------------------------------- Reader -------------------------------*/ +/** + * \brief A class that presents a common interface to both file descriptors and + * istreams . + * + * Standard C++ provides no way to create an iostream from a file descriptor or + * a FILE. The GNU, Apache, HPUX, and Solaris C++ libraries all provide + * non-standard ways to construct such a stream using similar semantics, but + * LLVM does not. Therefore this class is needed to ensure that zfsd can + * compile under LLVM. This class supports only the functionality needed by + * ZFSD; it does not implement the iostream API. + */ +class Reader +{ +public: + /** + * \brief Return the number of bytes immediately available for reading + */ + virtual size_t in_avail() const = 0; + + /** + * \brief Reads up to count bytes + * + * Whether this call blocks depends on the underlying input source. + * On error, -1 is returned, and errno will be set by the underlying + * source. + * + * \param buf Destination for the data + * \param count Maximum amount of data to read + * \returns Amount of data that was actually read + */ + virtual ssize_t read(char* buf, size_t count) = 0; +}; + + +/*-------------------------------- FDReader -------------------------------*/ +/** + * \brief Specialization of Reader that uses a file descriptor + */ +class FDReader : public Reader +{ +public: + /** + * \brief Constructor + * + * \param fd An open file descriptor. It will not be garbage + * collected by the destructor. + */ + FDReader(int fd); + + virtual size_t in_avail() const; + + virtual ssize_t read(char* buf, size_t count); + +protected: + /** Copy of the underlying file descriptor */ + int m_fd; +}; + +//- FDReader Inline Public Methods ----------------------------------------- +inline FDReader::FDReader(int fd) + : m_fd(fd) +{ +} + +inline ssize_t +FDReader::read(char* buf, size_t count) +{ + return (::read(m_fd, buf, count)); +} + + +/*-------------------------------- IstreamReader------------------------------*/ +/** + * \brief Specialization of Reader that uses a std::istream + */ +class IstreamReader : public Reader +{ +public: + /** + * Constructor + * + * \param stream Pointer to an open istream. It will not be + * garbage collected by the destructor. + */ + IstreamReader(istream* stream); + + virtual size_t in_avail() const; + + virtual ssize_t read(char* buf, size_t count); + +protected: + /** Copy of the underlying stream */ + istream* m_stream; +}; + +//- IstreamReader Inline Public Methods ---------------------------------------- +inline IstreamReader::IstreamReader(istream* stream) + : m_stream(stream) +{ +} + +inline size_t +IstreamReader::in_avail() const +{ + return (m_stream->rdbuf()->in_avail()); +} + + /*-------------------------------- EventBuffer -------------------------------*/ /** - * \brief Class buffering event data from Devd and splitting it - * into individual event strings. + * \brief Class buffering event data from Devd or a similar source and + * splitting it into individual event strings. * - * Users of this class initialize it with the file descriptor associated - * with the unix domain socket connection with devd. The lifetime of - * an EventBuffer instance should match that of the file descriptor passed - * to it. This is required as data from partially received events is - * retained in the EventBuffer in order to allow reconstruction of these - * events across multiple reads of the Devd file descriptor. + * Users of this class initialize it with a Reader associated with the unix + * domain socket connection with devd or a compatible source. The lifetime of + * an EventBuffer instance should match that of the Reader passed to it. This + * is required as data from partially received events is retained in the + * EventBuffer in order to allow reconstruction of these events across multiple + * reads of the stream. * - * Once the program determines that the Devd file descriptor is ready - * for reading, the EventBuffer::ExtractEvent() should be called in a - * loop until the method returns false. + * Once the program determines that the Reader is ready for reading, the + * EventBuffer::ExtractEvent() should be called in a loop until the method + * returns false. */ class EventBuffer { @@ -96,9 +208,9 @@ /** * Constructor * - * \param fd The file descriptor on which to buffer/parse event data. + * \param reader The data source on which to buffer/parse event data. */ - EventBuffer(int fd); + EventBuffer(Reader& reader); /** * Pull a single event string out of the event buffer. @@ -119,28 +231,34 @@ */ MIN_EVENT_SIZE = 2, + /* + * The maximum event size supported by ZFSD. + * Events larger than this size (minus 1) are + * truncated at the end of the last fully received + * key/value pair. + */ + MAX_EVENT_SIZE = 8192, + /** * The maximum amount of buffer data to read at * a single time from the Devd file descriptor. - * This size matches the largest event size allowed - * in the system. */ - MAX_READ_SIZE = 1024, + MAX_READ_SIZE = MAX_EVENT_SIZE, /** * The size of EventBuffer's buffer of Devd event data. - * This is one larger than the maximum event size which - * alows us to always include a terminating NUL without - * overwriting any received data. + * This is one larger than the maximum supported event + * size, which alows us to always include a terminating + * NUL without overwriting any received data. */ - EVENT_BUFSIZE = MAX_READ_SIZE + /*NUL*/1 + EVENT_BUFSIZE = MAX_EVENT_SIZE + /*NUL*/1 }; /** The amount of data in m_buf we have yet to look at. */ - size_t UnParsed(); + size_t UnParsed() const; /** The amount of data in m_buf available for the next event. */ - size_t NextEventMaxLen(); + size_t NextEventMaxLen() const; /** Fill the event buffer with event data from Devd. */ bool Fill(); @@ -151,11 +269,14 @@ /** Characters we treat as ending an event string. */ static const char s_eventEndTokens[]; + /** Characters found between successive "key=value" strings. */ + static const char s_keyPairSepTokens[]; + /** Temporary space for event data during our parsing. */ char m_buf[EVENT_BUFSIZE]; - /** Copy of the file descriptor linked to devd's domain socket. */ - int m_fd; + /** Reference to the reader linked to devd's domain socket. */ + Reader& m_reader; /** Valid bytes in m_buf. */ size_t m_validLen; @@ -165,17 +286,20 @@ /** Offset to the start token of the next event. */ size_t m_nextEventOffset; + + /** The EventBuffer is aligned and tracking event records. */ + bool m_synchronized; }; //- EventBuffer Inline Private Methods ----------------------------------------- inline size_t -EventBuffer::UnParsed() +EventBuffer::UnParsed() const { return (m_validLen - m_parsedLen); } inline size_t -EventBuffer::NextEventMaxLen() +EventBuffer::NextEventMaxLen() const { return (m_validLen - m_nextEventOffset); } @@ -350,6 +474,11 @@ static int s_devdSockFD; /** + * Reader object used by the EventBuffer + */ + static FDReader* s_reader; + + /** * Pipe file descriptors used to close races with our * signal handlers. */ ==== //depot/branches/redline/projects/cow/cddl/sbin/zfsd/zfsd_exception.cc#2 (text) ==== @@ -152,6 +152,6 @@ } output << m_log << endl; - syslog(LOG_ERR, output.str().c_str()); + syslog(LOG_ERR, "%s", output.str().c_str()); } ==== //depot/branches/redline/projects/cow/cddl/sbin/zfsd/zfsd_exception.h#2 (text) ==== ==== //depot/branches/redline/projects/cow/cddl/sbin/zfsd/zpool_list.cc#2 (text) ==== ==== //depot/branches/redline/projects/cow/cddl/sbin/zfsd/zpool_list.h#2 (text) ==== ==== //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/README#2 (text) ==== @@ -22,7 +22,7 @@ # # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. -# +# # ident "@(#)README 1.8 09/05/19 SMI" # @@ -90,7 +90,7 @@ o This method uses the standard STF techniques to create a Solaris package, which will be installed under the base directory "/opt/SUNWstc-fs-zfs". - + Briefly, this build and installation is performed as follows: # set path to STF bin directory @@ -135,7 +135,7 @@ SUNWstc-stf in the global zone % sudo pkgadd -d /ws/onnv-stc2/packages/`uname -p` SUNWstc-stf - + o When testing with NFS, you should set the remote access permission for rsh/rcp on the remote server machine. You can add the permission to ~root/.rhosts file in the server, for example: @@ -151,10 +151,10 @@ 3.2.1 Configure the tests - o You could configure the test on physical disks, that means you'll need - at least one scratch disks. (Above two is recommended) Configure the two + o You could configure the test on physical disks, that means you'll need + at least one scratch disks. (Above two is recommended) Configure the two scratch disks, c0t13d0 and c0t14d0 for example: - + % cd /opt/SUNWstc-fs-zfs; stf_configure -c DISKS="c0t13d0 c0t14d0" o The test suites could also be configured on rawfiles, each of them should @@ -162,7 +162,7 @@ % mkfile 3g /var/tmp/file1 /var/tmp/file2 % cd /opt/SUNWstc-zfs - % stf_configure -c DISKS="/var/tmp/file1 /var/tmp/file2" + % stf_configure -c DISKS="/var/tmp/file1 /var/tmp/file2" o By default the test suite runs all test assertions. However, the test suite can be configured for test runs of varying length by @@ -171,7 +171,7 @@ will configure the test suite for the shortest possible runtime: % cd /opt/SUNWstc-fs-zfs; stf_configure -c DISKS="c0t13d0 c0t14d0" \ - -c "RUNTIME=short" + -c "RUNTIME=short" Note that hardware speed is also a significnat contributor to the runtime length of the test suite. @@ -182,7 +182,7 @@ % cd /opt/SUNWstc-fs-zfs; stf_configure -c DISKS="c0t13d0 c0t14d0" \ -c "KEEP=poolA poolB" - + o If you want to run the test suite with remote support, you should assign one or more machines as remote testing hosts. Meanwhile, you also need to specify disks for each remote host. Optionally, you can @@ -219,7 +219,7 @@ specify RHOSTS and RDISKS. Currently, only one value "remote" is supported for iscsi variable. - Here is an example + Here is an example % cd /opt/SUNWstc-fs-zfs % stf_configure -c DISKS="c0t13d0 c0t14d0" -c RHOSTS="host1" \ -c RDISKS="'detect'" -c iscsi="remote" @@ -235,7 +235,7 @@ % export DISKS="c0t13d0 c0t14d0" % export KEEP="poolA poolB" % export RUNTIME="long" - % export RHOSTS="foo1 foo2" + % export RHOSTS="foo1 foo2" % export RDISKS="'c0t1d0 c0t2d0' 'detect'" % export RTEST_ROOT="/export/tmp" % stf_configure @@ -280,7 +280,7 @@ o First, configure in the global zone to create a local zone and export the pool to the local zone. You'll need at least one scratch - disks. (Two above is recommended) You can assign a zone name, zone root + disks. (Two above is recommended) You can assign a zone name, zone root and IP address for the local zone. All parameters are optional. Syntax as, % stf_configure -c DISKS="" -c zone=new [-c zone_name=] [-c zone_root=] [-c zone_ip=] @@ -316,7 +316,7 @@ o To execute all of the modes on current system platform - % cd /opt/SUNWstc-fs-zfs; + % cd /opt/SUNWstc-fs-zfs; % /opt/SUNWstc-stf/bin/`uname -p`/stf_execute o To execute in a specific mode: @@ -332,8 +332,8 @@ 3.4 Unconfigure the suite. - o Use the STF unconfigure tool. + o Use the STF unconfigure tool. - % cd /opt/SUNWstc-fs-zfs; stf_unconfigure + % cd /opt/SUNWstc-fs-zfs; stf_unconfigure ================================================================================ ==== //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/bin/scripts/Makefile#2 (text) ==== @@ -31,7 +31,7 @@ zfs_crypto \ zpool_version zfs_version \ zpool_smi zpool_bsd \ - groupadd groupmod groupdel \ + groupadd groupmod groupdel groupshow \ useradd usermod userdel \ dumpadm swap dircmp bsddisks df \ zonecfg zlogin zoneadm svcs fstyp \ ==== //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/commands.txt#2 (text) ==== @@ -87,6 +87,7 @@ /opt/SUNWstc-fs-zfs/bin/groupadd /opt/SUNWstc-fs-zfs/bin/groupdel /opt/SUNWstc-fs-zfs/bin/groupmod +/opt/SUNWstc-fs-zfs/bin/groupshow /usr/bin/head /bin/hostname /bin/kill ==== //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/poolversion/cleanup.ksh#2 (xtext) ==== @@ -40,7 +40,4 @@ log_must $ZPOOL destroy $TESTPOOL log_must $ZPOOL destroy $TESTPOOL2 -log_must $RM /tmp/zpool_version_1.dat -log_must $RM /tmp/zpool2_version_1.dat - default_cleanup ==== //depot/branches/redline/projects/cow/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/poolversion/setup.ksh#2 (xtext) ==== @@ -37,13 +37,13 @@ log_unsupported "zpool version property not supported on this system." fi +verify_disk_count "$DISKS" 2 +DISKS_ARRAY=($DISKS) # create a version 1 pool -log_must $MKFILE 64m /tmp/zpool_version_1.dat -log_must $ZPOOL create -o version=1 $TESTPOOL /tmp/zpool_version_1.dat +log_must $ZPOOL create -o version=1 $TESTPOOL ${DISKS_ARRAY[0]} # create another version 1 pool -log_must $MKFILE 64m /tmp/zpool2_version_1.dat -log_must $ZPOOL create -o version=1 $TESTPOOL2 /tmp/zpool2_version_1.dat +log_must $ZPOOL create -o version=1 $TESTPOOL2 ${DISKS_ARRAY[1]} log_pass ==== //depot/branches/redline/projects/cow/cddl/usr.bin/zstreamdump/Makefile#3 (text) ==== @@ -20,7 +20,7 @@ DPADD= ${LIBM} ${LIBNVPAIR} ${LIBUMEM} ${LIBZPOOL} \ ${LIBPTHREAD} ${LIBZ} ${LIBAVL} -LDADD= -lm -lnvpair -lumem -lzpool -lpthread -lz -lavl +LDADD= -lm -lnvpair -lumem -lzpool -lpthread -lz -lavl -luutil CSTD= c99 ==== //depot/branches/redline/projects/cow/cddl/usr.bin/ztest/Makefile#3 (text) ==== @@ -18,8 +18,12 @@ CFLAGS+= -I${.CURDIR}/../../lib/libumem DPADD= ${LIBM} ${LIBNVPAIR} ${LIBUMEM} ${LIBZPOOL} \ - ${LIBPTHREAD} ${LIBAVL} -LDADD= -lm -lnvpair -lumem -lzpool -lpthread -lavl + ${LIBPTHREAD} ${LIBZ} ${LIBAVL} +LDADD= -lm -lnvpair -lumem -lzpool -lpthread -lz -lavl -luutil + +# Since there are many asserts in this program, it makes no sense to compile +# it without debugging. +CFLAGS+=-g -O0 CSTD= c99 ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c#11 (text) ==== @@ -2938,44 +2938,6 @@ } /** - * \brief Find an ARC buffer using its block pointer. - * - * \param spa The SPA associated with the buffer. - * \param bp The block pointer associated with the buffer. - * \param priv The private data associated with the buffer. - * - * \note Calling this function will place a reference on any found buffer. - * - * XXX This should be folded into arc_read_nolock somehow - */ -arc_buf_t * -arc_buf_find_bp(spa_t *spa, blkptr_t *bp, void *private) -{ - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; - uint64_t guid = spa_guid(spa); - arc_buf_t *buf = NULL; - - hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), - &hash_lock); - if (hdr != NULL) { - if (hdr->b_datacnt > 0) { - add_reference(hdr, hash_lock, private); - if (HDR_BUF_AVAILABLE(hdr)) { - buf = hdr->b_buf; - ASSERT(buf->b_efunc == NULL); - hdr->b_flags &= ~ARC_BUF_AVAILABLE; - } else { - buf = arc_buf_clone(buf); - } - arc_access(hdr, hash_lock); - } - mutex_exit(hash_lock); - } - return buf; -} - -/** * \brief "Read" the block block at the specified DVA (in bp) via the * cache. * @@ -3036,6 +2998,7 @@ kmutex_t *hash_lock; zio_t *rzio; uint64_t guid = spa_load_guid(spa); + boolean_t cached_only = (*arc_flags & ARC_CACHED_ONLY) != 0; top: hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), @@ -3046,6 +3009,12 @@ if (HDR_IO_IN_PROGRESS(hdr)) { + /* + * Cache lookups should only occur from consumers + * that do not have any context loaded yet. This + * means that no I/O should be in progress for them. + */ + ASSERT(!cached_only); if (*arc_flags & ARC_WAIT) { cv_wait(&hdr->b_cv, hash_lock); mutex_exit(hash_lock); @@ -3117,6 +3086,13 @@ uint64_t addr; boolean_t devw = B_FALSE; + if (cached_only) { + if (hdr) + mutex_exit(hash_lock); + done(NULL, NULL, private); + return (0); + } + if (hdr == NULL) { /* this block is not in the cache */ arc_buf_hdr_t *exists; @@ -4481,7 +4457,7 @@ if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_io_error); } else { - zio->io_error = EIO; + ZIO_SET_ERROR(zio, EIO); } if (!equal) ARCSTAT_BUMP(arcstat_l2_cksum_bad); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#71 (text) ==== @@ -37,12 +37,36 @@ #include #include #include +#include static void dbuf_destroy(dmu_buf_impl_t *db); static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static zio_t *dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); +static arc_evict_func_t dbuf_do_evict; +#define IN_RANGE(x, val, y) ((val) >= (x) && (val) <= (y)) +#ifdef ZFS_DEBUG +#define DEBUG_REFCOUNT_INC(rc) refcount_acquire(&(rc)) +#define DEBUG_REFCOUNT_DEC(rc) do { \ + refcount_release(&(rc)); \ + ASSERT((rc) >= 0); \ +} while (0) +#define DEBUG_COUNTER_INC(counter) atomic_add_64(&(counter), 1) +#else +#define DEBUG_REFCOUNT_INC(rc) do { } while (0) +#define DEBUG_REFCOUNT_DEC(rc) do { } while (0) +#define DEBUG_COUNTER_INC(counter) do { } while (0) +#endif + +#define _DBUF_CONSTANT_FMT \ + " offset %"PRIu64" os %p level %d holds %"PRIi64" dirty %d state %d\n" +#define _DBUF_CONSTANT_FMT_ARGS(db) \ + (db)->db.db_offset, (db)->db_objset, (db)->db_level, \ + refcount_count(&(db)->db_holds), (db)->db_dirtycnt, (db)->db_state + +#define tmpprintf(args...) do { } while (0) + /** * \brief Global data structures and functions for the dbuf cache. */ @@ -246,8 +270,21 @@ atomic_add_64(&dbuf_hash_count, -1); } -static arc_evict_func_t dbuf_do_evict; +static void +dbuf_update_user_data(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + if (db->db_level == 0 && + db->db_user != NULL && db->db_user->user_data_ptr_ptr != NULL) { + ASSERT(!refcount_is_zero(&db->db_holds)); + *db->db_user->user_data_ptr_ptr = db->db.db_data; + } +} +/** + * DMU buffer user eviction mechanism. + * See dmu_buf_user_t about how this works. + */ static void dbuf_queue_user_evict(dmu_buf_impl_t *db, list_t *evict_list_p) { @@ -256,13 +293,104 @@ if (db->db_level != 0 || db->db_user == NULL) return; -#ifdef ZFS_DEBUG - atomic_add_64(&user_evicts, 1); -#endif + DEBUG_COUNTER_INC(user_evicts); + ASSERT(!list_link_active(&db->db_user->evict_queue_link)); list_insert_head(evict_list_p, db->db_user); db->db_user = NULL; } +/** + * \brief Update the user eviction data for the DMU buffer. + * + * \param db_fake The DMU buffer to set the data for. + * \param old_user The old user's eviction data pointer. + * \param new_user The new user's eviction data pointer. + * + * \returns NULL on success, or the existing user ptr if it's already + * been set. + */ +dmu_buf_user_t * +dmu_buf_update_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, + dmu_buf_user_t *new_user) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT(db->db_level == 0); + + mutex_enter(&db->db_mtx); + + if (db->db_user == old_user) { + db->db_user = new_user; + dbuf_update_user_data(db); + } else + old_user = db->db_user; + + mutex_exit(&db->db_mtx); + return (old_user); +} + +dmu_buf_user_t * +dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) +{ + + return (dmu_buf_update_user(db_fake, NULL, user)); +} + +dmu_buf_user_t * +dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + db->db_immediate_evict = TRUE; + return (dmu_buf_update_user(db_fake, NULL, user)); +} + +/** + * \return the db_user set with dmu_buf_update_user(), or NULL if not set. + */ +dmu_buf_user_t * +dmu_buf_get_user(dmu_buf_t *db_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT(!refcount_is_zero(&db->db_holds)); + + return (db->db_user); +} + +/** + * Clear the dbuf's ARC buffer. + */ +static void +dbuf_clear_data(dmu_buf_impl_t *db, list_t *evict_list_p) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); + db->db_buf = NULL; + dbuf_queue_user_evict(db, evict_list_p); + db->db.db_data = NULL; + if (db->db_state != DB_NOFILL) + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "clear data"); +} + +/** + * Set the dbuf's buffer to the ARC buffer, including any associated state, + * such as db_data. + */ +static void +dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); + ASSERT(buf != NULL); + + db->db_buf = buf; + db->db_buf->b_last_dbuf = db; + ASSERT(buf->b_data != NULL); + db->db.db_data = buf->b_data; + if (!arc_released(buf)) + arc_set_callback(buf, dbuf_do_evict, db); + dbuf_update_user_data(db); +} + boolean_t dbuf_is_metadata(dmu_buf_impl_t *db) { @@ -345,6 +473,7 @@ { dnode_t *dn; dbuf_dirty_record_t *dr; + dbuf_dirty_record_t *dr_next; dbuf_dirty_record_t *pending; ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -379,12 +508,15 @@ } pending = NULL; - for (dr = list_head(&db->db_dirty_records); dr != NULL; - dr = list_next(&db->db_dirty_records, dr)) { + for (dr = list_head(&db->db_dirty_records); dr != NULL; dr = dr_next) { + dr_next = list_next(&db->db_dirty_records, dr); ASSERT(dr->dr_dbuf == db); + ASSERT(dr_next == NULL || dr->dr_txg > dr_next->dr_txg); /* This DR happens to be the pending DR. */ - if (dr == db->db_data_pending) + if (dr == db->db_data_pending) { pending = dr; + ASSERT(dr_next == NULL); + } } if (db->db_data_pending != NULL) { /* The pending DR's dbuf is this dbuf. */ @@ -465,51 +597,43 @@ } } } + + /*** Dbuf state checks. */ + /* If a dbuf is partial, it can only have one dirty record. */ + ASSERT((db->db_state & DB_PARTIAL) == 0 || db->db_dirtycnt == 1); + + /* + * Returns 1 if either the bitmask is not set or those are the only + * bits set, with exceptions where they are acceptable. + */ +#define BITMASK_SET(val, bitmask, exceptions) \ + (((val) & (bitmask)) == 0 || ((val) & (~(bitmask|exceptions))) == 0) +#define BITMASK_SET_EXCLUSIVE(val, bitmask) BITMASK_SET(val, bitmask, 0) + + ASSERT(BITMASK_SET_EXCLUSIVE(db->db_state, DB_UNCACHED)); + ASSERT(BITMASK_SET_EXCLUSIVE(db->db_state, DB_NOFILL)); + ASSERT(BITMASK_SET_EXCLUSIVE(db->db_state, DB_CACHED)); + ASSERT(BITMASK_SET_EXCLUSIVE(db->db_state, DB_EVICTING)); + ASSERT(BITMASK_SET(db->db_state, DB_PARTIAL, DB_FILL)); + ASSERT(BITMASK_SET(db->db_state, DB_READ, DB_FILL)); + ASSERT(BITMASK_SET(db->db_state, DB_FILL, (DB_PARTIAL|DB_READ))); +#undef BITMASK_SET_EXCLUSIVE +#undef BITMASK_SET + DB_DNODE_EXIT(db); } #endif -static void -dbuf_update_data(dmu_buf_impl_t *db) +static arc_buf_t * +dbuf_alloc_arcbuf(dmu_buf_impl_t *db) { - ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_level == 0 && - db->db_user != NULL && db->db_user->user_data_ptr_ptr != NULL) { - ASSERT(!refcount_is_zero(&db->db_holds)); - *db->db_user->user_data_ptr_ptr = db->db.db_data; - } -} + spa_t *spa; + arc_buf_t *buf; -/** - * Clear the dbuf's ARC buffer. - */ -static void -dbuf_clear_data(dmu_buf_impl_t *db, list_t *evict_list_p) -{ - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); - db->db_buf = NULL; - dbuf_queue_user_evict(db, evict_list_p); - db->db.db_data = NULL; - if (db->db_state != DB_NOFILL) - DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "clear data"); -} - -/** - * Set the dbuf's buffer to the ARC buffer, including any associated state, - * such as db_data. - */ -static void -dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) -{ - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); - db->db_buf = buf; - ASSERT(buf->b_data != NULL); - db->db.db_data = buf->b_data; - if (!arc_released(buf)) - arc_set_callback(buf, dbuf_do_evict, db); - dbuf_update_data(db); + DB_GET_SPA(&spa, db); + buf = arc_buf_alloc(spa, db->db.db_size, db, DBUF_GET_BUFC_TYPE(db)); + buf->b_last_dbuf = db; + return (buf); } /** @@ -564,7 +688,7 @@ typedef struct dbuf_dirty_record_hole_itr { /* provided data */ arc_buf_t *src; - dbuf_dirty_leaf_t *dl; + dbuf_dirty_leaf_record_t *dl; /* calculated data */ dbuf_dirty_range_t *range; /* One greater than the last valid offset in the dst buffer */ @@ -582,7 +706,7 @@ */ static inline void dbuf_dirty_record_hole_itr_init(dbuf_dirty_record_hole_itr_t *itr, - dbuf_dirty_leaf_t *dl, arc_buf_t *src_buf) + dbuf_dirty_leaf_record_t *dl, arc_buf_t *src_buf) { itr->src = src_buf; itr->dl = dl; @@ -664,11 +788,9 @@ refcount_count(&db->db_holds) > 1 && syncer_dr->dt.dl.dr_override_state != DR_OVERRIDDEN && syncer_dr->dt.dl.dr_data == db->db_buf) { - dnode_t *dn = DB_DNODE(db); - spa_t *spa = dn->dn_objset->os_spa; - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - arc_buf_t *buf = arc_buf_alloc(spa, db->db.db_size, db, type); + arc_buf_t *buf; + buf = dbuf_alloc_arcbuf(db); bcopy(db->db.db_data, buf->b_data, db->db.db_size); if (deferred_split) { /* @@ -709,7 +831,7 @@ * buffer has to be copied over exclusive of those ranges. */ static void -dbuf_merge_write_ranges(dbuf_dirty_leaf_t *dl, arc_buf_t *old_buf) +dbuf_merge_write_ranges(dbuf_dirty_leaf_record_t *dl, arc_buf_t *old_buf) { dbuf_dirty_record_hole_itr_t itr; dbuf_dirty_record_hole_t *hole; @@ -745,7 +867,7 @@ dbuf_resolve_ranges(dmu_buf_impl_t *db, arc_buf_t *buf) { dbuf_dirty_record_t *dr; - dbuf_dirty_leaf_t *dl; + dbuf_dirty_leaf_record_t *dl; arc_buf_t *old_buf; /* No range data is kept for non data blocks. */ @@ -803,11 +925,17 @@ static void dbuf_read_complete(dmu_buf_impl_t *db, arc_buf_t *buf) { + if (db->db_level == 0 && db->db_dirtycnt > 0) { + /* + * Buffers in the FILL state are valid here if the read was + * issued prior to a write that completely filled. + */ ASSERT(db->db_buf != buf); ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED || + db->db_state == DB_FILL || (db->db_state & DB_READ)); /* @@ -816,7 +944,7 @@ */ dbuf_resolve_ranges(db, buf); - if (db->db_state & DB_READ) { + if (db->db_state == DB_READ) { /* * The most recent version of this block * was waiting on this read. Transition @@ -825,14 +953,21 @@ ASSERT(db->db_buf != NULL); DBUF_STATE_CHANGE(db, =, DB_CACHED, "resolve of records in READ state"); + } else if (db->db_state & DB_READ) { + /* + * Clear the READ bit; let fill_done transition us + * to DB_CACHED. + */ + ASSERT(db->db_state & DB_FILL); + DBUF_STATE_CHANGE(db, &=, ~DB_READ, + "resolve of records with READ state bit set"); } /* * The provided buffer is no longer relevant to the * current transaction group. Discard it. */ - arc_release(buf, db); - VERIFY(arc_buf_remove_ref(buf, db) == 1); + arc_discard_buf(buf, db); /* Dispatch any deferred syncer writes. */ if (db->db_data_pending != NULL && @@ -848,13 +983,14 @@ "read completed with no dirty records"); } else { /* - * The block was free'd or filled before this - * read could complete. + * The block was free'd or filled before this read could + * complete. Note that in this case, it satisfies the reader + * since the frontend must already be populated. */ + ASSERT(db->db_buf != NULL); ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); - arc_release(buf, db); - VERIFY(arc_buf_remove_ref(buf, db) == 1); + arc_discard_buf(buf, db); } DBUF_PROCESS_BUF_SETS(db, B_FALSE); } @@ -916,7 +1052,9 @@ { int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); - ASSERT(db->db_blkid == DMU_BONUS_BLKID); + if (db->db_blkid != DMU_BONUS_BLKID) + return B_FALSE; + ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(DB_DNODE_HELD(db)); ASSERT3U(bonuslen, <=, db->db.db_size); @@ -926,9 +1064,9 @@ bzero(db->db.db_data, DN_MAX_BONUSLEN); if (bonuslen) bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); - dbuf_update_data(db); + dbuf_update_user_data(db); DBUF_STATE_CHANGE(db, =, DB_CACHED, "bonus buffer filled"); - return (TRUE); + return (B_TRUE); } /** @@ -944,7 +1082,7 @@ * \returns whether any action was taken. */ static boolean_t -dbuf_read_on_hole(dmu_buf_impl_t *db, dnode_t *dn, uint32_t *flags) +dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, uint32_t *flags) { int is_hole; @@ -964,14 +1102,25 @@ if (is_hole) { arc_buf_t *buf; - buf = arc_buf_alloc(dn->dn_objset->os_spa, db->db.db_size, - db, DBUF_GET_BUFC_TYPE(db)); + buf = dbuf_alloc_arcbuf(db); bzero(buf->b_data, db->db.db_size); DBUF_STATE_CHANGE(db, =, DB_READ, "hole read satisfied"); dbuf_read_complete(db, buf); - return (TRUE); + return (B_TRUE); + } + return (B_FALSE); +} + +static void +dbuf_read_cached_done(zio_t *zio, arc_buf_t *buf, void *priv) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)priv; + + if (buf != NULL) { + ASSERT(arc_buf_frozen(buf) && !arc_released(buf)); + db->db_state = DB_READ; /* for read_complete */ + dbuf_read_complete(db, buf); } - return (FALSE); } /** @@ -1002,15 +1151,8 @@ ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED || (db->db_state & DB_PARTIAL)); - /* - * 1. Read without any writes (db_buf == NULL) - * 2. Have dirty records (!list_is_empty(&db->db_dirty_records) - * 3. freed_in_flight == TRUE - */ - //ASSERT(db->db_buf == NULL); - if ((db->db_blkid == DMU_BONUS_BLKID && dbuf_read_bonus(db, dn, flags)) - || dbuf_read_on_hole(db, dn, flags)) { + if (dbuf_read_bonus(db, dn, flags) || dbuf_read_hole(db, dn, flags)) { DB_DNODE_EXIT(db); *flags |= DB_RF_CACHED; if ((*flags & DB_RF_CACHED_ONLY) == 0) @@ -1022,12 +1164,15 @@ /* Check to see if a caller only wants cached buffers. */ if (*flags & DB_RF_CACHED_ONLY) { - arc_buf_t *buf = arc_buf_find_bp(spa, db->db_blkptr, db); - if (buf != NULL) { - db->db_state = DB_READ; /* for read_complete */ - dbuf_read_complete(db, buf); + ASSERT(db->db_state == DB_UNCACHED && db->db_buf == NULL && + db->db_dirtycnt == 0); + aflags = ARC_CACHED_ONLY; + (void) arc_read(/*pio*/NULL, spa, db->db_blkptr, /*pbuf*/NULL, + dbuf_read_cached_done, db, /*priority*/0, /*zio_flags*/0, + &aflags, /*zb*/NULL); + + if (aflags & ARC_CACHED) *flags |= DB_RF_CACHED; - } DB_DNODE_EXIT(db); /* Cache lookups never drop the dbuf mutex. */ return; @@ -1186,78 +1331,6 @@ } /** - * \brief This is our just-in-time copy function. - * - * It makes a copy of buffers that have been modified in a previous transaction - * group, before we modify them in the current active group. - * - * This function is used in two places: when we are dirtying a - * buffer for the first time in a txg, and when we are freeing - * a range in a dnode that includes this buffer. - * - * Note that when we are called from dbuf_free_range() we do - * not put a hold on the buffer, we just traverse the active - * dbuf list for the dnode. - */ -static void -dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg, list_t *evict_list_p) -{ - dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); - - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db.db_data != NULL); - ASSERT(db->db_level == 0); - ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); - - if (dr == NULL || - (dr->dt.dl.dr_data != - ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) - return; - - /* - * If the most recent dirty record for this dbuf has not yet synced - * and its referencing the dbuf data, either: - * reset the reference to point to a new copy, - * or (if there no active holders) - * just null out the current db_data pointer. - */ - ASSERT(dr->dr_txg >= txg - 2); - if (db->db_blkid == DMU_BONUS_BLKID) { - /* Note that the data bufs here are zio_bufs */ - dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); - arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); - bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); - } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { - int size = db->db.db_size; - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - spa_t *spa; - arc_buf_t *buf; - - DB_GET_SPA(&spa, db); - buf = arc_buf_alloc(spa, size, db, type); - - if (db->db_state & (DB_READ|DB_PARTIAL)) { - /* - * Any relevant data from the last dr will be - * copied into the new buffer based on the - * write ranges placed on this buffer once - * the dbuf is resolved. - */ - dbuf_set_data(db, buf); - } else { - /* - * Disassociate the dbuf from future syncer - * operation on the previous dirty record. - */ - bcopy(db->db.db_data, buf->b_data, size); - dr->dt.dl.dr_data = buf; - } - } else { - dbuf_clear_data(db, evict_list_p); - } -} - -/** * \brief Signal that the dirty record is about to be re-dirtied after sync. * * \param dr The dirty record to update. @@ -1303,6 +1376,192 @@ } /** + * \brief Disassociate the frontend for any older transaction groups of a + * dbuf that is inside a range being freed. + * + * \param db Dbuf whose dirty records should be handled. + * \param dn Dnode for the dbuf. + * \param tx Transaction that the free range operation applies to. + * \param evict_list_p Dbuf user eviction list (see dmu_buf_user_t). + * + * This function's primary purpose is to ensure that the state of any dirty + * records affected by the operation remain consistent. + */ +static void +dbuf_free_range_disassociate_frontend(dmu_buf_impl_t *db, dnode_t *dn, + dmu_tx_t *tx, list_t *evict_list_p) +{ + dbuf_dirty_record_t *dr; + + dr = list_head(&db->db_dirty_records); + tmpprintf("%s db %p dr %p holds %d dirties %d txg %"PRIu64"\n", + __func__, db, dr, refcount_count(&db->db_holds), + db->db_dirtycnt, tx->tx_txg); + + if (dr == NULL) + return; + + if (dr->dr_txg == tx->tx_txg) { + /* + * This buffer is "in-use", re-adjust the file size to reflect + * that this buffer may contain new data when we sync. + */ + if (db->db_blkid != DMU_SPILL_BLKID && + db->db_blkid > dn->dn_maxblkid) + dn->dn_maxblkid = db->db_blkid; + /* Handle intermediate dmu_sync() calls. */ + dbuf_unoverride(dr); + + /* + * If this buffer is still waiting on data for a RMW merge, that + * data no longer applies to this buffer. Transition to cached. + */ + dbuf_dirty_record_cleanup_ranges(dr); + } else { + if (db->db_state & DB_PARTIAL) { + /* + * Schedule resolution for the older transaction + * group's dirty record before we change the dbuf's + * state and lose track of the PARTIAL state. + */ + dbuf_transition_to_read(db); + } + /* Disassociate the frontend if necessary. */ + if (dr->dt.dl.dr_data == db->db_buf) { + arc_buf_t *buf; + + buf = dbuf_alloc_arcbuf(db); + if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + + /* + * Frontend being referenced by a user, but + * this dirty record has yet to be processed + * by the syncer. + */ + ASSERT(dr != db->db_data_pending); + if (db->db_state & DB_READ) { + /* + * The reader has yet to access the + * frontend (it must wait for the + * READ->CACHED transition), so it + * is safe to replace the frontend. + */ + dbuf_set_data(db, buf); + } else { + /* + * A reader is accessing the frontend, + * so we cannot replace it. + * Disassociate by replacing the + * buffer used for future syncer + * operations. + */ + bcopy(db->db.db_data, buf->b_data, + db->db.db_size); + dr->dt.dl.dr_data = buf; + } + } else { + /* + * Foreground is currently unreferenced, but + * a future access that results in a READ + * will confuse in-progress resolution of + * dirty records for older transactions. + * Provide a buffer so any future consumers + * will see a dbuf in the CACHED state. + */ + dbuf_set_data(db, buf); + } + } + } +} + +/** + * \brief Dirty level 1 blocks for a free_range operation. + * + * \returns B_TRUE if an indirect block is processed. + */ +static boolean_t +dbuf_free_range_indirects(dnode_t *dn, dmu_buf_impl_t *db, uint64_t start, + uint64_t end, dmu_tx_t *tx) +{ + dbuf_dirty_record_t *dr; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + uint64_t first_l1 = start >> epbs; + uint64_t last_l1 = end >> epbs; + + if (db->db_level == 0) + return (B_FALSE); + + if (db->db_level == 1 && IN_RANGE(first_l1, db->db_blkid, last_l1)) { + mutex_enter(&db->db_mtx); + dr = list_head(&db->db_dirty_records); + if (dr != NULL && dr->dr_txg < tx->tx_txg) { + dbuf_add_ref(db, FTAG); + mutex_exit(&db->db_mtx); + dbuf_will_dirty(db, tx); + dbuf_rele(db, FTAG); + } else { + mutex_exit(&db->db_mtx); + } + } + return (B_TRUE); +} + +static boolean_t +dbuf_free_range_already_freed(dmu_buf_impl_t *db) +{ + /* XXX add comment about why these are OK */ + if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL || + db->db_state == DB_EVICTING) { + ASSERT(db->db.db_data == NULL); + mutex_exit(&db->db_mtx); + return (B_TRUE); + } + return (B_FALSE); +} + +static boolean_t +dbuf_free_range_filler_will_free(dmu_buf_impl_t *db) +{ + if (db->db_state & DB_FILL) { + /* + * If the buffer is currently being filled, then its + * contents cannot be directly cleared. Signal the filler + * to have dbuf_fill_done perform the clear just before + * transitioning the buffer to the CACHED state. + */ + db->db_freed_in_flight = TRUE; + mutex_exit(&db->db_mtx); + return (B_TRUE); + } + return (B_FALSE); +} + +/** + * \brief If a dbuf has no users, clear it. + * + * \returns B_TRUE if the dbuf was cleared. + */ +static boolean_t +dbuf_clear_successful(dmu_buf_impl_t *db, list_t *evict_list_p) +{ + + if (refcount_count(&db->db_holds) == 0) { + /* All consumers are finished, so evict the buffer */ + ASSERT(db->db_buf != NULL); + dbuf_clear(db, evict_list_p); + return (B_TRUE); + } + return (B_FALSE); +} + +/** + * \brief Free a range of data blocks in a dnode. + * + * \param dn Dnode which the range applies to. + * \param start Starting block id of the range, inclusive. + * \param end Ending block id of the range, inclusive. + * \param tx Transaction to apply the free operation too. + * * Evict (if its unreferenced) or clear (if its referenced) any level-0 * data blocks in the free range, so that any future readers will find * empty blocks. Also, if we happen accross any level-1 dbufs in the @@ -1313,154 +1572,44 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) { dmu_buf_impl_t *db, *db_next; - dbuf_dirty_record_t *dr; - uint64_t txg = tx->tx_txg; - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - uint64_t first_l1 = start >> epbs; - uint64_t last_l1 = end >> epbs; list_t evict_list; dmu_buf_create_user_evict_list(&evict_list); - if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) { + if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) end = dn->dn_maxblkid; - last_l1 = end >> epbs; - } - dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); + + dprintf_dnode(dn, "start=%"PRIu64" end=%"PRIu64"\n", start, end); mutex_enter(&dn->dn_dbufs_mtx); for (db = list_head(&dn->dn_dbufs); db; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); - if (db->db_level == 1 && - db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { - mutex_enter(&db->db_mtx); - dr = list_head(&db->db_dirty_records); - if (dr != NULL && dr->dr_txg < txg) { - dbuf_add_ref(db, FTAG); - mutex_exit(&db->db_mtx); - dbuf_will_dirty(db, tx); - dbuf_rele(db, FTAG); - } else { - mutex_exit(&db->db_mtx); - } - } - - if (db->db_level != 0) + if (dbuf_free_range_indirects(dn, db, start, end, tx)) continue; - dprintf_dbuf(db, "found buf %s\n", ""); - if (db->db_blkid < start || db->db_blkid > end) + if (!IN_RANGE(start, db->db_blkid, end)) continue; - - /* found a level 0 buffer in the range */ if (dbuf_undirty(db, tx)) continue; mutex_enter(&db->db_mtx); + DBUF_VERIFY(db); + if (dbuf_free_range_already_freed(db) || + dbuf_free_range_filler_will_free(db) || + dbuf_clear_successful(db, &evict_list)) + continue; /* db_mtx already exited */ - if (db->db_state == DB_UNCACHED || - db->db_state == DB_NOFILL || - db->db_state == DB_EVICTING) { - ASSERT(db->db.db_data == NULL); - mutex_exit(&db->db_mtx); - continue; - } /* - * Our goal is to make the data visible in the current - * transaction group all zeros while preserving the data + * The goal is to make the data that is visible in the current + * transaction group all zeros, while preserving the data * as seen in any earlier transaction groups. */ - if (db->db_state & DB_FILL) { - /* - * If the buffer is currently being filled then we - * cannot directly clear the buffer's contents. - * Instead, we signal the filler by setting - * db_freed_in_flight and having dbuf_fill_done do - * this work just before transitioning the buffer to - * the CACHED state. - */ - db->db_freed_in_flight = TRUE; - mutex_exit(&db->db_mtx); - continue; - } - /* All consumers are finished, so evict the buffer */ - if (refcount_count(&db->db_holds) == 0) { - ASSERT(db->db_buf); - dbuf_clear(db, &evict_list); - continue; - } - - /* The dbuf is referenced */ - - dr = list_head(&db->db_dirty_records); - if (dr != NULL) { - if (dr->dr_txg == txg) { - /* - * This buffer is "in-use", re-adjust the file - * size to reflect that this buffer may - * contain new data when we sync. - */ - if (db->db_blkid != DMU_SPILL_BLKID && - db->db_blkid > dn->dn_maxblkid) - dn->dn_maxblkid = db->db_blkid; - /* Handle intermediate dmu_sync() calls. */ - dbuf_unoverride(dr); - - /* - * If this buffer is still waiting on data - * for a RMW merge, that data no longer applies - * to this buffer. - */ - dbuf_dirty_record_cleanup_ranges(dr); - } else { - /* - * This dbuf is not dirty in the open context. - * Either uncache it (if its not referenced in - * the open context) or reset its contents to - * empty. - */ - if (db->db_state & DB_PARTIAL) { - /* - * Schedule resolution for the older - * transaction group's dirty record - * before we change the dbuf's state - * and lose track of the PARTIAL state. - */ - dbuf_transition_to_read(db); - } - dbuf_fix_old_data(db, txg, &evict_list); - } - } - /* - * If cached, zero fill the buffer. - * - * Outstanding dirty records may need to be flushed. In - * that case, transition to cached and zero fill the buffer. - */ - if (db->db_state == DB_CACHED || db->db_dirtycnt > 0) { - /* - * If there's only a reader, provide a fresh buffer. - * The reader may be a resolver rather than an user - * initiated reader, so don't assert DB_READ. - */ - if (db->db_buf == NULL) { - arc_buf_t *buf; - spa_t *spa; - - DB_GET_SPA(&spa, db); - buf = arc_buf_alloc(spa, db->db.db_size, db, - DBUF_GET_BUFC_TYPE(db)); - dbuf_set_data(db, buf); - } else - arc_release(db->db_buf, db); - - /* Now clear the contents. */ - bzero(db->db.db_data, db->db.db_size); - arc_buf_freeze(db->db_buf); - DBUF_STATE_CHANGE(db, =, DB_CACHED, - "dbuf has been freed"); - } - + dbuf_free_range_disassociate_frontend(db, dn, tx, &evict_list); + ASSERT(db->db_buf != NULL); + arc_release(db->db_buf, db); + bzero(db->db.db_data, db->db.db_size); + arc_buf_freeze(db->db_buf); + DBUF_STATE_CHANGE(db, =, DB_CACHED, "zeroed by free"); mutex_exit(&db->db_mtx); /* Process one dbuf at a time to reduce memory pressure. */ dmu_buf_process_user_evicts(&evict_list); @@ -1504,7 +1653,7 @@ static void dbuf_dirty_record_truncate_ranges(dbuf_dirty_record_t *dr, int new_size) { - dbuf_dirty_leaf_t *dl; + dbuf_dirty_leaf_record_t *dl; dbuf_dirty_range_t *range; ASSERT(MUTEX_HELD(&dr->dr_dbuf->db_mtx)); @@ -1610,120 +1759,169 @@ db->db_blkptr, os->os_spa, &zb); } -/** - * \brief Create a new dbuf dirty record for this transaction. - * - * \param db The dbuf to create the dirty record for - * \param tx The transaction to create the dirty record on - * - * \invariant The dbuf mutex must be held. - * \invariant The dnode must be referenced. - * \invariant A dirty record must not already exist for the transaction's - * transaction group. +/* + * State of the current dirtying process. Dirtying requires keeping a lot + * of state available, so using a struct to access it keeps the code sane. */ +typedef struct dbuf_dirty_state { + dmu_buf_impl_t *db; /**< Dbuf being dirtied. */ + dmu_tx_t *tx; /**< Transaction to dirty. */ + dnode_t *dn; /**< The dbuf's dnode. */ + dbuf_dirty_record_t *insert_pt; /**< DR to insert new DR after. */ + dbuf_dirty_record_t *txg_dr; /**< Dirty record for this txg. */ + boolean_t txg_already_dirty; /**< This txg already dirty? */ + boolean_t do_free_accounting; /**< Free accounting needed? */ + list_t evict_list; /**< Dbuf user eviction list. */ + + /* The below only apply to leaf blocks. */ + arc_buf_t *fill_buf; /**< Already-filled optional buffer. */ + int offset; /**< Offset of the upcoming write. */ + int size; /**< Size of the upcoming write. */ +} dbuf_dirty_state_t; + +static void +dbuf_new_dirty_record_accounting(dbuf_dirty_state_t *dds) +{ + dnode_t *dn = dds->dn; + dmu_tx_t *tx = dds->tx; + dmu_buf_impl_t *db = dds->db; + objset_t *os = dn->dn_objset; + + /* + * Only valid if not already dirty in this transaction group. + */ + DNODE_VERIFY_DIRTYCTX(dn, tx); + + ASSERT3U(dn->dn_nlevels, >, db->db_level); + ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || + dn->dn_phys->dn_nlevels > db->db_level || + DN_NEXT_LEVEL(dn, tx->tx_txg) > db->db_level || + DN_NEXT_LEVEL(dn, tx->tx_txg - 1) > db->db_level || + DN_NEXT_LEVEL(dn, tx->tx_txg - 2) > db->db_level); + + /* + * We should only be dirtying in syncing context if it's the + * mos or we're initializing the os or it's a special object. + * However, we are allowed to dirty in syncing context provided + * we already dirtied it in open context. Hence we must make + * this assertion only if we're not already dirty. + */ + ASSERT(!dmu_tx_is_syncing(tx) || + DMU_OBJECT_IS_SPECIAL(dn->dn_object) || + os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); + ASSERT(db->db.db_size != 0); + + dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + + if (db->db_blkid != DMU_BONUS_BLKID) { + /* + * Update the accounting. + * Note: we delay "free accounting" until after we drop + * the db_mtx. This keeps us from grabbing other locks + * (and possibly deadlocking) in bp_get_dsize() while + * also holding the db_mtx. + */ + dnode_willuse_space(dn, db->db.db_size, tx); + if (db->db_blkid != DMU_SPILL_BLKID) + dds->do_free_accounting = dbuf_block_freeable(db); + } +} + static dbuf_dirty_record_t * -dbuf_dirty_record_create(dmu_buf_impl_t *db, dmu_tx_t *tx, list_t *evict_list_p) +dbuf_dirty_record_create(dbuf_dirty_state_t *dds) { dbuf_dirty_record_t *dr; - dnode_t *dn; + + ASSERT(MUTEX_HELD(&dds->db->db_mtx)); + ASSERT(DB_DNODE_HELD(dds->db)); + dr = list_head(&dds->db->db_dirty_records); + ASSERT(dr == NULL || dr->dr_txg != dds->tx->tx_txg); + + dbuf_new_dirty_record_accounting(dds); + + ASSERT(dds->txg_dr == NULL); + dr = kmem_zalloc(sizeof(dbuf_dirty_record_t), KM_SLEEP); + dr->dr_dbuf = dds->db; + dr->dr_txg = dds->tx->tx_txg; + dds->txg_dr = dr; + + return (dr); +} + +static void +dbuf_dirty_record_register(dbuf_dirty_state_t *dds) +{ - /* Check the invariants. */ - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(DB_DNODE_HELD(db)); - dr = list_head(&db->db_dirty_records); - ASSERT(dr == NULL || dr->dr_txg != tx->tx_txg); + ASSERT(dds->txg_dr != NULL); + list_insert_after(&dds->db->db_dirty_records, dds->insert_pt, + dds->txg_dr); - dn = DB_DNODE(db); - dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); + /* This buffer is now part of this txg */ + dbuf_add_ref(dds->db, (void *)(uintptr_t)dds->tx->tx_txg); + dds->db->db_dirtycnt += 1; + ASSERT3U(dds->db->db_dirtycnt, <=, TXG_CONCURRENT_STATES); +} - if (db->db_level == 0) { - void *data_old = db->db_buf; - dbuf_dirty_leaf_t *dl = &dr->dt.dl; +static void +dbuf_dirty_record_create_indirect(dbuf_dirty_state_t *dds) +{ + dbuf_dirty_record_t *dr; - if (db->db_state != DB_NOFILL) { - if (db->db_blkid == DMU_BONUS_BLKID) { - dbuf_fix_old_data(db, tx->tx_txg, evict_list_p); - data_old = db->db.db_data; - } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { - /* - * Release the data buffer from the cache so - * that we can modify it without impacting - * possible other users of this cached data - * block. Note that indirect blocks and - * private objects are not released until the - * syncing state (since they are only modified - * then). - */ - if (db->db_buf != NULL) { - arc_release(db->db_buf, db); - dbuf_fix_old_data(db, tx->tx_txg, - evict_list_p); - data_old = db->db_buf; - } else { - /* - * Buffer hasn't been created yet - - * new dbuf that's just been dirtied. - */ - int size = db->db.db_size; - arc_buf_contents_t type; - spa_t *spa; + dr = dbuf_dirty_record_create(dds); + mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); + list_create(&dr->dt.di.dr_children, + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); + dbuf_dirty_record_register(dds); +} - DB_GET_SPA(&spa, db); - type = DBUF_GET_BUFC_TYPE(db); - data_old = arc_buf_alloc(spa, size, - db, type); - dbuf_set_data(db, data_old); - } - } - ASSERT(data_old != NULL); - } - dr->dt.dl.dr_data = data_old; - dprintf_dbuf(db, "%s: dr_data=%p\n", __func__, - dr->dt.dl.dr_data); - list_create(&dl->write_ranges, sizeof(dbuf_dirty_range_t), - offsetof(dbuf_dirty_range_t, write_range_link)); - } else { - mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); - list_create(&dr->dt.di.dr_children, - sizeof (dbuf_dirty_record_t), - offsetof(dbuf_dirty_record_t, dr_dirty_node)); - } - dr->dr_dbuf = db; - dr->dr_txg = tx->tx_txg; - list_insert_head(&db->db_dirty_records, dr); +static void +dbuf_dirty_record_update_leaf(dbuf_dirty_state_t *dds) +{ + if (dds->db->db_blkid == DMU_BONUS_BLKID) + dds->txg_dr->dt.dl.dr_data = dds->db->db.db_data; + else + dds->txg_dr->dt.dl.dr_data = dds->db->db_buf; +} - /* - * Make sure that if this block was marked to be freed in this - * transaction group, that we revert that change. - */ - if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && - db->db_blkid != DMU_SPILL_BLKID) { - mutex_enter(&dn->dn_mtx); - dnode_clear_range(dn, db->db_blkid, /*nblks*/1, tx); - mutex_exit(&dn->dn_mtx); - db->db_freed_in_flight = FALSE; - } +static void +dbuf_dirty_record_register_as_leaf(dbuf_dirty_state_t *dds) +{ + dbuf_dirty_record_t *dr = dds->txg_dr; + dmu_buf_impl_t *db = dds->db; - /* - * This buffer is now part of this txg - */ - dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); - db->db_dirtycnt += 1; - ASSERT3U(db->db_dirtycnt, <=, TXG_CONCURRENT_STATES); + dbuf_dirty_record_update_leaf(dds); + dprintf_dbuf(db, "%s: dr_data=%p\n", __func__, dr->dt.dl.dr_data); + list_create(&dr->dt.dl.write_ranges, sizeof(dbuf_dirty_range_t), + offsetof(dbuf_dirty_range_t, write_range_link)); + dbuf_dirty_record_register(dds); +} - return (dr); +static void +dbuf_dirty_record_create_nofill(dbuf_dirty_state_t *dds) +{ + dbuf_dirty_record_t *dr; + + (void) dbuf_dirty_record_create(dds); + dbuf_dirty_record_register_as_leaf(dds); } void dbuf_dirty_verify(dmu_buf_impl_t *db, dmu_tx_t *tx) { +#ifdef ZFS_DEBUG dnode_t *dn = DB_DNODE(db); + dbuf_dirty_record_t *dr; /* Ensure that this dbuf has a transaction group and a hold */ ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); DMU_TX_VERIFY_DIRTY_BUF(tx, db); + dr = list_head(&db->db_dirty_records); + ASSERT(dr == NULL || dr->dr_txg <= tx->tx_txg || + db->db.db_object == DMU_META_DNODE_OBJECT); + /* * Shouldn't dirty a regular buffer in syncing context. Private * objects may be dirtied in syncing context, but only if they @@ -1735,30 +1933,206 @@ dn->dn_objset->os_dsl_dataset == NULL); DNODE_VERIFY_DIRTYCTX(dn, tx); +#endif +} + +/** + * \brief Enter a dbuf-dirtying function. + * + * \note This function should only be called once in a dbuf-dirtying function. + * + * This function's primary purpose is to compute state that only needs to be + * computed once per dirty call. Call dbuf_dirty_compute_state if the + * function drops the mutex, for things that require re-computing. + */ +static void +dbuf_dirty_enter(dbuf_dirty_state_t *dds, dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dbuf_dirty_record_t *dr; + + memset(dds, 0, sizeof(*dds)); + dds->db = db; + dds->tx = tx; + + dmu_buf_create_user_evict_list(&dds->evict_list); + DB_DNODE_ENTER(db); + dds->dn = DB_DNODE(db); + + mutex_enter(&db->db_mtx); +} + +/** + * \brief Compute the current dbuf dirty state. + * + * \note See dbuf_dirty for more information. + * \note The dbuf mutex must be held before this function is called, and + * afterwards, must not be dropped except by dbuf_dirty_exit(). + * If this is not possible, the intention was to allow a dbuf_dirty + * function to re-invoke this function after an action that might drop + * the mutex, and before continuing. Additional work may be needed. + */ +static void +dbuf_dirty_compute_state(dbuf_dirty_state_t *dds) +{ + dmu_buf_impl_t *db = dds->db; + dmu_tx_t *tx = dds->tx; + dbuf_dirty_record_t *dr, *newest; + + /* Only one filler allowed at a time. */ + while (db->db_state & DB_FILL) { + ASSERT(db->db_level == 0); + cv_wait(&db->db_changed, &db->db_mtx); + } + + dbuf_dirty_verify(db, tx); + if (db->db_blkid == DMU_SPILL_BLKID) + dds->dn->dn_have_spill = B_TRUE; + dnode_set_dirtyctx(dds->dn, tx, db); + + newest = list_head(&db->db_dirty_records); + + /* Only the mdn object may dirty an older txg. */ + ASSERT(newest == NULL || newest->dr_txg <= tx->tx_txg || + db->db.db_object == DMU_META_DNODE_OBJECT); + + dds->insert_pt = NULL; /* Insert at head. */ + for (dr = newest; dr != NULL && dr->dr_txg > tx->tx_txg; + dr = list_next(&db->db_dirty_records, dr)) + dds->insert_pt = dr; + + if (dr != NULL && dr->dr_txg == tx->tx_txg) + dds->txg_dr = dr; + + /* + * Cache whether this TX already has a dirty record, so that upon exit, + * additional work can be done after dropping the dbuf mutex. This + * information is useful elsewhere, too. + */ + dds->txg_already_dirty = (dds->txg_dr != NULL); } +static void dbuf_dirty_parent(dbuf_dirty_state_t *dds); + /** + * \brief Exit a dbuf-dirtying function. See dbuf_dirty. + * + * \note This function should only be called once in a dbuf-dirtying function. + * + * This function's primary purpose is to verify a consistent state upon + * completing a dirty operation, then drop the mutex and dirty parent dbufs. + * It is also a good time to update free accounting. + */ +static void +dbuf_dirty_exit(dbuf_dirty_state_t *dds) +{ + dmu_buf_impl_t *db = dds->db; + void *front = (db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : + db->db_buf; + + ASSERT(db->db_level != 0 || dds->txg_dr->dt.dl.dr_data == front); + ASSERT(dds->txg_dr->dr_txg == dds->tx->tx_txg); + + mutex_exit(&db->db_mtx); + dmu_buf_destroy_user_evict_list(&dds->evict_list); + + if (!dds->txg_already_dirty) { + if (dds->do_free_accounting) { + /* NB: This only applies to non-SPILL/BONUS blocks. */ + blkptr_t *bp = db->db_blkptr; + objset_t *os = dds->dn->dn_objset; + int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? + bp_get_dsize(os->os_spa, bp) : db->db.db_size; + /* + * This is only a guess -- if the dbuf is dirty + * in a previous txg, we don't know how much + * space it will use on disk yet. We should + * really have the struct_rwlock to access + * db_blkptr, but since this is just a guess, + * it's OK if we get an odd answer. + */ + ddt_prefetch(os->os_spa, bp); + dnode_willuse_space(dds->dn, -willfree, dds->tx); + } + dbuf_dirty_parent(dds); + } + + DB_DNODE_EXIT(db); +} + +/** + * \brief Dirty a nofill buffer. See dbuf_dirty. + * + * NOFILL buffers are similar to regular leaf buffers only in the sense that + * they create dirty records that contain ARC buffers in each txg. They + * don't need any frontend manipulation. + */ +dbuf_dirty_record_t * +dbuf_dirty_nofill(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dbuf_dirty_state_t dds; + + ASSERT(db->db_level == 0); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(db->db_state & (DB_UNCACHED|DB_NOFILL|DB_CACHED)); + + dbuf_dirty_enter(&dds, db, tx); + DBUF_STATE_CHANGE(db, =, DB_NOFILL, "allocating NOFILL buffer"); + dbuf_clear_data(db, &dds.evict_list); + dbuf_dirty_compute_state(&dds); + + if (dds.txg_already_dirty) + /* + * Reset immediate write sync state if needed. + * XXX: Is this really needed for NOFILL buffers? + */ + dbuf_unoverride(dds.txg_dr); + else + dbuf_dirty_record_create_nofill(&dds); + + dbuf_dirty_exit(&dds); + return (dds.txg_dr); +} + +/** + * \brief Dirty an indirect block. See dbuf_dirty. + * + * Indirect blocks are always completely rewritten, so they don't need any + * complex frontend manipulation. + */ +static dbuf_dirty_record_t * +dbuf_dirty_indirect(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dbuf_dirty_state_t dds; + + dbuf_dirty_enter(&dds, db, tx); + dbuf_dirty_compute_state(&dds); + + if (!dds.txg_already_dirty) + dbuf_dirty_record_create_indirect(&dds); + + dbuf_dirty_exit(&dds); + return (dds.txg_dr); +} + +/** * \brief Dirty the dbuf's parent. * - * \param db The dbuf whose parent needs to be dirtied - * \param tx The transaction to dirty the parent for - * \param dr The applicable dirty record + * \param dds Dbuf dirty state. * - * \invariant The dbuf's dnode must be referenced by the caller. - * - * If the dnode's struct_rwlock is not held, it will be grabbed and dropped - * within this function. + * \note If the dnode's struct_rwlock is not held, it will be grabbed and + * dropped within this function. */ static void -dbuf_dirty_parent(dmu_buf_impl_t *db, dmu_tx_t *tx, dbuf_dirty_record_t *dr) +dbuf_dirty_parent(dbuf_dirty_state_t *dds) { - dnode_t *dn; + dnode_t *dn = dds->dn; + dmu_buf_impl_t *db = dds->db; + dmu_tx_t *tx = dds->tx; + dbuf_dirty_record_t *dr = dds->txg_dr; + int drop_struct_lock = FALSE; int txgoff = tx->tx_txg & TXG_MASK; - ASSERT(DB_DNODE_HELD(db)); - dn = DB_DNODE(db); - if (db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); @@ -1798,8 +2172,7 @@ rw_exit(&dn->dn_struct_rwlock); ASSERT3U(db->db_level+1, ==, parent->db_level); - di = dbuf_dirty(parent, tx, 0, parent->db.db_size, DB_UNCACHED, - NULL); + di = dbuf_dirty_indirect(parent, tx); if (parent_held) dbuf_rele(parent, FTAG); @@ -1846,7 +2219,7 @@ dbuf_dirty_record_check_ranges(dbuf_dirty_record_t *dr) { #ifdef ZFS_DEBUG - dbuf_dirty_leaf_t *dl; + dbuf_dirty_leaf_record_t *dl; dbuf_dirty_range_t *prev, *cur, *next; if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) @@ -1875,7 +2248,7 @@ dbuf_dirty_record_add_range(dbuf_dirty_record_t *dr, int offset, int size) { dbuf_dirty_range_t *next_range, *old_range, *range; - dbuf_dirty_leaf_t *dl; + dbuf_dirty_leaf_record_t *dl; dmu_buf_impl_t *db; dl = &dr->dt.dl; @@ -1888,7 +2261,7 @@ /* Optimization: clear the ranges if the incoming range fills. */ if (offset == 0 && size == db->db.db_size) { dbuf_dirty_record_cleanup_ranges(dr); - return; + goto out; } range = kmem_zalloc(sizeof(dbuf_dirty_range_t), KM_SLEEP); @@ -1912,9 +2285,7 @@ old_range->end = MAX(range->end, old_range->end); old_range->size = old_range->end - old_range->start; list_remove(&dl->write_ranges, old_range); -#ifdef ZFS_DEBUG - refcount_release(&dirty_ranges_in_flight); -#endif + DEBUG_REFCOUNT_DEC(dirty_ranges_in_flight); kmem_free(range, sizeof(dbuf_dirty_range_t)); range = old_range; } @@ -1926,280 +2297,366 @@ } else { /* If old_range is NULL, this does a list_insert_tail(). */ list_insert_before(&dl->write_ranges, old_range, range); -#ifdef ZFS_DEBUG - refcount_acquire(&dirty_ranges_in_flight); - atomic_add_64(&dirty_ranges_total, 1); -#endif + DEBUG_REFCOUNT_INC(dirty_ranges_in_flight); + DEBUG_COUNTER_INC(dirty_ranges_total); } dbuf_dirty_record_check_ranges(dr); + +out: + if (dr->dr_dbuf->db_state & (DB_READ|DB_PARTIAL)) + if (list_is_empty(&dr->dt.dl.write_ranges)) + DBUF_STATE_CHANGE(db, =, DB_FILL, "complete filler"); +} + +static void +dbuf_dirty_set_data(dbuf_dirty_state_t *dds) +{ + arc_buf_t *buf = dds->fill_buf; + if (buf == NULL) + buf = dbuf_alloc_arcbuf(dds->db); + dbuf_set_data(dds->db, buf); +} + +static void +dbuf_dirty_leaf_with_existing_frontend(dbuf_dirty_state_t *dds) +{ + dmu_buf_impl_t *db = dds->db; + int size = db->db.db_size; + dbuf_dirty_record_t *newest = list_head(&db->db_dirty_records); + boolean_t old_txg_is_frontend = !dds->txg_already_dirty && + newest != NULL && newest->dt.dl.dr_data == db->db_buf; + arc_buf_t *fill_buf = dds->fill_buf; + + ASSERT(fill_buf == NULL || fill_buf != db->db_buf); + ASSERT(refcount_count(&db->db_holds) > db->db_dirtycnt); + + /* Reset any immediate write that has occurred. */ + if (dds->txg_already_dirty) + dbuf_unoverride(dds->txg_dr); + + /* If the old txg's record owns the frontend, give it its own copy. */ + if (old_txg_is_frontend) { + if (newest == db->db_data_pending) { + /* + * The syncer or holder normally disassociate. But if + * the syncer is performing a deferred resolve, then + * it will not disassociate until the resolve + * completes. Since the syncer has already + * scheduled its write with its buffer, we must + * disassociate by replacing the frontend. + */ + ASSERT(db->db_state & (DB_READ|DB_PARTIAL)); + ASSERT(db->db_dirtycnt == 1); + dbuf_dirty_set_data(dds); + } else { + newest->dt.dl.dr_data = dbuf_alloc_arcbuf(db); + bcopy(db->db.db_data, newest->dt.dl.dr_data->b_data, + size); + arc_release(db->db_buf, db); + if (fill_buf) { + bcopy(fill_buf->b_data, db->db.db_data, size); + ASSERT(arc_released(fill_buf)); + VERIFY(arc_buf_remove_ref(fill_buf, db) == 1); + } + } + return; + } + + /* We have a filled buffer and already own the current frontend. */ + if (fill_buf) { + arc_release(db->db_buf, db); + bcopy(fill_buf->b_data, db->db.db_data, size); + ASSERT(arc_released(fill_buf)); + VERIFY(arc_buf_remove_ref(fill_buf, db) == 1); + return; + } + + /* Frontend not owned by anybody. Notify that it will be modified. */ + ASSERT(newest == NULL || fill_buf == NULL); + if (dds->txg_already_dirty) { + /* Already released on initial dirty, so just thaw. */ + ASSERT(arc_released(db->db_buf)); + arc_buf_thaw(db->db_buf); + } else + arc_release(db->db_buf, db); +} + +static void +dbuf_dirty_record_create_leaf(dbuf_dirty_state_t *dds) +{ + dmu_buf_impl_t *db = dds->db; + dbuf_dirty_record_t *dr; + + dr = dbuf_dirty_record_create(dds); + + /* + * If this block was marked to be freed in this txg, revert that + * change. Note that db_freed_in_flight may have already been + * processed, so it can't be checked here. + */ + if (db->db_blkid != DMU_SPILL_BLKID) { + mutex_enter(&dds->dn->dn_mtx); + dnode_clear_range(dds->dn, db->db_blkid, /*nblks*/1, dds->tx); + mutex_exit(&dds->dn->dn_mtx); + db->db_freed_in_flight = FALSE; + } + dbuf_dirty_record_register_as_leaf(dds); } +static void +dbuf_dirty_leaf_common(dbuf_dirty_state_t *dds) +{ + dmu_buf_impl_t *db = dds->db; + + if (db->db_buf == NULL) + dbuf_dirty_set_data(dds); + else + dbuf_dirty_leaf_with_existing_frontend(dds); + ASSERT(arc_released(db->db_buf) && !arc_buf_frozen(db->db_buf)); + + if (!dds->txg_already_dirty) + dbuf_dirty_record_create_leaf(dds); + else + dbuf_dirty_record_update_leaf(dds); + + if (db->db_state != DB_CACHED) + dbuf_dirty_record_add_range(dds->txg_dr, dds->offset, + dds->size); +} + +dbuf_dirty_record_t * +dbuf_dirty_record_create_bonus(dbuf_dirty_state_t *dds) +{ + dmu_buf_impl_t *db = dds->db; + dbuf_dirty_record_t *newest = list_head(&db->db_dirty_records); + boolean_t last_txg_is_frontend = newest != NULL && + newest->dt.dl.dr_data == db->db.db_data; + dbuf_dirty_record_t *dr; + + if (last_txg_is_frontend) { + newest->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); + arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); + bcopy(db->db.db_data, newest->dt.dl.dr_data, DN_MAX_BONUSLEN); + } + dr = dbuf_dirty_record_create(dds); + dbuf_dirty_record_register_as_leaf(dds); + return (dr); +} + /** - * \brief Mark a dbuf as dirty. + * \brief Dirty a dbuf belonging to a meta-dnode. See dbuf_dirty. + * + * Dbufs belonging to the meta-dnode object are allowed to dirty in older + * transaction groups. Additionally, they will always be overwritten in + * each transaction group, which means no complex frontend manipulation. + * simplifies the logic considerably compared to normal leaf objects. */ dbuf_dirty_record_t * -dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx, int offset, int size, int how, - arc_buf_t *db_buf) +dbuf_dirty_mdn_object(dmu_buf_impl_t *db, dmu_tx_t *tx) { - dnode_t *dn; - objset_t *os; - dbuf_dirty_record_t *dr; - boolean_t do_free_accounting = B_FALSE; - boolean_t already_dirty = B_FALSE; - int txgoff = tx->tx_txg & TXG_MASK; - list_t evict_list; + dbuf_dirty_state_t dds; + + ASSERT(db->db_level == 0); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + + dbuf_dirty_enter(&dds, db, tx); + dbuf_dirty_compute_state(&dds); + + if (db->db_buf == NULL) + dbuf_set_data(db, dbuf_alloc_arcbuf(db)); - ASSERT(how == DB_FILL || how == DB_NOFILL || how == DB_UNCACHED); + if (dds.txg_already_dirty) + dbuf_unoverride(dds.txg_dr); + else + (void) dbuf_dirty_record_create_leaf(&dds); - dmu_buf_create_user_evict_list(&evict_list); + dbuf_dirty_exit(&dds); + return (dds.txg_dr); +} - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); +/** + * \brief Dirty a bonus dbuf. See dbuf_dirty. + * + * Bonus buffers are special in the sense that they do not use ARC buffers, + * but instead occupy space inside the dnode physical block. The dbuf + * layer's primary role is to provide a transactional mechanism for updating + * this special dnode section. Underlying bonus blocks therefore always use + * special zio buffers, and never share information between transactions. + */ +dbuf_dirty_record_t * +dbuf_dirty_bonus(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dbuf_dirty_state_t dds; - mutex_enter(&db->db_mtx); + ASSERT(db->db_blkid == DMU_BONUS_BLKID); + /* Can't dirty a bonus buffer without first reading it. */ + ASSERT(db->db_state == DB_CACHED); + dbuf_dirty_enter(&dds, db, tx); + dbuf_dirty_compute_state(&dds); - /* Only one filler allowed at a time. */ - while (db->db_state & DB_FILL) { - ASSERT(db->db_level == 0); - cv_wait(&db->db_changed, &db->db_mtx); - } + if (!dds.txg_already_dirty) + (void) dbuf_dirty_record_create_bonus(&dds); - dbuf_dirty_verify(db, tx); + dbuf_dirty_exit(&dds); + return (dds.txg_dr); +} - dr = list_head(&db->db_dirty_records); - ASSERT(dr == NULL || dr->dr_txg <= tx->tx_txg || - db->db.db_object == DMU_META_DNODE_OBJECT); +/** + * \brief Handle potential Copy-On-Write (COW) faults. + * + * This function's primary purpose is to optimize dirtying behavior that are + * likely to involve COW faults. + */ +static void +dbuf_dirty_handle_fault(dbuf_dirty_state_t *dds) +{ + dmu_buf_impl_t *db = dds->db; + ASSERT(db->db_level == 0); if (db->db_state & DB_PARTIAL) { - if (dr != NULL) { - if (dr->dr_txg != tx->tx_txg) { - /* - * Schedule resolution for this older - * transation group before we change the - * dbuf's state and lose track of the - * PARTIAL state. - */ - dbuf_transition_to_read(db); - } - } else if (offset != 0 && (offset + size) != db->db.db_size) { + dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); + if (dr->dr_txg != dds->tx->tx_txg) { + /* + * The newest dirty record's transaction group has + * closed. Since COW fault resolution can't be + * avoided, there is no benefit to waiting until the + * dirty record reaches the syncer. Start + * asynchronous fault resolution now. + */ + dbuf_transition_to_read(db); + } + } else if (db->db_state == DB_UNCACHED) { + int write_end = dds->offset + dds->size; + + if (dds->offset != 0 && write_end != db->db.db_size) { /* - * Immediately issue a read if we start writing - * inside the block rather than either at the + * Immediately start resolving a COW fault if we start + * writing inside the block rather than either at the * beginning (forward) or end (backward). Future * writes are unlikely to fill this dbuf. */ dbuf_transition_to_read(db); - } - } - - dnode_set_dirtyctx(dn, tx); - - /* Transition to the appropriate state if needed */ - if (how == DB_NOFILL) { - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT(db->db_state & (DB_UNCACHED|DB_NOFILL|DB_CACHED)); - dbuf_clear_data(db, &evict_list); - DBUF_STATE_CHANGE(db, =, DB_NOFILL, "allocating NOFILL buffer"); - } else if (how == DB_FILL) { - if (db->db_state == DB_UNCACHED) { - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - boolean_t cached = B_FALSE; - spa_t *spa; - arc_buf_t *fill_buf; - - ASSERT(db->db_buf == NULL); - ASSERT(db->db.db_data == NULL); - DB_GET_SPA(&spa, db); - fill_buf = db_buf; - + } else if (dds->size != db->db.db_size) { /* * If this dirty won't fill the buffer, see if a - * previous version is in the ARC. + * previous version is in the ARC. This skips the + * partial buffer bookkeeping that would otherwise + * be necessary. */ - if (fill_buf == NULL && size != db->db.db_size) - cached = dbuf_read_cached(db, dn); - - if (!cached) { - if (fill_buf == NULL) - fill_buf = arc_buf_alloc(spa, - db->db.db_size, db, type); - dbuf_set_data(db, fill_buf); - if (size != db->db.db_size) - DBUF_STATE_CHANGE(db, =, - (DB_PARTIAL|DB_FILL), - "notifying of an initial " - "partial fill"); - else - DBUF_STATE_CHANGE(db, =, DB_FILL, - "notifying of a complete fill"); - } - } else if (db->db_state & (DB_READ|DB_PARTIAL)) { - DBUF_STATE_CHANGE(db, |=, DB_FILL, - "notifying of a followup partial fill"); - } else { - /* No wait on FILL is done for indirect blocks. */ - ASSERT(db->db_state == DB_CACHED || - (db->db_level != 0 && db->db_state == DB_FILL)); + dbuf_read_cached(db, dds->dn); } } +} - if (db->db_blkid == DMU_SPILL_BLKID) - dn->dn_have_spill = B_TRUE; +/** + * \brief Common dbuf_dirty_enter() replacement for leaf blocks. + */ +void +dbuf_dirty_leaf_enter(dbuf_dirty_state_t *dds, + dmu_buf_impl_t *db, dmu_tx_t *tx, int offset, int size) +{ + dbuf_dirty_enter(dds, db, tx); + dds->offset = offset; + dds->size = size; /* - * If this buffer is already dirty, we're done. - * - * Find the newest dirty record that is not newer than the - * transaction's group. If there isn't one, dr == NULL. If it is - * older, it will be ignored. + * Handle COW faults prior to computing the dirty state, since + * transitioning to read drops the lock. */ - dr = list_head(&db->db_dirty_records); - while (dr != NULL && dr->dr_txg > tx->tx_txg) - dr = list_next(&db->db_dirty_records, dr); - if (dr && dr->dr_txg == tx->tx_txg) { - /* - * This transaction happens to be occurring in the same - * transaction group as the dirty record found above. - */ - if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { - /* Reset immediate write sync state if needed */ - dbuf_unoverride(dr); - if (db->db.db_object != DMU_META_DNODE_OBJECT && - db->db_state != DB_NOFILL) { - /* - * Notify ARC that the buffer will be - * modified, requiring a new checksum. - */ - arc_buf_thaw(db->db_buf); - } - } + dbuf_dirty_handle_fault(dds); + dbuf_dirty_compute_state(dds); +} - /* - * If we are assigning a buffer directly, release the - * old buffer allocated to this transaction group. - */ - if (db_buf != NULL && db_buf != db->db_buf) { - ASSERT(dr->dt.dl.dr_data == db->db_buf); - if (!arc_released(db->db_buf)) { - ASSERT(dr->dt.dl.dr_override_state == - DR_OVERRIDDEN); - arc_release(db->db_buf, db); - } - dr->dt.dl.dr_data = db_buf; - VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); - db->db_buf = NULL; - dbuf_set_data(db, db_buf); - } - already_dirty = B_TRUE; - } else { +/** + * \brief Dirty a regular leaf block. See dbuf_dirty. + * + * This function handles dirtying all user data blocks. + */ +dbuf_dirty_record_t * +dbuf_dirty_leaf(dmu_buf_impl_t *db, dmu_tx_t *tx, int offset, int size) +{ + dbuf_dirty_state_t dds; - /* - * Only valid if not already dirty in this transaction group. - */ - DNODE_VERIFY_DIRTYCTX(dn, tx); + ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(db->db_level == 0); - ASSERT3U(dn->dn_nlevels, >, db->db_level); - ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || - dn->dn_phys->dn_nlevels > db->db_level || - DN_NEXT_LEVEL(dn, tx->tx_txg) > db->db_level || - DN_NEXT_LEVEL(dn, tx->tx_txg - 1) > db->db_level || - DN_NEXT_LEVEL(dn, tx->tx_txg - 2) > db->db_level); + dbuf_dirty_leaf_enter(&dds, db, tx, offset, size); - /* - * We should only be dirtying in syncing context if it's the - * mos or we're initializing the os or it's a special object. - * However, we are allowed to dirty in syncing context provided - * we already dirtied it in open context. Hence we must make - * this assertion only if we're not already dirty. - */ - os = dn->dn_objset; - ASSERT(!dmu_tx_is_syncing(tx) || - DMU_OBJECT_IS_SPECIAL(dn->dn_object) || - os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); - ASSERT(db->db.db_size != 0); + if (db->db_state == DB_UNCACHED) + DBUF_STATE_CHANGE(db, =, (DB_PARTIAL|DB_FILL), + "notifying of initial partial fill"); + else if (db->db_state & (DB_READ|DB_PARTIAL)) + DBUF_STATE_CHANGE(db, |=, DB_FILL, + "notifying of followup partial fill"); + dbuf_dirty_leaf_common(&dds); - dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + dbuf_dirty_exit(&dds); + return (dds.txg_dr); +} - if (db->db_blkid != DMU_BONUS_BLKID) { - /* - * Update the accounting. - * Note: we delay "free accounting" until after we drop - * the db_mtx. This keeps us from grabbing other locks - * (and possibly deadlocking) in bp_get_dsize() while - * also holding the db_mtx. - * - * XXX Shouldn't this conditional ignore SPILL too? - */ - dnode_willuse_space(dn, db->db.db_size, tx); - do_free_accounting = dbuf_block_freeable(db); - } +/** + * \brief Dirty a regular leaf block with a filled ARC buffer. See dbuf_dirty. + * + * This function is identical to dbuf_dirty_leaf, except that it doesn't + * have to handle partial fills, since it is always provided an already + * filled buffer that is the write data for the transaction. + */ +dbuf_dirty_record_t * +dbuf_dirty_with_arcbuf(dmu_buf_impl_t *db, dmu_tx_t *tx, arc_buf_t *fill_buf) +{ + dbuf_dirty_state_t dds; - /* - * If assigning a buffer directly, release any buffers - * that will no longer be referenced. - */ - if (db_buf != NULL && dr != NULL) { - ASSERT(db->db_level == 0); + ASSERT(db->db_level == 0); - /* - * Handle the case of the syncer or dbuf_hold() - * preemptively dissassociating the dirty record - * from the buffer used in the open transaction - * group. - */ - if (dr->dt.dl.dr_data != db->db_buf) { - ASSERT(db->db_state == DB_CACHED); - arc_release(db->db_buf, db); - VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); - db->db_buf = NULL; - } - dbuf_set_data(db, db_buf); - } + dbuf_dirty_leaf_enter(&dds, db, tx, 0, db->db.db_size); + dds.fill_buf = fill_buf; - /* - * If this buffer is dirty in an old transaction group we need - * to make a copy of it so that the changes we make in this - * transaction group won't leak out when we sync the older txg. - */ - dr = dbuf_dirty_record_create(db, tx, &evict_list); - } + if (db->db_state != DB_CACHED) + DBUF_STATE_CHANGE(db, =, DB_FILL, "assigning filled buffer"); + dbuf_dirty_leaf_common(&dds); - /* Add the dirty range and do some related bookkeeping. */ - if (db->db_state != DB_CACHED && db->db_level == 0) { - dbuf_dirty_record_add_range(dr, offset, size); - if ((db->db_state & DB_FILL) && - list_is_empty(&dr->dt.dl.write_ranges)) - DBUF_STATE_CHANGE(db, =, DB_FILL, - "writer fully filled"); - } + dbuf_dirty_exit(&dds); + return (dds.txg_dr); +} - mutex_exit(&db->db_mtx); - - dmu_buf_destroy_user_evict_list(&evict_list); - - if (!already_dirty) { - if (do_free_accounting && db->db_blkid != DMU_SPILL_BLKID) { - blkptr_t *bp = db->db_blkptr; - int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? - bp_get_dsize(os->os_spa, bp) : db->db.db_size; - /* - * This is only a guess -- if the dbuf is dirty - * in a previous txg, we don't know how much - * space it will use on disk yet. We should - * really have the struct_rwlock to access - * db_blkptr, but since this is just a guess, - * it's OK if we get an odd answer. - */ - ddt_prefetch(os->os_spa, bp); - dnode_willuse_space(dn, -willfree, tx); - } - - dbuf_dirty_parent(db, tx, dr); +/** + * \brief Dirty a DMU buffer. + * + * \param db Dbuf to dirty. + * \param tx Transaction to dirty the dbuf in. + * + * This function is merely a dispatcher. Different types of dbufs require + * different actions in different scenarios. However, each dbuf_dirty + * implementing function should follow the same basic order: + * + * 1. dbuf_dirty_enter (grab the dbuf mutex) + * 2. Do any pre-dirty optimizations or fixups needed. + * *** Beyond this point, the dbuf mutex must always be held. *** + * 3. dbuf_dirty_compute_state (compute the basic dbuf_dirty state) + * 4. Change the dbuf state as applicable + * 5. Make the frontend (db->db_buf) usable by the dirty record for this txg. + * 6. Create or update this txg's dirty record, if needed. + * 7. dbuf_dirty_exit, which triggers dirtying parent dbufs if this dbuf was + * not already dirty in this txg. + * + * \note The point of having separate functions is to reduce the difficulty + * of understanding what happens to each type of dbuf in a dirty. + */ +dbuf_dirty_record_t * +dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + if (db->db_blkid == DMU_BONUS_BLKID) { + return (dbuf_dirty_bonus(db, tx)); + } else if (db->db_level == 0) { + if (db->db.db_object == DMU_META_DNODE_OBJECT) + return (dbuf_dirty_mdn_object(db, tx)); + else + return (dbuf_dirty_leaf(db, tx, 0, db->db.db_size)); + } else { + return (dbuf_dirty_indirect(db, tx)); } - - DB_DNODE_EXIT(db); - return (dr); } /** @@ -2212,7 +2669,7 @@ void dbuf_dirty_record_cleanup_ranges(dbuf_dirty_record_t *dr) { - dbuf_dirty_leaf_t *dl; + dbuf_dirty_leaf_record_t *dl; dbuf_dirty_range_t *range; /* Write ranges do not apply to indirect blocks */ @@ -2223,11 +2680,122 @@ dl = &dr->dt.dl; while ((range = list_remove_head(&dl->write_ranges)) != NULL) { kmem_free(range, sizeof(dbuf_dirty_range_t)); + DEBUG_REFCOUNT_DEC(dirty_ranges_in_flight); + } +} + +/* XXX refactor dbuf_undirty_*() into dbuf_undirty(). */ +static void +dbuf_undirty_bonus(dbuf_dirty_record_t *dr) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + + if (dr->dt.dl.dr_data != db->db.db_data) { + zio_buf_free(dr->dt.dl.dr_data, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); + } + db->db_data_pending = NULL; + ASSERT(list_next(&db->db_dirty_records, dr) == NULL); + list_remove(&db->db_dirty_records, dr); + kmem_free(dr, sizeof(dbuf_dirty_record_t)); + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; +} + +static void +dbuf_undirty_leaf(dbuf_dirty_record_t *dr) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); + if (db->db_state == DB_NOFILL) + return; + + if (dr->dt.dl.dr_data != db->db_buf) { + /* + * What we wrote is already out of date, so + * just free the ARC buffer. + */ + VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); + } else if (!arc_released(db->db_buf)) { + /* + * Our dbuf hasn't already been evicted, so + * register a callback to clean it up once + * its ARC buffer is released. + */ + arc_set_callback(db->db_buf, dbuf_do_evict, db); + } +} + +static void +dbuf_undirty_indirect(dbuf_dirty_record_t *dr) +{ + dnode_t *dn; + dmu_buf_impl_t *db = dr->dr_dbuf; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(list_head(&dr->dt.di.dr_children) == NULL); + /* + * The size of an indirect block must match what its + * associated dnode thinks it should be. + */ + ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); + /* + * If the dbuf's block pointer is not a hole, evict it when + * its last ARC buffer hold has been released. + */ + if (!BP_IS_HOLE(db->db_blkptr)) { + int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); + ASSERT3U(dn->dn_phys->dn_maxblkid >> (db->db_level * epbs), >=, + db->db_blkid); + arc_set_callback(db->db_buf, dbuf_do_evict, db); + } + DB_DNODE_EXIT(db); + mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); +} + +static void +dbuf_undirty_write(dbuf_dirty_record_t *dr, uint64_t txg) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + + ASSERT(!list_link_active(&dr->dr_dirty_node)); + ASSERT(dr->dr_txg == txg); + /* There should be no older dirty records. */ + ASSERT(list_next(&db->db_dirty_records, dr) == NULL); + list_remove(&db->db_dirty_records, dr); + #ifdef ZFS_DEBUG - refcount_release(&dirty_ranges_in_flight); - ASSERT(dirty_ranges_in_flight >= 0); + if (db->db_blkid == DMU_SPILL_BLKID) { + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); + ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && + db->db_blkptr == &dn->dn_phys->dn_spill); + DB_DNODE_EXIT(db); + } #endif + + /* Clean up the dirty record. */ + if (db->db_level == 0) { + dbuf_undirty_leaf(dr); + dbuf_dirty_record_cleanup_ranges(dr); + list_destroy(&dr->dt.dl.write_ranges); + } else { + dbuf_undirty_indirect(dr); } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + + cv_broadcast(&db->db_changed); + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + db->db_data_pending = NULL; } /** @@ -2358,6 +2926,7 @@ if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { arc_buf_t *buf = db->db_buf; + tmpprintf("%s db %p clearing\n", __func__, db); ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); dbuf_clear_data(db, &evict_list); VERIFY(arc_buf_remove_ref(buf, db) == 1); @@ -2366,6 +2935,7 @@ return (1); } + tmpprintf("%s db %p undirtied\n", __func__, db); mutex_exit(&db->db_mtx); dmu_buf_destroy_user_evict_list(&evict_list); @@ -2395,7 +2965,7 @@ DB_DNODE_EXIT(db); (void) dbuf_read(db, NULL, rf); /* Already CACHED or UNCACHED at this point */ - (void) dbuf_dirty(db, tx, 0, db->db.db_size, DB_FILL, NULL); + (void) dbuf_dirty(db, tx); } /** @@ -2470,7 +3040,7 @@ } #endif - dbuf_dirty(db, tx, offset, size, DB_FILL, NULL); + dbuf_dirty_leaf(db, tx, offset, size); } void @@ -2478,7 +3048,7 @@ { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dbuf_dirty(db, tx, 0, db->db.db_size, DB_NOFILL, NULL); + dbuf_dirty_nofill(db, tx); } void @@ -2498,7 +3068,7 @@ while (db->db_state & DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); - dbuf_dirty(db, tx, 0, db->db.db_size, DB_FILL, NULL); + dbuf_dirty_leaf(db, tx, 0, db->db.db_size); } #pragma weak dmu_buf_fill_done = dbuf_fill_done @@ -2515,7 +3085,8 @@ ASSERT(dr->dr_txg == tx->tx_txg); ASSERT(dr != db->db_data_pending); - if (db->db_level == 0 && db->db_freed_in_flight) { + if (db->db_freed_in_flight) { + ASSERT(db->db_level == 0); ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ @@ -2531,10 +3102,10 @@ * buffer has been fully filled. Otherwise, clear the * FILL bit, so it goes back to the steady state. */ - if (db->db_state == DB_FILL) + if (db->db_state == DB_FILL) { DBUF_STATE_CHANGE(db, =, DB_CACHED, "filler finished, complete buffer"); - else { + } else { DBUF_STATE_CHANGE(db, &=, ~DB_FILL, "filler finished, incomplete buffer"); ASSERT(db->db_state & (DB_PARTIAL|DB_READ)); @@ -2567,29 +3138,7 @@ arc_return_buf(buf, db); ASSERT(arc_released(buf)); - - mutex_enter(&db->db_mtx); - - ASSERT(db->db_state & (DB_CACHED|DB_UNCACHED|DB_PARTIAL|DB_READ)); - - /* - * If the dbuf is cached and the number of holds exceeds the number - * of dirty calls on it, then dirty it again and remove the buffer - * reference, before copying the ARC buffer to the dbuf. - */ - if (db->db_state == DB_CACHED && - refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { - mutex_exit(&db->db_mtx); - (void) dbuf_dirty(db, tx, 0, db->db.db_size, DB_FILL, NULL); - bcopy(buf->b_data, db->db.db_data, db->db.db_size); - VERIFY(arc_buf_remove_ref(buf, db) == 1); - xuio_stat_wbuf_copied(); - return; - } - - xuio_stat_wbuf_nocopy(); - mutex_exit(&db->db_mtx); - (void) dbuf_dirty(db, tx, 0, db->db.db_size, DB_FILL, buf); + (void) dbuf_dirty_with_arcbuf(db, tx, buf); dbuf_fill_done(db, tx); } @@ -3022,18 +3571,14 @@ ASSERT(db->db_blkid != DMU_BONUS_BLKID); if (dr->dt.dl.dr_data == db->db_buf) { - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - arc_buf_t *buf = arc_buf_alloc(dn->dn_objset->os_spa, - db->db.db_size, db, type); - - dbuf_set_data(db, buf); + dbuf_set_data(db, dbuf_alloc_arcbuf(db)); bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, db->db.db_size); } } (void) refcount_add(&db->db_holds, tag); - dbuf_update_data(db); + dbuf_update_user_data(db); DBUF_VERIFY(db); /* If a reading buffer set is associated, add the callback now. */ if (buf_set != NULL && (buf_set->dmu_ctx->flags & DMU_CTX_FLAG_READ)) { @@ -3238,66 +3783,6 @@ } /** - * \brief Update the user eviction data for the DMU buffer. - * - * \param db_fake The DMU buffer to set the data for. - * \param old_user The old user's eviction data pointer. - * \param new_user The new user's eviction data pointer. - * - * \returns NULL on success, or the existing user ptr if it's already - * been set. - * - * dmu_evict_user() will call the evict_func for all buffers in a - * objset with a given pageout func. - */ -dmu_buf_user_t * -dmu_buf_update_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, - dmu_buf_user_t *new_user) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - ASSERT(db->db_level == 0); - - mutex_enter(&db->db_mtx); - - if (db->db_user == old_user) { - db->db_user = new_user; - dbuf_update_data(db); - } else - old_user = db->db_user; - - mutex_exit(&db->db_mtx); - return (old_user); -} - -dmu_buf_user_t * -dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) -{ - - return (dmu_buf_update_user(db_fake, NULL, user)); -} - -dmu_buf_user_t * -dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - - db->db_immediate_evict = TRUE; - return (dmu_buf_update_user(db_fake, NULL, user)); -} - -/** - * \return the db_user set with dmu_buf_update_user(), or NULL if not set. - */ -dmu_buf_user_t * -dmu_buf_get_user(dmu_buf_t *db_fake) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - ASSERT(!refcount_is_zero(&db->db_holds)); - - return (db->db_user); -} - -/** * \brief Tells if the given dbuf is freeable. */ boolean_t @@ -3396,6 +3881,7 @@ /* Provide the pending dirty record to child dbufs */ db->db_data_pending = dr; + ASSERT(list_next(&db->db_dirty_records, dr) == NULL); mutex_exit(&db->db_mtx); zio = dr->dr_zio = dbuf_write(dr, db->db_buf, tx); @@ -3408,42 +3894,26 @@ } static void -dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx, arc_buf_t **datap) +dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; - uint64_t txg = tx->tx_txg; + void *data = dr->dt.dl.dr_data; dnode_t *dn; + ASSERT3U(db->db_level, ==, 0); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(DB_DNODE_HELD(db)); ASSERT(db->db_blkid == DMU_BONUS_BLKID); + ASSERT(data != NULL); + dn = DB_DNODE(db); + ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); - ASSERT(*datap != NULL); - ASSERT3U(db->db_level, ==, 0); - ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); - bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); + bcopy(data, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); DB_DNODE_EXIT(db); - if (*datap != db->db.db_data) { - zio_buf_free(*datap, DN_MAX_BONUSLEN); - arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); - } - db->db_data_pending = NULL; - ASSERT(list_next(&db->db_dirty_records, dr) == NULL); - ASSERT(dr->dr_dbuf == db); - list_remove(&db->db_dirty_records, dr); - if (dr->dr_dbuf->db_level != 0) { - list_destroy(&dr->dt.di.dr_children); - mutex_destroy(&dr->dt.di.dr_mtx); - } - dbuf_dirty_record_cleanup_ranges(dr); - if (db->db_level == 0) - list_destroy(&dr->dt.dl.write_ranges); - kmem_free(dr, sizeof (dbuf_dirty_record_t)); - ASSERT(db->db_dirtycnt > 0); - db->db_dirtycnt -= 1; - dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); + dbuf_undirty_bonus(dr); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); } static void @@ -3469,7 +3939,6 @@ */ ASSERT(arc_released(*datap)); dbuf_transition_to_read(db); - ASSERT(db->db_state & (DB_CACHED|DB_READ)); } /* @@ -3507,7 +3976,7 @@ * be called). */ if (db->db_blkid == DMU_BONUS_BLKID) { - dbuf_sync_bonus(dr, tx, datap); + dbuf_sync_bonus(dr, tx); return; } @@ -3538,12 +4007,13 @@ * Syncer splits must be deferred until the buffer contents * are fully valid. */ - if (resolve_pending == FALSE && + if (resolve_pending == B_FALSE && dn->dn_object != DMU_META_DNODE_OBJECT) dbuf_syncer_split(db, dr, /*deferred_split*/B_FALSE); /* Notify the world that this dirty record is about to write. */ db->db_data_pending = dr; + ASSERT(list_next(&db->db_dirty_records, dr) == NULL); mutex_exit(&db->db_mtx); @@ -3723,85 +4193,8 @@ * Now that the write is completed, the dirty record it resolves is * no longer needed, so remove it. */ - dr = db->db_data_pending; - ASSERT(!list_link_active(&dr->dr_dirty_node)); - ASSERT(dr->dr_txg == txg); - ASSERT(dr->dr_dbuf == db); - /* There should be no older dirty records. */ - ASSERT(list_next(&db->db_dirty_records, dr) == NULL); - list_remove(&db->db_dirty_records, dr); - -#ifdef ZFS_DEBUG - if (db->db_blkid == DMU_SPILL_BLKID) { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); - ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && - db->db_blkptr == &dn->dn_phys->dn_spill); - DB_DNODE_EXIT(db); - } -#endif - - /* Clean up the dirty record. */ - if (db->db_level == 0) { - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); - if (db->db_state != DB_NOFILL) { - if (dr->dt.dl.dr_data != db->db_buf) { - /* - * What we wrote is already out of date, so - * just free the ARC buffer. - */ - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, - db) == 1); - } else if (!arc_released(db->db_buf)) { - /* - * Our dbuf hasn't already been evicted, so - * register a callback to clean it up once - * its ARC buffer is released. - */ - arc_set_callback(db->db_buf, dbuf_do_evict, db); - } - } - } else { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - ASSERT(list_head(&dr->dt.di.dr_children) == NULL); - /* - * The size of an indirect block must match what its - * associated dnode thinks it should be. - */ - ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); - /* - * If the dbuf's block pointer is not a hole, evict it when - * its last ARC buffer hold has been released. - */ - if (!BP_IS_HOLE(db->db_blkptr)) { - int epbs = - dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; - ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, - db->db.db_size); - ASSERT3U(dn->dn_phys->dn_maxblkid - >> (db->db_level * epbs), >=, db->db_blkid); - arc_set_callback(db->db_buf, dbuf_do_evict, db); - } - DB_DNODE_EXIT(db); - mutex_destroy(&dr->dt.di.dr_mtx); - list_destroy(&dr->dt.di.dr_children); - } - dbuf_dirty_record_cleanup_ranges(dr); - if (db->db_level == 0) - list_destroy(&dr->dt.dl.write_ranges); - kmem_free(dr, sizeof (dbuf_dirty_record_t)); - - cv_broadcast(&db->db_changed); - ASSERT(db->db_dirtycnt > 0); - db->db_dirtycnt -= 1; - db->db_data_pending = NULL; + ASSERT(db->db_data_pending->dr_dbuf == db); + dbuf_undirty_write(db->db_data_pending, txg); dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c#56 (text) ==== @@ -866,6 +866,8 @@ int dmu_thread_context_create(void) { + int ret = 0; +#ifdef _KERNEL /* XXX TSD only works in the kernel. FIXME! */ dmu_cb_state_t *dcs; /* This function should never be called more than once in a thread. */ @@ -878,7 +880,16 @@ dcs = kmem_zalloc(sizeof(dmu_cb_state_t), KM_SLEEP); list_create(&dcs->io_list, sizeof(dmu_context_node_t), offsetof(dmu_context_node_t, dcn_link)); - return tsd_set(zfs_async_io_key, dcs); + + ret = tsd_set(zfs_async_io_key, dcs); +#ifdef ZFS_DEBUG + { + dmu_cb_state_t *check = tsd_get(zfs_async_io_key); + ASSERT(check == dcs); + } +#endif +#endif /* _KERNEL */ + return (ret); } void @@ -1313,7 +1324,7 @@ ASSERT(reader || ((flags & DMU_CTX_READER_FLAGS) == 0)); ASSERT(!reader || ((flags & DMU_CTX_WRITER_FLAGS) == 0)); /* The NOFILL flag and a NULL data_buf go hand in hand. */ - ASSERT((flags & DMU_CTX_FLAG_NOFILL) ^ (data_buf != NULL)); + ASSERT(((flags & DMU_CTX_FLAG_NOFILL) != 0) ^ (data_buf != NULL)); /* * If the caller is a reader and didn't pass in a dnode, hold it. @@ -2369,8 +2380,8 @@ void dmu_fini(void) { + arc_fini(); /* arc depends on l2arc */ l2arc_fini(); - arc_fini(); zfetch_fini(); dbuf_fini(); dnode_fini(); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c#13 (text) ==== @@ -450,10 +450,7 @@ dn->dn_assigned_txg = 0; dn->dn_dirtyctx = 0; - if (dn->dn_dirtyctx_firstset != NULL) { - kmem_free(dn->dn_dirtyctx_firstset, 1); - dn->dn_dirtyctx_firstset = NULL; - } + dn->dn_dirtyctx_firstset = NULL; if (dn->dn_bonus != NULL) { list_t evict_list; dmu_buf_create_user_evict_list(&evict_list); @@ -546,10 +543,7 @@ dn->dn_dirtyctx = 0; dn->dn_free_txg = 0; - if (dn->dn_dirtyctx_firstset) { - kmem_free(dn->dn_dirtyctx_firstset, 1); - dn->dn_dirtyctx_firstset = NULL; - } + dn->dn_dirtyctx_firstset = NULL; dn->dn_allocated_txg = tx->tx_txg; dn->dn_id_flags = 0; @@ -1272,7 +1266,7 @@ VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); db = dn->dn_dbuf; - (void) dbuf_dirty(db, tx, 0, db->db.db_size, DB_UNCACHED, NULL); + (void) dbuf_dirty(db, tx); dsl_dataset_dirty(os->os_dsl_dataset, tx); } @@ -1438,7 +1432,7 @@ /* dirty the left indirects */ db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); ASSERT(db != NULL); - new = dbuf_dirty(db, tx, 0, db->db.db_size, DB_UNCACHED, NULL); + new = dbuf_dirty(db, tx); dbuf_rele(db, FTAG); /* transfer the dirty records to the new indirect */ @@ -1470,9 +1464,10 @@ * * \param dn Dnode to mark dirty. * \param tx Transaction the dnode is being dirtied in. + * \param tag Tag to track the first dirty of this dnode. */ void -dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx) +dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, void *tag) { mutex_enter(&dn->dn_mtx); @@ -1487,6 +1482,7 @@ else dn->dn_dirtyctx = DN_DIRTY_OPEN; } + dn->dn_dirtyctx_firstset = tag; } mutex_exit(&dn->dn_mtx); } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c#13 (text) ==== @@ -5173,6 +5173,8 @@ vd->vdev_stat.vs_checksum_errors = 0; vdev_state_dirty(vd->vdev_top); + /* Tell userspace that the vdev is gone. */ + zfs_post_remove(spa, vd); } for (int c = 0; c < vd->vdev_children; c++) ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h#6 (text) ==== @@ -53,6 +53,7 @@ void *b_data; arc_evict_func_t *b_efunc; void *b_private; + void *b_last_dbuf; }; typedef enum arc_buf_contents { @@ -68,6 +69,7 @@ #define ARC_PREFETCH (1 << 3) /**< I/O is a prefetch */ #define ARC_CACHED (1 << 4) /**< I/O was already in cache */ #define ARC_L2CACHE (1 << 5) /**< cache in L2ARC */ +#define ARC_CACHED_ONLY (1 << 6) /**< cache lookup only */ /** * The following breakdows of arc_size exist for kstat only. @@ -105,6 +107,13 @@ int arc_referenced(arc_buf_t *buf); #endif +static inline void +arc_discard_buf(arc_buf_t *buf, void *tag) +{ + arc_release(buf, tag); + VERIFY(arc_buf_remove_ref(buf, tag) == 1); +} + int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, arc_done_func_t *done, void *priv, int priority, int zio_flags, uint32_t *arc_flags, const zbookmark_t *zb); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h#30 (text) ==== @@ -263,12 +263,12 @@ * have child DRs, each associated with its child dbufs. Finally, the leaf * DRs contain the ARC buffer containing the data to be written. */ -typedef struct dbuf_dirty_indirect { +typedef struct dbuf_dirty_indirect_record { kmutex_t dr_mtx; /* Protects the children. */ list_t dr_children; /* List of our dirty children. */ -} dbuf_dirty_indirect_t; +} dbuf_dirty_indirect_record_t; -typedef struct dbuf_dirty_leaf { +typedef struct dbuf_dirty_leaf_record { /* * dr_data is set when we dirty the buffer so that we can retain the * pointer even if it gets COW'd in a subsequent transaction group. @@ -284,12 +284,12 @@ * only cover part of it, and no read has filled in the gaps yet. */ list_t write_ranges; -} dbuf_dirty_leaf_t; +} dbuf_dirty_leaf_record_t; -typedef union dbuf_dirty_types { - struct dbuf_dirty_indirect di; - struct dbuf_dirty_leaf dl; -} dbuf_dirty_types_t; +typedef union dbuf_dirty_record_types { + struct dbuf_dirty_indirect_record di; + struct dbuf_dirty_leaf_record dl; +} dbuf_dirty_record_types_t; typedef struct dbuf_dirty_record { /** link on our parents dirty list */ @@ -310,7 +310,7 @@ /** pointer to parent dirty record */ struct dbuf_dirty_record *dr_parent; - union dbuf_dirty_types dt; + union dbuf_dirty_record_types dt; } dbuf_dirty_record_t; typedef struct dbuf_dirty_range { @@ -487,8 +487,7 @@ void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); -dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx, int offset, - int size, int how, arc_buf_t *db_buf); +dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); void dbuf_clear(dmu_buf_impl_t *db, list_t *evict_list); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h#34 (text) ==== @@ -180,11 +180,11 @@ /** * \brief Artificial blkid for bonus blocks */ -#define DMU_BONUS_BLKID (-1ULL) +#define DMU_BONUS_BLKID (ULLONG_MAX) /** * \brief Artificial blkid for spill blocks */ -#define DMU_SPILL_BLKID (-2ULL) +#define DMU_SPILL_BLKID (ULLONG_MAX - 1) /* * Public routines to create, destroy, open, and close objsets. */ @@ -373,10 +373,21 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf_user *); -/* - * Consumers are expected to allocate and free space for this structure. - * Consequently, if any additional context is needed, another struct that - * includes this one at the start should be passed in. +/** + * The DMU buffer user eviction data container. This is used to allow users + * of a dbuf to register with the dbuf layer that they wish to be notified + * when the backing store for a dbuf they're using has been evicted. The + * backing store is an ARC buffer that corresponds to the transaction group + * that the user is currently operating in. + * + * Whenever a dirty record's ARC buffer is removed, the context in which the + * removal occurs must queue an user eviction. This queue must then be + * processed while not holding any dbuf locks. In this way, the user can + * perform any work needed in their eviction function. + * + * Implementation Note: Users are expected to allocate and free space for + * this structure. Consequently, if any additional context is needed, another + * struct that includes this one at the start should be passed in. */ typedef struct dmu_buf_user { /** ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h#8 (text) ==== @@ -310,7 +310,7 @@ boolean_t dnode_add_ref(dnode_t *dn, void *ref); void dnode_rele(dnode_t *dn, void *ref); void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); -void dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx); +void dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, void *tag); void dnode_sync(dnode_t *dn, dmu_tx_t *tx); void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); @@ -336,7 +336,7 @@ void dnode_evict_dbufs(dnode_t *dn); #define DNODE_VERIFY_DIRTYCTX(dn, tx) \ - ASSERT((dn)->dn_object == 0 || \ + ASSERT((dn)->dn_object == DMU_META_DNODE_OBJECT || \ (dn)->dn_dirtyctx == DN_UNDIRTIED || \ (dn)->dn_dirtyctx == \ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)) ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h#4 (text) ==== @@ -342,9 +342,11 @@ extern void vdev_set_min_asize(vdev_t *vd); /* - * zdb uses this tunable, so it must be declared here to make lint happy. + * Global variables */ +/* zdb uses this tunable, so it must be declared here to make lint happy. */ extern int zfs_vdev_cache_size; +extern uint_t zfs_geom_probe_vdev; #ifdef __cplusplus } ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h#3 (text) ==== @@ -430,7 +430,16 @@ enum zio_flag io_orig_flags; enum zio_stage io_orig_stage; enum zio_stage io_orig_pipeline; + int io_error; +#ifdef ZFS_DEBUG + struct { + int err; + int lineno; + const char *filename; + } io_last_errno; +#endif + int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; uint64_t io_child_count; @@ -463,6 +472,26 @@ #endif }; +/* + * NB: Mostly useful for tracing ZIO errors. Perhaps it could be extended + * to include a full history for each zio. Skip ECKSUM, because it can + * happen a lot. + */ +#ifdef ZFS_DEBUG +#define ZIO_SET_ERROR(zio, error) do { \ + (zio)->io_error = (error); \ + if (error != 0 && error != ECKSUM) { \ + dprintf("zio %p error %d at %s:%d\n", \ + zio, (error), __FILE__, __LINE__); \ + (zio)->io_last_errno.err = error; \ + (zio)->io_last_errno.filename = __FILE__; \ + (zio)->io_last_errno.lineno = __LINE__; \ + } \ +} while (0) +#else +#define ZIO_SET_ERROR(zio, error) (zio)->io_error = error +#endif /* ZFS_DEBUG */ + extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *priv, enum zio_flag flags); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c#8 (text) ==== @@ -66,6 +66,13 @@ SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW, &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); +/** + * Thread local storage used to indicate when a thread is probing geoms + * for their guids. If NULL, this thread is not tasting geoms. If non NULL, + * it is looking for a replacement for the vdev_t* that is its value. + */ +uint_t zfs_geom_probe_vdev; + static void vdev_geom_orphan(struct g_consumer *cp) { @@ -93,7 +100,6 @@ * async removal support to invoke a close on this * vdev once it is safe to do so. */ - zfs_post_remove(vd->vdev_spa, vd); vd->vdev_remove_wanted = B_TRUE; spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); } @@ -374,9 +380,8 @@ static void vdev_geom_taste_orphan(struct g_consumer *cp) { - - KASSERT(1 == 0, ("%s called while tasting %s.", __func__, - cp->provider->name)); + ZFS_LOG(0, "WARNING: Orphan %s while tasting its VDev GUID.", + cp->provider->name); } static struct g_consumer * @@ -392,7 +397,6 @@ g_topology_assert(); zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste"); - /* This orphan function should be never called. */ zgp->orphan = vdev_geom_taste_orphan; zcp = g_new_consumer(zgp); @@ -535,6 +539,9 @@ g_topology_lock(); error = 0; + /* Set the TLS to indicate downstack that we should not access zvols*/ + VERIFY(tsd_set(zfs_geom_probe_vdev, vd) == 0); + /* * Try using the recorded path for this device, but only * accept it if its label data contains the expected GUIDs. @@ -569,6 +576,9 @@ cp = vdev_geom_open_by_path(vd, 0); } + /* Clear the TLS now that tasting is done */ + VERIFY(tsd_set(zfs_geom_probe_vdev, NULL) == 0); + if (cp == NULL) { ZFS_LOG(1, "Provider %s not found.", vd->vdev_path); error = ENOENT; ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c#11 (text) ==== @@ -5306,6 +5306,7 @@ uint_t zfs_fsyncer_key; extern uint_t rrw_tsd_key; extern uint_t zfs_async_io_key; +extern uint_t zfs_geom_probe_vdev; #ifdef sun int @@ -5390,6 +5391,7 @@ tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, NULL); tsd_create(&zfs_async_io_key, dmu_thread_context_destroy); + tsd_create(&zfs_geom_probe_vdev, NULL); printf("ZFS storage pool version " SPA_VERSION_STRING "\n"); root_mount_rel(zfs_root_token); ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c#36 (text) ==== @@ -967,6 +967,17 @@ mutex_exit(&spa_namespace_lock); return (ENXIO); } + if (tsd_get(zfs_geom_probe_vdev) != NULL) { + /* + * if zfs_geom_probe_vdev is set, that means that zfs is + * attempting to probe geom providers while looking for a + * replacement for a missing VDEV. In this case, the + * spa_namespace_lock will not be held, but it is still illegal + * to use a zvol as a vdev. Deadlocks can result if another + * thread has spa_namespace_lock + */ + return (EOPNOTSUPP); + } if (zv->zv_total_opens == 0) err = zvol_first_open(zv); Change 541430 by willa@willa_repo on 2012/05/10 22:29:56 Fix a racy mis-refire of the pending write in dbuf_read_complete(). At the start, use the same check the syncer uses, to determine whether to issue the write. dbuf_read_complete() may be entered more than once between the time the syncer sets db_data_pending and the time the write completes, but it will only actually resolve ranges once. Make it so that whichever reader resolves first wins this race. This bug was only caught by running xdd with 10 threads. ztest, the STF suite, & xdd with 1 thread didn't trip it. In a non-debug build, it manifests as the syncer being hung waiting for a zio. In a debug build, dbuf_read_complete()->zio_nowait() tripped the zio->io_executor == NULL assert in zio_nowait(). Affected files ... ... //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#72 edit Differences ... ==== //depot/branches/redline/projects/cow/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c#72 (text) ==== @@ -927,6 +927,10 @@ { if (db->db_level == 0 && db->db_dirtycnt > 0) { + dbuf_dirty_record_t *pending = db->db_data_pending; + boolean_t resolving_write_pending = pending != NULL && + !list_is_empty(&pending->dt.dl.write_ranges) && + pending->dr_zio != NULL; /* * Buffers in the FILL state are valid here if the read was @@ -970,9 +974,8 @@ arc_discard_buf(buf, db); /* Dispatch any deferred syncer writes. */ - if (db->db_data_pending != NULL && - db->db_data_pending->dr_zio != NULL) - zio_nowait(db->db_data_pending->dr_zio); + if (resolving_write_pending) + zio_nowait(pending->dr_zio); } else if (db->db_state == DB_READ) { /* * Read with no dirty data. Use the buffer we