--- old/cddl/contrib/opensolaris/cmd/ztest/ztest.c +++ new/cddl/contrib/opensolaris/cmd/ztest/ztest.c @@ -104,7 +104,6 @@ #include #include #include -#include #include #include #include @@ -223,6 +222,7 @@ typedef struct ztest_info { ztest_func_t *zi_func; /* test function */ + const char *zi_name; /* string name of test function */ uint64_t zi_iters; /* iterations per execution */ uint64_t *zi_interval; /* execute every seconds */ uint64_t zi_call_count; /* per-pass count */ @@ -268,37 +268,39 @@ uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ +#define ZI(name, iters, interval) { name, #name, iters, interval } + ztest_info_t ztest_info[] = { - { ztest_dmu_read_write, 1, &zopt_always }, - { ztest_dmu_write_parallel, 10, &zopt_always }, - { ztest_dmu_object_alloc_free, 1, &zopt_always }, - { ztest_dmu_commit_callbacks, 1, &zopt_always }, - { ztest_zap, 30, &zopt_always }, - { ztest_zap_parallel, 100, &zopt_always }, - { ztest_split_pool, 1, &zopt_always }, - { ztest_zil_commit, 1, &zopt_incessant }, - { ztest_zil_remount, 1, &zopt_sometimes }, - { ztest_dmu_read_write_zcopy, 1, &zopt_often }, - { ztest_dmu_objset_create_destroy, 1, &zopt_often }, - { ztest_dsl_prop_get_set, 1, &zopt_often }, - { ztest_spa_prop_get_set, 1, &zopt_sometimes }, + ZI(ztest_dmu_read_write, 1, &zopt_always ), + ZI(ztest_dmu_write_parallel, 10, &zopt_always ), + ZI(ztest_dmu_object_alloc_free, 1, &zopt_always ), + ZI(ztest_dmu_commit_callbacks, 1, &zopt_always ), + ZI(ztest_zap, 30, &zopt_always ), + ZI(ztest_zap_parallel, 100, &zopt_always ), + ZI(ztest_split_pool, 1, &zopt_always ), + ZI(ztest_zil_commit, 1, &zopt_incessant ), + ZI(ztest_zil_remount, 1, &zopt_sometimes ), + ZI(ztest_dmu_read_write_zcopy, 1, &zopt_often ), + ZI(ztest_dmu_objset_create_destroy, 1, &zopt_often ), + ZI(ztest_dsl_prop_get_set, 1, &zopt_often ), + ZI(ztest_spa_prop_get_set, 1, &zopt_sometimes ), #if 0 - { ztest_dmu_prealloc, 1, &zopt_sometimes }, + ZI(ztest_dmu_prealloc, 1, &zopt_often ), #endif - { ztest_fzap, 1, &zopt_sometimes }, - { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes }, - { ztest_spa_create_destroy, 1, &zopt_sometimes }, - { ztest_fault_inject, 1, &zopt_sometimes }, - { ztest_ddt_repair, 1, &zopt_sometimes }, - { ztest_dmu_snapshot_hold, 1, &zopt_sometimes }, - { ztest_reguid, 1, &zopt_sometimes }, - { ztest_spa_rename, 1, &zopt_rarely }, - { ztest_scrub, 1, &zopt_rarely }, - { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely }, - { ztest_vdev_attach_detach, 1, &zopt_rarely }, - { ztest_vdev_LUN_growth, 1, &zopt_rarely }, - { ztest_vdev_add_remove, 1, &zopt_vdevtime }, - { ztest_vdev_aux_add_remove, 1, &zopt_vdevtime }, + ZI(ztest_fzap, 1, &zopt_sometimes ), + ZI(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes ), + ZI(ztest_spa_create_destroy, 1, &zopt_sometimes ), + ZI(ztest_fault_inject, 1, &zopt_sometimes ), + ZI(ztest_ddt_repair, 1, &zopt_sometimes ), + ZI(ztest_dmu_snapshot_hold, 1, &zopt_sometimes ), + ZI(ztest_reguid, 1, &zopt_often ), + ZI(ztest_spa_rename, 1, &zopt_rarely ), + ZI(ztest_scrub, 1, &zopt_rarely ), + ZI(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely ), + ZI(ztest_vdev_attach_detach, 1, &zopt_rarely ), + ZI(ztest_vdev_LUN_growth, 1, &zopt_rarely ), + ZI(ztest_vdev_add_remove, 1, &zopt_vdevtime ), + ZI(ztest_vdev_aux_add_remove, 1, &zopt_vdevtime ), }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) @@ -404,6 +406,7 @@ } (void) fprintf(stderr, "%s\n", buf); fatal_msg = buf; /* to ease debugging */ + fflush(NULL); if (ztest_dump_core) abort(); exit(3); @@ -622,9 +625,12 @@ static void ztest_kill(ztest_shared_t *zs) { + pid_t curpid = getpid(); + zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(zs->zs_spa)); zs->zs_space = metaslab_class_get_space(spa_normal_class(zs->zs_spa)); - (void) kill(getpid(), SIGKILL); + printf("*** Crashing the current test process (pid %d)\n", curpid); + (void) kill(curpid, SIGKILL); } static uint64_t @@ -1427,7 +1433,6 @@ * but not always, because we also want to verify correct * behavior when the data was not recently read into cache. */ - ASSERT(offset % doi.doi_data_block_size == 0); if (ztest_random(4) != 0) { int prefetch = ztest_random(2) ? DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; @@ -1505,6 +1510,9 @@ return (ENOSPC); } + if (zopt_verbose >= 7) + printf("%s: freeing obj %d offset 0x%lx length 0x%lx tx %p\n", + __func__, lr->lr_foid, lr->lr_offset, lr->lr_length, tx); VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset, lr->lr_length, tx) == 0); @@ -1942,7 +1950,7 @@ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg != 0) { - dmu_prealloc(os, object, offset, size, tx); + (void) dmu_prealloc(os, object, offset, size, tx); dmu_tx_commit(tx); txg_wait_synced(dmu_objset_pool(os), txg); } else { @@ -3642,14 +3650,14 @@ * We've verified all the old bufwads, and made new ones. * Now write them out. */ - dmu_write(os, packobj, packoff, packsize, packbuf, tx); if (zopt_verbose >= 7) { - (void) printf("writing offset %llx size %llx" - " txg %llx\n", + (void) printf("writing obj %d offset %llx size %llx" + " txg %llx\n", packobj, (u_longlong_t)bigoff, (u_longlong_t)bigsize, (u_longlong_t)txg); } + dmu_write(os, packobj, packoff, packsize, packbuf, tx); for (off = bigoff, j = 0; j < s; j++, off += chunksize) { dmu_buf_t *dbt; if (i != 5) { @@ -3669,6 +3677,13 @@ VERIFY(dmu_buf_hold(os, bigobj, off, FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); } + if (zopt_verbose >= 7) { + (void) printf("assigning obj %d offset %llx " + "size %llx txg %llx\n", bigobj, + (u_longlong_t)bigoff, + (u_longlong_t)bigsize, + (u_longlong_t)txg); + } if (i != 5) { dmu_assign_arcbuf(bonus_db, off, bigbuf_arcbufs[j], tx); @@ -4506,11 +4521,17 @@ vdev_file_t *vf = vd0->vdev_tsd; if (vf != NULL && ztest_random(3) == 0) { + printf("Closing fd %d for path '%s'\n", + vf->vf_vnode->v_fd, vd0->vdev_path); (void) close(vf->vf_vnode->v_fd); vf->vf_vnode->v_fd = -1; } else if (ztest_random(2) == 0) { + printf("Marking vdev '%s' not readable\n", + vd0->vdev_path); vd0->vdev_cant_read = B_TRUE; } else { + printf("Marking vdev '%s' not writable\n", + vd0->vdev_path); vd0->vdev_cant_write = B_TRUE; } guid0 = vd0->vdev_guid; @@ -4557,11 +4578,13 @@ if (islog) (void) rw_wrlock(&ztest_shared->zs_name_lock); + printf("Offlining vdev '%s'\n", vd0->vdev_path); VERIFY(vdev_offline(spa, guid0, flags) != EBUSY); if (islog) (void) rw_unlock(&ztest_shared->zs_name_lock); } else { + printf("Onlining vdev '%s'\n", vd0->vdev_path); (void) vdev_online(spa, guid0, 0, NULL); } } @@ -4737,7 +4760,7 @@ return; if (zopt_verbose >= 3) { - (void) printf("Changed guid old %llu -> %llu\n", + (void) printf("Changed spa %p guid old %llu -> %llu\n", spa, (u_longlong_t)orig, (u_longlong_t)spa_guid(spa)); } @@ -4808,30 +4831,14 @@ int status; char zdb[MAXPATHLEN + MAXNAMELEN + 20]; char zbuf[1024]; - char *bin; - char *ztest; - char *isa; - int isalen; FILE *fp; - strlcpy(zdb, "/usr/bin/ztest", sizeof(zdb)); - - /* zdb lives in /usr/sbin, while ztest lives in /usr/bin */ - bin = strstr(zdb, "/usr/bin/"); - ztest = strstr(bin, "/ztest"); - isa = bin + 8; - isalen = ztest - isa; - isa = strdup(isa); - /* LINTED */ - (void) sprintf(bin, - "/usr/sbin%.*s/zdb -bcc%s%s -U %s %s", - isalen, - isa, + (void) sprintf(zdb, + "zdb -bcc%s%s -U %s %s", zopt_verbose >= 3 ? "s" : "", zopt_verbose >= 4 ? "v" : "", spa_config_path, pool); - free(isa); if (zopt_verbose >= 5) (void) printf("Executing %s\n", strstr(zdb, "zdb ")); @@ -4977,7 +4984,7 @@ ztest_deadman_thread(void *arg) { ztest_shared_t *zs = arg; - int grace = 300; + int grace = 600; hrtime_t delta; delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace; @@ -5005,10 +5012,8 @@ atomic_add_64(&zi->zi_call_time, functime); if (zopt_verbose >= 4) { - Dl_info dli; - (void) dladdr((void *)zi->zi_func, &dli); (void) printf("%6.2f sec in %s\n", - (double)functime / NANOSEC, dli.dli_sname); + (double)functime / NANOSEC, zi->zi_name); } } @@ -5639,14 +5644,12 @@ (void) printf("%7s %9s %s\n", "-----", "----", "--------"); for (int f = 0; f < ZTEST_FUNCS; f++) { - Dl_info dli; zi = &zs->zs_info[f]; print_time(zi->zi_call_time, timebuf); - (void) dladdr((void *)zi->zi_func, &dli); (void) printf("%7llu %9s %s\n", (u_longlong_t)zi->zi_call_count, timebuf, - dli.dli_sname); + zi->zi_name); } (void) printf("\n"); } --- old/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h +++ new/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h @@ -212,6 +212,9 @@ */ #define curthread ((void *)(uintptr_t)thr_self()) +#define tsd_get(key) NULL +#define tsd_set(key, value) 0 + typedef struct kthread kthread_t; #define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ @@ -362,6 +365,7 @@ typedef struct taskq taskq_t; typedef uintptr_t taskqid_t; typedef void (task_func_t)(void *); +typedef void (*taskq_callback_fn)(void *); #define TASKQ_PREPOPULATE 0x0001 #define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ @@ -377,8 +381,11 @@ extern taskq_t *system_taskq; extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); -#define taskq_create_proc(a, b, c, d, e, p, f) \ - (taskq_create(a, b, c, d, e, f)) +extern taskq_t *taskq_create_with_callbacks(const char *, int, pri_t, int, int, + uint_t, taskq_callback_fn, taskq_callback_fn); + +#define taskq_create_proc(a, b, c, d, e, p, f, g, h) \ + (taskq_create_with_callbacks(a, b, c, d, e, f, g, h)) #define taskq_create_sysdc(a, b, d, e, p, dc, f) \ (taskq_create(a, b, maxclsyspri, d, e, f)) extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); --- old/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c +++ new/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c @@ -53,6 +53,8 @@ int tq_maxalloc_wait; task_t *tq_freelist; task_t tq_task; + taskq_callback_fn tq_ctor; + taskq_callback_fn tq_dtor; }; static task_t * @@ -161,6 +163,9 @@ task_t *t; mutex_enter(&tq->tq_lock); + if (tq->tq_ctor != NULL) + tq->tq_ctor(tq); + while (tq->tq_flags & TASKQ_ACTIVE) { if ((t = tq->tq_task.task_next) == &tq->tq_task) { if (--tq->tq_active == 0) @@ -182,14 +187,16 @@ } tq->tq_nthreads--; cv_broadcast(&tq->tq_wait_cv); + if (tq->tq_dtor != NULL) + tq->tq_dtor(tq); mutex_exit(&tq->tq_lock); return (NULL); } -/*ARGSUSED*/ taskq_t * -taskq_create(const char *name, int nthreads, pri_t pri, - int minalloc, int maxalloc, uint_t flags) +taskq_create_with_callbacks(const char *name, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags, taskq_callback_fn ctor, + taskq_callback_fn dtor) { taskq_t *tq = kmem_zalloc(sizeof (taskq_t), KM_SLEEP); int t; @@ -220,6 +227,8 @@ tq->tq_task.task_next = &tq->tq_task; tq->tq_task.task_prev = &tq->tq_task; tq->tq_threadlist = kmem_alloc(nthreads * sizeof (thread_t), KM_SLEEP); + tq->tq_ctor = ctor; + tq->tq_dtor = dtor; if (flags & TASKQ_PREPOPULATE) { mutex_enter(&tq->tq_lock); @@ -235,6 +244,15 @@ return (tq); } +/*ARGSUSED*/ +taskq_t * +taskq_create(const char *name, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags) +{ + return (taskq_create_with_callbacks(name, nthreads, pri, minalloc, + maxalloc, flags, NULL, NULL)); +} + void taskq_destroy(taskq_t *tq) { --- old/cddl/sbin/zfsd/callout.cc +++ new/cddl/sbin/zfsd/callout.cc @@ -167,3 +167,37 @@ setitimer(ITIMER_REAL, &timerval, NULL); } } + +timeval +Callout::TimeRemaining() const +{ + /* + * Outline: Add the m_interval for each callout in s_activeCallouts + * ahead of this, except for the first callout. Add to that the result + * of getitimer (That's because the first callout stores its original + * interval setting while the timer is ticking). + */ + itimerval timervalToAlarm; + timeval timeToExpiry; + std::list::iterator it; + + if (! IsPending() ) { + timeToExpiry.tv_sec = INT_MAX; + timeToExpiry.tv_usec = 999999; /*maximum normalized value*/ + return (timeToExpiry); + } + + timerclear(&timeToExpiry); + getitimer(ITIMER_REAL, &timervalToAlarm); + timeval& timeToAlarm = timervalToAlarm.it_value; + timeradd(&timeToExpiry, &timeToAlarm, &timeToExpiry); + + it =s_activeCallouts.begin(); + it++; /*skip the first callout in the list*/ + for (; it != s_activeCallouts.end(); it++) { + timeradd(&timeToExpiry, &(*it)->m_interval, &timeToExpiry); + if ((*it) == this) + break; + } + return (timeToExpiry); +} --- old/cddl/sbin/zfsd/callout.h +++ new/cddl/sbin/zfsd/callout.h @@ -117,6 +117,17 @@ */ bool Reset(const timeval &interval, CalloutFunc_t *func, void *arg); + /** + * \brief Calculate the remaining time until this Callout's timer + * expires. + * + * The return value will be slightly greater than the actual time to + * expiry. + * + * If the callout is not pending, returns INT_MAX. + */ + timeval TimeRemaining() const; + private: /** * All active callouts sorted by expiration time. The callout --- old/cddl/sbin/zfsd/case_file.cc +++ new/cddl/sbin/zfsd/case_file.cc @@ -40,6 +40,7 @@ */ #include #include +#include #include #include #include @@ -53,6 +54,7 @@ /*============================ Namespace Control =============================*/ using std::auto_ptr; using std::hex; +using std::ifstream; using std::stringstream; using std::setfill; using std::setw; @@ -116,8 +118,12 @@ int numCaseFiles(scandir(s_caseFilePath.c_str(), &caseFiles, DeSerializeSelector, /*compar*/NULL)); - if (numCaseFiles == 0 || numCaseFiles == -1) + if (numCaseFiles == -1) + return; + if (numCaseFiles == 0) { + free(caseFiles); return; + } for (int i = 0; i < numCaseFiles; i++) { @@ -138,9 +144,17 @@ void CaseFile::PurgeAll() { - /* CaseFiles remove themselves from this list on destruction. */ - while (s_activeCases.size() != 0) - delete s_activeCases.front(); + /* + * Serialize casefiles before deleting them so that they can be reread + * and revalidated during BuildCaseFiles. + * CaseFiles remove themselves from this list on destruction. + */ + while (s_activeCases.size() != 0) { + CaseFile *casefile = s_activeCases.front(); + casefile->Serialize(); + delete casefile; + } + } //- CaseFile Public Methods ---------------------------------------------------- @@ -382,9 +396,7 @@ || event.Value("class") == "ereport.fs.zfs.checksum") { m_tentativeEvents.push_front(event.DeepCopy()); - if (!m_tentativeTimer.IsPending()) - m_tentativeTimer.Reset(s_removeGracePeriod, - OnGracePeriodEnded, this); + RegisterCallout(event); consumed = true; } @@ -393,6 +405,33 @@ return (consumed || closed); } + +void +CaseFile::RegisterCallout(const DevCtlEvent &event) +{ + timeval now, countdown, elapsed, timestamp, zero, remaining; + gettimeofday(&now, 0); + timestamp = event.GetTimestamp(); + timersub(&now, ×tamp, &elapsed); + timersub(&s_removeGracePeriod, &elapsed, &countdown); + /* + * If countdown is <= zero, Reset the timer to the + * smallest positive time value instead + */ + timerclear(&zero); + if (timercmp(&countdown, &zero, <=)) { + timerclear(&countdown); + countdown.tv_usec = 1; + } + + remaining = m_tentativeTimer.TimeRemaining(); + + if (!m_tentativeTimer.IsPending() + || timercmp(&countdown, &remaining, <)) + m_tentativeTimer.Reset(countdown, OnGracePeriodEnded, this); +} + + bool CaseFile::CloseIfSolved() { @@ -472,7 +511,6 @@ string evString; CaseFile *existingCaseFile(NULL); CaseFile *caseFile(NULL); - int fd(-1); try { uintmax_t poolGUID; @@ -505,8 +543,8 @@ .Find(vdevGUID)) == NULL) { /* * Either the pool no longer exists - * of this vdev is no longer a member of - * the pool. + * or this vdev is no longer a member of + * the pool. */ unlink(fullName.c_str()); return; @@ -519,27 +557,59 @@ */ caseFile = new CaseFile(Vdev(zpl.front(), vdevConf)); } - - fd = open(fullName.c_str(), O_RDONLY); - if (fd == -1) { + + ifstream caseStream(fullName.c_str()); + if (! caseStream) { throw ZfsdException("CaseFile::DeSerialize: Unable to " "read %s.\n", fileName); return; } + stringstream fakeDevdSocket(stringstream::in + | stringstream::out); + IstreamReader caseReader(&fakeDevdSocket); /* Re-load EventData */ - EventBuffer eventBuffer(fd); - while (eventBuffer.ExtractEvent(evString)) { - DevCtlEvent *event(DevCtlEvent::CreateEvent(evString)); - caseFile->m_events.push_back(event); + EventBuffer eventBuffer(caseReader); + caseStream >> std::noskipws >> std::ws; + while (!caseStream.eof()) { + /* + * Outline: + * read the beginning of a line and check it for + * "tentative". If found, discard "tentative". + * Shove into fakeDevdSocket. + * call ExtractEvent + * continue + */ + DevCtlEventList* destEvents; + string tentFlag("tentative "); + string line; + std::stringbuf lineBuf; + caseStream.get(lineBuf); + caseStream.ignore(); /*discard the newline character*/ + line = lineBuf.str(); + if (line.compare(0, tentFlag.size(), tentFlag) == 0) { + line.erase(0, tentFlag.size()); + destEvents = &caseFile->m_tentativeEvents; + } else { + destEvents = &caseFile->m_events; + } + fakeDevdSocket << line; + fakeDevdSocket << '\n'; + while (eventBuffer.ExtractEvent(evString)) { + DevCtlEvent *event(DevCtlEvent::CreateEvent( + evString)); + if (event != NULL) { + destEvents->push_back(event); + caseFile->RegisterCallout(*event); + } + } } - close(fd); + } catch (const ParseException &exp) { exp.Log(evString); if (caseFile != existingCaseFile) delete caseFile; - close(fd); /* * Since we can't parse the file, unlink it so we don't @@ -603,6 +673,24 @@ m_tentativeEvents.clear(); } + +void +CaseFile::SerializeEvList(const DevCtlEventList events, int fd, + const char* prefix) const +{ + if (events.empty()) + return; + for (DevCtlEventList::const_iterator curEvent = events.begin(); + curEvent != events.end(); curEvent++) { + const string &eventString((*curEvent)->GetEventString()); + + if (prefix) + write(fd, prefix, strlen(prefix)); + write(fd, eventString.c_str(), eventString.length()); + } +} + + void CaseFile::Serialize() { @@ -614,7 +702,7 @@ << "_vdev_" << VdevGUIDString() << ".case"; - if (m_events.empty()) { + if (m_events.empty() && m_tentativeEvents.empty()) { unlink(saveFile.str().c_str()); return; } @@ -625,12 +713,9 @@ saveFile.str().c_str()); return; } - for (DevCtlEventList::const_iterator curEvent = m_events.begin(); - curEvent != m_events.end(); curEvent++) { - const string &eventString((*curEvent)->GetEventString()); - - write(fd, eventString.c_str(), eventString.length()); - } + SerializeEvList(m_events, fd); + SerializeEvList(m_tentativeEvents, fd, "tentative "); + close(fd); } void --- old/cddl/sbin/zfsd/case_file.h +++ new/cddl/sbin/zfsd/case_file.h @@ -171,7 +171,12 @@ bool ReEvaluate(const ZfsEvent &event); /** - * \breif Close a case if it is no longer relevant. + * \brief Register an itimer callout for the given event, if necessary + */ + void RegisterCallout(const DevCtlEvent &event); + + /** + * \brief Close a case if it is no longer relevant. * * This method deals with cases tracking soft errors. Soft errors * will be discarded should a remove event occur within a short period @@ -210,12 +215,12 @@ static int DeSerializeSelector(const struct dirent *dirEntry); /** - * \brief Given the name of a file containing a serialized CaseFile - * object, create/update an in-core CaseFile object + * \brief Given the name of a file containing serialized events from a + * CaseFile object, create/update an in-core CaseFile object * representing the serialized data. * - * \param fileName The name of a file containing a serialized - * CaseFile object. + * \param fileName The name of a file containing serialized events + * from a CaseFile object. */ static void DeSerializeFile(const char *fileName); @@ -248,6 +253,15 @@ */ void Serialize(); + /** + * \brief Serializes the supplied event list and writes it to fd + * + * \param prefix If not NULL, this prefix will be prepended to + * every event in the file. + */ + void SerializeEvList(const DevCtlEventList events, int fd, + const char* prefix=NULL) const; + /** * \brief Unconditionally close a CaseFile. */ --- old/cddl/sbin/zfsd/dev_ctl_event.cc +++ new/cddl/sbin/zfsd/dev_ctl_event.cc @@ -250,6 +250,23 @@ { } +timeval +DevCtlEvent::GetTimestamp() const +{ + timeval tv_timestamp; + struct tm tm_timestamp; + + if ( ! Contains("timestamp") ) { + throw ZfsdException("Event contains no timestamp: %s", + m_eventString.c_str()); + } + strptime(Value(string("timestamp")).c_str(), "%s", &tm_timestamp); + tv_timestamp.tv_sec = mktime(&tm_timestamp); + tv_timestamp.tv_usec = 0; + return (tv_timestamp); +} + + //- DevCtlEvent Protected Methods ---------------------------------------------- DevCtlEvent::DevCtlEvent(Type type, NVPairMap &map, const string &eventString) : m_type(type), --- old/cddl/sbin/zfsd/dev_ctl_event.h +++ new/cddl/sbin/zfsd/dev_ctl_event.h @@ -158,7 +158,7 @@ * * All name => value data for events can be accessed via the Contains() * and Value() methods. name => value pairs for data not explicitly - * recieved as a a name => value pair are synthesized during parsing. For + * recieved as a name => value pair are synthesized during parsing. For * example, ATTACH and DETACH events have "device-name" and "parent" * name => value pairs added. */ @@ -276,6 +276,11 @@ */ virtual void Process() const; + /** + * Get the time that the event was created + */ + timeval GetTimestamp() const; + protected: /** Table entries used to map a type to a user friendly string. */ struct EventTypeRecord @@ -353,7 +358,8 @@ /** * Ingest event data from the supplied string. * - * \param eventString The string of devd event data to parse. + * \param[in] eventString The string of devd event data to parse. + * \param[out] nvpairs Returns the parsed data */ static void ParseEventString(Type type, const string &eventString, NVPairMap &nvpairs); --- old/cddl/sbin/zfsd/zfsd.cc +++ new/cddl/sbin/zfsd/zfsd.cc @@ -43,6 +43,7 @@ #include #include +#include #include #include #include @@ -86,6 +87,32 @@ int g_debug = 0; libzfs_handle_t *g_zfsHandle; +/*-------------------------------- FDReader -------------------------------*/ +//- FDReader Public Methods ---------------------------------------------------- +size_t +FDReader::in_avail() const +{ + int bytes; + if (ioctl(m_fd, FIONREAD, &bytes)) { + syslog(LOG_ERR, "ioctl FIONREAD: %s", strerror(errno)); + return (0); + } + return (bytes); +} + + +/*-------------------------------- IstreamReader ---------------------------*/ +//- IstreamReader Public Methods ---------------------------------------------- +ssize_t +IstreamReader::read(char* buf, size_t count) +{ + m_stream->read(buf, count); + if (m_stream->fail()) + return (-1); + return (m_stream->gcount()); +} + + /*-------------------------------- EventBuffer -------------------------------*/ //- EventBuffer Static Data ---------------------------------------------------- /** @@ -104,17 +131,23 @@ const char EventBuffer::s_keyPairSepTokens[] = " \t\n"; //- EventBuffer Public Methods ------------------------------------------------- -EventBuffer::EventBuffer(int fd) - : m_fd(fd), +EventBuffer::EventBuffer(Reader& reader) + : m_reader(reader), m_validLen(0), m_parsedLen(0), - m_nextEventOffset(0) + m_nextEventOffset(0), + m_synchronized(true) { } bool EventBuffer::ExtractEvent(string &eventString) { + stringstream tsField; + timeval now; + + gettimeofday(&now, NULL); + tsField << " timestamp=" << now.tv_sec; while (UnParsed() > 0 || Fill()) { @@ -127,24 +160,18 @@ continue; } - char *nextEvent(m_buf + m_nextEventOffset); - size_t startLen(strcspn(nextEvent, s_eventStartTokens)); - bool aligned(startLen == 0); - if (aligned == false) { - syslog(LOG_WARNING, - "Re-synchronizing with devd event stream"); - m_nextEventOffset += startLen; + char *nextEvent(m_buf + m_nextEventOffset); + bool truncated(true); + size_t eventLen(strcspn(nextEvent, s_eventEndTokens)); + + if (!m_synchronized) { + /* Discard data until an end token is read. */ + if (nextEvent[eventLen] != '\0') + m_synchronized = true; + m_nextEventOffset += eventLen; m_parsedLen = m_nextEventOffset; continue; - } - - /* - * Start tokens may be end tokens too, so skip the start - * token when trying to find the end of the event. - */ - bool truncated(true); - size_t eventLen(strcspn(nextEvent + 1, s_eventEndTokens) + 1); - if (nextEvent[eventLen] == '\0') { + } else if (nextEvent[eventLen] == '\0') { m_parsedLen += eventLen; if (m_parsedLen < MAX_EVENT_SIZE) { @@ -156,9 +183,6 @@ } syslog(LOG_WARNING, "Event exceeds event size limit of %d bytes."); - } else if (nextEvent[eventLen] != '\n') { - syslog(LOG_WARNING, - "Improperly terminated event encountered."); } else { /* * Include the normal terminator in the extracted @@ -175,12 +199,28 @@ if (truncated) { size_t fieldEnd; + /* Break cleanly at the end of a key<=>value pair. */ fieldEnd = eventString.find_last_of(s_keyPairSepTokens); - eventString.erase(fieldEnd); + if (fieldEnd != string::npos) + eventString.erase(fieldEnd); + eventString += '\n'; + + m_synchronized = false; syslog(LOG_WARNING, "Truncated %d characters from event.", eventLen - fieldEnd); } + + /* + * Add a timestamp as the final field of the event if it is + * not already present. + */ + if ( eventString.find("timestamp=") == string::npos) { + eventString.insert( + eventString.find_last_not_of('\n') + 1, + tsField.str()); + } + return (true); } return (false); @@ -190,7 +230,8 @@ bool EventBuffer::Fill() { - ssize_t result; + size_t avail; + ssize_t consumed(0); /* Compact the buffer. */ if (m_nextEventOffset != 0) { @@ -202,19 +243,26 @@ } /* Fill any empty space. */ - result = read(m_fd, m_buf + m_validLen, MAX_READ_SIZE - m_validLen); - if (result == -1) { - if (errno == EINTR || errno == EAGAIN) { - return (false); - } else { - err(1, "Read from devd socket failed"); + avail = m_reader.in_avail(); + if (avail) { + size_t want; + + want = std::min(avail, MAX_READ_SIZE - m_validLen); + consumed = m_reader.read(m_buf + m_validLen, want); + if (consumed == -1) { + if (errno == EINTR) { + return (false); + } else { + err(1, "EventBuffer::Fill(): Read failed"); + } } } - m_validLen += result; + + m_validLen += consumed; /* Guarantee our buffer is always NUL terminated. */ m_buf[m_validLen] = '\0'; - return (result > 0); + return (consumed > 0); } /*--------------------------------- ZfsDaemon --------------------------------*/ @@ -223,6 +271,7 @@ bool ZfsDaemon::s_terminateEventLoop; char ZfsDaemon::s_pidFilePath[] = "/var/run/zfsd.pid"; pidfh *ZfsDaemon::s_pidFH; +FDReader* ZfsDaemon::s_reader; int ZfsDaemon::s_devdSockFD = -1; int ZfsDaemon::s_signalPipeFD[2]; bool ZfsDaemon::s_systemRescanRequested(false); @@ -309,6 +358,7 @@ void ZfsDaemon::Fini() { + PurgeCaseFiles(); ClosePIDFile(); } @@ -390,10 +440,9 @@ return (false); } - /* Don't block on reads. */ - if (fcntl(s_devdSockFD, F_SETFL, O_NONBLOCK) == -1) - err(1, "Unable to enable nonblocking behavior on devd socket"); + /* Connect the stream to the file descriptor */ + s_reader = new FDReader(s_devdSockFD); syslog(LOG_INFO, "Connection to devd successful"); return (true); } @@ -401,7 +450,10 @@ void ZfsDaemon::DisconnectFromDevd() { + delete s_reader; + s_reader = NULL; close(s_devdSockFD); + s_devdSockFD = -1; } void @@ -451,8 +503,8 @@ { char discardBuf[256]; - while (read(s_devdSockFD, discardBuf, sizeof(discardBuf)) > 0) - ; + while (s_reader->in_avail()) + s_reader->read(discardBuf, sizeof(discardBuf)); } bool @@ -531,6 +583,7 @@ event = DevCtlEvent::CreateEvent(evString); if (event != NULL) event->Process(); + delete event; } } } @@ -564,7 +617,7 @@ void ZfsDaemon::EventLoop() { - EventBuffer eventBuffer(s_devdSockFD); + EventBuffer eventBuffer(*s_reader); while (s_terminateEventLoop == false) { struct pollfd fds[2]; @@ -616,13 +669,13 @@ RescanSystem(); } - if ((fds->revents & POLLERR) != 0) { + if ((fds[0].revents & POLLERR) != 0) { /* Try reconnecting. */ syslog(LOG_INFO, "Error on socket. Disconnecting."); break; } - if ((fds->revents & POLLHUP) != 0) { + if ((fds[0].revents & POLLHUP) != 0) { /* Try reconnecting. */ syslog(LOG_INFO, "Hup on socket. Disconnecting."); break; --- old/cddl/sbin/zfsd/zfsd.h +++ new/cddl/sbin/zfsd/zfsd.h @@ -42,6 +42,7 @@ #define _ZFSD_H_ #include +#include #include #include #include @@ -57,6 +58,7 @@ using std::auto_ptr; using std::map; using std::pair; +using std::istream; using std::string; /*================================ Global Data ===============================*/ @@ -74,21 +76,131 @@ #define NUM_ELEMENTS(x) (sizeof(x) / sizeof(*x)) /*============================= Class Definitions ============================*/ + +/*-------------------------------- Reader -------------------------------*/ +/** + * \brief A class that presents a common interface to both file descriptors and + * istreams . + * + * Standard C++ provides no way to create an iostream from a file descriptor or + * a FILE. The GNU, Apache, HPUX, and Solaris C++ libraries all provide + * non-standard ways to construct such a stream using similar semantics, but + * LLVM does not. Therefore this class is needed to ensure that zfsd can + * compile under LLVM. This class supports only the functionality needed by + * ZFSD; it does not implement the iostream API. + */ +class Reader +{ +public: + /** + * \brief Return the number of bytes immediately available for reading + */ + virtual size_t in_avail() const = 0; + + /** + * \brief Reads up to count bytes + * + * Whether this call blocks depends on the underlying input source. + * On error, -1 is returned, and errno will be set by the underlying + * source. + * + * \param buf Destination for the data + * \param count Maximum amount of data to read + * \returns Amount of data that was actually read + */ + virtual ssize_t read(char* buf, size_t count) = 0; +}; + + +/*-------------------------------- FDReader -------------------------------*/ +/** + * \brief Specialization of Reader that uses a file descriptor + */ +class FDReader : public Reader +{ +public: + /** + * \brief Constructor + * + * \param fd An open file descriptor. It will not be garbage + * collected by the destructor. + */ + FDReader(int fd); + + virtual size_t in_avail() const; + + virtual ssize_t read(char* buf, size_t count); + +protected: + /** Copy of the underlying file descriptor */ + int m_fd; +}; + +//- FDReader Inline Public Methods ----------------------------------------- +inline FDReader::FDReader(int fd) + : m_fd(fd) +{ +} + +inline ssize_t +FDReader::read(char* buf, size_t count) +{ + return (::read(m_fd, buf, count)); +} + + +/*-------------------------------- IstreamReader------------------------------*/ +/** + * \brief Specialization of Reader that uses a std::istream + */ +class IstreamReader : public Reader +{ +public: + /** + * Constructor + * + * \param stream Pointer to an open istream. It will not be + * garbage collected by the destructor. + */ + IstreamReader(istream* stream); + + virtual size_t in_avail() const; + + virtual ssize_t read(char* buf, size_t count); + +protected: + /** Copy of the underlying stream */ + istream* m_stream; +}; + +//- IstreamReader Inline Public Methods ---------------------------------------- +inline IstreamReader::IstreamReader(istream* stream) + : m_stream(stream) +{ +} + +inline size_t +IstreamReader::in_avail() const +{ + return (m_stream->rdbuf()->in_avail()); +} + + /*-------------------------------- EventBuffer -------------------------------*/ /** - * \brief Class buffering event data from Devd and splitting it - * into individual event strings. + * \brief Class buffering event data from Devd or a similar source and + * splitting it into individual event strings. * - * Users of this class initialize it with the file descriptor associated - * with the unix domain socket connection with devd. The lifetime of - * an EventBuffer instance should match that of the file descriptor passed - * to it. This is required as data from partially received events is - * retained in the EventBuffer in order to allow reconstruction of these - * events across multiple reads of the Devd file descriptor. + * Users of this class initialize it with a Reader associated with the unix + * domain socket connection with devd or a compatible source. The lifetime of + * an EventBuffer instance should match that of the Reader passed to it. This + * is required as data from partially received events is retained in the + * EventBuffer in order to allow reconstruction of these events across multiple + * reads of the stream. * - * Once the program determines that the Devd file descriptor is ready - * for reading, the EventBuffer::ExtractEvent() should be called in a - * loop until the method returns false. + * Once the program determines that the Reader is ready for reading, the + * EventBuffer::ExtractEvent() should be called in a loop until the method + * returns false. */ class EventBuffer { @@ -96,9 +208,9 @@ /** * Constructor * - * \param fd The file descriptor on which to buffer/parse event data. + * \param reader The data source on which to buffer/parse event data. */ - EventBuffer(int fd); + EventBuffer(Reader& reader); /** * Pull a single event string out of the event buffer. @@ -143,10 +255,10 @@ }; /** The amount of data in m_buf we have yet to look at. */ - size_t UnParsed(); + size_t UnParsed() const; /** The amount of data in m_buf available for the next event. */ - size_t NextEventMaxLen(); + size_t NextEventMaxLen() const; /** Fill the event buffer with event data from Devd. */ bool Fill(); @@ -163,8 +275,8 @@ /** Temporary space for event data during our parsing. */ char m_buf[EVENT_BUFSIZE]; - /** Copy of the file descriptor linked to devd's domain socket. */ - int m_fd; + /** Reference to the reader linked to devd's domain socket. */ + Reader& m_reader; /** Valid bytes in m_buf. */ size_t m_validLen; @@ -174,17 +286,20 @@ /** Offset to the start token of the next event. */ size_t m_nextEventOffset; + + /** The EventBuffer is aligned and tracking event records. */ + bool m_synchronized; }; //- EventBuffer Inline Private Methods ----------------------------------------- inline size_t -EventBuffer::UnParsed() +EventBuffer::UnParsed() const { return (m_validLen - m_parsedLen); } inline size_t -EventBuffer::NextEventMaxLen() +EventBuffer::NextEventMaxLen() const { return (m_validLen - m_nextEventOffset); } @@ -359,6 +474,11 @@ static int s_devdSockFD; /** + * Reader object used by the EventBuffer + */ + static FDReader* s_reader; + + /** * Pipe file descriptors used to close races with our * signal handlers. */ --- old/cddl/tools/regression/stc/src/suites/fs/zfs/README +++ new/cddl/tools/regression/stc/src/suites/fs/zfs/README @@ -22,7 +22,7 @@ # # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. -# +# # ident "@(#)README 1.8 09/05/19 SMI" # @@ -90,7 +90,7 @@ o This method uses the standard STF techniques to create a Solaris package, which will be installed under the base directory "/opt/SUNWstc-fs-zfs". - + Briefly, this build and installation is performed as follows: # set path to STF bin directory @@ -135,7 +135,7 @@ SUNWstc-stf in the global zone % sudo pkgadd -d /ws/onnv-stc2/packages/`uname -p` SUNWstc-stf - + o When testing with NFS, you should set the remote access permission for rsh/rcp on the remote server machine. You can add the permission to ~root/.rhosts file in the server, for example: @@ -151,10 +151,10 @@ 3.2.1 Configure the tests - o You could configure the test on physical disks, that means you'll need - at least one scratch disks. (Above two is recommended) Configure the two + o You could configure the test on physical disks, that means you'll need + at least one scratch disks. (Above two is recommended) Configure the two scratch disks, c0t13d0 and c0t14d0 for example: - + % cd /opt/SUNWstc-fs-zfs; stf_configure -c DISKS="c0t13d0 c0t14d0" o The test suites could also be configured on rawfiles, each of them should @@ -162,7 +162,7 @@ % mkfile 3g /var/tmp/file1 /var/tmp/file2 % cd /opt/SUNWstc-zfs - % stf_configure -c DISKS="/var/tmp/file1 /var/tmp/file2" + % stf_configure -c DISKS="/var/tmp/file1 /var/tmp/file2" o By default the test suite runs all test assertions. However, the test suite can be configured for test runs of varying length by @@ -171,7 +171,7 @@ will configure the test suite for the shortest possible runtime: % cd /opt/SUNWstc-fs-zfs; stf_configure -c DISKS="c0t13d0 c0t14d0" \ - -c "RUNTIME=short" + -c "RUNTIME=short" Note that hardware speed is also a significnat contributor to the runtime length of the test suite. @@ -182,7 +182,7 @@ % cd /opt/SUNWstc-fs-zfs; stf_configure -c DISKS="c0t13d0 c0t14d0" \ -c "KEEP=poolA poolB" - + o If you want to run the test suite with remote support, you should assign one or more machines as remote testing hosts. Meanwhile, you also need to specify disks for each remote host. Optionally, you can @@ -219,7 +219,7 @@ specify RHOSTS and RDISKS. Currently, only one value "remote" is supported for iscsi variable. - Here is an example + Here is an example % cd /opt/SUNWstc-fs-zfs % stf_configure -c DISKS="c0t13d0 c0t14d0" -c RHOSTS="host1" \ -c RDISKS="'detect'" -c iscsi="remote" @@ -235,7 +235,7 @@ % export DISKS="c0t13d0 c0t14d0" % export KEEP="poolA poolB" % export RUNTIME="long" - % export RHOSTS="foo1 foo2" + % export RHOSTS="foo1 foo2" % export RDISKS="'c0t1d0 c0t2d0' 'detect'" % export RTEST_ROOT="/export/tmp" % stf_configure @@ -280,7 +280,7 @@ o First, configure in the global zone to create a local zone and export the pool to the local zone. You'll need at least one scratch - disks. (Two above is recommended) You can assign a zone name, zone root + disks. (Two above is recommended) You can assign a zone name, zone root and IP address for the local zone. All parameters are optional. Syntax as, % stf_configure -c DISKS="" -c zone=new [-c zone_name=] [-c zone_root=] [-c zone_ip=] @@ -316,7 +316,7 @@ o To execute all of the modes on current system platform - % cd /opt/SUNWstc-fs-zfs; + % cd /opt/SUNWstc-fs-zfs; % /opt/SUNWstc-stf/bin/`uname -p`/stf_execute o To execute in a specific mode: @@ -332,8 +332,8 @@ 3.4 Unconfigure the suite. - o Use the STF unconfigure tool. + o Use the STF unconfigure tool. - % cd /opt/SUNWstc-fs-zfs; stf_unconfigure + % cd /opt/SUNWstc-fs-zfs; stf_unconfigure ================================================================================ --- old/cddl/tools/regression/stc/src/suites/fs/zfs/bin/file_trunc.c +++ new/cddl/tools/regression/stc/src/suites/fs/zfs/bin/file_trunc.c @@ -171,6 +171,7 @@ off_t roffset = 0; char *buf = NULL; char *rbuf = NULL; + int i; buf = (char *)calloc(1, bsize); rbuf = (char *)calloc(1, bsize); @@ -203,7 +204,16 @@ } if (memcmp(buf, rbuf, bsize) != 0) { - perror("memcmp"); + (void) fprintf(stderr, + "Read back of data written offset %x " + "isn't what we wrote:\n", (offset + roffset)); + for (i = 0;i < bsize; i++) { + if (buf[i] == rbuf[i]) + continue; + fprintf(stderr, "%04x: %02x | %02x\n", + i, buf[i], rbuf[i]); + } + (void) fprintf(stderr, "%s", rbuf); exit(9); } } --- old/cddl/tools/regression/stc/src/suites/fs/zfs/bin/scripts/Makefile +++ new/cddl/tools/regression/stc/src/suites/fs/zfs/bin/scripts/Makefile @@ -31,7 +31,7 @@ zfs_crypto \ zpool_version zfs_version \ zpool_smi zpool_bsd \ - groupadd groupmod groupdel \ + groupadd groupmod groupdel groupshow \ useradd usermod userdel \ dumpadm swap dircmp bsddisks df \ zonecfg zlogin zoneadm svcs fstyp \ --- old/cddl/tools/regression/stc/src/suites/fs/zfs/commands.txt +++ new/cddl/tools/regression/stc/src/suites/fs/zfs/commands.txt @@ -87,6 +87,7 @@ /opt/SUNWstc-fs-zfs/bin/groupadd /opt/SUNWstc-fs-zfs/bin/groupdel /opt/SUNWstc-fs-zfs/bin/groupmod +/opt/SUNWstc-fs-zfs/bin/groupshow /usr/bin/head /bin/hostname /bin/kill --- old/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/poolversion/cleanup.ksh +++ new/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/poolversion/cleanup.ksh @@ -40,7 +40,4 @@ log_must $ZPOOL destroy $TESTPOOL log_must $ZPOOL destroy $TESTPOOL2 -log_must $RM /tmp/zpool_version_1.dat -log_must $RM /tmp/zpool2_version_1.dat - default_cleanup --- old/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/poolversion/setup.ksh +++ new/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/poolversion/setup.ksh @@ -37,13 +37,13 @@ log_unsupported "zpool version property not supported on this system." fi +verify_disk_count "$DISKS" 2 +DISKS_ARRAY=($DISKS) # create a version 1 pool -log_must $MKFILE 64m /tmp/zpool_version_1.dat -log_must $ZPOOL create -o version=1 $TESTPOOL /tmp/zpool_version_1.dat +log_must $ZPOOL create -o version=1 $TESTPOOL ${DISKS_ARRAY[0]} # create another version 1 pool -log_must $MKFILE 64m /tmp/zpool2_version_1.dat -log_must $ZPOOL create -o version=1 $TESTPOOL2 /tmp/zpool2_version_1.dat +log_must $ZPOOL create -o version=1 $TESTPOOL2 ${DISKS_ARRAY[1]} log_pass --- old/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/scrub_mirror/scrub_mirror_common.kshlib +++ new/cddl/tools/regression/stc/src/suites/fs/zfs/tests/functional/scrub_mirror/scrub_mirror_common.kshlib @@ -61,7 +61,7 @@ typeset scrubbed="false" while [[ "$scrubbed" != "true" ]]; do $ZPOOL status $POOL | $GREP -s "scrub" \ - | $GREP -i "completed" + | $GREP -i "repaired" if [[ $? -eq 0 ]]; then scrubbed="true" fi --- old/cddl/usr.bin/zstreamdump/Makefile +++ new/cddl/usr.bin/zstreamdump/Makefile @@ -20,7 +20,7 @@ DPADD= ${LIBM} ${LIBNVPAIR} ${LIBUMEM} ${LIBZPOOL} \ ${LIBPTHREAD} ${LIBZ} ${LIBAVL} -LDADD= -lm -lnvpair -lumem -lzpool -lpthread -lz -lavl +LDADD= -lm -lnvpair -lumem -lzpool -lpthread -lz -lavl -luutil CSTD= c99 --- old/cddl/usr.bin/ztest/Makefile +++ new/cddl/usr.bin/ztest/Makefile @@ -18,8 +18,12 @@ CFLAGS+= -I${.CURDIR}/../../lib/libumem DPADD= ${LIBM} ${LIBNVPAIR} ${LIBUMEM} ${LIBZPOOL} \ - ${LIBPTHREAD} ${LIBAVL} -LDADD= -lm -lnvpair -lumem -lzpool -lpthread -lavl + ${LIBPTHREAD} ${LIBZ} ${LIBAVL} +LDADD= -lm -lnvpair -lumem -lzpool -lpthread -lz -lavl -luutil + +# Since there are many asserts in this program, it makes no sense to compile +# it without debugging. +CFLAGS+=-g -O0 CSTD= c99 --- old/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c +++ new/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c @@ -2,6 +2,8 @@ * Copyright (c) 2009 Pawel Jakub Dawidek * All rights reserved. * + * Copyright (c) 2012 Spectra Logic Corporation. All rights reserved. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -61,9 +63,10 @@ } SYSUNINIT(system_taskq_fini, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_fini, NULL); -taskq_t * -taskq_create(const char *name, int nthreads, pri_t pri, int minalloc __unused, - int maxalloc __unused, uint_t flags) +static taskq_t * +taskq_create_with_init(const char *name, int nthreads, pri_t pri, + int minalloc __unused, int maxalloc __unused, uint_t flags, + taskq_callback_fn ctor, taskq_callback_fn dtor) { taskq_t *tq; @@ -73,17 +76,34 @@ tq = kmem_alloc(sizeof(*tq), KM_SLEEP); tq->tq_queue = taskqueue_create(name, M_WAITOK, taskqueue_thread_enqueue, &tq->tq_queue); + if (ctor != NULL) + taskqueue_set_callback(tq->tq_queue, + TASKQUEUE_CALLBACK_TYPE_INIT, ctor, NULL); + if (dtor != NULL) + taskqueue_set_callback(tq->tq_queue, + TASKQUEUE_CALLBACK_TYPE_SHUTDOWN, dtor, NULL); (void) taskqueue_start_threads(&tq->tq_queue, nthreads, pri, "%s", name); return ((taskq_t *)tq); } taskq_t * +taskq_create(const char *name, int nthreads, pri_t pri, int minalloc __unused, + int maxalloc __unused, uint_t flags) +{ + + return (taskq_create_with_init(name, nthreads, pri, minalloc, maxalloc, + flags, NULL, NULL)); +} + +taskq_t * taskq_create_proc(const char *name, int nthreads, pri_t pri, int minalloc, - int maxalloc, proc_t *proc __unused, uint_t flags) + int maxalloc, proc_t *proc __unused, uint_t flags, taskq_callback_fn ctor, + taskq_callback_fn dtor) { - return (taskq_create(name, nthreads, pri, minalloc, maxalloc, flags)); + return (taskq_create_with_init(name, nthreads, pri, minalloc, maxalloc, + flags, ctor, dtor)); } void --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2011-2012 Spectra Logic Corporation. All rights reserved. */ /** @@ -1049,7 +1050,7 @@ } fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) - panic("buffer modified while frozen!"); + panic("buffer %p modified while frozen!", buf); mutex_exit(&buf->b_hdr->b_freeze_lock); } @@ -1084,6 +1085,20 @@ mutex_exit(&buf->b_hdr->b_freeze_lock); } +boolean_t +arc_buf_frozen(arc_buf_t *buf) +{ + boolean_t frozen = B_TRUE; + + /* + * NB: Does not grab or assert the mutex because the caller more + * than likely cannot use the results in an atomic fashion. + */ + if (buf->b_hdr->b_freeze_cksum == NULL) + frozen = B_FALSE; + return (frozen); +} + void arc_buf_thaw(arc_buf_t *buf) { @@ -1121,6 +1136,15 @@ hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); +#ifdef ZFS_DEBUG + if (buf->b_hdr->b_freeze_cksum == NULL && buf->b_hdr->b_state != arc_anon) { + printf("%s: invalid state: freeze_cksum=%p, b_state=%p\n", + __func__, buf->b_hdr->b_freeze_cksum, buf->b_hdr->b_state); + printf("arc_anon=%p arc_mru=%p arc_mru_ghost=%p arc_mfu=%p " + "arc_mfu_ghost=%p arc_l2c_only=%p\n", arc_anon, arc_mru, + arc_mru_ghost, arc_mfu, arc_mfu_ghost, arc_l2c_only); + } +#endif ASSERT(buf->b_hdr->b_freeze_cksum != NULL || buf->b_hdr->b_state == arc_anon); arc_cksum_compute(buf, B_FALSE); @@ -2974,6 +2998,7 @@ kmutex_t *hash_lock; zio_t *rzio; uint64_t guid = spa_load_guid(spa); + boolean_t cached_only = (*arc_flags & ARC_CACHED_ONLY) != 0; top: hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), @@ -2984,6 +3009,12 @@ if (HDR_IO_IN_PROGRESS(hdr)) { + /* + * Cache lookups should only occur from consumers + * that do not have any context loaded yet. This + * means that no I/O should be in progress for them. + */ + ASSERT(!cached_only); if (*arc_flags & ARC_WAIT) { cv_wait(&hdr->b_cv, hash_lock); mutex_exit(hash_lock); @@ -3055,6 +3086,13 @@ uint64_t addr; boolean_t devw = B_FALSE; + if (cached_only) { + if (hdr) + mutex_exit(hash_lock); + done(NULL, NULL, private); + return (0); + } + if (hdr == NULL) { /* this block is not in the cache */ arc_buf_hdr_t *exists; @@ -3330,7 +3368,7 @@ } /** - * \brief Release this buffer from the cache. + * \brief Convert to an anonymous buffer. * * This must be done after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make a new hdr for the @@ -4419,7 +4457,7 @@ if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_io_error); } else { - zio->io_error = EIO; + ZIO_SET_ERROR(zio, EIO); } if (!equal) ARCSTAT_BUMP(arcstat_l2_cksum_bad); --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Spectra Logic Corporation. All rights reserved. */ #include @@ -36,10 +37,35 @@ #include #include #include +#include static void dbuf_destroy(dmu_buf_impl_t *db); static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); +static zio_t *dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, + dmu_tx_t *tx); +static arc_evict_func_t dbuf_do_evict; + +#define IN_RANGE(x, val, y) ((val) >= (x) && (val) <= (y)) +#ifdef ZFS_DEBUG +#define DEBUG_REFCOUNT_INC(rc) refcount_acquire(&(rc)) +#define DEBUG_REFCOUNT_DEC(rc) do { \ + refcount_release(&(rc)); \ + ASSERT((rc) >= 0); \ +} while (0) +#define DEBUG_COUNTER_INC(counter) atomic_add_64(&(counter), 1) +#else +#define DEBUG_REFCOUNT_INC(rc) do { } while (0) +#define DEBUG_REFCOUNT_DEC(rc) do { } while (0) +#define DEBUG_COUNTER_INC(counter) do { } while (0) +#endif + +#define _DBUF_CONSTANT_FMT \ + " offset %"PRIu64" os %p level %d holds %"PRIi64" dirty %d state %d\n" +#define _DBUF_CONSTANT_FMT_ARGS(db) \ + (db)->db.db_offset, (db)->db_objset, (db)->db_level, \ + refcount_count(&(db)->db_holds), (db)->db_dirtycnt, (db)->db_state + +#define tmpprintf(args...) do { } while (0) /** * \brief Global data structures and functions for the dbuf cache. @@ -73,7 +99,25 @@ * \brief dbuf hash table routines */ static dbuf_hash_table_t dbuf_hash_table; +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf, CTLFLAG_RW, 0, "ZFS DBUF"); +#define SYSCTL_REFCOUNT(name, desc) \ + int name; \ + SYSCTL_QUAD(_vfs_zfs_dbuf, OID_AUTO, name, CTLFLAG_RD, \ + &name, 0, desc) +#define SYSCTL_COUNTER_U(name, desc) \ + uint64_t name; \ + SYSCTL_QUAD(_vfs_zfs_dbuf, OID_AUTO, name, CTLFLAG_RD, \ + &name, 0, desc) + +#ifdef ZFS_DEBUG +SYSCTL_REFCOUNT(dirty_ranges_in_flight, "number of dirty ranges in flight"); +SYSCTL_COUNTER_U(dirty_ranges_total, "number of total dirty ranges"); +SYSCTL_COUNTER_U(user_evicts, "number of user evicts performed"); +#endif +SYSCTL_COUNTER_U(dirty_writes_lost, "dirty writes lost"); + static uint64_t dbuf_hash_count; static uint64_t @@ -95,6 +139,30 @@ return (crc); } +#ifdef ZFS_DEBUG +#define DBUF_STATE_CHANGE(db, op, state, why) do { \ + (db)->db_state op state; \ + if (zfs_flags & ZFS_DEBUG_DBUF_STATE) { \ + uint64_t __db_obj = (db)->db.db_object; \ + char __db_buf[32]; \ + if (__db_obj == DMU_META_DNODE_OBJECT) \ + strcpy(__db_buf, "mdn"); \ + else \ + (void) snprintf(__db_buf, sizeof(__db_buf), \ + "%lld", (u_longlong_t)__db_obj); \ + __dprintf(__FILE__, __func__, __LINE__, \ + "%s: dbp=%p arc=%p obj=%s, lvl=%u blkid=%lld " \ + "state change (" #op " " #state "): %s\n", \ + __func__, db, (db)->db_buf, __db_buf, \ + (db)->db_level, (u_longlong_t)(db)->db_blkid, why); \ + } \ +} while(0) +#else +#define DBUF_STATE_CHANGE(db, op, state, why) do { \ + (db)->db_state op state; \ +} while(0) +#endif + #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ @@ -202,22 +270,125 @@ atomic_add_64(&dbuf_hash_count, -1); } -static arc_evict_func_t dbuf_do_evict; +static void +dbuf_update_user_data(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + if (db->db_level == 0 && + db->db_user != NULL && db->db_user->user_data_ptr_ptr != NULL) { + ASSERT(!refcount_is_zero(&db->db_holds)); + *db->db_user->user_data_ptr_ptr = db->db.db_data; + } +} +/** + * DMU buffer user eviction mechanism. + * See dmu_buf_user_t about how this works. + */ static void -dbuf_evict_user(dmu_buf_impl_t *db) +dbuf_queue_user_evict(dmu_buf_impl_t *db, list_t *evict_list_p) { ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_level != 0 || db->db_evict_func == NULL) + if (db->db_level != 0 || db->db_user == NULL) return; - if (db->db_user_data_ptr_ptr) - *db->db_user_data_ptr_ptr = db->db.db_data; - db->db_evict_func(&db->db, db->db_user_ptr); - db->db_user_ptr = NULL; - db->db_user_data_ptr_ptr = NULL; - db->db_evict_func = NULL; + DEBUG_COUNTER_INC(user_evicts); + ASSERT(!list_link_active(&db->db_user->evict_queue_link)); + list_insert_head(evict_list_p, db->db_user); + db->db_user = NULL; +} + +/** + * \brief Update the user eviction data for the DMU buffer. + * + * \param db_fake The DMU buffer to set the data for. + * \param old_user The old user's eviction data pointer. + * \param new_user The new user's eviction data pointer. + * + * \returns NULL on success, or the existing user ptr if it's already + * been set. + */ +dmu_buf_user_t * +dmu_buf_update_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, + dmu_buf_user_t *new_user) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT(db->db_level == 0); + + mutex_enter(&db->db_mtx); + + if (db->db_user == old_user) { + db->db_user = new_user; + dbuf_update_user_data(db); + } else + old_user = db->db_user; + + mutex_exit(&db->db_mtx); + return (old_user); +} + +dmu_buf_user_t * +dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) +{ + + return (dmu_buf_update_user(db_fake, NULL, user)); +} + +dmu_buf_user_t * +dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + db->db_immediate_evict = TRUE; + return (dmu_buf_update_user(db_fake, NULL, user)); +} + +/** + * \return the db_user set with dmu_buf_update_user(), or NULL if not set. + */ +dmu_buf_user_t * +dmu_buf_get_user(dmu_buf_t *db_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT(!refcount_is_zero(&db->db_holds)); + + return (db->db_user); +} + +/** + * Clear the dbuf's ARC buffer. + */ +static void +dbuf_clear_data(dmu_buf_impl_t *db, list_t *evict_list_p) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); + db->db_buf = NULL; + dbuf_queue_user_evict(db, evict_list_p); + db->db.db_data = NULL; + if (db->db_state != DB_NOFILL) + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "clear data"); +} + +/** + * Set the dbuf's buffer to the ARC buffer, including any associated state, + * such as db_data. + */ +static void +dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); + ASSERT(buf != NULL); + + db->db_buf = buf; + db->db_buf->b_last_dbuf = db; + ASSERT(buf->b_data != NULL); + db->db.db_data = buf->b_data; + if (!arc_released(buf)) + arc_set_callback(buf, dbuf_do_evict, db); + dbuf_update_user_data(db); } boolean_t @@ -237,13 +408,13 @@ } void -dbuf_evict(dmu_buf_impl_t *db) +dbuf_evict(dmu_buf_impl_t *db, list_t *evict_list_p) { ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_buf == NULL); ASSERT(db->db_data_pending == NULL); - dbuf_clear(db); + dbuf_clear(db, evict_list_p); dbuf_destroy(db); } @@ -302,6 +473,8 @@ { dnode_t *dn; dbuf_dirty_record_t *dr; + dbuf_dirty_record_t *dr_next; + dbuf_dirty_record_t *pending; ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -334,11 +507,23 @@ ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); } - for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) + pending = NULL; + for (dr = list_head(&db->db_dirty_records); dr != NULL; dr = dr_next) { + dr_next = list_next(&db->db_dirty_records, dr); ASSERT(dr->dr_dbuf == db); - - for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) - ASSERT(dr->dr_dbuf == db); + ASSERT(dr_next == NULL || dr->dr_txg > dr_next->dr_txg); + /* This DR happens to be the pending DR. */ + if (dr == db->db_data_pending) { + pending = dr; + ASSERT(dr_next == NULL); + } + } + if (db->db_data_pending != NULL) { + /* The pending DR's dbuf is this dbuf. */ + ASSERT(db->db_data_pending->dr_dbuf == db); + /* The pending DR should be on the list. */ + ASSERT(pending == db->db_data_pending); + } /* * We can't assert that db_size matches dn_datablksz because it @@ -384,6 +569,16 @@ } } } + /* + * XXX + * We may need to modify the state check here if something may be + * in DB_FILL and have dirty parts, depending on how db_state + * semantics are changed. + * + * XXX + * Why does this ignore DB_FILL in the first place? DB_FILL + * still dirties the buffer and must be sunk too. + */ if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && (db->db_buf == NULL || db->db_buf->b_data) && db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && @@ -402,42 +597,43 @@ } } } + + /*** Dbuf state checks. */ + /* If a dbuf is partial, it can only have one dirty record. */ + ASSERT((db->db_state & DB_PARTIAL) == 0 || db->db_dirtycnt == 1); + + /* + * Returns 1 if either the bitmask is not set or those are the only + * bits set, with exceptions where they are acceptable. + */ +#define BITMASK_SET(val, bitmask, exceptions) \ + (((val) & (bitmask)) == 0 || ((val) & (~(bitmask|exceptions))) == 0) +#define BITMASK_SET_EXCLUSIVE(val, bitmask) BITMASK_SET(val, bitmask, 0) + + ASSERT(BITMASK_SET_EXCLUSIVE(db->db_state, DB_UNCACHED)); + ASSERT(BITMASK_SET_EXCLUSIVE(db->db_state, DB_NOFILL)); + ASSERT(BITMASK_SET_EXCLUSIVE(db->db_state, DB_CACHED)); + ASSERT(BITMASK_SET_EXCLUSIVE(db->db_state, DB_EVICTING)); + ASSERT(BITMASK_SET(db->db_state, DB_PARTIAL, DB_FILL)); + ASSERT(BITMASK_SET(db->db_state, DB_READ, DB_FILL)); + ASSERT(BITMASK_SET(db->db_state, DB_FILL, (DB_PARTIAL|DB_READ))); +#undef BITMASK_SET_EXCLUSIVE +#undef BITMASK_SET + DB_DNODE_EXIT(db); } #endif -static void -dbuf_update_data(dmu_buf_impl_t *db) +static arc_buf_t * +dbuf_alloc_arcbuf(dmu_buf_impl_t *db) { - ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_level == 0 && db->db_user_data_ptr_ptr) { - ASSERT(!refcount_is_zero(&db->db_holds)); - *db->db_user_data_ptr_ptr = db->db.db_data; - } -} + spa_t *spa; + arc_buf_t *buf; -/** - * Set the dbuf's buffer to the ARC buffer, including any associated state, - * such as db_data. - */ -static void -dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) -{ - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); - db->db_buf = buf; - if (buf != NULL) { - ASSERT(buf->b_data != NULL); - db->db.db_data = buf->b_data; - if (!arc_released(buf)) - arc_set_callback(buf, dbuf_do_evict, db); - dbuf_update_data(db); - } else { - dbuf_evict_user(db); - db->db.db_data = NULL; - if (db->db_state != DB_NOFILL) - db->db_state = DB_UNCACHED; - } + DB_GET_SPA(&spa, db); + buf = arc_buf_alloc(spa, db->db.db_size, db, DBUF_GET_BUFC_TYPE(db)); + buf->b_last_dbuf = db; + return (buf); } /** @@ -449,6 +645,9 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) { arc_buf_t *abuf; + list_t evict_list; + + dmu_buf_create_user_evict_list(&evict_list); mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { @@ -462,9 +661,10 @@ } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); - dbuf_set_data(db, NULL); + dbuf_clear_data(db, &evict_list); mutex_exit(&db->db_mtx); } + dmu_buf_destroy_user_evict_list(&evict_list); return (abuf); } @@ -479,35 +679,364 @@ } } +typedef struct dbuf_dirty_record_hole { + caddr_t src; + caddr_t dst; + int size; +} dbuf_dirty_record_hole_t; + +typedef struct dbuf_dirty_record_hole_itr { + /* provided data */ + arc_buf_t *src; + dbuf_dirty_leaf_record_t *dl; + /* calculated data */ + dbuf_dirty_range_t *range; + /* One greater than the last valid offset in the dst buffer */ + int max_offset; + int hole_start; + dbuf_dirty_record_hole_t hole; +} dbuf_dirty_record_hole_itr_t; + +/** + * \brief Initialize a dirty record hole iterator. + * + * \param itr Iterator context to initialize. + * \param dl Dirty leaf to merge. + * \param src_buf ARC buffer containing initial data. + */ +static inline void +dbuf_dirty_record_hole_itr_init(dbuf_dirty_record_hole_itr_t *itr, + dbuf_dirty_leaf_record_t *dl, arc_buf_t *src_buf) +{ + itr->src = src_buf; + itr->dl = dl; + itr->max_offset = MIN(arc_buf_size(src_buf), arc_buf_size(dl->dr_data)); + itr->range = list_head(&dl->write_ranges); + ASSERT((zfs_flags & ZFS_DEBUG_MODIFY) == 0 || + !arc_buf_frozen(dl->dr_data)); + itr->hole.src = NULL; + itr->hole.dst = NULL; + itr->hole.size = 0; + /* If no ranges exist, the dirty buffer is entirely valid. */ + if (itr->range == NULL) { + /* Set to the end to return no holes */ + itr->hole_start = itr->max_offset; + } else if (itr->range->start == 0) { + itr->hole_start = itr->range->size; + itr->range = list_next(&itr->dl->write_ranges, itr->range); + } else + itr->hole_start = 0; +} + +/** + * \brief Iterate a dirty record, providing the next hole. + * + * \param itr Dirty record hole iterator context. + * + * The hole returned provides direct pointers to the source, destination, + * and the target size. A hole is a portion of the dirty record's ARC + * buffer that does not contain valid data and must be filled in using the + * initial ARC buffer, which should be entirely valid. + * + * \return NULL If there are no more holes. + */ +static inline dbuf_dirty_record_hole_t * +dbuf_dirty_record_hole_itr_next(dbuf_dirty_record_hole_itr_t *itr) +{ + + if (itr->hole_start >= itr->max_offset) + return (NULL); + + itr->hole.src = (caddr_t)(itr->src->b_data) + itr->hole_start; + itr->hole.dst = (caddr_t)(itr->dl->dr_data->b_data) + itr->hole_start; + if (itr->range != NULL) { + itr->hole.size = MIN(itr->max_offset, itr->range->start) - + itr->hole_start; + itr->hole_start = itr->range->end; + itr->range = list_next(&itr->dl->write_ranges, itr->range); + } else { + itr->hole.size = itr->max_offset - itr->hole_start; + itr->hole_start = itr->max_offset; + } + return (&itr->hole); +} + +/* + * Perform any dbuf arc buffer splits required to guarantee + * the syncer operates on a stable buffer. + * + * \param db The dbuf to potentially split. + * \param syncer_dr The dirty record being processed by the syncer. + * \param deferred_split True if this check is being performed after a + * resolving read. + * + * If the syncer's buffer is currently "in use" in the + * open transaction group (i.e., there are active holds + * and db_data still references it), then make a copy + * before we start the write, so that any modifications + * from the open txg will not leak into this write. + * + * \note This copy does not need to be made for objects + * only modified in the syncing context (e.g. + * DNONE_DNODE blocks). + */ +static void +dbuf_syncer_split(dmu_buf_impl_t *db, dbuf_dirty_record_t *syncer_dr, + boolean_t deferred_split) +{ + if (syncer_dr && (db->db_state & DB_NOFILL) == 0 && + refcount_count(&db->db_holds) > 1 && + syncer_dr->dt.dl.dr_override_state != DR_OVERRIDDEN && + syncer_dr->dt.dl.dr_data == db->db_buf) { + arc_buf_t *buf; + + buf = dbuf_alloc_arcbuf(db); + bcopy(db->db.db_data, buf->b_data, db->db.db_size); + if (deferred_split) { + /* + * In the case of a deferred split, the + * syncer has already generated a zio that + * references the syncer's arc buffer. + * Replace the open txg buffer instead. + * No activity in the open txg can be + * occurring yet. A reader is waiting + * for the resolve to complete, and a + * writer hasn't gotten around to creating + * a dirty record. Otherwise this dbuf + * would have already have been split. + */ + dbuf_set_data(db, buf); + } else { + /* + * The syncer has yet to create a write + * zio and since the dbuf may be in the + * CACHED state, activity in the open + * txg may be occurring. Switch out + * the syncer's dbuf, since it can tolerate + * the change. + */ + syncer_dr->dt.dl.dr_data = buf; + } + } +} + +/** + * \brief Merge write ranges for a dirty record. + * + * \param dl Dirty leaf record to merge the old buffer to. + * \param buf The old ARC buffer to use for missing data. + * + * This function performs an inverse merge. The write ranges provided + * indicate valid data in the dirty leaf's buffer, which means the old + * buffer has to be copied over exclusive of those ranges. + */ +static void +dbuf_merge_write_ranges(dbuf_dirty_leaf_record_t *dl, arc_buf_t *old_buf) +{ + dbuf_dirty_record_hole_itr_t itr; + dbuf_dirty_record_hole_t *hole; + + ASSERT3P(dl, !=, NULL); + /* If there are no write ranges, we're done. */ + if (list_is_empty(&dl->write_ranges)) + return; + /* If there are write ranges, there must be an ARC buffer. */ + ASSERT(dl->dr_data != NULL); + + /* + * We use an iterator here because it simplifies the logic + * considerably for this function. + */ + dbuf_dirty_record_hole_itr_init(&itr, dl, old_buf); + + while ((hole = dbuf_dirty_record_hole_itr_next(&itr)) != NULL) + memcpy(hole->dst, hole->src, hole->size); +} + +/** + * \brief Resolve a dbuf using its ranges and the filled ARC buffer provided. + * + * \param db Dbuf to resolve. + * \param buf ARC buffer to use to resolve. + * + * This routine is called after a read completes. The results of the read + * are stored in the ARC buffer. It will then merge writes in the order + * that they occurred, cleaning up write ranges as it goes. + */ +static void +dbuf_resolve_ranges(dmu_buf_impl_t *db, arc_buf_t *buf) +{ + dbuf_dirty_record_t *dr; + dbuf_dirty_leaf_record_t *dl; + arc_buf_t *old_buf; + + /* No range data is kept for non data blocks. */ + ASSERT3U(db->db_level, ==, 0); + + /* + * Start with the oldest dirty record, merging backwards. For the + * first dirty record, the provided ARC buffer is the "old" buffer. + * + * In turn, the older buffer is copied to the newer one, using an + * inverse of the newer one's write ranges. + */ + dr = list_tail(&db->db_dirty_records); + old_buf = buf; + while (dr != NULL) { + dl = &dr->dt.dl; + ASSERT(dl->dr_data); + dbuf_merge_write_ranges(dl, old_buf); + /* + * Now that we have updated the buffer, freeze it. However, + * if the FILL bit is set, someone else is actively + * modifying the current buffer, and will be responsible for + * freezing that buffer. + */ + if (dl->dr_data != db->db_buf || !(db->db_state & DB_FILL)) + arc_buf_freeze(dl->dr_data); + dbuf_dirty_record_cleanup_ranges(dr); + old_buf = dl->dr_data; + dr = list_prev(&db->db_dirty_records, dr); + } + + /* + * Process any deferred syncer splits now that the buffer contents + * are fully valid. + */ + dbuf_syncer_split(db, db->db_data_pending, /*deferred_split*/B_TRUE); +} + +static void +dbuf_process_buf_sets(dmu_buf_impl_t *db, boolean_t err) +{ + dmu_context_node_t *dcn, *next; + + for (dcn = list_head(&db->db_dmu_buf_sets); dcn != NULL; dcn = next) { + next = list_next(&db->db_dmu_buf_sets, dcn); + dmu_buf_set_rele(dcn->buf_set, err); + dmu_context_node_remove(&db->db_dmu_buf_sets, dcn); + } +} +#define DBUF_PROCESS_BUF_SETS(db, err) do { \ + if (!list_is_empty(&(db)->db_dmu_buf_sets)) \ + dbuf_process_buf_sets(db, err); \ +} while (0) + +static void +dbuf_read_complete(dmu_buf_impl_t *db, arc_buf_t *buf) +{ + + if (db->db_level == 0 && db->db_dirtycnt > 0) { + dbuf_dirty_record_t *pending = db->db_data_pending; + boolean_t resolving_write_pending = pending != NULL && + !list_is_empty(&pending->dt.dl.write_ranges) && + pending->dr_zio != NULL; + + /* + * Buffers in the FILL state are valid here if the read was + * issued prior to a write that completely filled. + */ + ASSERT(db->db_buf != buf); + ASSERT(db->db_state == DB_CACHED || + db->db_state == DB_UNCACHED || + db->db_state == DB_FILL || + (db->db_state & DB_READ)); + + /* + * Fill any holes in the dbuf's dirty records + * with the original block we read. + */ + dbuf_resolve_ranges(db, buf); + + if (db->db_state == DB_READ) { + /* + * The most recent version of this block + * was waiting on this read. Transition + * to cached. + */ + ASSERT(db->db_buf != NULL); + DBUF_STATE_CHANGE(db, =, DB_CACHED, + "resolve of records in READ state"); + } else if (db->db_state & DB_READ) { + /* + * Clear the READ bit; let fill_done transition us + * to DB_CACHED. + */ + ASSERT(db->db_state & DB_FILL); + DBUF_STATE_CHANGE(db, &=, ~DB_READ, + "resolve of records with READ state bit set"); + } + + /* + * The provided buffer is no longer relevant to the + * current transaction group. Discard it. + */ + arc_discard_buf(buf, db); + + /* Dispatch any deferred syncer writes. */ + if (resolving_write_pending) + zio_nowait(pending->dr_zio); + } else if (db->db_state == DB_READ) { + /* + * Read with no dirty data. Use the buffer we + * read and transition to DB_CACHED. + */ + dbuf_set_data(db, buf); + DBUF_STATE_CHANGE(db, =, DB_CACHED, + "read completed with no dirty records"); + } else { + /* + * The block was free'd or filled before this read could + * complete. Note that in this case, it satisfies the reader + * since the frontend must already be populated. + */ + ASSERT(db->db_buf != NULL); + ASSERT(db->db_state == DB_CACHED || + db->db_state == DB_UNCACHED); + arc_discard_buf(buf, db); + } + DBUF_PROCESS_BUF_SETS(db, B_FALSE); +} + static void dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; + dbuf_dirty_record_t *dr; + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + mutex_enter(&db->db_mtx); - ASSERT3U(db->db_state, ==, DB_READ); - /* - * All reads are synchronous, so we must have a hold on the dbuf - */ + + dprintf_dbuf(db, "%s: zio=%p arc=%p\n", __func__, zio, buf); + + /* Any reads or writes must have a hold on this dbuf */ ASSERT(refcount_count(&db->db_holds) > 0); - ASSERT(db->db_buf == NULL); - ASSERT(db->db.db_data == NULL); - if (db->db_level == 0 && db->db_freed_in_flight) { - /* we were freed in flight; disregard any error */ - arc_release(buf, db); - bzero(buf->b_data, db->db.db_size); - arc_buf_freeze(buf); - db->db_freed_in_flight = FALSE; - dbuf_set_data(db, buf); - db->db_state = DB_CACHED; - } else if (zio == NULL || zio->io_error == 0) { - dbuf_set_data(db, buf); - db->db_state = DB_CACHED; + + if (zio == NULL || zio->io_error == 0) { + /* Read succeeded. */ + dbuf_read_complete(db, buf); } else { - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT3P(db->db_buf, ==, NULL); + /* Read failed. */ + if (db->db_dirtycnt > 0) { + /* + * The failure of this read has already been + * communicated to the user by the zio pipeline. + * Limit our losses to just the data we can't + * read by filling any holes in our dirty records + * with zeros. + */ + bzero(buf->b_data, arc_buf_size(buf)); + arc_buf_freeze(buf); + dbuf_read_complete(db, buf); + atomic_add_64(&dirty_writes_lost, 1); + } else { + ASSERT3P(db->db_buf, ==, NULL); + db->db_state = DB_UNCACHED; + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "read failed"); + DBUF_PROCESS_BUF_SETS(db, B_TRUE); + } VERIFY(arc_buf_remove_ref(buf, db) == 1); - db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); dbuf_rele_and_unlock(db, NULL); @@ -522,12 +1051,12 @@ * \returns whether any action was taken. */ static boolean_t -dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn) +dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t *flags) { int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); if (db->db_blkid != DMU_BONUS_BLKID) - return (FALSE); + return B_FALSE; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(DB_DNODE_HELD(db)); @@ -538,11 +1067,9 @@ bzero(db->db.db_data, DN_MAX_BONUSLEN); if (bonuslen) bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); - DB_DNODE_EXIT(db); - dbuf_update_data(db); - db->db_state = DB_CACHED; - mutex_exit(&db->db_mtx); - return (TRUE); + dbuf_update_user_data(db); + DBUF_STATE_CHANGE(db, =, DB_CACHED, "bonus buffer filled"); + return (B_TRUE); } /** @@ -552,13 +1079,18 @@ * \param dn Dnode for the dbuf. * \param flags Dbuf read flags pointer. * + * \invariant The dbuf's mutex must be held. + * \note If any action was taken, this function drops db_mtx. + * * \returns whether any action was taken. */ static boolean_t -dbuf_read_on_hole(dmu_buf_impl_t *db, dnode_t *dn, uint32_t *flags) +dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, uint32_t *flags) { int is_hole; + ASSERT(MUTEX_HELD(&db->db_mtx)); + is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr); /* * For level 0 blocks only, if the above check fails: @@ -571,20 +1103,43 @@ BP_IS_HOLE(db->db_blkptr); if (is_hole) { - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + arc_buf_t *buf; + + buf = dbuf_alloc_arcbuf(db); + bzero(buf->b_data, db->db.db_size); + DBUF_STATE_CHANGE(db, =, DB_READ, "hole read satisfied"); + dbuf_read_complete(db, buf); + return (B_TRUE); + } + return (B_FALSE); +} + +static void +dbuf_read_cached_done(zio_t *zio, arc_buf_t *buf, void *priv) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)priv; - dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, - db->db.db_size, db, type)); - DB_DNODE_EXIT(db); - bzero(db->db.db_data, db->db.db_size); - db->db_state = DB_CACHED; - *flags |= DB_RF_CACHED; - mutex_exit(&db->db_mtx); - return (TRUE); + if (buf != NULL) { + ASSERT(arc_buf_frozen(buf) && !arc_released(buf)); + db->db_state = DB_READ; /* for read_complete */ + dbuf_read_complete(db, buf); } - return (FALSE); } +/** + * \brief Actually read (or issue I/O for) a dbuf's block. + * + * \param db The dbuf to read. + * \param zio The parent zio to associate with. + * \param flags Pointer to the read flags. + * + * \note Flags will be modified to include DB_RF_CACHED if the call + * returns with the dbuf cached. + * \note The dbuf mutex will be dropped in all cases except if the + * DB_RF_CACHED flag is set. + * \note The DB_RF_CACHED flag has the effect of performing a + * cached-only read. + */ static void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) { @@ -597,24 +1152,47 @@ DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(!refcount_is_zero(&db->db_holds)); - /* We need the struct_rwlock to prevent db_blkptr from changing. */ - ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_state == DB_UNCACHED); - ASSERT(db->db_buf == NULL); + ASSERT(db->db_state == DB_UNCACHED || (db->db_state & DB_PARTIAL)); - if (dbuf_read_bonus(db, dn)) + if (dbuf_read_bonus(db, dn, flags) || dbuf_read_hole(db, dn, flags)) { + DB_DNODE_EXIT(db); + *flags |= DB_RF_CACHED; + if ((*flags & DB_RF_CACHED_ONLY) == 0) + mutex_exit(&db->db_mtx); return; + } + + spa = dn->dn_objset->os_spa; - if (dbuf_read_on_hole(db, dn, flags)) + /* Check to see if a caller only wants cached buffers. */ + if (*flags & DB_RF_CACHED_ONLY) { + ASSERT(db->db_state == DB_UNCACHED && db->db_buf == NULL && + db->db_dirtycnt == 0); + aflags = ARC_CACHED_ONLY; + (void) arc_read(/*pio*/NULL, spa, db->db_blkptr, /*pbuf*/NULL, + dbuf_read_cached_done, db, /*priority*/0, /*zio_flags*/0, + &aflags, /*zb*/NULL); + + if (aflags & ARC_CACHED) + *flags |= DB_RF_CACHED; + DB_DNODE_EXIT(db); + /* Cache lookups never drop the dbuf mutex. */ return; + } - spa = dn->dn_objset->os_spa; DB_DNODE_EXIT(db); - db->db_state = DB_READ; + DBUF_STATE_CHANGE(db, =, DB_READ, "read issued"); mutex_exit(&db->db_mtx); + /* + * db_blkptr is protected by both the dbuf mutex and the associated + * struct_rwlock. The caller must acquire struct_rwlock before + * reads that may sleep without the dbuf mutex held. + */ + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + if (DBUF_IS_L2CACHEABLE(db)) aflags |= ARC_L2CACHE; @@ -638,6 +1216,34 @@ *flags |= DB_RF_CACHED; } +/** + * \brief Find a dbuf's block in the ARC, if it's there. + * + * \param db Dbuf to find the block for. + * \param dn Dnode for the dbuf. + * + * \note Calling this function is equivalent to calling dbuf_read, + * but only if the block is already in the cache. + * \note This function only applies to level 0 blocks. + * + * \returns whether it was there. + */ +static boolean_t +dbuf_read_cached(dmu_buf_impl_t *db, dnode_t *dn) +{ + int rflags = DB_RF_CACHED_ONLY; + boolean_t held = RW_WRITE_HELD(&dn->dn_struct_rwlock); + + ASSERT(DB_DNODE_HELD(db)); + + /* Make sure read_impl doesn't change its contract with us. */ + ASSERT(MUTEX_HELD(&db->db_mtx)); + dbuf_read_impl(db, NULL, &rflags); + ASSERT(MUTEX_HELD(&db->db_mtx)); + + return (db->db_state == DB_CACHED); +} + int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { @@ -673,7 +1279,7 @@ if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); - } else if (db->db_state == DB_UNCACHED) { + } else if (db->db_state & (DB_UNCACHED|DB_PARTIAL)) { spa_t *spa = dn->dn_objset->os_spa; if (zio == NULL) @@ -712,8 +1318,7 @@ /* Skip the wait on the caller's request. */ if ((flags & DB_RF_NEVERWAIT) == 0) { mutex_enter(&db->db_mtx); - while (db->db_state == DB_READ || - db->db_state == DB_FILL) { + while (db->db_state & (DB_READ|DB_FILL)) { ASSERT(db->db_state == DB_READ || (flags & DB_RF_HAVESTRUCT) == 0); cv_wait(&db->db_changed, &db->db_mtx); @@ -728,86 +1333,6 @@ return (err); } -static void -dbuf_noread(dmu_buf_impl_t *db) -{ - ASSERT(!refcount_is_zero(&db->db_holds)); - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - mutex_enter(&db->db_mtx); - while (db->db_state == DB_READ || db->db_state == DB_FILL) - cv_wait(&db->db_changed, &db->db_mtx); - if (db->db_state == DB_UNCACHED) { - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - spa_t *spa; - - ASSERT(db->db_buf == NULL); - ASSERT(db->db.db_data == NULL); - DB_GET_SPA(&spa, db); - dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); - db->db_state = DB_FILL; - } else if (db->db_state == DB_NOFILL) { - dbuf_set_data(db, NULL); - } else { - ASSERT3U(db->db_state, ==, DB_CACHED); - } - mutex_exit(&db->db_mtx); -} - -/** - * \brief This is our just-in-time copy function. - * - * It makes a copy of buffers that have been modified in a previous transaction - * group, before we modify them in the current active group. - * - * This function is used in two places: when we are dirtying a - * buffer for the first time in a txg, and when we are freeing - * a range in a dnode that includes this buffer. - * - * Note that when we are called from dbuf_free_range() we do - * not put a hold on the buffer, we just traverse the active - * dbuf list for the dnode. - */ -static void -dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) -{ - dbuf_dirty_record_t *dr = db->db_last_dirty; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db.db_data != NULL); - ASSERT(db->db_level == 0); - ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); - - if (dr == NULL || - (dr->dt.dl.dr_data != - ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) - return; - - /* - * If the last dirty record for this dbuf has not yet synced - * and its referencing the dbuf data, either: - * reset the reference to point to a new copy, - * or (if there a no active holders) - * just null out the current db_data pointer. - */ - ASSERT(dr->dr_txg >= txg - 2); - if (db->db_blkid == DMU_BONUS_BLKID) { - /* Note that the data bufs here are zio_bufs */ - dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); - arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); - bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); - } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { - int size = db->db.db_size; - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - spa_t *spa; - - DB_GET_SPA(&spa, db); - dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); - bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); - } else { - dbuf_set_data(db, NULL); - } -} - /** * \brief Signal that the dirty record is about to be re-dirtied after sync. * @@ -833,7 +1358,7 @@ ASSERT(db->db_data_pending != dr); - /* free this block */ + /* Free this block. */ if (!BP_IS_HOLE(bp)) { spa_t *spa; @@ -841,6 +1366,7 @@ zio_free(spa, txg, bp); } dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + /* * Release the already-written buffer, so we leave it in * a consistent dirty state. Note that all callers are @@ -853,6 +1379,192 @@ } /** + * \brief Disassociate the frontend for any older transaction groups of a + * dbuf that is inside a range being freed. + * + * \param db Dbuf whose dirty records should be handled. + * \param dn Dnode for the dbuf. + * \param tx Transaction that the free range operation applies to. + * \param evict_list_p Dbuf user eviction list (see dmu_buf_user_t). + * + * This function's primary purpose is to ensure that the state of any dirty + * records affected by the operation remain consistent. + */ +static void +dbuf_free_range_disassociate_frontend(dmu_buf_impl_t *db, dnode_t *dn, + dmu_tx_t *tx, list_t *evict_list_p) +{ + dbuf_dirty_record_t *dr; + + dr = list_head(&db->db_dirty_records); + tmpprintf("%s db %p dr %p holds %d dirties %d txg %"PRIu64"\n", + __func__, db, dr, refcount_count(&db->db_holds), + db->db_dirtycnt, tx->tx_txg); + + if (dr == NULL) + return; + + if (dr->dr_txg == tx->tx_txg) { + /* + * This buffer is "in-use", re-adjust the file size to reflect + * that this buffer may contain new data when we sync. + */ + if (db->db_blkid != DMU_SPILL_BLKID && + db->db_blkid > dn->dn_maxblkid) + dn->dn_maxblkid = db->db_blkid; + /* Handle intermediate dmu_sync() calls. */ + dbuf_unoverride(dr); + + /* + * If this buffer is still waiting on data for a RMW merge, that + * data no longer applies to this buffer. Transition to cached. + */ + dbuf_dirty_record_cleanup_ranges(dr); + } else { + if (db->db_state & DB_PARTIAL) { + /* + * Schedule resolution for the older transaction + * group's dirty record before we change the dbuf's + * state and lose track of the PARTIAL state. + */ + dbuf_transition_to_read(db); + } + /* Disassociate the frontend if necessary. */ + if (dr->dt.dl.dr_data == db->db_buf) { + arc_buf_t *buf; + + buf = dbuf_alloc_arcbuf(db); + if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + + /* + * Frontend being referenced by a user, but + * this dirty record has yet to be processed + * by the syncer. + */ + ASSERT(dr != db->db_data_pending); + if (db->db_state & DB_READ) { + /* + * The reader has yet to access the + * frontend (it must wait for the + * READ->CACHED transition), so it + * is safe to replace the frontend. + */ + dbuf_set_data(db, buf); + } else { + /* + * A reader is accessing the frontend, + * so we cannot replace it. + * Disassociate by replacing the + * buffer used for future syncer + * operations. + */ + bcopy(db->db.db_data, buf->b_data, + db->db.db_size); + dr->dt.dl.dr_data = buf; + } + } else { + /* + * Foreground is currently unreferenced, but + * a future access that results in a READ + * will confuse in-progress resolution of + * dirty records for older transactions. + * Provide a buffer so any future consumers + * will see a dbuf in the CACHED state. + */ + dbuf_set_data(db, buf); + } + } + } +} + +/** + * \brief Dirty level 1 blocks for a free_range operation. + * + * \returns B_TRUE if an indirect block is processed. + */ +static boolean_t +dbuf_free_range_indirects(dnode_t *dn, dmu_buf_impl_t *db, uint64_t start, + uint64_t end, dmu_tx_t *tx) +{ + dbuf_dirty_record_t *dr; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + uint64_t first_l1 = start >> epbs; + uint64_t last_l1 = end >> epbs; + + if (db->db_level == 0) + return (B_FALSE); + + if (db->db_level == 1 && IN_RANGE(first_l1, db->db_blkid, last_l1)) { + mutex_enter(&db->db_mtx); + dr = list_head(&db->db_dirty_records); + if (dr != NULL && dr->dr_txg < tx->tx_txg) { + dbuf_add_ref(db, FTAG); + mutex_exit(&db->db_mtx); + dbuf_will_dirty(db, tx); + dbuf_rele(db, FTAG); + } else { + mutex_exit(&db->db_mtx); + } + } + return (B_TRUE); +} + +static boolean_t +dbuf_free_range_already_freed(dmu_buf_impl_t *db) +{ + /* XXX add comment about why these are OK */ + if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL || + db->db_state == DB_EVICTING) { + ASSERT(db->db.db_data == NULL); + mutex_exit(&db->db_mtx); + return (B_TRUE); + } + return (B_FALSE); +} + +static boolean_t +dbuf_free_range_filler_will_free(dmu_buf_impl_t *db) +{ + if (db->db_state & DB_FILL) { + /* + * If the buffer is currently being filled, then its + * contents cannot be directly cleared. Signal the filler + * to have dbuf_fill_done perform the clear just before + * transitioning the buffer to the CACHED state. + */ + db->db_freed_in_flight = TRUE; + mutex_exit(&db->db_mtx); + return (B_TRUE); + } + return (B_FALSE); +} + +/** + * \brief If a dbuf has no users, clear it. + * + * \returns B_TRUE if the dbuf was cleared. + */ +static boolean_t +dbuf_clear_successful(dmu_buf_impl_t *db, list_t *evict_list_p) +{ + + if (refcount_count(&db->db_holds) == 0) { + /* All consumers are finished, so evict the buffer */ + ASSERT(db->db_buf != NULL); + dbuf_clear(db, evict_list_p); + return (B_TRUE); + } + return (B_FALSE); +} + +/** + * \brief Free a range of data blocks in a dnode. + * + * \param dn Dnode which the range applies to. + * \param start Starting block id of the range, inclusive. + * \param end Ending block id of the range, inclusive. + * \param tx Transaction to apply the free operation too. + * * Evict (if its unreferenced) or clear (if its referenced) any level-0 * data blocks in the free range, so that any future readers will find * empty blocks. Also, if we happen accross any level-1 dbufs in the @@ -863,116 +1575,68 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) { dmu_buf_impl_t *db, *db_next; - uint64_t txg = tx->tx_txg; - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - uint64_t first_l1 = start >> epbs; - uint64_t last_l1 = end >> epbs; + list_t evict_list; + + dmu_buf_create_user_evict_list(&evict_list); - if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) { + if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) end = dn->dn_maxblkid; - last_l1 = end >> epbs; - } - dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); + + dprintf_dnode(dn, "start=%"PRIu64" end=%"PRIu64"\n", start, end); mutex_enter(&dn->dn_dbufs_mtx); for (db = list_head(&dn->dn_dbufs); db; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); - if (db->db_level == 1 && - db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { - mutex_enter(&db->db_mtx); - if (db->db_last_dirty && - db->db_last_dirty->dr_txg < txg) { - dbuf_add_ref(db, FTAG); - mutex_exit(&db->db_mtx); - dbuf_will_dirty(db, tx); - dbuf_rele(db, FTAG); - } else { - mutex_exit(&db->db_mtx); - } - } - - if (db->db_level != 0) + if (dbuf_free_range_indirects(dn, db, start, end, tx)) continue; - dprintf_dbuf(db, "found buf %s\n", ""); - if (db->db_blkid < start || db->db_blkid > end) + if (!IN_RANGE(start, db->db_blkid, end)) continue; - - /* found a level 0 buffer in the range */ if (dbuf_undirty(db, tx)) continue; mutex_enter(&db->db_mtx); - if (db->db_state == DB_UNCACHED || - db->db_state == DB_NOFILL || - db->db_state == DB_EVICTING) { - ASSERT(db->db.db_data == NULL); - mutex_exit(&db->db_mtx); - continue; - } - if (db->db_state == DB_READ || db->db_state == DB_FILL) { - /* will be handled in dbuf_read_done or dbuf_rele */ - db->db_freed_in_flight = TRUE; - mutex_exit(&db->db_mtx); - continue; - } - if (refcount_count(&db->db_holds) == 0) { - ASSERT(db->db_buf); - dbuf_clear(db); - continue; - } - /* The dbuf is referenced */ + DBUF_VERIFY(db); + if (dbuf_free_range_already_freed(db) || + dbuf_free_range_filler_will_free(db) || + dbuf_clear_successful(db, &evict_list)) + continue; /* db_mtx already exited */ - if (db->db_last_dirty != NULL) { - dbuf_dirty_record_t *dr = db->db_last_dirty; - - if (dr->dr_txg == txg) { - /* - * This buffer is "in-use", re-adjust the file - * size to reflect that this buffer may - * contain new data when we sync. - */ - if (db->db_blkid != DMU_SPILL_BLKID && - db->db_blkid > dn->dn_maxblkid) - dn->dn_maxblkid = db->db_blkid; - dbuf_unoverride(dr); - } else { - /* - * This dbuf is not dirty in the open context. - * Either uncache it (if its not referenced in - * the open context) or reset its contents to - * empty. - */ - dbuf_fix_old_data(db, txg); - } - } - /* clear the contents if its cached */ - if (db->db_state == DB_CACHED) { - ASSERT(db->db.db_data != NULL); - arc_release(db->db_buf, db); - bzero(db->db.db_data, db->db.db_size); - arc_buf_freeze(db->db_buf); - } - + /* + * The goal is to make the data that is visible in the current + * transaction group all zeros, while preserving the data + * as seen in any earlier transaction groups. + */ + dbuf_free_range_disassociate_frontend(db, dn, tx, &evict_list); + ASSERT(db->db_buf != NULL); + arc_release(db->db_buf, db); + bzero(db->db.db_data, db->db.db_size); + arc_buf_freeze(db->db_buf); + DBUF_STATE_CHANGE(db, =, DB_CACHED, "zeroed by free"); mutex_exit(&db->db_mtx); + /* Process one dbuf at a time to reduce memory pressure. */ + dmu_buf_process_user_evicts(&evict_list); } mutex_exit(&dn->dn_dbufs_mtx); + dmu_buf_destroy_user_evict_list(&evict_list); } static int dbuf_block_freeable(dmu_buf_impl_t *db) { dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; + dbuf_dirty_record_t *dr; uint64_t birth_txg = 0; /* * We don't need any locking to protect db_blkptr: - * If it's syncing, then db_last_dirty will be set - * so we'll ignore db_blkptr. + * If it's syncing, then db_dirty_records will have + * entries, so we'll ignore db_blkptr. */ ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_last_dirty) - birth_txg = db->db_last_dirty->dr_txg; + dr = list_head(&db->db_dirty_records); + if (dr != NULL) + birth_txg = dr->dr_txg; else if (db->db_blkptr) birth_txg = db->db_blkptr->blk_birth; @@ -989,10 +1653,41 @@ return (FALSE); } +static void +dbuf_dirty_record_truncate_ranges(dbuf_dirty_record_t *dr, int new_size) +{ + dbuf_dirty_leaf_record_t *dl; + dbuf_dirty_range_t *range; + + ASSERT(MUTEX_HELD(&dr->dr_dbuf->db_mtx)); + if (dr->dr_dbuf->db_level != 0) + return; + + dl = &dr->dt.dl; + for (;;) { + range = list_tail(&dl->write_ranges); + + if (range->start >= new_size) { + list_remove(&dl->write_ranges, range); + kmem_free(range, sizeof(dbuf_dirty_range_t)); + continue; + } + + /* + * Update the last range that could be affected by + * this truncation. Its size changes only if it + * extends past the end of the buffer's new size. + */ + range->end = MIN(new_size, range->end); + range->size = range->end - range->size; + break; + } +} + void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) { - arc_buf_t *buf, *obuf; + arc_buf_t *buf, *old_buf; int osize = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dnode_t *dn; @@ -1015,26 +1710,30 @@ * XXX we should be doing a dbuf_read, checking the return * value and returning that up to our callers */ + /* XXX this needs to be made nonblocking */ dbuf_will_dirty(db, tx); /* create the data buffer for the new block */ buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); /* copy old block data to the new block */ - obuf = db->db_buf; - bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); + old_buf = db->db_buf; + bcopy(old_buf->b_data, buf->b_data, MIN(osize, size)); /* zero the remainder */ if (size > osize) bzero((uint8_t *)buf->b_data + osize, size - osize); mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); - VERIFY(arc_buf_remove_ref(obuf, db) == 1); + VERIFY(arc_buf_remove_ref(old_buf, db) == 1); db->db.db_size = size; if (db->db_level == 0) { - ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); - db->db_last_dirty->dt.dl.dr_data = buf; + dbuf_dirty_record_t *dr; + + dr = list_head(&db->db_dirty_records); + ASSERT3U(dr->dr_txg, ==, tx->tx_txg); + dr->dt.dl.dr_data = buf; } mutex_exit(&db->db_mtx); @@ -1063,109 +1762,45 @@ db->db_blkptr, os->os_spa, &zb); } -/** - * \brief Mark a dbuf as dirty. +/* + * State of the current dirtying process. Dirtying requires keeping a lot + * of state available, so using a struct to access it keeps the code sane. */ -dbuf_dirty_record_t * -dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) -{ - dnode_t *dn; - objset_t *os; - dbuf_dirty_record_t **drp, *dr; - int drop_struct_lock = FALSE; - boolean_t do_free_accounting = B_FALSE; - int txgoff = tx->tx_txg & TXG_MASK; +typedef struct dbuf_dirty_state { + dmu_buf_impl_t *db; /**< Dbuf being dirtied. */ + dmu_tx_t *tx; /**< Transaction to dirty. */ + dnode_t *dn; /**< The dbuf's dnode. */ + dbuf_dirty_record_t *insert_pt; /**< DR to insert new DR after. */ + dbuf_dirty_record_t *txg_dr; /**< Dirty record for this txg. */ + boolean_t txg_already_dirty; /**< This txg already dirty? */ + boolean_t do_free_accounting; /**< Free accounting needed? */ + list_t evict_list; /**< Dbuf user eviction list. */ - /* Ensure that this dbuf has no transaction groups or holds */ - ASSERT(tx->tx_txg != 0); - ASSERT(!refcount_is_zero(&db->db_holds)); - DMU_TX_DIRTY_BUF(tx, db); + /* The below only apply to leaf blocks. */ + arc_buf_t *fill_buf; /**< Already-filled optional buffer. */ + int offset; /**< Offset of the upcoming write. */ + int size; /**< Size of the upcoming write. */ +} dbuf_dirty_state_t; - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - /* - * Shouldn't dirty a regular buffer in syncing context. Private - * objects may be dirtied in syncing context, but only if they - * were already pre-dirtied in open context. - */ - ASSERT(!dmu_tx_is_syncing(tx) || - BP_IS_HOLE(dn->dn_objset->os_rootbp) || - DMU_OBJECT_IS_SPECIAL(dn->dn_object) || - dn->dn_objset->os_dsl_dataset == NULL); - /* - * We make this assert for private objects as well, but after we - * check if we're already dirty. They are allowed to re-dirty - * in syncing context. - */ - ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || - dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); +static void +dbuf_new_dirty_record_accounting(dbuf_dirty_state_t *dds) +{ + dnode_t *dn = dds->dn; + dmu_tx_t *tx = dds->tx; + dmu_buf_impl_t *db = dds->db; + objset_t *os = dn->dn_objset; - mutex_enter(&db->db_mtx); /* - * XXX make this true for indirects too? The problem is that - * transactions created with dmu_tx_create_assigned() from - * syncing context don't bother holding ahead. + * Only valid if not already dirty in this transaction group. */ - ASSERT(db->db_level != 0 || - db->db_state == DB_CACHED || db->db_state == DB_FILL || - db->db_state == DB_NOFILL); + DNODE_VERIFY_DIRTYCTX(dn, tx); - mutex_enter(&dn->dn_mtx); - /* - * Don't set dirtyctx to SYNC if we're just modifying this as we - * initialize the objset. - */ - if (dn->dn_dirtyctx == DN_UNDIRTIED && - !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { - dn->dn_dirtyctx = - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); - ASSERT(dn->dn_dirtyctx_firstset == NULL); - dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); - } - mutex_exit(&dn->dn_mtx); - - if (db->db_blkid == DMU_SPILL_BLKID) - dn->dn_have_spill = B_TRUE; - - /* - * If this buffer is already dirty, we're done. - */ - drp = &db->db_last_dirty; - ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || - db->db.db_object == DMU_META_DNODE_OBJECT); - while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) - drp = &dr->dr_next; - if (dr && dr->dr_txg == tx->tx_txg) { - DB_DNODE_EXIT(db); - - if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { - /* - * If this buffer has already been written out, - * we now need to reset its state. - */ - dbuf_unoverride(dr); - if (db->db.db_object != DMU_META_DNODE_OBJECT && - db->db_state != DB_NOFILL) - arc_buf_thaw(db->db_buf); - } - mutex_exit(&db->db_mtx); - return (dr); - } - - /* - * Only valid if not already dirty. - */ - ASSERT(dn->dn_object == 0 || - dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); - ASSERT3U(dn->dn_nlevels, >, db->db_level); ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || dn->dn_phys->dn_nlevels > db->db_level || - dn->dn_next_nlevels[txgoff] > db->db_level || - dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || - dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); + DN_NEXT_LEVEL(dn, tx->tx_txg) > db->db_level || + DN_NEXT_LEVEL(dn, tx->tx_txg - 1) > db->db_level || + DN_NEXT_LEVEL(dn, tx->tx_txg - 2) > db->db_level); /* * We should only be dirtying in syncing context if it's the @@ -1174,8 +1809,8 @@ * we already dirtied it in open context. Hence we must make * this assertion only if we're not already dirty. */ - os = dn->dn_objset; - ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || + ASSERT(!dmu_tx_is_syncing(tx) || + DMU_OBJECT_IS_SPECIAL(dn->dn_object) || os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); ASSERT(db->db.db_size != 0); @@ -1190,71 +1825,316 @@ * also holding the db_mtx. */ dnode_willuse_space(dn, db->db.db_size, tx); - do_free_accounting = dbuf_block_freeable(db); + if (db->db_blkid != DMU_SPILL_BLKID) + dds->do_free_accounting = dbuf_block_freeable(db); } +} + +static dbuf_dirty_record_t * +dbuf_dirty_record_create(dbuf_dirty_state_t *dds) +{ + dbuf_dirty_record_t *dr; + + ASSERT(MUTEX_HELD(&dds->db->db_mtx)); + ASSERT(DB_DNODE_HELD(dds->db)); + dr = list_head(&dds->db->db_dirty_records); + ASSERT(dr == NULL || dr->dr_txg != dds->tx->tx_txg); + + dbuf_new_dirty_record_accounting(dds); + + ASSERT(dds->txg_dr == NULL); + dr = kmem_zalloc(sizeof(dbuf_dirty_record_t), KM_SLEEP); + dr->dr_dbuf = dds->db; + dr->dr_txg = dds->tx->tx_txg; + dds->txg_dr = dr; + + return (dr); +} +static void +dbuf_dirty_record_register(dbuf_dirty_state_t *dds) +{ + + ASSERT(dds->txg_dr != NULL); + list_insert_after(&dds->db->db_dirty_records, dds->insert_pt, + dds->txg_dr); + + /* This buffer is now part of this txg */ + dbuf_add_ref(dds->db, (void *)(uintptr_t)dds->tx->tx_txg); + dds->db->db_dirtycnt += 1; + ASSERT3U(dds->db->db_dirtycnt, <=, TXG_CONCURRENT_STATES); +} + +static void +dbuf_dirty_record_create_indirect(dbuf_dirty_state_t *dds) +{ + dbuf_dirty_record_t *dr; + + dr = dbuf_dirty_record_create(dds); + mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); + list_create(&dr->dt.di.dr_children, + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); + dbuf_dirty_record_register(dds); +} + +static void +dbuf_dirty_record_update_leaf(dbuf_dirty_state_t *dds) +{ + if (dds->db->db_blkid == DMU_BONUS_BLKID) + dds->txg_dr->dt.dl.dr_data = dds->db->db.db_data; + else + dds->txg_dr->dt.dl.dr_data = dds->db->db_buf; +} + +static void +dbuf_dirty_record_register_as_leaf(dbuf_dirty_state_t *dds) +{ + dbuf_dirty_record_t *dr = dds->txg_dr; + dmu_buf_impl_t *db = dds->db; + + dbuf_dirty_record_update_leaf(dds); + dprintf_dbuf(db, "%s: dr_data=%p\n", __func__, dr->dt.dl.dr_data); + list_create(&dr->dt.dl.write_ranges, sizeof(dbuf_dirty_range_t), + offsetof(dbuf_dirty_range_t, write_range_link)); + dbuf_dirty_record_register(dds); +} + +static void +dbuf_dirty_record_create_nofill(dbuf_dirty_state_t *dds) +{ + dbuf_dirty_record_t *dr; + + (void) dbuf_dirty_record_create(dds); + dbuf_dirty_record_register_as_leaf(dds); +} + +void +dbuf_dirty_verify(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ +#ifdef ZFS_DEBUG + dnode_t *dn = DB_DNODE(db); + dbuf_dirty_record_t *dr; + + /* Ensure that this dbuf has a transaction group and a hold */ + ASSERT(tx->tx_txg != 0); + ASSERT(!refcount_is_zero(&db->db_holds)); + DMU_TX_VERIFY_DIRTY_BUF(tx, db); + + dr = list_head(&db->db_dirty_records); + ASSERT(dr == NULL || dr->dr_txg <= tx->tx_txg || + db->db.db_object == DMU_META_DNODE_OBJECT); + /* - * If this buffer is dirty in an old transaction group we need - * to make a copy of it so that the changes we make in this - * transaction group won't leak out when we sync the older txg. + * Shouldn't dirty a regular buffer in syncing context. Private + * objects may be dirtied in syncing context, but only if they + * were already pre-dirtied in open context. */ - dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); - if (db->db_level == 0) { - void *data_old = db->db_buf; + ASSERT(!dmu_tx_is_syncing(tx) || + BP_IS_HOLE(dn->dn_objset->os_rootbp) || + DMU_OBJECT_IS_SPECIAL(dn->dn_object) || + dn->dn_objset->os_dsl_dataset == NULL); + + DNODE_VERIFY_DIRTYCTX(dn, tx); +#endif +} + +/** + * \brief Enter a dbuf-dirtying function. + * + * \note This function should only be called once in a dbuf-dirtying function. + * + * This function's primary purpose is to compute state that only needs to be + * computed once per dirty call. Call dbuf_dirty_compute_state if the + * function drops the mutex, for things that require re-computing. + */ +static void +dbuf_dirty_enter(dbuf_dirty_state_t *dds, dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dbuf_dirty_record_t *dr; + + memset(dds, 0, sizeof(*dds)); + dds->db = db; + dds->tx = tx; + + dmu_buf_create_user_evict_list(&dds->evict_list); + DB_DNODE_ENTER(db); + dds->dn = DB_DNODE(db); + + mutex_enter(&db->db_mtx); +} + +/** + * \brief Compute the current dbuf dirty state. + * + * \note See dbuf_dirty for more information. + * \note The dbuf mutex must be held before this function is called, and + * afterwards, must not be dropped except by dbuf_dirty_exit(). + * If this is not possible, the intention was to allow a dbuf_dirty + * function to re-invoke this function after an action that might drop + * the mutex, and before continuing. Additional work may be needed. + */ +static void +dbuf_dirty_compute_state(dbuf_dirty_state_t *dds) +{ + dmu_buf_impl_t *db = dds->db; + dmu_tx_t *tx = dds->tx; + dbuf_dirty_record_t *dr, *newest; - if (db->db_state != DB_NOFILL) { - if (db->db_blkid == DMU_BONUS_BLKID) { - dbuf_fix_old_data(db, tx->tx_txg); - data_old = db->db.db_data; - } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { - /* - * Release the data buffer from the cache so - * that we can modify it without impacting - * possible other users of this cached data - * block. Note that indirect blocks and - * private objects are not released until the - * syncing state (since they are only modified - * then). - */ - arc_release(db->db_buf, db); - dbuf_fix_old_data(db, tx->tx_txg); - data_old = db->db_buf; - } - ASSERT(data_old != NULL); - } - dr->dt.dl.dr_data = data_old; - } else { - mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); - list_create(&dr->dt.di.dr_children, - sizeof (dbuf_dirty_record_t), - offsetof(dbuf_dirty_record_t, dr_dirty_node)); + /* Only one filler allowed at a time. */ + while (db->db_state & DB_FILL) { + ASSERT(db->db_level == 0); + cv_wait(&db->db_changed, &db->db_mtx); } - dr->dr_dbuf = db; - dr->dr_txg = tx->tx_txg; - dr->dr_next = *drp; - *drp = dr; + + dbuf_dirty_verify(db, tx); + if (db->db_blkid == DMU_SPILL_BLKID) + dds->dn->dn_have_spill = B_TRUE; + dnode_set_dirtyctx(dds->dn, tx, db); + + newest = list_head(&db->db_dirty_records); + + /* Only the mdn object may dirty an older txg. */ + ASSERT(newest == NULL || newest->dr_txg <= tx->tx_txg || + db->db.db_object == DMU_META_DNODE_OBJECT); + + dds->insert_pt = NULL; /* Insert at head. */ + for (dr = newest; dr != NULL && dr->dr_txg > tx->tx_txg; + dr = list_next(&db->db_dirty_records, dr)) + dds->insert_pt = dr; + + if (dr != NULL && dr->dr_txg == tx->tx_txg) + dds->txg_dr = dr; /* - * We could have been freed_in_flight between the dbuf_noread - * and dbuf_dirty. We win, as though the dbuf_noread() had - * happened after the free. + * Cache whether this TX already has a dirty record, so that upon exit, + * additional work can be done after dropping the dbuf mutex. This + * information is useful elsewhere, too. */ - if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && - db->db_blkid != DMU_SPILL_BLKID) { - mutex_enter(&dn->dn_mtx); - dnode_clear_range(dn, db->db_blkid, 1, tx); - mutex_exit(&dn->dn_mtx); - db->db_freed_in_flight = FALSE; + dds->txg_already_dirty = (dds->txg_dr != NULL); +} + +static void dbuf_dirty_parent(dbuf_dirty_state_t *dds); + +/** + * \brief Exit a dbuf-dirtying function. See dbuf_dirty. + * + * \note This function should only be called once in a dbuf-dirtying function. + * + * This function's primary purpose is to verify a consistent state upon + * completing a dirty operation, then drop the mutex and dirty parent dbufs. + * It is also a good time to update free accounting. + */ +static void +dbuf_dirty_exit(dbuf_dirty_state_t *dds) +{ + dmu_buf_impl_t *db = dds->db; + void *front = (db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : + db->db_buf; + + ASSERT(db->db_level != 0 || dds->txg_dr->dt.dl.dr_data == front); + ASSERT(dds->txg_dr->dr_txg == dds->tx->tx_txg); + + mutex_exit(&db->db_mtx); + dmu_buf_destroy_user_evict_list(&dds->evict_list); + + if (!dds->txg_already_dirty) { + if (dds->do_free_accounting) { + /* NB: This only applies to non-SPILL/BONUS blocks. */ + blkptr_t *bp = db->db_blkptr; + objset_t *os = dds->dn->dn_objset; + int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? + bp_get_dsize(os->os_spa, bp) : db->db.db_size; + /* + * This is only a guess -- if the dbuf is dirty + * in a previous txg, we don't know how much + * space it will use on disk yet. We should + * really have the struct_rwlock to access + * db_blkptr, but since this is just a guess, + * it's OK if we get an odd answer. + */ + ddt_prefetch(os->os_spa, bp); + dnode_willuse_space(dds->dn, -willfree, dds->tx); + } + dbuf_dirty_parent(dds); } - /* - * This buffer is now part of this txg - */ - dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); - db->db_dirtycnt += 1; - ASSERT3U(db->db_dirtycnt, <=, 3); + DB_DNODE_EXIT(db); +} + +/** + * \brief Dirty a nofill buffer. See dbuf_dirty. + * + * NOFILL buffers are similar to regular leaf buffers only in the sense that + * they create dirty records that contain ARC buffers in each txg. They + * don't need any frontend manipulation. + */ +dbuf_dirty_record_t * +dbuf_dirty_nofill(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dbuf_dirty_state_t dds; + + ASSERT(db->db_level == 0); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(db->db_state & (DB_UNCACHED|DB_NOFILL|DB_CACHED)); + + dbuf_dirty_enter(&dds, db, tx); + DBUF_STATE_CHANGE(db, =, DB_NOFILL, "allocating NOFILL buffer"); + dbuf_clear_data(db, &dds.evict_list); + dbuf_dirty_compute_state(&dds); + + if (dds.txg_already_dirty) + /* + * Reset immediate write sync state if needed. + * XXX: Is this really needed for NOFILL buffers? + */ + dbuf_unoverride(dds.txg_dr); + else + dbuf_dirty_record_create_nofill(&dds); + + dbuf_dirty_exit(&dds); + return (dds.txg_dr); +} + +/** + * \brief Dirty an indirect block. See dbuf_dirty. + * + * Indirect blocks are always completely rewritten, so they don't need any + * complex frontend manipulation. + */ +static dbuf_dirty_record_t * +dbuf_dirty_indirect(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dbuf_dirty_state_t dds; + + dbuf_dirty_enter(&dds, db, tx); + dbuf_dirty_compute_state(&dds); + + if (!dds.txg_already_dirty) + dbuf_dirty_record_create_indirect(&dds); + + dbuf_dirty_exit(&dds); + return (dds.txg_dr); +} + +/** + * \brief Dirty the dbuf's parent. + * + * \param dds Dbuf dirty state. + * + * \note If the dnode's struct_rwlock is not held, it will be grabbed and + * dropped within this function. + */ +static void +dbuf_dirty_parent(dbuf_dirty_state_t *dds) +{ + dnode_t *dn = dds->dn; + dmu_buf_impl_t *db = dds->db; + dmu_tx_t *tx = dds->tx; + dbuf_dirty_record_t *dr = dds->txg_dr; - mutex_exit(&db->db_mtx); + int drop_struct_lock = FALSE; + int txgoff = tx->tx_txg & TXG_MASK; if (db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID) { @@ -1263,22 +2143,7 @@ list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); - DB_DNODE_EXIT(db); - return (dr); - } else if (do_free_accounting) { - blkptr_t *bp = db->db_blkptr; - int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? - bp_get_dsize(os->os_spa, bp) : db->db.db_size; - /* - * This is only a guess -- if the dbuf is dirty - * in a previous txg, we don't know how much - * space it will use on disk yet. We should - * really have the struct_rwlock to access - * db_blkptr, but since this is just a guess, - * it's OK if we get an odd answer. - */ - ddt_prefetch(os->os_spa, bp); - dnode_willuse_space(dn, -willfree, tx); + return; } if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { @@ -1292,10 +2157,12 @@ } if (db->db_level+1 < dn->dn_nlevels) { + /* The dbuf's parent is an indirect block */ dmu_buf_impl_t *parent = db->db_parent; dbuf_dirty_record_t *di; int parent_held = FALSE; + /* Get a hold on the parent before dropping struct_rwlock */ if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; @@ -1306,27 +2173,40 @@ } if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); + ASSERT3U(db->db_level+1, ==, parent->db_level); - di = dbuf_dirty(parent, tx); + di = dbuf_dirty_indirect(parent, tx); if (parent_held) dbuf_rele(parent, FTAG); + /* + * Update the dirty record to add this dbuf to its parent's + * dirty record's list of dirty children. The indirect + * mutex could be conditionally acquired, but doing so is + * unlikely to save any effort in most cases. Acquiring it + * unconditionally keeps this path clean of apparent LORs. + */ + mutex_enter(&di->dt.di.dr_mtx); mutex_enter(&db->db_mtx); /* possible race with dbuf_undirty() */ - if (db->db_last_dirty == dr || + if (list_head(&db->db_dirty_records) == dr || dn->dn_object == DMU_META_DNODE_OBJECT) { - mutex_enter(&di->dt.di.dr_mtx); ASSERT3U(di->dr_txg, ==, tx->tx_txg); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&di->dt.di.dr_children, dr); - mutex_exit(&di->dt.di.dr_mtx); dr->dr_parent = di; } mutex_exit(&db->db_mtx); + mutex_exit(&di->dt.di.dr_mtx); } else { + /* The dbuf's parent is the dnode */ ASSERT(db->db_level+1 == dn->dn_nlevels); ASSERT(db->db_blkid < dn->dn_nblkptr); ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); + /* + * Update the dnode's list of dirty records to include this + * dbuf's dirty record. + */ mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); @@ -1336,37 +2216,642 @@ } dnode_setdirty(dn, tx); +} + +static void +dbuf_dirty_record_check_ranges(dbuf_dirty_record_t *dr) +{ +#ifdef ZFS_DEBUG + dbuf_dirty_leaf_record_t *dl; + dbuf_dirty_range_t *prev, *cur, *next; + + if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) + return; + + dl = &dr->dt.dl; + + prev = next = NULL; + for (cur = list_head(&dl->write_ranges); cur != NULL; + prev = cur, cur = next) { + next = list_next(&dl->write_ranges, cur); + ASSERT(prev == NULL || cur->start > prev->end); + ASSERT(next == NULL || cur->end < next->start); + } +#endif +} + +/** + * \brief Record a write range for the associated dirty record. + * + * \param dr The dirty record to record the write range for. + * \param offset The offset of the new write range. + * \param size The size of the new write range. + */ +static void +dbuf_dirty_record_add_range(dbuf_dirty_record_t *dr, int offset, int size) +{ + dbuf_dirty_range_t *next_range, *old_range, *range; + dbuf_dirty_leaf_record_t *dl; + dmu_buf_impl_t *db; + + dl = &dr->dt.dl; + db = dr->dr_dbuf; + + /* Write ranges do not apply to indirect blocks. */ + ASSERT(db->db_level == 0); + ASSERT(MUTEX_HELD(&db->db_mtx)); + + /* Optimization: clear the ranges if the incoming range fills. */ + if (offset == 0 && size == db->db.db_size) { + dbuf_dirty_record_cleanup_ranges(dr); + goto out; + } + + range = kmem_zalloc(sizeof(dbuf_dirty_range_t), KM_SLEEP); + range->start = offset; + range->size = size; + range->end = offset + size; + + /* + * This loop acts as an accumulator, merging dirty ranges if they + * overlap or are adjacent, and in so doing leaving behind only one + * range. But if the new range must be inserted separately, it will + * do so using the old range as a marker. + */ + for (old_range = list_head(&dl->write_ranges); + old_range != NULL && old_range->start <= range->end; + old_range = next_range) { + next_range = list_next(&dl->write_ranges, old_range); + if (range->start <= old_range->end && + range->end >= old_range->start) { + old_range->start = MIN(range->start, old_range->start); + old_range->end = MAX(range->end, old_range->end); + old_range->size = old_range->end - old_range->start; + list_remove(&dl->write_ranges, old_range); + DEBUG_REFCOUNT_DEC(dirty_ranges_in_flight); + kmem_free(range, sizeof(dbuf_dirty_range_t)); + range = old_range; + } + } + + /* If the writer will finish filling, go directly to DB_FILL. */ + if (range->start == 0 && range->size == db->db.db_size) { + kmem_free(range, sizeof(dbuf_dirty_range_t)); + } else { + /* If old_range is NULL, this does a list_insert_tail(). */ + list_insert_before(&dl->write_ranges, old_range, range); + DEBUG_REFCOUNT_INC(dirty_ranges_in_flight); + DEBUG_COUNTER_INC(dirty_ranges_total); + } + + dbuf_dirty_record_check_ranges(dr); + +out: + if (dr->dr_dbuf->db_state & (DB_READ|DB_PARTIAL)) + if (list_is_empty(&dr->dt.dl.write_ranges)) + DBUF_STATE_CHANGE(db, =, DB_FILL, "complete filler"); +} + +static void +dbuf_dirty_set_data(dbuf_dirty_state_t *dds) +{ + arc_buf_t *buf = dds->fill_buf; + if (buf == NULL) + buf = dbuf_alloc_arcbuf(dds->db); + dbuf_set_data(dds->db, buf); +} + +static void +dbuf_dirty_leaf_with_existing_frontend(dbuf_dirty_state_t *dds) +{ + dmu_buf_impl_t *db = dds->db; + int size = db->db.db_size; + dbuf_dirty_record_t *newest = list_head(&db->db_dirty_records); + boolean_t old_txg_is_frontend = !dds->txg_already_dirty && + newest != NULL && newest->dt.dl.dr_data == db->db_buf; + arc_buf_t *fill_buf = dds->fill_buf; + + ASSERT(fill_buf == NULL || fill_buf != db->db_buf); + ASSERT(refcount_count(&db->db_holds) > db->db_dirtycnt); + + /* Reset any immediate write that has occurred. */ + if (dds->txg_already_dirty) + dbuf_unoverride(dds->txg_dr); + + /* If the old txg's record owns the frontend, give it its own copy. */ + if (old_txg_is_frontend) { + if (newest == db->db_data_pending) { + /* + * The syncer or holder normally disassociate. But if + * the syncer is performing a deferred resolve, then + * it will not disassociate until the resolve + * completes. Since the syncer has already + * scheduled its write with its buffer, we must + * disassociate by replacing the frontend. + */ + ASSERT(db->db_state & (DB_READ|DB_PARTIAL)); + ASSERT(db->db_dirtycnt == 1); + dbuf_dirty_set_data(dds); + } else { + newest->dt.dl.dr_data = dbuf_alloc_arcbuf(db); + bcopy(db->db.db_data, newest->dt.dl.dr_data->b_data, + size); + arc_release(db->db_buf, db); + if (fill_buf) { + bcopy(fill_buf->b_data, db->db.db_data, size); + ASSERT(arc_released(fill_buf)); + VERIFY(arc_buf_remove_ref(fill_buf, db) == 1); + } + } + return; + } + + /* We have a filled buffer and already own the current frontend. */ + if (fill_buf) { + arc_release(db->db_buf, db); + bcopy(fill_buf->b_data, db->db.db_data, size); + ASSERT(arc_released(fill_buf)); + VERIFY(arc_buf_remove_ref(fill_buf, db) == 1); + return; + } + + /* Frontend not owned by anybody. Notify that it will be modified. */ + ASSERT(newest == NULL || fill_buf == NULL); + if (dds->txg_already_dirty) { + /* Already released on initial dirty, so just thaw. */ + ASSERT(arc_released(db->db_buf)); + arc_buf_thaw(db->db_buf); + } else + arc_release(db->db_buf, db); +} + +static void +dbuf_dirty_record_create_leaf(dbuf_dirty_state_t *dds) +{ + dmu_buf_impl_t *db = dds->db; + dbuf_dirty_record_t *dr; + + dr = dbuf_dirty_record_create(dds); + + /* + * If this block was marked to be freed in this txg, revert that + * change. Note that db_freed_in_flight may have already been + * processed, so it can't be checked here. + */ + if (db->db_blkid != DMU_SPILL_BLKID) { + mutex_enter(&dds->dn->dn_mtx); + dnode_clear_range(dds->dn, db->db_blkid, /*nblks*/1, dds->tx); + mutex_exit(&dds->dn->dn_mtx); + db->db_freed_in_flight = FALSE; + } + dbuf_dirty_record_register_as_leaf(dds); +} + +static void +dbuf_dirty_leaf_common(dbuf_dirty_state_t *dds) +{ + dmu_buf_impl_t *db = dds->db; + + if (db->db_buf == NULL) + dbuf_dirty_set_data(dds); + else + dbuf_dirty_leaf_with_existing_frontend(dds); + ASSERT(arc_released(db->db_buf) && !arc_buf_frozen(db->db_buf)); + + if (!dds->txg_already_dirty) + dbuf_dirty_record_create_leaf(dds); + else + dbuf_dirty_record_update_leaf(dds); + + if (db->db_state != DB_CACHED) + dbuf_dirty_record_add_range(dds->txg_dr, dds->offset, + dds->size); +} + +dbuf_dirty_record_t * +dbuf_dirty_record_create_bonus(dbuf_dirty_state_t *dds) +{ + dmu_buf_impl_t *db = dds->db; + dbuf_dirty_record_t *newest = list_head(&db->db_dirty_records); + boolean_t last_txg_is_frontend = newest != NULL && + newest->dt.dl.dr_data == db->db.db_data; + dbuf_dirty_record_t *dr; + + if (last_txg_is_frontend) { + newest->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); + arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); + bcopy(db->db.db_data, newest->dt.dl.dr_data, DN_MAX_BONUSLEN); + } + dr = dbuf_dirty_record_create(dds); + dbuf_dirty_record_register_as_leaf(dds); + return (dr); +} + +/** + * \brief Dirty a dbuf belonging to a meta-dnode. See dbuf_dirty. + * + * Dbufs belonging to the meta-dnode object are allowed to dirty in older + * transaction groups. Additionally, they will always be overwritten in + * each transaction group, which means no complex frontend manipulation. + * simplifies the logic considerably compared to normal leaf objects. + */ +dbuf_dirty_record_t * +dbuf_dirty_mdn_object(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dbuf_dirty_state_t dds; + + ASSERT(db->db_level == 0); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + + dbuf_dirty_enter(&dds, db, tx); + dbuf_dirty_compute_state(&dds); + + if (db->db_buf == NULL) + dbuf_set_data(db, dbuf_alloc_arcbuf(db)); + + if (dds.txg_already_dirty) + dbuf_unoverride(dds.txg_dr); + else + (void) dbuf_dirty_record_create_leaf(&dds); + + dbuf_dirty_exit(&dds); + return (dds.txg_dr); +} + +/** + * \brief Dirty a bonus dbuf. See dbuf_dirty. + * + * Bonus buffers are special in the sense that they do not use ARC buffers, + * but instead occupy space inside the dnode physical block. The dbuf + * layer's primary role is to provide a transactional mechanism for updating + * this special dnode section. Underlying bonus blocks therefore always use + * special zio buffers, and never share information between transactions. + */ +dbuf_dirty_record_t * +dbuf_dirty_bonus(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dbuf_dirty_state_t dds; + + ASSERT(db->db_blkid == DMU_BONUS_BLKID); + /* Can't dirty a bonus buffer without first reading it. */ + ASSERT(db->db_state == DB_CACHED); + dbuf_dirty_enter(&dds, db, tx); + dbuf_dirty_compute_state(&dds); + + if (!dds.txg_already_dirty) + (void) dbuf_dirty_record_create_bonus(&dds); + + dbuf_dirty_exit(&dds); + return (dds.txg_dr); +} + +/** + * \brief Handle potential Copy-On-Write (COW) faults. + * + * This function's primary purpose is to optimize dirtying behavior that are + * likely to involve COW faults. + */ +static void +dbuf_dirty_handle_fault(dbuf_dirty_state_t *dds) +{ + dmu_buf_impl_t *db = dds->db; + + ASSERT(db->db_level == 0); + if (db->db_state & DB_PARTIAL) { + dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); + if (dr->dr_txg != dds->tx->tx_txg) { + /* + * The newest dirty record's transaction group has + * closed. Since COW fault resolution can't be + * avoided, there is no benefit to waiting until the + * dirty record reaches the syncer. Start + * asynchronous fault resolution now. + */ + dbuf_transition_to_read(db); + } + } else if (db->db_state == DB_UNCACHED) { + int write_end = dds->offset + dds->size; + + if (dds->offset != 0 && write_end != db->db.db_size) { + /* + * Immediately start resolving a COW fault if we start + * writing inside the block rather than either at the + * beginning (forward) or end (backward). Future + * writes are unlikely to fill this dbuf. + */ + dbuf_transition_to_read(db); + } else if (dds->size != db->db.db_size) { + /* + * If this dirty won't fill the buffer, see if a + * previous version is in the ARC. This skips the + * partial buffer bookkeeping that would otherwise + * be necessary. + */ + dbuf_read_cached(db, dds->dn); + } + } +} + +/** + * \brief Common dbuf_dirty_enter() replacement for leaf blocks. + */ +void +dbuf_dirty_leaf_enter(dbuf_dirty_state_t *dds, + dmu_buf_impl_t *db, dmu_tx_t *tx, int offset, int size) +{ + + dbuf_dirty_enter(dds, db, tx); + dds->offset = offset; + dds->size = size; + /* + * Handle COW faults prior to computing the dirty state, since + * transitioning to read drops the lock. + */ + dbuf_dirty_handle_fault(dds); + dbuf_dirty_compute_state(dds); +} + +/** + * \brief Dirty a regular leaf block. See dbuf_dirty. + * + * This function handles dirtying all user data blocks. + */ +dbuf_dirty_record_t * +dbuf_dirty_leaf(dmu_buf_impl_t *db, dmu_tx_t *tx, int offset, int size) +{ + dbuf_dirty_state_t dds; + + ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(db->db_level == 0); + + dbuf_dirty_leaf_enter(&dds, db, tx, offset, size); + + if (db->db_state == DB_UNCACHED) + DBUF_STATE_CHANGE(db, =, (DB_PARTIAL|DB_FILL), + "notifying of initial partial fill"); + else if (db->db_state & (DB_READ|DB_PARTIAL)) + DBUF_STATE_CHANGE(db, |=, DB_FILL, + "notifying of followup partial fill"); + dbuf_dirty_leaf_common(&dds); + + dbuf_dirty_exit(&dds); + return (dds.txg_dr); +} + +/** + * \brief Dirty a regular leaf block with a filled ARC buffer. See dbuf_dirty. + * + * This function is identical to dbuf_dirty_leaf, except that it doesn't + * have to handle partial fills, since it is always provided an already + * filled buffer that is the write data for the transaction. + */ +dbuf_dirty_record_t * +dbuf_dirty_with_arcbuf(dmu_buf_impl_t *db, dmu_tx_t *tx, arc_buf_t *fill_buf) +{ + dbuf_dirty_state_t dds; + + ASSERT(db->db_level == 0); + + dbuf_dirty_leaf_enter(&dds, db, tx, 0, db->db.db_size); + dds.fill_buf = fill_buf; + + if (db->db_state != DB_CACHED) + DBUF_STATE_CHANGE(db, =, DB_FILL, "assigning filled buffer"); + dbuf_dirty_leaf_common(&dds); + + dbuf_dirty_exit(&dds); + return (dds.txg_dr); +} + +/** + * \brief Dirty a DMU buffer. + * + * \param db Dbuf to dirty. + * \param tx Transaction to dirty the dbuf in. + * + * This function is merely a dispatcher. Different types of dbufs require + * different actions in different scenarios. However, each dbuf_dirty + * implementing function should follow the same basic order: + * + * 1. dbuf_dirty_enter (grab the dbuf mutex) + * 2. Do any pre-dirty optimizations or fixups needed. + * *** Beyond this point, the dbuf mutex must always be held. *** + * 3. dbuf_dirty_compute_state (compute the basic dbuf_dirty state) + * 4. Change the dbuf state as applicable + * 5. Make the frontend (db->db_buf) usable by the dirty record for this txg. + * 6. Create or update this txg's dirty record, if needed. + * 7. dbuf_dirty_exit, which triggers dirtying parent dbufs if this dbuf was + * not already dirty in this txg. + * + * \note The point of having separate functions is to reduce the difficulty + * of understanding what happens to each type of dbuf in a dirty. + */ +dbuf_dirty_record_t * +dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + if (db->db_blkid == DMU_BONUS_BLKID) { + return (dbuf_dirty_bonus(db, tx)); + } else if (db->db_level == 0) { + if (db->db.db_object == DMU_META_DNODE_OBJECT) + return (dbuf_dirty_mdn_object(db, tx)); + else + return (dbuf_dirty_leaf(db, tx, 0, db->db.db_size)); + } else { + return (dbuf_dirty_indirect(db, tx)); + } +} + +/** + * \brief Cleanup a dirty record's write ranges as necessary. + * + * XXX + * This should be replaced with a larger dbuf_dirty_record_destroy() that + * cleans up an entire dirty record. + */ +void +dbuf_dirty_record_cleanup_ranges(dbuf_dirty_record_t *dr) +{ + dbuf_dirty_leaf_record_t *dl; + dbuf_dirty_range_t *range; + + /* Write ranges do not apply to indirect blocks */ + if (dr->dr_dbuf->db_level != 0) + return; + + /* Remove any write range entries left behind. */ + dl = &dr->dt.dl; + while ((range = list_remove_head(&dl->write_ranges)) != NULL) { + kmem_free(range, sizeof(dbuf_dirty_range_t)); + DEBUG_REFCOUNT_DEC(dirty_ranges_in_flight); + } +} + +/* XXX refactor dbuf_undirty_*() into dbuf_undirty(). */ +static void +dbuf_undirty_bonus(dbuf_dirty_record_t *dr) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + + if (dr->dt.dl.dr_data != db->db.db_data) { + zio_buf_free(dr->dt.dl.dr_data, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); + } + db->db_data_pending = NULL; + ASSERT(list_next(&db->db_dirty_records, dr) == NULL); + list_remove(&db->db_dirty_records, dr); + kmem_free(dr, sizeof(dbuf_dirty_record_t)); + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; +} + +static void +dbuf_undirty_leaf(dbuf_dirty_record_t *dr) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); + if (db->db_state == DB_NOFILL) + return; + + if (dr->dt.dl.dr_data != db->db_buf) { + /* + * What we wrote is already out of date, so + * just free the ARC buffer. + */ + VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); + } else if (!arc_released(db->db_buf)) { + /* + * Our dbuf hasn't already been evicted, so + * register a callback to clean it up once + * its ARC buffer is released. + */ + arc_set_callback(db->db_buf, dbuf_do_evict, db); + } +} + +static void +dbuf_undirty_indirect(dbuf_dirty_record_t *dr) +{ + dnode_t *dn; + dmu_buf_impl_t *db = dr->dr_dbuf; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(list_head(&dr->dt.di.dr_children) == NULL); + /* + * The size of an indirect block must match what its + * associated dnode thinks it should be. + */ + ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); + /* + * If the dbuf's block pointer is not a hole, evict it when + * its last ARC buffer hold has been released. + */ + if (!BP_IS_HOLE(db->db_blkptr)) { + int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); + ASSERT3U(dn->dn_phys->dn_maxblkid >> (db->db_level * epbs), >=, + db->db_blkid); + arc_set_callback(db->db_buf, dbuf_do_evict, db); + } DB_DNODE_EXIT(db); - return (dr); + mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); +} + +static void +dbuf_undirty_write(dbuf_dirty_record_t *dr, uint64_t txg) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + + ASSERT(!list_link_active(&dr->dr_dirty_node)); + ASSERT(dr->dr_txg == txg); + /* There should be no older dirty records. */ + ASSERT(list_next(&db->db_dirty_records, dr) == NULL); + list_remove(&db->db_dirty_records, dr); + +#ifdef ZFS_DEBUG + if (db->db_blkid == DMU_SPILL_BLKID) { + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); + ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && + db->db_blkptr == &dn->dn_phys->dn_spill); + DB_DNODE_EXIT(db); + } +#endif + + /* Clean up the dirty record. */ + if (db->db_level == 0) { + dbuf_undirty_leaf(dr); + dbuf_dirty_record_cleanup_ranges(dr); + list_destroy(&dr->dt.dl.write_ranges); + } else { + dbuf_undirty_indirect(dr); + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + + cv_broadcast(&db->db_changed); + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + db->db_data_pending = NULL; } /** - * \brief Undirty a buffer, clearing dirty records. + * \brief Undirty a buffer in the transaction group referenced by + * the given transaction. + * + * XXX The extra refcount of doing a resolving read confuses some + * of the hold accounting. Do we do the wrong thing in this + * case? + * + * XXX Need to update comments to reflect the dbuf_dirty() refactoring. */ static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; uint64_t txg = tx->tx_txg; - dbuf_dirty_record_t *dr, **drp; + dbuf_dirty_record_t *dr; + list_t evict_list; ASSERT(txg != 0); ASSERT(db->db_blkid != DMU_BONUS_BLKID); + dmu_buf_create_user_evict_list(&evict_list); + mutex_enter(&db->db_mtx); /* - * If this buffer is not dirty, we're done. + * If this buffer is not dirty in this transaction + * group, we're done. */ - for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) + for (dr = list_head(&db->db_dirty_records); dr != NULL; + dr = list_next(&db->db_dirty_records, dr)) { if (dr->dr_txg <= txg) break; + } if (dr == NULL || dr->dr_txg < txg) { + dmu_buf_destroy_user_evict_list(&evict_list); mutex_exit(&db->db_mtx); return (0); } ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); + /* + * XXX Wait for the buffer to be resolved. With additional accounting + * we should be able to undirty immediately and disassociate the + * read from this dbuf before it completes. + * + * XXX This wait should not be necessary, but ZFS deadlocks without it. + */ + while (db->db_state & (DB_READ|DB_FILL)) + cv_wait(&db->db_changed, &db->db_mtx); + DB_DNODE_ENTER(db); dn = DB_DNODE(db); @@ -1394,9 +2879,9 @@ ASSERT(db->db.db_size != 0); - /* XXX would be nice to fix up dn_towrite_space[] */ + /* XXX would be nice to fix up *_space_towrite[] */ - *drp = dr->dr_next; + list_remove(&db->db_dirty_records, dr); /* * Note that there are three places in dbuf_dirty() @@ -1433,6 +2918,9 @@ mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } + dbuf_dirty_record_cleanup_ranges(dr); + if (db->db_level == 0) + list_destroy(&dr->dt.dl.write_ranges); kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); @@ -1441,14 +2929,20 @@ if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { arc_buf_t *buf = db->db_buf; + tmpprintf("%s db %p clearing\n", __func__, db); ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); - dbuf_set_data(db, NULL); + dbuf_clear_data(db, &evict_list); VERIFY(arc_buf_remove_ref(buf, db) == 1); - dbuf_evict(db); + dbuf_evict(db, &evict_list); + dmu_buf_destroy_user_evict_list(&evict_list); return (1); } + tmpprintf("%s db %p undirtied\n", __func__, db); mutex_exit(&db->db_mtx); + + dmu_buf_destroy_user_evict_list(&evict_list); + return (0); } @@ -1473,17 +2967,91 @@ rf |= DB_RF_HAVESTRUCT; DB_DNODE_EXIT(db); (void) dbuf_read(db, NULL, rf); + /* Already CACHED or UNCACHED at this point */ (void) dbuf_dirty(db, tx); } +/** + * \brief Issue an async read that will eventually transition a dbuf + * into the CACHED state. + * + * \param db Dbuf to transition + * + * \invariant The dbuf's mutex must be held. + * + * Upon return, the dbuf will either be in the READ (async READ + * pending), or CACHED (read satisfied by a cache hit or zero fill for + * an object hole) state. + * + * \note The dbuf's mutex is dropped temporarilly while the read is + * scheduled. Caller's must reverify if necessary any state + * protected by the dbuf mutex. + */ +void +dbuf_transition_to_read(dmu_buf_impl_t *db) +{ + int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NEVERWAIT; + dnode_t *dn; + zio_t *zio = NULL; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_state & (DB_PARTIAL|DB_UNCACHED)); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) + rf |= DB_RF_HAVESTRUCT; + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, + ZIO_FLAG_MUSTSUCCEED); + DB_DNODE_EXIT(db); + + mutex_exit(&db->db_mtx); + (void) dbuf_read(db, zio, rf); + (void) zio_nowait(zio); + mutex_enter(&db->db_mtx); +} + +#pragma weak dmu_buf_will_dirty_range = dbuf_will_dirty_range +/** + * \brief Signal intent to dirty a subset of the buffer. + * + * \param db The dbuf that will be dirtied + * \param tx The transaction the dirty will occur in + * \param offset The starting offset of the intended dirty + * \param size The length of the intended dirty + * + * XXX This needs to be merged into dbuf_will_dirty(). + */ void +dbuf_will_dirty_range(dmu_buf_impl_t *db, dmu_tx_t *tx, int offset, int size) +{ + dbuf_dirty_record_t *dr; + + ASSERT(tx->tx_txg != 0); + ASSERT(!refcount_is_zero(&db->db_holds)); + ASSERT(db->db_level == 0); + ASSERT(db->db_blkid != DMU_SPILL_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); +#ifdef ZFS_DEBUG + { + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); + DB_DNODE_EXIT(db); + } +#endif + + dbuf_dirty_leaf(db, tx, offset, size); +} + +void dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - db->db_state = DB_NOFILL; - - dmu_buf_will_fill(db_fake, tx); + dbuf_dirty_nofill(db, tx); } void @@ -1499,8 +3067,11 @@ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); - dbuf_noread(db); - (void) dbuf_dirty(db, tx); + /* Wait for another filler to finish. */ + while (db->db_state & DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + + dbuf_dirty_leaf(db, tx, 0, db->db.db_size); } #pragma weak dmu_buf_fill_done = dbuf_fill_done @@ -1510,16 +3081,40 @@ { mutex_enter(&db->db_mtx); DBUF_VERIFY(db); + if (db->db_state & DB_FILL) { + dbuf_dirty_record_t *dr; + + dr = list_head(&db->db_dirty_records); + ASSERT(dr->dr_txg == tx->tx_txg); + ASSERT(dr != db->db_data_pending); - if (db->db_state == DB_FILL) { - if (db->db_level == 0 && db->db_freed_in_flight) { + if (db->db_freed_in_flight) { + ASSERT(db->db_level == 0); ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ bzero(db->db.db_data, db->db.db_size); db->db_freed_in_flight = FALSE; + dbuf_dirty_record_cleanup_ranges(dr); + DBUF_STATE_CHANGE(db, =, DB_CACHED, + "fill done handling freed in flight"); + } else { + /* + * This function can be called with another state bit + * set, but if FILL is the only bit set, then the + * buffer has been fully filled. Otherwise, clear the + * FILL bit, so it goes back to the steady state. + */ + if (db->db_state == DB_FILL) { + DBUF_STATE_CHANGE(db, =, DB_CACHED, + "filler finished, complete buffer"); + } else { + DBUF_STATE_CHANGE(db, &=, ~DB_FILL, + "filler finished, incomplete buffer"); + ASSERT(db->db_state & (DB_PARTIAL|DB_READ)); + } } - db->db_state = DB_CACHED; + cv_broadcast(&db->db_changed); } mutex_exit(&db->db_mtx); @@ -1534,6 +3129,8 @@ void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) { + dbuf_dirty_record_t *dr; + ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_level == 0); @@ -1544,56 +3141,7 @@ arc_return_buf(buf, db); ASSERT(arc_released(buf)); - - mutex_enter(&db->db_mtx); - - while (db->db_state == DB_READ || db->db_state == DB_FILL) - cv_wait(&db->db_changed, &db->db_mtx); - - ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); - - /* - * If the dbuf is cached and the number of holds exceeds the number - * of dirty calls on it, then dirty it again and remove the buffer - * reference, before copying the ARC buffer to the dbuf. - */ - if (db->db_state == DB_CACHED && - refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { - mutex_exit(&db->db_mtx); - (void) dbuf_dirty(db, tx); - bcopy(buf->b_data, db->db.db_data, db->db.db_size); - VERIFY(arc_buf_remove_ref(buf, db) == 1); - xuio_stat_wbuf_copied(); - return; - } - - xuio_stat_wbuf_nocopy(); - if (db->db_state == DB_CACHED) { - dbuf_dirty_record_t *dr = db->db_last_dirty; - - ASSERT(db->db_buf != NULL); - if (dr != NULL && dr->dr_txg == tx->tx_txg) { - ASSERT(dr->dt.dl.dr_data == db->db_buf); - if (!arc_released(db->db_buf)) { - ASSERT(dr->dt.dl.dr_override_state == - DR_OVERRIDDEN); - arc_release(db->db_buf, db); - } - dr->dt.dl.dr_data = buf; - VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); - } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { - arc_release(db->db_buf, db); - VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); - } - db->db_buf = NULL; - } - ASSERT(db->db_buf == NULL); - /* Set db->db_buf = buf */ - dbuf_set_data(db, buf); - db->db_state = DB_FILL; - mutex_exit(&db->db_mtx); - (void) dbuf_dirty(db, tx); - /* clear db->db.db_data and tell waiters it's changed ?? */ + (void) dbuf_dirty_with_arcbuf(db, tx, buf); dbuf_fill_done(db, tx); } @@ -1611,7 +3159,7 @@ * ARC: dbuf_do_evict()->dbuf_destroy() */ void -dbuf_clear(dmu_buf_impl_t *db) +dbuf_clear(dmu_buf_impl_t *db, list_t *evict_list_p) { dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; @@ -1620,8 +3168,9 @@ ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(refcount_is_zero(&db->db_holds)); + ASSERT(list_is_empty(&db->db_dirty_records)); - dbuf_evict_user(db); + dbuf_queue_user_evict(db, evict_list_p); if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); @@ -1630,13 +3179,14 @@ arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } db->db.db_data = NULL; - db->db_state = DB_UNCACHED; + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "buffer cleared"); } ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); + ASSERT(list_is_empty(&db->db_dirty_records)); - db->db_state = DB_EVICTING; + DBUF_STATE_CHANGE(db, =, DB_EVICTING, "buffer eviction started"); db->db_blkptr = NULL; DB_DNODE_ENTER(db); @@ -1715,7 +3265,7 @@ } else if (level < nlevels-1) { /* this block is referenced from an indirect block */ int err = dbuf_hold_impl(dn, level+1, - blkid >> epbs, fail_sparse, NULL, parentp); + blkid >> epbs, fail_sparse, NULL, parentp, NULL); if (err) return (err); err = dbuf_read(*parentp, NULL, @@ -1754,19 +3304,22 @@ db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); + list_create(&db->db_dirty_records, sizeof(dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, db_dirty_record_link)); + + list_create(&db->db_dmu_buf_sets, sizeof(dmu_context_node_t), + offsetof(dmu_context_node_t, dcn_link)); + db->db_objset = os; db->db.db_object = dn->dn_object; db->db_level = level; db->db_blkid = blkid; - db->db_last_dirty = NULL; db->db_dirtycnt = 0; db->db_dnode_handle = dn->dn_handle; db->db_parent = parent; db->db_blkptr = blkptr; - db->db_user_ptr = NULL; - db->db_user_data_ptr_ptr = NULL; - db->db_evict_func = NULL; + db->db_user = NULL; db->db_immediate_evict = 0; db->db_freed_in_flight = 0; @@ -1776,7 +3329,7 @@ (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; - db->db_state = DB_UNCACHED; + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "bonus buffer created"); /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); return (db); @@ -1799,7 +3352,7 @@ * dn_dbufs list. */ mutex_enter(&dn->dn_dbufs_mtx); - db->db_state = DB_EVICTING; + db->db_state = DB_EVICTING; /* not worth logging this state change */ if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ kmem_cache_free(dbuf_cache, db); @@ -1807,7 +3360,7 @@ return (odb); } list_insert_head(&dn->dn_dbufs, db); - db->db_state = DB_UNCACHED; + DBUF_STATE_CHANGE(db, =, DB_UNCACHED, "regular buffer created"); mutex_exit(&dn->dn_dbufs_mtx); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); @@ -1829,21 +3382,26 @@ { arc_buf_t *buf = private; dmu_buf_impl_t *db = buf->b_private; + list_t evict_list; + dmu_buf_create_user_evict_list(&evict_list); + if (!MUTEX_HELD(&db->db_mtx)) mutex_enter(&db->db_mtx); ASSERT(refcount_is_zero(&db->db_holds)); + ASSERT(list_is_empty(&db->db_dirty_records)); if (db->db_state != DB_EVICTING) { ASSERT(db->db_state == DB_CACHED); DBUF_VERIFY(db); db->db_buf = NULL; - dbuf_evict(db); + dbuf_evict(db, &evict_list); } else { mutex_exit(&db->db_mtx); dbuf_destroy(db); } + dmu_buf_destroy_user_evict_list(&evict_list); return (0); } @@ -1880,6 +3438,8 @@ } db->db_parent = NULL; db->db_buf = NULL; + list_destroy(&db->db_dirty_records); + list_destroy(&db->db_dmu_buf_sets); ASSERT(!list_link_active(&db->db_link)); ASSERT(db->db.db_data == NULL); @@ -1944,18 +3504,22 @@ /** * \brief Returns with db_holds incremented, and db_mtx not held. * + * \note buf_set may be NULL. * \note dn_struct_rwlock must be held. */ int dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, - void *tag, dmu_buf_impl_t **dbp) + void *tag, dmu_buf_impl_t **dbp, dmu_buf_set_t *buf_set) { dmu_buf_impl_t *db, *parent = NULL; + list_t evict_list; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT3U(dn->dn_nlevels, >, level); + dmu_buf_create_user_evict_list(&evict_list); + *dbp = NULL; top: /* dbuf_find() returns with db_mtx held */ @@ -1984,7 +3548,7 @@ if (db->db_buf && refcount_is_zero(&db->db_holds)) { arc_buf_add_ref(db->db_buf, db); if (db->db_buf->b_data == NULL) { - dbuf_clear(db); + dbuf_clear(db, &evict_list); if (parent) { dbuf_rele(parent, NULL); parent = NULL; @@ -2001,27 +3565,36 @@ * still referencing it from db_data, we need to make a copy * of it in case we decide we want to dirty it again in this txg. */ - if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && + if (db->db_data_pending && db->db_level == 0 && dn->dn_object != DMU_META_DNODE_OBJECT && - db->db_state == DB_CACHED && db->db_data_pending) { + db->db_state == DB_CACHED) { dbuf_dirty_record_t *dr = db->db_data_pending; + /* dbuf_sync_bonus does not set db_data_pending. */ + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + if (dr->dt.dl.dr_data == db->db_buf) { - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - - dbuf_set_data(db, - arc_buf_alloc(dn->dn_objset->os_spa, - db->db.db_size, db, type)); + dbuf_set_data(db, dbuf_alloc_arcbuf(db)); bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, db->db.db_size); } } (void) refcount_add(&db->db_holds, tag); - dbuf_update_data(db); + dbuf_update_user_data(db); DBUF_VERIFY(db); + /* If a reading buffer set is associated, add the callback now. */ + if (buf_set != NULL && (buf_set->dmu_ctx->flags & DMU_CTX_FLAG_READ)) { + if (db->db_state == DB_CACHED) { + /* Dbuf is already at the desired state. */ + dmu_buf_set_rele(buf_set, B_FALSE); + } else + dmu_context_node_add(&db->db_dmu_buf_sets, buf_set); + } mutex_exit(&db->db_mtx); + dmu_buf_destroy_user_evict_list(&evict_list); + /* NOTE: we can't rele the parent until after we drop the db_mtx */ if (parent) dbuf_rele(parent, NULL); @@ -2037,16 +3610,15 @@ dmu_buf_impl_t * dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) { - dmu_buf_impl_t *db; - int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); - return (err ? NULL : db); + return (dbuf_hold_level(dn, 0, blkid, tag)); } dmu_buf_impl_t * dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) { dmu_buf_impl_t *db; - int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); + int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db, + /*buf_set*/NULL); return (err ? NULL : db); } @@ -2123,10 +3695,13 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) { int64_t holds; + list_t evict_list; ASSERT(MUTEX_HELD(&db->db_mtx)); DBUF_VERIFY(db); + dmu_buf_create_user_evict_list(&evict_list); + /* * Remove the reference to the dbuf before removing its hold on the * dnode so we can guarantee in dnode_move() that a referenced bonus @@ -2137,14 +3712,16 @@ /* * We can't freeze indirects if there is a possibility that they - * may be modified in the current syncing context. + * may be modified in the current syncing context, or if there could + * be data in flight. */ - if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) + if (db->db_buf && db->db_state == DB_CACHED && + holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) arc_buf_freeze(db->db_buf); if (holds == db->db_dirtycnt && db->db_level == 0 && db->db_immediate_evict) - dbuf_evict_user(db); + dbuf_queue_user_evict(db, &evict_list); if (holds == 0) { if (db->db_blkid == DMU_BONUS_BLKID) { @@ -2168,27 +3745,37 @@ * This is a special case: we never associated this * dbuf with any data allocated from the ARC. */ +#ifdef ZFS_DEBUG + if ((db->db_state & (DB_UNCACHED|DB_NOFILL)) == 0) { + __dprintf(__FILE__, __func__, __LINE__, + "%s: dbuf invalid without ARC buffer: " + "state %d lvl=%d blkid=%d obj=%d\n", + __func__, db->db_state, db->db_level, + db->db_blkid, db->db.db_object); + } +#endif ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); - dbuf_evict(db); + dbuf_evict(db, &evict_list); } else if (arc_released(db->db_buf)) { arc_buf_t *buf = db->db_buf; /* * This dbuf has anonymous data associated with it. */ - dbuf_set_data(db, NULL); + dbuf_clear_data(db, &evict_list); VERIFY(arc_buf_remove_ref(buf, db) == 1); - dbuf_evict(db); + dbuf_evict(db, &evict_list); } else { VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); if (!DBUF_IS_CACHEABLE(db)) - dbuf_clear(db); + dbuf_clear(db, &evict_list); else mutex_exit(&db->db_mtx); } } else { mutex_exit(&db->db_mtx); } + dmu_buf_destroy_user_evict_list(&evict_list); } #pragma weak dmu_buf_refcount = dbuf_refcount @@ -2199,90 +3786,6 @@ } /** - * \param user_ptr [in] For use by the user and can be obtained - * via dmu_buf_get_user() - * - * \param user_data_ptr_ptr [in, out] Should be NULL, or a pointer to a - * pointer which will be set to db->db_data when - * the caller is allowed to access it. Note that - * db->db_data can change when dmu_buf_read, - * dmu_buf_tryupgrade, dmu_buf_will_dirty, or - * dmu_buf_will_fill are called. - * *user_data_ptr_ptr will be set to the new - * value when it changes. - * - * \param evict_func [in] If not NULL, evict_func will be called - * when this buffer is being excised from the - * cache, so that the data structure pointed to - * by user_data_ptr_ptr can be cleaned up. - * - * \returns NULL on success, or the existing user ptr if it's already - * been set. - * - * dmu_evict_user() will call the evict_func for all buffers in a - * objset with a given pageout func. - */ -void * -dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, - dmu_buf_evict_func_t *evict_func) -{ - return (dmu_buf_update_user(db_fake, NULL, user_ptr, - user_data_ptr_ptr, evict_func)); -} - -/** - * The same as set_user, but request immediate eviction when hold count goes - * to zero. - */ -void * -dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, - dmu_buf_evict_func_t *evict_func) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - - db->db_immediate_evict = TRUE; - return (dmu_buf_update_user(db_fake, NULL, user_ptr, - user_data_ptr_ptr, evict_func)); -} - -void * -dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, - void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - ASSERT(db->db_level == 0); - - ASSERT((user_ptr == NULL) == (evict_func == NULL)); - - mutex_enter(&db->db_mtx); - - if (db->db_user_ptr == old_user_ptr) { - db->db_user_ptr = user_ptr; - db->db_user_data_ptr_ptr = user_data_ptr_ptr; - db->db_evict_func = evict_func; - - dbuf_update_data(db); - } else { - old_user_ptr = db->db_user_ptr; - } - - mutex_exit(&db->db_mtx); - return (old_user_ptr); -} - -/** - * \return the user_ptr set with dmu_buf_set_user(), or NULL if not set. - */ -void * -dmu_buf_get_user(dmu_buf_t *db_fake) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - ASSERT(!refcount_is_zero(&db->db_holds)); - - return (db->db_user_ptr); -} - -/** * \brief Tells if the given dbuf is freeable. */ boolean_t @@ -2331,8 +3834,8 @@ if (parent == NULL) { mutex_exit(&db->db_mtx); rw_enter(&dn->dn_struct_rwlock, RW_READER); - (void) dbuf_hold_impl(dn, db->db_level+1, - db->db_blkid >> epbs, FALSE, db, &parent); + parent = dbuf_hold_level(dn, db->db_level + 1, + db->db_blkid >> epbs, db); rw_exit(&dn->dn_struct_rwlock); mutex_enter(&db->db_mtx); db->db_parent = parent; @@ -2381,12 +3884,12 @@ /* Provide the pending dirty record to child dbufs */ db->db_data_pending = dr; + ASSERT(list_next(&db->db_dirty_records, dr) == NULL); mutex_exit(&db->db_mtx); - dbuf_write(dr, db->db_buf, tx); + zio = dr->dr_zio = dbuf_write(dr, db->db_buf, tx); + mutex_enter(&dr->dt.di.dr_mtx); - zio = dr->dr_zio; - mutex_enter(&dr->dt.di.dr_mtx); dbuf_sync_list(&dr->dt.di.dr_children, tx); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); mutex_exit(&dr->dt.di.dr_mtx); @@ -2394,19 +3897,53 @@ } static void +dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + void *data = dr->dt.dl.dr_data; + dnode_t *dn; + + ASSERT3U(db->db_level, ==, 0); + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(DB_DNODE_HELD(db)); + ASSERT(db->db_blkid == DMU_BONUS_BLKID); + ASSERT(data != NULL); + + dn = DB_DNODE(db); + ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); + + bcopy(data, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); + DB_DNODE_EXIT(db); + + dbuf_undirty_bonus(dr); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); +} + +static void dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { arc_buf_t **datap = &dr->dt.dl.dr_data; dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; objset_t *os; + zio_t *zio; uint64_t txg = tx->tx_txg; + boolean_t resolve_pending; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); + if (db->db_state & DB_PARTIAL) { + /* + * Time has run out for waiting on any writer to fill + * this buffer. + */ + ASSERT(arc_released(*datap)); + dbuf_transition_to_read(db); + } + /* * To be synced, we must be dirtied. But we * might have been freed after the dirty. @@ -2414,11 +3951,15 @@ if (db->db_state == DB_UNCACHED) { /* This buffer has been freed since it was dirtied */ ASSERT(db->db.db_data == NULL); - } else if (db->db_state == DB_FILL) { - /* This buffer was freed and is now being re-filled */ + } else if (db->db_state & DB_FILL) { + /* + * This buffer is being modified. Those modifications + * should be in a newer transaction group and not + * reference the data we are about to write. + */ ASSERT(db->db.db_data != dr->dt.dl.dr_data); } else { - ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); + ASSERT(db->db_state & (DB_CACHED|DB_READ|DB_NOFILL)); } DBUF_VERIFY(db); @@ -2438,33 +3979,7 @@ * be called). */ if (db->db_blkid == DMU_BONUS_BLKID) { - dbuf_dirty_record_t **drp; - - ASSERT(*datap != NULL); - ASSERT3U(db->db_level, ==, 0); - ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); - bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); - DB_DNODE_EXIT(db); - - if (*datap != db->db.db_data) { - zio_buf_free(*datap, DN_MAX_BONUSLEN); - arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); - } - db->db_data_pending = NULL; - drp = &db->db_last_dirty; - while (*drp != dr) - drp = &(*drp)->dr_next; - ASSERT(dr->dr_next == NULL); - ASSERT(dr->dr_dbuf == db); - *drp = dr->dr_next; - if (dr->dr_dbuf->db_level != 0) { - list_destroy(&dr->dt.di.dr_children); - mutex_destroy(&dr->dt.di.dr_mtx); - } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); - ASSERT(db->db_dirtycnt > 0); - db->db_dirtycnt -= 1; - dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); + dbuf_sync_bonus(dr, tx); return; } @@ -2488,33 +4003,43 @@ ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); } - if (db->db_state != DB_NOFILL && - dn->dn_object != DMU_META_DNODE_OBJECT && - refcount_count(&db->db_holds) > 1 && - dr->dt.dl.dr_override_state != DR_OVERRIDDEN && - *datap == db->db_buf) { - /* - * If this buffer is currently "in use" (i.e., there - * are active holds and db_data still references it), - * then make a copy before we start the write so that - * any modifications from the open txg will not leak - * into this write. - * - * NOTE: this copy does not need to be made for - * objects only modified in the syncing context (e.g. - * DNONE_DNODE blocks). - */ - int blksz = arc_buf_size(*datap); - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - *datap = arc_buf_alloc(os->os_spa, blksz, db, type); - bcopy(db->db.db_data, (*datap)->b_data, blksz); - } - /* notify that the dirty record is about to write */ + /* Remember if we need to defer write execution to dbuf_read_done(). */ + resolve_pending = !list_is_empty(&dr->dt.dl.write_ranges); + + /* + * Syncer splits must be deferred until the buffer contents + * are fully valid. + */ + if (resolve_pending == B_FALSE && + dn->dn_object != DMU_META_DNODE_OBJECT) + dbuf_syncer_split(db, dr, /*deferred_split*/B_FALSE); + + /* Notify the world that this dirty record is about to write. */ db->db_data_pending = dr; + ASSERT(list_next(&db->db_dirty_records, dr) == NULL); mutex_exit(&db->db_mtx); - dbuf_write(dr, *datap, tx); + zio = dbuf_write(dr, *datap, tx); + + if (resolve_pending) { + + /* Resolve race with dbuf_read_done(). */ + mutex_enter(&db->db_mtx); + dr->dr_zio = zio; + resolve_pending = !list_is_empty(&dr->dt.dl.write_ranges); + mutex_exit(&db->db_mtx); + + if (resolve_pending) { + /* + * Resolve still pending. Let dbuf_read_done() + * fire the write. + */ + DB_DNODE_EXIT(db); + return; + } + } else + dr->dr_zio = zio; ASSERT(!list_link_active(&dr->dr_dirty_node)); if (dn->dn_object == DMU_META_DNODE_OBJECT) { @@ -2643,7 +4168,7 @@ blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; uint64_t txg = zio->io_txg; - dbuf_dirty_record_t **drp, *dr; + dbuf_dirty_record_t *dr; ASSERT3U(zio->io_error, ==, 0); ASSERT(db->db_blkptr == bp); @@ -2671,83 +4196,8 @@ * Now that the write is completed, the dirty record it resolves is * no longer needed, so remove it. */ - drp = &db->db_last_dirty; - while ((dr = *drp) != db->db_data_pending) - drp = &dr->dr_next; - ASSERT(!list_link_active(&dr->dr_dirty_node)); - ASSERT(dr->dr_txg == txg); - ASSERT(dr->dr_dbuf == db); - ASSERT(dr->dr_next == NULL); - *drp = dr->dr_next; - -#ifdef ZFS_DEBUG - if (db->db_blkid == DMU_SPILL_BLKID) { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); - ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && - db->db_blkptr == &dn->dn_phys->dn_spill); - DB_DNODE_EXIT(db); - } -#endif - - /* Clean up the dirty record. */ - if (db->db_level == 0) { - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); - if (db->db_state != DB_NOFILL) { - if (dr->dt.dl.dr_data != db->db_buf) { - /* - * What we wrote is already out of date, so - * just free the ARC buffer. - */ - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, - db) == 1); - } else if (!arc_released(db->db_buf)) { - /* - * Our dbuf hasn't already been evicted, so - * register a callback to clean it up once - * its ARC buffer is released. - */ - arc_set_callback(db->db_buf, dbuf_do_evict, db); - } - } - } else { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - ASSERT(list_head(&dr->dt.di.dr_children) == NULL); - /* - * The size of an indirect block must match what its - * associated dnode thinks it should be. - */ - ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); - /* - * If the dbuf's block pointer is not a hole, evict it when - * its last ARC buffer hold has been released. - */ - if (!BP_IS_HOLE(db->db_blkptr)) { - int epbs = - dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; - ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, - db->db.db_size); - ASSERT3U(dn->dn_phys->dn_maxblkid - >> (db->db_level * epbs), >=, db->db_blkid); - arc_set_callback(db->db_buf, dbuf_do_evict, db); - } - DB_DNODE_EXIT(db); - mutex_destroy(&dr->dt.di.dr_mtx); - list_destroy(&dr->dt.di.dr_children); - } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); - - cv_broadcast(&db->db_changed); - ASSERT(db->db_dirtycnt > 0); - db->db_dirtycnt -= 1; - db->db_data_pending = NULL; + ASSERT(db->db_data_pending->dr_dbuf == db); + dbuf_undirty_write(db->db_data_pending, txg); dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); } @@ -2793,7 +4243,7 @@ /** * \brief Commit a dirty buffer to disk. */ -static void +static zio_t * dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; @@ -2804,6 +4254,7 @@ zbookmark_t zb; zio_prop_t zp; zio_t *pio; /* parent I/O */ + zio_t *dr_zio; int wp_flag = 0; DB_DNODE_ENTER(db); @@ -2862,28 +4313,34 @@ DB_DNODE_EXIT(db); if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + /* + * An immediate write has occurred via dmu_sync, which means + * its block pointer override needs to be handled here. + */ ASSERT(db->db_state != DB_NOFILL); - dr->dr_zio = zio_write(pio, os->os_spa, txg, + dr_zio = zio_write(pio, os->os_spa, txg, db->db_blkptr, data->b_data, arc_buf_size(data), &zp, dbuf_write_override_ready, dbuf_write_override_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; - zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, + zio_write_override(dr_zio, &dr->dt.dl.dr_overridden_by, dr->dt.dl.dr_copies); mutex_exit(&db->db_mtx); } else if (db->db_state == DB_NOFILL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); - dr->dr_zio = zio_write(pio, os->os_spa, txg, + dr_zio = zio_write(pio, os->os_spa, txg, db->db_blkptr, NULL, db->db.db_size, &zp, dbuf_write_nofill_ready, dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); } else { ASSERT(arc_released(data)); - dr->dr_zio = arc_write(pio, os->os_spa, txg, + dr_zio = arc_write(pio, os->os_spa, txg, db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp, dbuf_write_ready, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } + + return (dr_zio); } --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Spectra Logic Corporation. All rights reserved. */ #include @@ -101,6 +102,25 @@ { byteswap_uint64_array, TRUE, "bpobj subobj" }, }; +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, dmu, CTLFLAG_RW, 0, "ZFS DMU"); +#define SYSCTL_COUNTER_U(name, desc) \ + uint64_t name; \ + SYSCTL_QUAD(_vfs_zfs_dmu, OID_AUTO, name, CTLFLAG_RD, \ + &name, 0, desc) +#define SYSCTL_REFCOUNT(name, desc) \ + uint_t name; \ + SYSCTL_INT(_vfs_zfs_dmu, OID_AUTO, name, CTLFLAG_RD, \ + &name, 0, desc) + +#ifdef ZFS_DEBUG +SYSCTL_REFCOUNT(dcn_in_flight, "DMU context nodes in flight"); +SYSCTL_COUNTER_U(dmu_ctx_total, "total number of DMU contexts"); +SYSCTL_COUNTER_U(buf_set_total, "total number of buffer sets"); +SYSCTL_REFCOUNT(dmu_ctx_in_flight, "number of DMU contexts in flight"); +SYSCTL_REFCOUNT(buf_set_in_flight, "number of buffer sets in flight"); +#endif + /** * \brief Obtain the DMU buffer from the specified object which contains the * specified offset. @@ -364,170 +384,6 @@ } /** - * \note longer-term, we should modify all of the dmu_buf_*() interfaces - * to take a held dnode rather than -- the lookup is wasteful, - * and can induce severe lock contention when writing to several files - * whose dnodes are in the same block. - */ -static int -dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) -{ - dsl_pool_t *dp = NULL; - dmu_buf_t **dbp; - uint64_t blkid, nblks, i; - uint32_t dbuf_flags; - int err; - zio_t *zio; - hrtime_t start; - - ASSERT(length <= DMU_MAX_ACCESS); - - dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; - if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) - dbuf_flags |= DB_RF_NOPREFETCH; - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (dn->dn_datablkshift) { - int blkshift = dn->dn_datablkshift; - nblks = (P2ROUNDUP(offset+length, 1ULL<> blkshift; - } else { - if (offset + length > dn->dn_datablksz) { - zfs_panic_recover("zfs: accessing past end of object " - "%llx/%llx (size=%u access=%llu+%llu)", - (longlong_t)dn->dn_objset-> - os_dsl_dataset->ds_object, - (longlong_t)dn->dn_object, dn->dn_datablksz, - (longlong_t)offset, (longlong_t)length); - rw_exit(&dn->dn_struct_rwlock); - return (EIO); - } - nblks = 1; - } - dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); - - if (dn->dn_objset->os_dsl_dataset) - dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; - if (dp && dsl_pool_sync_context(dp)) - start = gethrtime(); - zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); - blkid = dbuf_whichblock(dn, offset); - for (i = 0; i < nblks; i++) { - dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); - if (db == NULL) { - rw_exit(&dn->dn_struct_rwlock); - dmu_buf_rele_array(dbp, nblks, tag); - zio_nowait(zio); - return (EIO); - } - /* initiate async i/o */ - if (read) - (void) dbuf_read(db, zio, dbuf_flags); -#ifdef _KERNEL - else - curthread->td_ru.ru_oublock++; -#endif - dbp[i] = &db->db; - } - rw_exit(&dn->dn_struct_rwlock); - - /* wait for async i/o */ - err = zio_wait(zio); - /* track read overhead when we are in sync context */ - if (dp && dsl_pool_sync_context(dp)) - dp->dp_read_overhead += gethrtime() - start; - if (err) { - dmu_buf_rele_array(dbp, nblks, tag); - return (err); - } - - /* wait for other io to complete */ - if (read) { - for (i = 0; i < nblks; i++) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; - mutex_enter(&db->db_mtx); - while (db->db_state == DB_READ || - db->db_state == DB_FILL) - cv_wait(&db->db_changed, &db->db_mtx); - if (db->db_state == DB_UNCACHED) - err = EIO; - mutex_exit(&db->db_mtx); - if (err) { - dmu_buf_rele_array(dbp, nblks, tag); - return (err); - } - } - } - - *numbufsp = nblks; - *dbpp = dbp; - return (0); -} - -/** - * Holds the DMU buffers which contain all bytes in a range of an object. A - * pointer to an array of dmu_buf_t*'s is returned (in *dbpp). - */ -static int -dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, - uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) -{ - dnode_t *dn; - int err; - - err = dnode_hold(os, object, FTAG, &dn); - if (err) - return (err); - - err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - numbufsp, dbpp, DMU_READ_PREFETCH); - - dnode_rele(dn, FTAG); - - return (err); -} - -int -dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, - uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; - int err; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - numbufsp, dbpp, DMU_READ_PREFETCH); - DB_DNODE_EXIT(db); - - return (err); -} - -/** - * Releases the hold on an array of dmu_buf_t*'s, and frees the array. The - * hold on the array of buffers MUST be released with dmu_buf_rele_array. You - * can NOT release the hold on each buffer individually with dmu_buf_rele. - */ -void -dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) -{ - int i; - dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; - - if (numbufs == 0) - return; - - for (i = 0; i < numbufs; i++) { - if (dbp[i]) - dbuf_rele(dbp[i], tag); - } - - kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); -} - -/** * \brief Asynchronously try to read in the data. */ void @@ -750,126 +606,882 @@ return (0); } +/* + * DMU Context based functions. + */ + +/* Used for TSD for processing completed asynchronous I/Os. */ +uint_t zfs_async_io_key; + +void +dmu_context_node_add(list_t *list, dmu_buf_set_t *buf_set) +{ + dmu_context_node_t *dcn = kmem_zalloc(sizeof(dmu_context_node_t), + KM_SLEEP); + dcn->buf_set = buf_set; + list_insert_tail(list, dcn); +#ifdef ZFS_DEBUG + refcount_acquire(&dcn_in_flight); +#endif +} + +void +dmu_context_node_remove(list_t *list, dmu_context_node_t *dcn) +{ + list_remove(list, dcn); + kmem_free(dcn, sizeof(dmu_context_node_t)); +#ifdef ZFS_DEBUG + ASSERT(dcn_in_flight > 0); + refcount_release(&dcn_in_flight); +#endif +} + +static void +dmu_buf_read_xuio(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, + uint64_t sz) +{ +#ifdef _KERNEL + dmu_context_t *dmu_ctx = buf_set->dmu_ctx; + uio_t *uio = (uio_t *)dmu_ctx->data_buf; + xuio_t *xuio = (xuio_t *)uio; + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + arc_buf_t *dbuf_abuf = dbi->db_buf; + arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); + + if (dmu_xuio_add(xuio, abuf, off, sz) == 0) { + uio->uio_resid -= sz; + uio->uio_loffset += sz; + } + + if (abuf == dbuf_abuf) + XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); + else + XUIOSTAT_BUMP(xuiostat_rbuf_copied); +#endif +} + +static void +dmu_buf_read_uio(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, + uint64_t sz) +{ +#ifdef _KERNEL + uio_t *uio = (uio_t *)buf_set->dmu_ctx->data_buf; + struct iovec *iov = uio->uio_iov; + dprintf("%s: uio iov=%p iovcnt=%d base %p len %lu\n", + __func__, iov, uio->uio_iovcnt, iov->iov_base, + iov->iov_len); + if (uiomove((char *)db->db_data + off, sz, UIO_READ, uio)) + buf_set->err += 1; +#endif +} +static void +dmu_buf_write_uio(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, + uint64_t sz) +{ +#ifdef _KERNEL + uio_t *uio = (uio_t *)buf_set->dmu_ctx->data_buf; + struct iovec *iov = uio->uio_iov; + dprintf("%s: uio iov=%p iovcnt=%d base %p len %lu\n", + __func__, iov, uio->uio_iovcnt, iov->iov_base, + iov->iov_len); + if (uiomove((char *)db->db_data + off, sz, UIO_WRITE, uio)) + buf_set->err += 1; +#endif +} + +static void +dmu_buf_read_char(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, + uint64_t sz) +{ + char *data = (char *)buf_set->dmu_ctx->data_buf + db->db_offset - + buf_set->dmu_ctx->dn_start + off; + dprintf("%s(set=%p, db=%p, off=%lu, sz=%lu) db_data=%p data=%p\n", + __func__, buf_set, db, off, sz, db->db_data + off, data); + bcopy((char *)db->db_data + off, data, sz); +} +static void +dmu_buf_write_char(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, + uint64_t sz) +{ + char *data = (char *)buf_set->dmu_ctx->data_buf + db->db_offset - + buf_set->dmu_ctx->dn_start + off; + dprintf("%s(set=%p, db=%p, off=%lu, sz=%lu) data=%p db_data=%p\n", + __func__, buf_set, db, off, sz, data, db->db_data + off); + bcopy(data, (char *)db->db_data + off, sz); +} + +static void +dmu_buf_write_pages(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, + uint64_t sz) +{ +#ifdef sun + int copied; + page_t *pp = (page_t *)dmu_context->data_buf; + + for (copied = 0; copied < sz; copied += PAGESIZE) { + caddr_t va; + int thiscpy; + + ASSERT3U(pp->p_offset, ==, db->db_offset + off); + thiscpy = MIN(PAGESIZE, sz - copied); + va = zfs_map_page(pp, S_READ); + bcopy(va, (char *)db->db_data + off, thiscpy); + zfs_unmap_page(pp, va); + pp = pp->p_next; + off += PAGESIZE; + } +#endif +} + +static void +dmu_buf_transfer_nofill(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, + uint64_t sz) +{ + dmu_tx_t *tx = DMU_BUF_SET_TX(buf_set); + dmu_buf_will_not_fill(db, tx); + /* No need to do any more here. */ +} + +static void +dmu_buf_transfer_write(dmu_buf_set_t *buf_set, dmu_buf_t *db, uint64_t off, + uint64_t sz) +{ + dmu_tx_t *tx = DMU_BUF_SET_TX(buf_set); + + if (sz == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty_range(db, tx, off, sz); + buf_set->dmu_ctx->move_cb(buf_set, db, off, sz); + dmu_buf_fill_done(db, tx); +} + +void +dmu_buf_set_transfer(dmu_buf_set_t *buf_set) +{ + uint64_t offset, size; + dmu_context_t *dmu_ctx = buf_set->dmu_ctx; + dmu_tx_t *tx = dmu_ctx->tx; + int i; + + /* Initialize the current state. */ + size = buf_set->size; + offset = buf_set->dn_start; + + /* Perform the I/O copy, one buffer at a time. */ + for (i = 0; i < buf_set->count; i++) { + dmu_buf_t *db = buf_set->dbp[i]; + uint64_t off = offset - db->db_offset; + uint64_t sz = MIN(db->db_size - off, size); + + ASSERT(size > 0); + dmu_ctx->buf_transfer_cb(buf_set, db, off, sz); + offset += sz; + size -= sz; + } +} + +void +dmu_buf_set_transfer_write(dmu_buf_set_t *buf_set) +{ + + dmu_buf_set_transfer(buf_set); + ASSERT(buf_set->dmu_ctx->dn != NULL); + /* Release the dnode immediately before committing the tx. */ + dnode_rele(buf_set->dmu_ctx->dn, buf_set->dmu_ctx->tag); + buf_set->dmu_ctx->dn = NULL; +} + +static void +dmu_buf_set_transfer_write_tx(dmu_buf_set_t *buf_set) +{ + + dmu_buf_set_transfer_write(buf_set); + dmu_tx_commit(buf_set->tx); +} + +/** + * \brief Release a DMU context hold, cleaning up if no holds remain. + * + * \param dmu_ctx DMU context to release. + */ +void +dmu_context_rele(dmu_context_t *dmu_ctx) +{ + dmu_buf_set_t *buf_set; + + if (!refcount_release(&dmu_ctx->holds)) + return; + +#ifdef ZFS_DEBUG + ASSERT(dmu_ctx_in_flight > 0); + refcount_release(&dmu_ctx_in_flight); +#endif + + if ((dmu_ctx->flags & DMU_CTX_FLAG_NO_HOLD) == 0 && dmu_ctx->dn != NULL) + dnode_rele(dmu_ctx->dn, dmu_ctx->tag); + + /* At this point, there are no buffer sets left. Call back. */ + if (dmu_ctx->context_cb != NULL) + dmu_ctx->context_cb(dmu_ctx); +} + +/** + * \brief Handle a completed buffer set, and its DMU context if necessary. + * + * \param buf_set Buffer set to handle. + */ +static void +dmu_buf_set_complete(dmu_buf_set_t *buf_set) +{ + int i; + dmu_context_t *dmu_ctx = buf_set->dmu_ctx; + + /* Only perform I/O if no errors occurred for the buffer set. */ + if (buf_set->err == 0) { + dmu_ctx->buf_set_transfer_cb(buf_set); + if (buf_set->err == 0) + atomic_add_64(&dmu_ctx->completed_size, buf_set->size); + } + /* Check again in case transfer causes errors. */ + if (buf_set->err) + atomic_add_int(&dmu_ctx->err, buf_set->err); + + for (i = 0; i < buf_set->count; i++) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)buf_set->dbp[i]; + ASSERT(db != NULL); + dbuf_rele(db, dmu_ctx->tag); + } + +#ifdef ZFS_DEBUG + ASSERT(buf_set_in_flight > 0); + refcount_release(&buf_set_in_flight); +#endif + + kmem_free(buf_set, sizeof(dmu_buf_set_t) + + buf_set->dbp_length * sizeof(dmu_buf_t *)); + dmu_context_rele(dmu_ctx); +} + int -dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf, uint32_t flags) +dmu_thread_context_create(void) +{ + int ret = 0; +#ifdef _KERNEL /* XXX TSD only works in the kernel. FIXME! */ + dmu_cb_state_t *dcs; + + /* This function should never be called more than once in a thread. */ +#ifdef ZFS_DEBUG + dcs = tsd_get(zfs_async_io_key); + ASSERT(dcs == NULL); +#endif + + /* Called with taskqueue mutex held. */ + dcs = kmem_zalloc(sizeof(dmu_cb_state_t), KM_SLEEP); + list_create(&dcs->io_list, sizeof(dmu_context_node_t), + offsetof(dmu_context_node_t, dcn_link)); + + ret = tsd_set(zfs_async_io_key, dcs); +#ifdef ZFS_DEBUG + { + dmu_cb_state_t *check = tsd_get(zfs_async_io_key); + ASSERT(check == dcs); + } +#endif +#endif /* _KERNEL */ + return (ret); +} + +void +dmu_thread_context_destroy(void *context __unused) +{ + dmu_cb_state_t *dcs; + + dcs = tsd_get(zfs_async_io_key); + /* This function may be called on a thread that didn't call create. */ + if (dcs == NULL) + return; + + /* + * This function should only get called after a thread has finished + * processing its queue. + */ + ASSERT(list_is_empty(&dcs->io_list)); + + kmem_free(dcs, sizeof(dmu_cb_state_t)); + VERIFY(tsd_set(zfs_async_io_key, NULL) == 0); +} + +void +dmu_thread_context_process(void) +{ + dmu_cb_state_t *dcs = tsd_get(zfs_async_io_key); + dmu_context_node_t *dcn, *next; + + /* + * If the current thread didn't register, it doesn't handle queued + * async I/O's. It is probably not a zio thread. This is needed + * because zio_execute() can be called from non-zio threads. + */ + if (dcs == NULL) + return; + + for (dcn = list_head(&dcs->io_list); dcn != NULL; dcn = next) { + next = list_next(&dcs->io_list, dcn); + dmu_buf_set_complete(dcn->buf_set); + dmu_context_node_remove(&dcs->io_list, dcn); + } +} + +/** + * \brief Release a buffer set for a given dbuf. + * + * \param buf_set Buffer set to release. + * \param err Whether an error occurred. + * + * \invariant If specified, the dbuf's mutex must be held. + */ +void +dmu_buf_set_rele(dmu_buf_set_t *buf_set, boolean_t err) +{ + dmu_context_t *dmu_ctx = buf_set->dmu_ctx; + + /* Report an error, if any. */ + if (err) + atomic_add_int(&buf_set->err, 1); + + /* If we are finished, schedule this buffer set for delivery. */ + ASSERT(buf_set->holds > 0); + if (refcount_release(&buf_set->holds)) { + dmu_cb_state_t *dcs = tsd_get(zfs_async_io_key); + + if (dcs != NULL && (dmu_ctx->flags & DMU_CTX_FLAG_ASYNC)) { + dmu_context_node_add(&dcs->io_list, buf_set); + } else { + /* + * The current thread doesn't have anything + * registered in its TSD, so it must not handle + * queued delivery. Dispatch this set now. + */ + dmu_buf_set_complete(buf_set); + } + } +} + +/** + * \brief Set up the buffers for a given set. + * + * \param buf_set Buffer set to set up buffers for. + * + * \retval errno If any buffer could not be held for this buffer set. + * \retval 0 Success. + */ +static int +dmu_buf_set_setup_buffers(dmu_buf_set_t *buf_set) +{ + dmu_context_t *dmu_ctx = buf_set->dmu_ctx; + dnode_t *dn = dmu_ctx->dn; + uint64_t blkid; + int dbuf_flags; + int i; + + dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; + if ((dmu_ctx->flags & DMU_CTX_FLAG_PREFETCH) == 0 || + buf_set->size > zfetch_array_rd_sz) + dbuf_flags |= DB_RF_NOPREFETCH; + + blkid = dbuf_whichblock(dn, dmu_ctx->dn_offset); + /* + * Note that while this loop is running, any zio's set up for async + * reads are not executing, therefore access to this buf_set is + * serialized within this function; i.e. atomics are not needed here. + */ + for (i = 0; i < buf_set->count; i++) { + dmu_buf_impl_t *db = NULL; + int err = dbuf_hold_impl(dn, /*level*/0, blkid + i, + /*fail_sparse*/FALSE, dmu_ctx->tag, &db, buf_set); + uint64_t bufoff, bufsiz; + + if (db == NULL) { + /* Only include counts for the processed buffers. */ + buf_set->count = i; + buf_set->holds = i + 1 /*initiator*/; + zio_nowait(buf_set->zio); + return (err); + } + /* initiate async i/o */ + if (dmu_ctx->flags & DMU_CTX_FLAG_READ) + (void) dbuf_read(db, buf_set->zio, dbuf_flags); +#ifdef _KERNEL + else + curthread->td_ru.ru_oublock++; +#endif + + /* Calculate the amount of data this buffer contributes. */ + ASSERT(dmu_ctx->dn_offset >= db->db.db_offset); + bufoff = dmu_ctx->dn_offset - db->db.db_offset; + bufsiz = (int)MIN(db->db.db_size - bufoff, buf_set->resid); + buf_set->resid -= bufsiz; + /* Update the caller's data to let them know what's next. */ + dmu_ctx->dn_offset += bufsiz; + dmu_ctx->resid -= bufsiz; + /* Put this dbuf in the buffer set's list. */ + buf_set->dbp[i] = &db->db; + } + return (0); +} + +/** + * \brief Set up a new transaction for the DMU context. + * + * \param dmu_ctx DMU context to set up new transaction for. + * \param txp Address to store dmu_tx_t pointer. + * \param dnp Address to store dnode_t pointer for new dnode. + */ +static int +dmu_context_setup_tx(dmu_context_t *dmu_ctx, dmu_tx_t **txp, dnode_t **dnp, + uint64_t size) +{ + int err; + + /* Readers and writers with a context transaction do not apply. */ + if ((dmu_ctx->flags & DMU_CTX_FLAG_READ) || dmu_ctx->tx != NULL) + return (0); + + *txp = dmu_tx_create(dmu_ctx->os); + dmu_tx_hold_write(*txp, dmu_ctx->object, dmu_ctx->dn_offset, size); + err = dmu_tx_assign(*txp, TXG_WAIT); + if (err) + goto out; + + /* + * Writer without caller TX: dnode hold is done here rather + * than in dmu_context_init(). + */ + err = dnode_hold(dmu_ctx->os, dmu_ctx->object, dmu_ctx->tag, dnp); + if (err) + goto out; + dmu_ctx->dn = *dnp; + +out: + if (err && *txp != NULL) { + dmu_tx_abort(*txp); + *txp = NULL; + } + return (err); +} + +/** + * \brief Initialize a buffer set of a certain size. + * + * \param dmu_ctx DMU context to associate the buffer set with. + * \param buf_set_p Pointer to set to the new buffer set's address. + * \param size Requested size of the buffer set. + * + * \retval 0 Success. + * \retval EIO I/O error: tried to access past the end of the dnode, + * or dmu_buf_set_setup_buffers() failed. + */ +static int +dmu_buf_set_init(dmu_context_t *dmu_ctx, dmu_buf_set_t **buf_set_p, + uint64_t size) { - dnode_t *dn; - dmu_buf_t **dbp; - int numbufs, err; + dmu_buf_set_t *buf_set; + dmu_tx_t *tx = NULL; + size_t set_size; + int err, nblks; + dnode_t *dn = dmu_ctx->dn; + + ASSERT(dmu_ctx != NULL); + ASSERT(dmu_ctx->holds > 0); - err = dnode_hold(os, object, FTAG, &dn); + /* + * Create a transaction for writes, if needed. This must be done + * first in order to hold the correct struct_rwlock, use the + * correct values for dn_datablksz, etc. + */ + err = dmu_context_setup_tx(dmu_ctx, &tx, &dn, size); if (err) return (err); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + /* Figure out how many blocks are needed for the requested size. */ + if (dn->dn_datablkshift) { + nblks = P2ROUNDUP(dmu_ctx->dn_offset + size, dn->dn_datablksz); + nblks -= P2ALIGN(dmu_ctx->dn_offset, dn->dn_datablksz); + nblks >>= dn->dn_datablkshift; + } else { + if ((dmu_ctx->dn_offset + size) > dn->dn_datablksz) { + zfs_panic_recover("zfs: accessing past end of object " + "%llx/%llx (size=%u access=%llu+%llu)", + (longlong_t)dn->dn_objset-> + os_dsl_dataset->ds_object, + (longlong_t)dn->dn_object, dn->dn_datablksz, + (longlong_t)dmu_ctx->dn_offset, + (longlong_t)size); + err = EIO; + goto out; + } + nblks = 1; + } + + /* Create the new buffer set. */ + set_size = sizeof(dmu_buf_set_t) + nblks * sizeof(dmu_buf_t *); + buf_set = kmem_zalloc(set_size, KM_SLEEP); + + /* Initialize a new buffer set. */ +#ifdef ZFS_DEBUG + refcount_acquire(&buf_set_in_flight); + atomic_add_64(&buf_set_total, 1); +#endif + buf_set->size = size; + buf_set->resid = size; + buf_set->dn_start = dmu_ctx->dn_offset; + buf_set->count = nblks; + buf_set->dbp_length = nblks; + buf_set->tx = tx; + + /* Include a refcount for the initiator. */ + if (dmu_ctx->flags & DMU_CTX_FLAG_READ) + refcount_init(&buf_set->holds, nblks + 1); + else + /* For writes, dbufs never need to call us back. */ + refcount_init(&buf_set->holds, 1); + buf_set->dmu_ctx = dmu_ctx; + refcount_acquire(&dmu_ctx->holds); + /* Either we're a reader or we have a transaction somewhere. */ + ASSERT((dmu_ctx->flags & DMU_CTX_FLAG_READ) || DMU_BUF_SET_TX(buf_set)); + buf_set->zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + *buf_set_p = buf_set; + + err = dmu_buf_set_setup_buffers(buf_set); + +out: + if (err && tx != NULL) + dmu_tx_abort(tx); + if (dn != NULL) + rw_exit(&dn->dn_struct_rwlock); + return (err); +} + +/** + * \brief Process the I/Os queued for a given buffer set. + * + * \param buf_set Buffer set to process I/Os for. + * + * \retval errno Errors from zio_wait or a buffer went UNCACHED. + * \retval 0 Success. + */ +static int +dmu_buf_set_process_io(dmu_buf_set_t *buf_set) +{ + int err, i, syncing; + dsl_pool_t *dp = NULL; + hrtime_t start = 0; + dmu_context_t *dmu_ctx = buf_set->dmu_ctx; + dnode_t *dn = dmu_ctx->dn; + /* - * Deal with odd block sizes, where there can't be data past the first - * block. If we ever do the tail block optimization, we will need to - * handle that here as well. + * If the I/O is asynchronous, issue the I/O's without waiting. + * Writes do not need to wait for any ZIOs. */ - if (dn->dn_maxblkid == 0) { - int newsz = offset > dn->dn_datablksz ? 0 : - MIN(size, dn->dn_datablksz - offset); - bzero((char *)buf + newsz, size - newsz); - size = newsz; + if ((dmu_ctx->flags & DMU_CTX_FLAG_ASYNC) || + (dmu_ctx->flags & DMU_CTX_FLAG_READ) == 0) { + zio_nowait(buf_set->zio); + return (0); } - while (size > 0) { - uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); - int i; + /* Time accounting for sync context. */ + if (dn->dn_objset->os_dsl_dataset) + dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; + if (dp && dsl_pool_sync_context(dp)) + start = gethrtime(); + + /* Wait for async i/o. */ + err = zio_wait(buf_set->zio); + + /* Track read overhead when we are in sync context. */ + if (start) + dp->dp_read_overhead += gethrtime() - start; + if (err) + return (err); - /* - * NB: we could do this block-at-a-time, but it's nice - * to be reading in parallel. - */ - err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, - TRUE, FTAG, &numbufs, &dbp, flags); + /* wait for other io to complete */ + for (i = 0; i < buf_set->count; i++) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)buf_set->dbp[i]; + mutex_enter(&db->db_mtx); + while (db->db_state & (DB_READ|DB_FILL)) + cv_wait(&db->db_changed, &db->db_mtx); + if (db->db_state == DB_UNCACHED) + err = EIO; + mutex_exit(&db->db_mtx); if (err) - break; + return (err); + } + return (0); +} + +/** + * \brief Issue the I/O specified in the given DMU context. + * + * \param dmu_ctx The DMU context. + * + * \return errno Errors executing I/O chunks. + * \return 0 If a DMU callback is specified; the callback + * receives any errors. + * \return 0 If no DMU callback is specified: Success. + */ +int +dmu_issue(dmu_context_t *dmu_ctx) +{ + int err = 0; + uint64_t io_size; + dmu_buf_set_t *buf_set; + + /* If this context is async, it must have a context callback. */ + ASSERT((dmu_ctx->flags & DMU_CTX_FLAG_ASYNC) == 0 || + dmu_ctx->context_cb != NULL); - for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; - dmu_buf_t *db = dbp[i]; + /* + * For writers, if a tx was specified but a dnode wasn't, hold here. + * This could be done in dmu_context_set_dmu_tx(), but that would + * require dmu.h to include a dnode_hold() prototype. + */ + if (dmu_ctx->tx != NULL && dmu_ctx->dn == NULL) { + err = dnode_hold(dmu_ctx->os, dmu_ctx->object, dmu_ctx->tag, + &dmu_ctx->dn); + if (err) + return (err); + } - ASSERT(size > 0); + /* While there is work left to do, execute the next chunk. */ + dprintf("%s(%p) -> buf %p off %lu sz %lu\n", __func__, dmu_ctx, + dmu_ctx->data_buf, dmu_ctx->dn_offset, dmu_ctx->resid); + while (dmu_ctx->resid > 0 && err == 0) { + io_size = MIN(dmu_ctx->resid, DMU_MAX_ACCESS/2); - bufoff = offset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); + dprintf("%s(%p@%lu+%lu) chunk %lu\n", __func__, dmu_ctx, + dmu_ctx->dn_offset, dmu_ctx->resid, io_size); + err = dmu_buf_set_init(dmu_ctx, &buf_set, io_size); - bcopy((char *)db->db_data + bufoff, buf, tocpy); + /* Process the I/O requests, if the initialization passed. */ + if (err == 0) + err = dmu_buf_set_process_io(buf_set); - offset += tocpy; - size -= tocpy; - buf = (char *)buf + tocpy; - } - dmu_buf_rele_array(dbp, numbufs, FTAG); + dmu_buf_set_rele(buf_set, err ? B_TRUE : B_FALSE); } - dnode_rele(dn, FTAG); + /* + * At this point, either this I/O is async, or all buffer sets + * have finished processing. + */ + ASSERT((dmu_ctx->flags & DMU_CTX_FLAG_ASYNC) || dmu_ctx->holds == 1); + return (err); } -void -dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx) +/** + * \brief Set up a DMU context. + * + * \param dmu_ctx The DMU context. + * \param dn A held dnode to associate with the context, or NULL. + * \param os The object set associated with the context. + * \param object The object ID associated with the context. + * \param size Size of the I/O to be performed. + * \param offset Offset into the dnode to perform the I/O. + * \param data_buf Data buffer to perform I/O transfers with. + * \param tag Hold tag to use. + * \param flags DMU context flags. + * + * \note The dnode must not be NULL, unless this is a writer. + * \note The dnode, if specified, must be held, unless the + * DMU_CTX_FLAG_NO_HOLD flag is specified. + */ +int +dmu_context_init(dmu_context_t *dmu_ctx, struct dnode *dn, objset_t *os, + uint64_t object, uint64_t offset, uint64_t size, void *data_buf, void *tag, + uint32_t flags) { - dmu_buf_t **dbp; - int numbufs, i; + boolean_t reader = (flags & DMU_CTX_FLAG_READ) != 0; + int err; + +#ifdef ZFS_DEBUG + refcount_acquire(&dmu_ctx_in_flight); + atomic_add_64(&dmu_ctx_total, 1); + /* Make sure the dnode is passed in appropriately. */ + if (dn == NULL) + ASSERT(os != NULL); + else + ASSERT(!refcount_is_zero(&dn->dn_holds) || + (flags & DMU_CTX_FLAG_NO_HOLD)); +#endif +#ifndef sun + ASSERT((flags & DMU_CTX_FLAG_SUN_PAGES) == 0); +#endif + + /* Make sure the flags are compatible with the I/O type. */ + ASSERT(reader || ((flags & DMU_CTX_READER_FLAGS) == 0)); + ASSERT(!reader || ((flags & DMU_CTX_WRITER_FLAGS) == 0)); + /* The NOFILL flag and a NULL data_buf go hand in hand. */ + ASSERT(((flags & DMU_CTX_FLAG_NOFILL) != 0) ^ (data_buf != NULL)); + + /* + * If the caller is a reader and didn't pass in a dnode, hold it. + * Writers (re-)hold a dnode in dmu_context_setup_tx(), or if a tx + * is specified, in dmu_issue(). + */ + if (dn == NULL && (flags & DMU_CTX_FLAG_READ)) { + err = dnode_hold(os, object, tag, &dn); + if (err) + return (err); + } - if (size == 0) - return; + /* All set, actually initialize the context! */ + bzero(dmu_ctx, sizeof(dmu_context_t)); + dmu_ctx->dn = dn; + dmu_ctx->os = os; + dmu_ctx->object = object; + dmu_ctx->size = size; + dmu_context_seek(dmu_ctx, offset, size, data_buf); + dmu_ctx->tag = tag; + dmu_ctx->flags = flags; - VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &numbufs, &dbp)); + /* Initialize default I/O callbacks. */ + if (dmu_ctx->flags & DMU_CTX_FLAG_UIO) { +#ifdef UIO_XUIO + uio_t *uio = (uio_t *)dmu_ctx->data_buf; + if (uio->uio_extflg == UIO_XUIO) { + ASSERT(reader); + dmu_ctx->move_cb = dmu_buf_read_xuio; + } else +#endif + { + dmu_ctx->move_cb = reader ? dmu_buf_read_uio : + dmu_buf_write_uio; + } + } else if (dmu_ctx->flags & DMU_CTX_FLAG_SUN_PAGES) { + /* implies writer */ + dmu_ctx->move_cb = dmu_buf_write_pages; + } else { + dmu_ctx->move_cb = reader ? dmu_buf_read_char : + dmu_buf_write_char; + } + dmu_ctx->buf_set_transfer_cb = reader ? dmu_buf_set_transfer : + dmu_buf_set_transfer_write_tx; + if ((dmu_ctx->flags & DMU_CTX_FLAG_NOFILL) == 0) { + dmu_ctx->buf_transfer_cb = reader ? dmu_ctx->move_cb : + dmu_buf_transfer_write; + } else + dmu_ctx->buf_transfer_cb = dmu_buf_transfer_nofill; - for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; - dmu_buf_t *db = dbp[i]; + /* Initialize including a refcount for the initiator. */ + refcount_init(&dmu_ctx->holds, 1); + return (0); +} - ASSERT(size > 0); +/** + * \brief Update a DMU context for the next call. + * + * \param dmu_ctx The DMU context. + * \param data_buf The updated destination data buffer. + * \param offset The offset into the dnode. + * \param size The size of the next call. + */ +void +dmu_context_seek(dmu_context_t *dmu_ctx, uint64_t offset, uint64_t size, + void *data_buf) +{ + dnode_t *dn = dmu_ctx->dn; - bufoff = offset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); +#ifdef ZFS_DEBUG +#ifdef _KERNEL + if (dmu_ctx->flags & DMU_CTX_FLAG_UIO) { + uio_t *uio = (uio_t *)data_buf; + /* Make sure UIO callers pass in the correct offset. */ + ASSERT(uio->uio_loffset == offset); + } +#endif + /* Make sure non-char * pointers stay the same. */ + if (!DMU_CTX_BUF_IS_CHAR(dmu_ctx)) + ASSERT(dmu_ctx->data_buf == NULL || + dmu_ctx->data_buf == data_buf); +#endif /* ZFS_DEBUG */ - ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + /* + * Deal with odd block sizes, where there can't be data past + * the first block. If we ever do the tail block optimization, + * we will need to handle that here as well. + */ + if ((dmu_ctx->flags & DMU_CTX_FLAG_READ) && dn->dn_maxblkid == 0 && + DMU_CTX_BUF_IS_CHAR(dmu_ctx)) { + int newsz = offset > dn->dn_datablksz ? 0 : + MIN(size, dn->dn_datablksz - offset); + bzero((char *)data_buf + newsz, size - newsz); + size = newsz; + } + dmu_ctx->dn_offset = offset; + dmu_ctx->dn_start = offset; + dmu_ctx->resid = size; + dmu_ctx->data_buf = data_buf; +} - if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); - else - dmu_buf_will_dirty(db, tx); +int +dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *data_buf, uint32_t flags) +{ + int err; + dmu_context_t dmu_ctx; - bcopy(buf, (char *)db->db_data + bufoff, tocpy); + err = dmu_context_init(&dmu_ctx, /*dnode*/NULL, os, object, offset, + size, data_buf, FTAG, flags|DMU_CTX_FLAG_READ); + if (err) + return (err); - if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); + err = dmu_issue(&dmu_ctx); + dmu_context_rele(&dmu_ctx); - offset += tocpy; - size -= tocpy; - buf = (char *)buf + tocpy; - } - dmu_buf_rele_array(dbp, numbufs, FTAG); + return (err); } void +dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + const void *data_buf, dmu_tx_t *tx) +{ + void *data_bufp = (void *)(uintptr_t)data_buf; + dmu_context_t dmu_ctx; + int err; + + err = dmu_context_init(&dmu_ctx, /*dnode*/NULL, os, object, offset, + size, data_bufp, FTAG, /*flags*/0); + VERIFY(err == 0); + dmu_context_set_dmu_tx(&dmu_ctx, tx); + + (void) dmu_issue(&dmu_ctx); + dmu_context_rele(&dmu_ctx); +} + +int dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { - dmu_buf_t **dbp; - int numbufs, i; + uint32_t flags = DMU_CTX_FLAG_NOFILL; + dmu_context_t dmu_ctx; + int err; if (size == 0) - return; + return (0); - VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &numbufs, &dbp)); + err = dmu_context_init(&dmu_ctx, /*dnode*/NULL, os, object, offset, + size, /*data_buf*/NULL, FTAG, flags); + if (err) + return (err); - for (i = 0; i < numbufs; i++) { - dmu_buf_t *db = dbp[i]; + dmu_context_set_dmu_tx(&dmu_ctx, tx); + err = dmu_issue(&dmu_ctx); + dmu_context_rele(&dmu_ctx); - dmu_buf_will_not_fill(db, tx); - } - dmu_buf_rele_array(dbp, numbufs, FTAG); + return (err); } /** @@ -999,111 +1611,17 @@ int dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) { - dmu_buf_t **dbp; - int numbufs, i, err; - xuio_t *xuio = NULL; + dmu_context_t dmu_ctx; + uint32_t dmu_flags = DMU_CTX_FLAG_READ|DMU_CTX_FLAG_UIO; + int err; - /* - * NB: we could do this block-at-a-time, but it's nice - * to be reading in parallel. - */ - err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, - &numbufs, &dbp); + err = dmu_context_init(&dmu_ctx, /*dnode*/NULL, os, object, + uio->uio_loffset, size, uio, FTAG, dmu_flags); if (err) return (err); -#ifdef UIO_XUIO - if (uio->uio_extflg == UIO_XUIO) - xuio = (xuio_t *)uio; -#endif - - for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; - dmu_buf_t *db = dbp[i]; - - ASSERT(size > 0); - - bufoff = uio->uio_loffset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - if (xuio) { - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - arc_buf_t *dbuf_abuf = dbi->db_buf; - arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); - err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); - if (!err) { - uio->uio_resid -= tocpy; - uio->uio_loffset += tocpy; - } - - if (abuf == dbuf_abuf) - XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); - else - XUIOSTAT_BUMP(xuiostat_rbuf_copied); - } else { - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_READ, uio); - } - if (err) - break; - - size -= tocpy; - } - dmu_buf_rele_array(dbp, numbufs, FTAG); - - return (err); -} - -static int -dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) -{ - dmu_buf_t **dbp; - int numbufs; - int err = 0; - int i; - - err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, - FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); - if (err) - return (err); - - for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; - dmu_buf_t *db = dbp[i]; - - ASSERT(size > 0); - - bufoff = uio->uio_loffset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); - - if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); - else - dmu_buf_will_dirty(db, tx); - - /* - * XXX uiomove could block forever (eg. nfs-backed - * pages). There needs to be a uiolockdown() function - * to lock the pages in memory, so that uiomove won't - * block. - */ - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_WRITE, uio); - - if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); - - if (err) - break; - - size -= tocpy; - } - - dmu_buf_rele_array(dbp, numbufs, FTAG); + err = dmu_issue(&dmu_ctx); + dmu_context_rele(&dmu_ctx); return (err); } @@ -1113,14 +1631,22 @@ { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; + dmu_context_t dmu_ctx; int err; + uint32_t flags = DMU_CTX_FLAG_UIO|DMU_CTX_FLAG_NO_HOLD; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); - err = dmu_write_uio_dnode(dn, uio, size, tx); + err = dmu_context_init(&dmu_ctx, dn, dn->dn_objset, dn->dn_object, + uio->uio_loffset, size, uio, FTAG, flags); + if (err == 0) { + dmu_context_set_dmu_tx(&dmu_ctx, tx); + err = dmu_issue(&dmu_ctx); + dmu_context_rele(&dmu_ctx); + } DB_DNODE_EXIT(db); return (err); @@ -1130,20 +1656,21 @@ dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, dmu_tx_t *tx) { - dnode_t *dn; + dmu_context_t dmu_ctx; + uint32_t dmu_flags = DMU_CTX_FLAG_UIO; int err; if (size == 0) return (0); - err = dnode_hold(os, object, FTAG, &dn); + err = dmu_context_init(&dmu_ctx, /*dnode*/NULL, os, object, + uio->uio_loffset, size, uio, FTAG, dmu_flags); if (err) return (err); - err = dmu_write_uio_dnode(dn, uio, size, tx); - - dnode_rele(dn, FTAG); - + dmu_context_set_dmu_tx(&dmu_ctx, tx); + err = dmu_issue(&dmu_ctx); + dmu_context_rele(&dmu_ctx); return (err); } @@ -1152,54 +1679,21 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, page_t *pp, dmu_tx_t *tx) { - dmu_buf_t **dbp; - int numbufs, i; + dmu_context_t dmu_ctx; + uint32_t dmu_flags = DMU_CTX_FLAG_SUN_PAGES; int err; if (size == 0) return (0); - err = dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &numbufs, &dbp); + err = dmu_context_init(&dmu_ctx, /*dnode*/NULL, os, object, offset, + size, pp, FTAG, dmu_flags); if (err) return (err); - for (i = 0; i < numbufs; i++) { - int tocpy, copied, thiscpy; - int bufoff; - dmu_buf_t *db = dbp[i]; - caddr_t va; - - ASSERT(size > 0); - ASSERT3U(db->db_size, >=, PAGESIZE); - - bufoff = offset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); - - if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); - else - dmu_buf_will_dirty(db, tx); - - for (copied = 0; copied < tocpy; copied += PAGESIZE) { - ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); - thiscpy = MIN(PAGESIZE, tocpy - copied); - va = zfs_map_page(pp, S_READ); - bcopy(va, (char *)db->db_data + bufoff, thiscpy); - zfs_unmap_page(pp, va); - pp = pp->p_next; - bufoff += PAGESIZE; - } - - if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); - - offset += tocpy; - size -= tocpy; - } - dmu_buf_rele_array(dbp, numbufs, FTAG); + dmu_context_set_dmu_tx(&dmu_ctx, tx); + err = dmu_issue(&dmu_ctx); + dmu_context_rele(&dmu_ctx); return (err); } #endif /* sun */ @@ -1465,9 +1959,11 @@ return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); } - dr = db->db_last_dirty; - while (dr && dr->dr_txg != txg) - dr = dr->dr_next; + for (dr = list_head(&db->db_dirty_records); dr != NULL; + dr = list_next(&db->db_dirty_records, dr)) { + if (dr->dr_txg == txg) + break; + } if (dr == NULL) { /* @@ -1478,6 +1974,30 @@ return (ENOENT); } + /* + * XXX TEMP + * If the dirty record is not CACHED, then the dmu_sync call needs + * to wait for it to reach that state. Thus, it needs to issue the + * READ if necessary. Then, it needs to wait for that read to + * complete, which should cause the dbuf to become CACHED after + * resolving the applicable write ranges. At that point, the sync + * can be completed. + */ + if (db->db_state != DB_CACHED) { + if (db->db_state & (DB_UNCACHED|DB_PARTIAL)) + dbuf_transition_to_read(db); + + while (db->db_state & (DB_READ|DB_FILL)) + cv_wait(&db->db_changed, &db->db_mtx); + + /* The dbuf had an I/O error or was freed in flight */ + if (db->db_state == DB_UNCACHED) { + mutex_exit(&db->db_mtx); + return (ENOENT); + } + } + ASSERT(db->db_state == DB_CACHED); + ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { @@ -1860,8 +2380,8 @@ void dmu_fini(void) { + arc_fini(); /* arc depends on l2arc */ l2arc_fini(); - arc_fini(); zfetch_fini(); dbuf_fini(); dnode_fini(); --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c @@ -1009,7 +1009,10 @@ dmu_objset_name(os, name); strlcat(name, "@", sizeof(name)); strlcat(name, snapname, sizeof(name)); - zvol_create_minors(name); + err = zvol_create_minors(name); + if (err) + printf("ZFS WARNING: Unable to create minors" + " for snapshot %s\n", name); } #endif #endif @@ -1305,15 +1308,17 @@ static void * dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) { - dbuf_dirty_record_t *dr, **drp; + dbuf_dirty_record_t *dr; void *data; if (db->db_dirtycnt == 0) return (db->db.db_data); /* Nothing is changing */ - for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) + for (dr = list_head(&db->db_dirty_records); dr != NULL; + dr = list_next(&db->db_dirty_records, dr)) { if (dr->dr_txg == tx->tx_txg) break; + } if (dr == NULL) { data = NULL; --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c @@ -23,6 +23,7 @@ */ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Spectra Logic Corporation. All rights reserved. */ #include @@ -269,23 +270,8 @@ zio_t *zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); - /* first level-0 block */ start = off >> dn->dn_datablkshift; - if (P2PHASE(off, dn->dn_datablksz) || - len < dn->dn_datablksz) { - err = dmu_tx_check_ioerr(zio, dn, 0, start); - if (err) - goto out; - } - - /* last level-0 block */ end = (off+len-1) >> dn->dn_datablkshift; - if (end != start && end <= dn->dn_maxblkid && - P2PHASE(off+len, dn->dn_datablksz)) { - err = dmu_tx_check_ioerr(zio, dn, 0, end); - if (err) - goto out; - } /* level-1 blocks */ if (nlvls > 1) { @@ -331,7 +317,8 @@ dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db); + err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db, + /*buf_set*/NULL); rw_exit(&dn->dn_struct_rwlock); if (err) { @@ -544,7 +531,8 @@ blkoff = P2PHASE(blkid, epb); tochk = MIN(epb - blkoff, nblks); - err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf); + err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf, + /*buf_set*/NULL); if (err) { txh->txh_tx->tx_err = err; break; @@ -805,7 +793,7 @@ #ifdef ZFS_DEBUG void -dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) +dmu_tx_verify_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) { dmu_tx_hold_t *txh; int match_object = FALSE, match_offset = FALSE; --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Spectra Logic Corporation. All rights reserved. */ #include @@ -449,13 +450,13 @@ dn->dn_assigned_txg = 0; dn->dn_dirtyctx = 0; - if (dn->dn_dirtyctx_firstset != NULL) { - kmem_free(dn->dn_dirtyctx_firstset, 1); - dn->dn_dirtyctx_firstset = NULL; - } + dn->dn_dirtyctx_firstset = NULL; if (dn->dn_bonus != NULL) { + list_t evict_list; + dmu_buf_create_user_evict_list(&evict_list); mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_evict(dn->dn_bonus); + dbuf_evict(dn->dn_bonus, &evict_list); + dmu_buf_destroy_user_evict_list(&evict_list); dn->dn_bonus = NULL; } dn->dn_zio = NULL; @@ -542,10 +543,7 @@ dn->dn_dirtyctx = 0; dn->dn_free_txg = 0; - if (dn->dn_dirtyctx_firstset) { - kmem_free(dn->dn_dirtyctx_firstset, 1); - dn->dn_dirtyctx_firstset = NULL; - } + dn->dn_dirtyctx_firstset = NULL; dn->dn_allocated_txg = tx->tx_txg; dn->dn_id_flags = 0; @@ -955,15 +953,12 @@ } static void -dnode_buf_pageout(dmu_buf_t *db, void *arg) +dnode_buf_pageout(dmu_buf_user_t *dbu) { - dnode_children_t *children_dnodes = arg; + dnode_children_t *children_dnodes = (dnode_children_t *)dbu; int i; - int epb = db->db_size >> DNODE_SHIFT; - ASSERT(epb == children_dnodes->dnc_count); - - for (i = 0; i < epb; i++) { + for (i = 0; i < children_dnodes->dnc_count; i++) { dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; dnode_t *dn; @@ -993,7 +988,7 @@ dnh->dnh_dnode = NULL; } kmem_free(children_dnodes, sizeof (dnode_children_t) + - (epb - 1) * sizeof (dnode_handle_t)); + (children_dnodes->dnc_count - 1) * sizeof (dnode_handle_t)); } /** @@ -1075,7 +1070,7 @@ idx = object & (epb-1); ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); - children_dnodes = dmu_buf_get_user(&db->db); + children_dnodes = (dnode_children_t *)dmu_buf_get_user(&db->db); if (children_dnodes == NULL) { int i; dnode_children_t *winner; @@ -1087,8 +1082,11 @@ zrl_init(&dnh[i].dnh_zrlock); dnh[i].dnh_dnode = NULL; } - if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL, - dnode_buf_pageout)) { + dmu_buf_init_user(&children_dnodes->db_evict, + dnode_buf_pageout, NULL); + winner = (dnode_children_t *) + dmu_buf_set_user(&db->db, &children_dnodes->db_evict); + if (winner) { kmem_free(children_dnodes, sizeof (dnode_children_t) + (epb - 1) * sizeof (dnode_handle_t)); children_dnodes = winner; @@ -1206,6 +1204,7 @@ void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) { + dmu_buf_impl_t *db; objset_t *os = dn->dn_objset; uint64_t txg = tx->tx_txg; @@ -1266,7 +1265,8 @@ */ VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); - (void) dbuf_dirty(dn->dn_dbuf, tx); + db = dn->dn_dbuf; + (void) dbuf_dirty(db, tx); dsl_dataset_dirty(os->os_dsl_dataset, tx); } @@ -1351,7 +1351,7 @@ goto fail; /* resize the old block */ - err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db); + err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db, /*buf_set*/NULL); if (err == 0) dbuf_new_size(db, size, tx); else if (err != ENOENT) @@ -1459,6 +1459,34 @@ rw_downgrade(&dn->dn_struct_rwlock); } +/** + * \brief Mark a dnode as dirty if it is not already. + * + * \param dn Dnode to mark dirty. + * \param tx Transaction the dnode is being dirtied in. + * \param tag Tag to track the first dirty of this dnode. + */ +void +dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, void *tag) +{ + + mutex_enter(&dn->dn_mtx); + /* + * Don't set dirtyctx to SYNC if we're just modifying this as we + * initialize the objset. + */ + if (dn->dn_dirtyctx == DN_UNDIRTIED) { + if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) { + if (dmu_tx_is_syncing(tx)) + dn->dn_dirtyctx = DN_DIRTY_SYNC; + else + dn->dn_dirtyctx = DN_DIRTY_OPEN; + } + dn->dn_dirtyctx_firstset = tag; + } + mutex_exit(&dn->dn_mtx); +} + void dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) { @@ -1562,11 +1590,11 @@ if (len < head) head = len; if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE, - FTAG, &db) == 0) { + FTAG, &db, /*buf_set*/NULL) == 0) { caddr_t data; /* don't dirty if it isn't on disk and isn't dirty */ - if (db->db_last_dirty || + if (!list_is_empty(&db->db_dirty_records) || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); dbuf_will_dirty(db, tx); @@ -1600,9 +1628,9 @@ if (len < tail) tail = len; if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), - TRUE, FTAG, &db) == 0) { + TRUE, FTAG, &db, /*buf_set*/NULL) == 0) { /* don't dirty if not on disk and not dirty */ - if (db->db_last_dirty || + if (!list_is_empty(&db->db_dirty_records) || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); dbuf_will_dirty(db, tx); @@ -1851,7 +1879,8 @@ data = dn->dn_phys->dn_blkptr; } else { uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl); - error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db); + error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db, + /*buf_set*/NULL); if (error) { if (error != ENOENT) return (error); --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c @@ -164,15 +164,19 @@ rw_enter(&dn->dn_struct_rwlock, RW_READER); err = dbuf_hold_impl(dn, db->db_level-1, - (db->db_blkid << epbs) + i, TRUE, FTAG, &child); + (db->db_blkid << epbs) + i, TRUE, FTAG, &child, + /*buf_set*/NULL); rw_exit(&dn->dn_struct_rwlock); if (err == ENOENT) continue; ASSERT(err == 0); ASSERT(child->db_level == 0); - dr = child->db_last_dirty; - while (dr && dr->dr_txg > txg) - dr = dr->dr_next; + + for (dr = list_head(&child->db_dirty_records); + dr != NULL && dr->dr_txg > txg; + dr = list_next(&child->db_dirty_records, dr)) + ; + ASSERT(dr == NULL || dr->dr_txg == txg); /* data_old better be zeroed */ @@ -194,7 +198,7 @@ mutex_enter(&child->db_mtx); buf = child->db.db_data; if (buf != NULL && child->db_state != DB_FILL && - child->db_last_dirty == NULL) { + list_is_empty(&child->db_dirty_records)) { for (j = 0; j < child->db.db_size >> 3; j++) { if (buf[j] != 0) { panic("freed data not zero: " @@ -262,7 +266,8 @@ FREE_VERIFY(db, start, end, tx); blocks_freed = free_blocks(dn, bp, end-start+1, tx); arc_buf_freeze(db->db_buf); - ASSERT(all || blocks_freed == 0 || db->db_last_dirty); + ASSERT(all || blocks_freed == 0 || + !list_is_empty(&db->db_dirty_records)); DB_DNODE_EXIT(db); return (all ? ALL : blocks_freed); } @@ -271,7 +276,8 @@ if (BP_IS_HOLE(bp)) continue; rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb); + err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb, + /*buf_set*/NULL); ASSERT3U(err, ==, 0); rw_exit(&dn->dn_struct_rwlock); @@ -295,7 +301,8 @@ ASSERT3U(bp->blk_birth, ==, 0); } #endif - ASSERT(all || blocks_freed == 0 || db->db_last_dirty); + ASSERT(all || blocks_freed == 0 || + !list_is_empty(&db->db_dirty_records)); return (all ? ALL : blocks_freed); } @@ -347,7 +354,8 @@ if (BP_IS_HOLE(bp)) continue; rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db); + err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db, + /*buf_set*/NULL); ASSERT3U(err, ==, 0); rw_exit(&dn->dn_struct_rwlock); @@ -375,7 +383,10 @@ { int progress; int pass = 0; + list_t evict_list; + dmu_buf_create_user_evict_list(&evict_list); + do { dmu_buf_impl_t *db, marker; int evicting = FALSE; @@ -400,10 +411,13 @@ mutex_exit(&db->db_mtx); } else if (refcount_is_zero(&db->db_holds)) { progress = TRUE; - dbuf_clear(db); /* exits db_mtx for us */ + dbuf_clear(db, &evict_list); } else { mutex_exit(&db->db_mtx); } + /* Make sure dbuf_clear exits db_mtx for us. */ + ASSERT(MUTEX_NOT_HELD(&db->db_mtx)); + dmu_buf_process_user_evicts(&evict_list); } list_remove(&dn->dn_dbufs, &marker); @@ -424,10 +438,11 @@ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) { mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_evict(dn->dn_bonus); + dbuf_evict(dn->dn_bonus, &evict_list); dn->dn_bonus = NULL; } rw_exit(&dn->dn_struct_rwlock); + dmu_buf_destroy_user_evict_list(&evict_list); } static void @@ -445,8 +460,9 @@ mutex_enter(&db->db_mtx); /* XXX - use dbuf_undirty()? */ list_remove(list, dr); - ASSERT(db->db_last_dirty == dr); - db->db_last_dirty = NULL; + ASSERT(list_head(&db->db_dirty_records) == dr); + list_remove_head(&db->db_dirty_records); + dbuf_dirty_record_cleanup_ranges(dr); db->db_dirtycnt -= 1; if (db->db_level == 0) { ASSERT(db->db_blkid == DMU_BONUS_BLKID || --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c @@ -165,7 +165,7 @@ if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { int64_t delta; - dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); + dprintf_bp(bp, "freeing ds=%llu\n", ds->ds_object); dsl_free(tx->tx_pool, tx->tx_txg, bp); mutex_enter(&ds->ds_dir->dd_lock); @@ -258,11 +258,9 @@ return (B_TRUE); } -/* ARGSUSED */ static void -dsl_dataset_evict(dmu_buf_t *db, void *dsv) +dsl_dataset_evict_impl(dsl_dataset_t *ds, boolean_t evict_deadlist) { - dsl_dataset_t *ds = dsv; ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); @@ -277,7 +275,7 @@ } bplist_destroy(&ds->ds_pending_deadlist); - if (db != NULL) { + if (evict_deadlist) { dsl_deadlist_close(&ds->ds_deadlist); } else { ASSERT(ds->ds_deadlist.dl_dbuf == NULL); @@ -301,6 +299,13 @@ kmem_free(ds, sizeof (dsl_dataset_t)); } +/* ARGSUSED */ +static void +dsl_dataset_evict(dmu_buf_user_t *dbu) +{ + dsl_dataset_evict_impl((dsl_dataset_t *)dbu, B_TRUE); +} + static int dsl_dataset_get_snapname(dsl_dataset_t *ds) { @@ -389,7 +394,7 @@ if (doi.doi_type != DMU_OT_DSL_DATASET) return (EINVAL); - ds = dmu_buf_get_user(dbuf); + ds = (dsl_dataset_t *)dmu_buf_get_user(dbuf); if (ds == NULL) { dsl_dataset_t *winner; @@ -471,10 +476,12 @@ ds->ds_reserved = ds->ds_quota = 0; } - if (err == 0) { - winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, - dsl_dataset_evict); - } + dmu_buf_init_user(&ds->db_evict, dsl_dataset_evict, + (void **)&ds->ds_phys); + if (err == 0) + winner = (dsl_dataset_t *) + dmu_buf_set_user_ie(dbuf, &ds->db_evict); + if (err || winner) { bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); @@ -747,7 +754,7 @@ if (ds->ds_dbuf) dsl_dataset_drop_ref(ds, tag); else - dsl_dataset_evict(NULL, ds); + dsl_dataset_evict_impl(ds, B_FALSE); } boolean_t @@ -1485,6 +1492,7 @@ } struct refsarg { + dmu_buf_user_t db_evict; kmutex_t lock; boolean_t gone; kcondvar_t cv; @@ -1492,9 +1500,9 @@ /* ARGSUSED */ static void -dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) +dsl_dataset_refs_gone(dmu_buf_user_t *dbu) { - struct refsarg *arg = argv; + struct refsarg *arg = (struct refsarg *)dbu; mutex_enter(&arg->lock); arg->gone = TRUE; @@ -1506,13 +1514,18 @@ dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) { struct refsarg arg; + dmu_buf_user_t *old_user; bzero(&arg, sizeof(arg)); mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); arg.gone = FALSE; - (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, - dsl_dataset_refs_gone); + dmu_buf_init_user(&arg.db_evict, dsl_dataset_refs_gone, + (void **)&ds->ds_phys); + old_user = dmu_buf_update_user(ds->ds_dbuf, &ds->db_evict, + &arg.db_evict); + ASSERT(old_user == &ds->db_evict); + dmu_buf_rele(ds->ds_dbuf, tag); mutex_enter(&arg.lock); while (!arg.gone) --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c @@ -50,9 +50,9 @@ /* ARGSUSED */ static void -dsl_dir_evict(dmu_buf_t *db, void *arg) +dsl_dir_evict(dmu_buf_user_t *dbu) { - dsl_dir_t *dd = arg; + dsl_dir_t *dd = (dsl_dir_t *)dbu; dsl_pool_t *dp = dd->dd_pool; int t; @@ -90,7 +90,7 @@ err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); if (err) return (err); - dd = dmu_buf_get_user(dbuf); + dd = (dsl_dir_t *)dmu_buf_get_user(dbuf); #ifdef ZFS_DEBUG { dmu_object_info_t doi; @@ -159,8 +159,9 @@ dmu_buf_rele(origin_bonus, FTAG); } - winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys, - dsl_dir_evict); + dmu_buf_init_user(&dd->db_evict, dsl_dir_evict, + (void **)&dd->dd_phys); + winner = (dsl_dir_t *)dmu_buf_set_user_ie(dbuf, &dd->db_evict); if (winner) { if (dd->dd_parent) dsl_dir_close(dd->dd_parent, dd); --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c @@ -55,6 +55,10 @@ kmutex_t zfs_write_limit_lock; +#ifdef ZFS_DEBUG +zio_t *syncer_zio = NULL; +#endif + static pgcnt_t old_physmem = 0; SYSCTL_DECL(_vfs_zfs); @@ -365,6 +369,9 @@ dsl_dataset_sync(ds, zio, tx); } DTRACE_PROBE(pool_sync__1setup); +#ifdef ZFS_DEBUG + syncer_zio = zio; +#endif err = zio_wait(zio); write_time = gethrtime() - start; @@ -387,6 +394,9 @@ dmu_buf_rele(ds->ds_dbuf, ds); dsl_dataset_sync(ds, zio, tx); } +#ifdef ZFS_DEBUG + syncer_zio = zio; +#endif err = zio_wait(zio); /* @@ -419,6 +429,9 @@ list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); dmu_objset_sync(mos, zio, tx); +#ifdef ZFS_DEBUG + syncer_zio = zio; +#endif err = zio_wait(zio); ASSERT(err == 0); dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c @@ -1280,9 +1280,9 @@ /*ARGSUSED*/ void -sa_evict(dmu_buf_t *db, void *sap) +sa_evict(dmu_buf_user_t *dbu) { - panic("evicting sa dbuf %p\n", (void *)db); + panic("evicting sa dbuf\n"); } static void @@ -1321,9 +1321,10 @@ void sa_handle_destroy(sa_handle_t *hdl) { + dmu_buf_t *db = hdl->sa_bonus; + mutex_enter(&hdl->sa_lock); - (void) dmu_buf_update_user((dmu_buf_t *)hdl->sa_bonus, hdl, - NULL, NULL, NULL); + (void) dmu_buf_update_user(db, &hdl->db_evict, NULL); if (hdl->sa_bonus_tab) { sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); @@ -1349,7 +1350,7 @@ { int error = 0; dmu_object_info_t doi; - sa_handle_t *handle; + sa_handle_t *handle = NULL, *winner = NULL; #ifdef ZFS_DEBUG dmu_object_info_from_db(db, &doi); @@ -1359,23 +1360,27 @@ /* find handle, if it exists */ /* if one doesn't exist then create a new one, and initialize it */ - handle = (hdl_type == SA_HDL_SHARED) ? dmu_buf_get_user(db) : NULL; + if (hdl_type == SA_HDL_SHARED) + handle = (sa_handle_t *)dmu_buf_get_user(db); + if (handle == NULL) { - sa_handle_t *newhandle; handle = kmem_cache_alloc(sa_cache, KM_SLEEP); + bzero(&handle->db_evict, sizeof(dmu_buf_user_t)); handle->sa_userp = userp; handle->sa_bonus = db; handle->sa_os = os; handle->sa_spill = NULL; error = sa_build_index(handle, SA_BONUS); - newhandle = (hdl_type == SA_HDL_SHARED) ? - dmu_buf_set_user_ie(db, handle, - NULL, sa_evict) : NULL; + if (hdl_type == SA_HDL_SHARED) { + dmu_buf_init_user(&handle->db_evict, sa_evict, NULL); + winner = (sa_handle_t *) + dmu_buf_set_user_ie(db, &handle->db_evict); + } - if (newhandle != NULL) { + if (winner != NULL) { kmem_cache_free(sa_cache, handle); - handle = newhandle; + handle = winner; } } *handlepp = handle; @@ -1888,8 +1893,10 @@ void sa_update_user(sa_handle_t *newhdl, sa_handle_t *oldhdl) { - (void) dmu_buf_update_user((dmu_buf_t *)newhdl->sa_bonus, - oldhdl, newhdl, NULL, sa_evict); + dmu_buf_t *db = newhdl->sa_bonus; + + dmu_buf_init_user(&newhdl->db_evict, sa_evict, NULL); + (void) dmu_buf_update_user(db, &oldhdl->db_evict, &newhdl->db_evict); oldhdl->sa_bonus = NULL; } --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2011-2012 Spectra Logic Corporation. All rights reserved. */ /** @@ -696,6 +697,20 @@ offsetof(spa_error_entry_t, se_avl)); } +static void +spa_zio_thread_init(void *context __unused) +{ + + VERIFY(0 == dmu_thread_context_create()); +} + +static void +spa_zio_thread_destroy(void *context) +{ + + dmu_thread_context_destroy(context/*NOTUSED*/); +} + static taskq_t * spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, uint_t value) @@ -739,7 +754,7 @@ } #endif return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, - spa->spa_proc, flags)); + spa->spa_proc, flags, spa_zio_thread_init, spa_zio_thread_destroy)); } static void @@ -1383,7 +1398,7 @@ packed = kmem_alloc(nvsize, KM_SLEEP); error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, - DMU_READ_PREFETCH); + DMU_CTX_FLAG_PREFETCH); if (error == 0) error = nvlist_unpack(packed, nvsize, value, 0); kmem_free(packed, nvsize); @@ -2468,7 +2483,7 @@ { spa_t *spa; spa_load_state_t state = SPA_LOAD_OPEN; - int error; + int error = 0; int locked = B_FALSE; int firstopen = B_FALSE; @@ -2570,15 +2585,23 @@ mutex_exit(&spa_namespace_lock); #ifdef __FreeBSD__ #ifdef _KERNEL - if (firstopen) - zvol_create_minors(pool); + if (firstopen) { + /* + * Don't pass up errors from here. The SPA was + * still created and we can't reasonably unwind it + * at this point. + */ + if (zvol_create_minors(pool)) + printf("ZFS WARNING: ZVOL device nodes for " + "pool %s could not be created\n", pool); + } #endif #endif } *spapp = spa; - return (0); + return (error); } int @@ -3577,13 +3600,23 @@ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); mutex_exit(&spa_namespace_lock); - spa_history_log_version(spa, LOG_POOL_IMPORT); #ifdef __FreeBSD__ #ifdef _KERNEL - zvol_create_minors(pool); + if (zvol_create_minors(pool)) { + /* + * Don't pass up errors from here. The SPA was + * still created and we can't reasonably unwind it + * at this point. + */ + printf("ZFS WARNING: Unable to create ZVOL block devices " + "for pool %s\n", pool); + } #endif #endif + + spa_history_log_version(spa, LOG_POOL_IMPORT); + return (0); } @@ -5141,6 +5174,8 @@ vd->vdev_stat.vs_checksum_errors = 0; vdev_state_dirty(vd->vdev_top); + /* Tell userspace that the vdev is gone. */ + zfs_post_remove(spa, vd); } for (int c = 0; c < vd->vdev_children; c++) --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c @@ -128,12 +128,12 @@ firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof); if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread, - buf, DMU_READ_PREFETCH)) != 0) + buf, DMU_CTX_FLAG_PREFETCH)) != 0) return (err); if (firstread != sizeof (reclen)) { if ((err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, sizeof (reclen) - firstread, - buf + firstread, DMU_READ_PREFETCH)) != 0) + buf + firstread, DMU_CTX_FLAG_PREFETCH)) != 0) return (err); } @@ -415,10 +415,10 @@ } err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf, - DMU_READ_PREFETCH); + DMU_CTX_FLAG_PREFETCH); if (leftover && err == 0) { err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, - leftover, buf + read_len, DMU_READ_PREFETCH); + leftover, buf + read_len, DMU_CTX_FLAG_PREFETCH); } mutex_exit(&spa->spa_history_lock); --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c @@ -271,6 +271,8 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0, "Try to recover from otherwise-fatal errors."); +SYSCTL_INT(_vfs_zfs, OID_AUTO, debug_flags, CTLFLAG_RW, &zfs_flags, 0, + "Debug flags for ZFS testing."); /* * ========================================================================== --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c @@ -338,7 +338,7 @@ mutex_exit(sm->sm_lock); error = dmu_read(os, smo->smo_object, offset, size, entry_map, - DMU_READ_PREFETCH); + DMU_CTX_FLAG_PREFETCH); mutex_enter(sm->sm_lock); if (error != 0) break; --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Spectra Logic Corporation. All rights reserved. */ #ifndef _SYS_ARC_H @@ -52,6 +53,7 @@ void *b_data; arc_evict_func_t *b_efunc; void *b_private; + void *b_last_dbuf; }; typedef enum arc_buf_contents { @@ -67,6 +69,7 @@ #define ARC_PREFETCH (1 << 3) /**< I/O is a prefetch */ #define ARC_CACHED (1 << 4) /**< I/O was already in cache */ #define ARC_L2CACHE (1 << 5) /**< cache in L2ARC */ +#define ARC_CACHED_ONLY (1 << 6) /**< cache lookup only */ /** * The following breakdows of arc_size exist for kstat only. @@ -92,16 +95,25 @@ int arc_buf_remove_ref(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); +arc_buf_t *arc_buf_find_bp(spa_t *spa, blkptr_t *bp, void *priv); int arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa, zbookmark_t *zb); int arc_released(arc_buf_t *buf); int arc_has_callback(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); +boolean_t arc_buf_frozen(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf); #endif +static inline void +arc_discard_buf(arc_buf_t *buf, void *tag) +{ + arc_release(buf, tag); + VERIFY(arc_buf_remove_ref(buf, tag) == 1); +} + int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, arc_done_func_t *done, void *priv, int priority, int zio_flags, uint32_t *arc_flags, const zbookmark_t *zb); --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Spectra Logic Corporation. All rights reserved. */ #ifndef _SYS_DBUF_H @@ -51,30 +52,190 @@ #define DB_RF_NOPREFETCH (1 << 3) #define DB_RF_NEVERWAIT (1 << 4) #define DB_RF_CACHED (1 << 5) +#define DB_RF_CACHED_ONLY (1 << 6) /** \} */ /** * The simplified state transition diagram for dbufs looks like: + * *\verbatim - +----> READ ----+ - | | - | V - (alloc)-->UNCACHED CACHED-->EVICTING-->(free) - | ^ ^ - | | | - +----> FILL ----+ | - | | - | | - +--------> NOFILL -------+ + +-> PARTIAL_FILL <---> PARTIAL-+ + | | | + +---------->READ_FILL<----[----+ + | ^ | + | | | + | V | + +-----------> READ ------+[-------+ + | || | + | VV V + (alloc)-->UNCACHED----------------->FILL--->CACHED----> EVICTING-->(free) + | ^ + | | + +--------------------> NOFILL ------------------+ \endverbatim + * + * Reader State Transitions: + * UNCACHED -> READ: Access to a block that does not have an + * active dbuf. A read is issued to media + * upon an ARC or L2ARC miss. + * + * READ -> CACHED: Data satisfied from the ARC, L2ARC, or + * a read of the media. No writes occurred. + * + * PARTIAL -> READ: Access to a block that has been partially + * written but has yet to have the read + * needed to resolve the COW fault issued. + * The read is issued to media. The ARC and + * L2ARC are not involved since they were + * checked for a hit at the time of the first + * write to this buffer. + * + * Writer State Transitions: + * UNCACHED -> FILL: Access to a block that does not have an + * active dbuf. Writer is filling the entire + * block. + * + * UNCACHED -> PARTIAL_FILL: Access to a block that does not have an + * active dbuf. Writer is filling a portion + * of the block starting at the beginning or + * end. The read needed to resolve the COW + * fault is deferred until we see that the + * writer will not fill this whole buffer. + * + * UNCACHED -> READ_FILL: Access to a block that does not have an + * active dbuf. Writer is filling a portion + * of the block and we have enough information + * to expect that the buffer will not be fully + * written. The read needed to resolve the COW + * fault is issued asynchronously. + * + * READ -> READ_FILL: Access to a block that has an active dbuf + * and a read has already been issued for the + * original buffer contents. A COW fault may + * not have occurred, if the buffer was not + * already dirty. Writer is filling a portion + * of the buffer. + * + * PARTIAL -> PARTIAL_FILL: Access to a block that has an active dbuf + * with an outstanding COW fault. Writer is + * filling a portion of the block and we have + * enough information to expect that the buffer + * will eventually be fully written. + * + * PARTIAL -> READ_FILL: Access to a block that has an active dbuf + * with an outstanding COW fault. Writer is + * filling a portion of the block and we have + * enough information to expect that the buffer + * will not be fully written, causing a read + * to be issued. + * + * PARTIAL -> FILL: Access to a block that has an active dbuf + * with an outstanding COW fault. Writer is + * filling enough of the buffer to avoid the + * read for this fault entirely. + * + * READ -> FILL: Access to a block that has an active dbuf + * with an outstanding COW fault, and a read + * has been issued. Write is filling enough of + * the buffer to obsolete the read. + * + * I/O Complete Transitions: + * FILL -> CACHED: The thread modifying the buffer has completed + * its work. The buffer can now be accessed by + * other threads. + * + * PARTIAL_FILL -> PARTIAL: The write thread modifying the buffer has + * completed its work. The buffer can now be + * accessed by other threads. No read has been + * issued to resolve the COW fault. + * + * READ_FILL -> READ: The write thread modifying the buffer has + * completed its work. The buffer can now be + * accessed by other threads. A read is + * outstanding to resolve the COW fault. + * + * The READ, PARITIAL_FILL, and READ_FILL states indicate the data associated + * with a dbuf is volatile and a new client must wait for the current consumer + * to exit the dbuf from that state prior to accessing the data. + * + * The PARITIAL_FILL, PARTIAL, READ_FILL, and READ states are used for + * deferring any reads required for resolution of Copy-On-Write faults. + * A PARTIAL dbuf has accumulated write data in its dirty records + * that must be merged into the existing data for the record once the + * record is read. A READ dbuf is a dbuf for which a synchronous or + * async read has been issued. If the dbuf has dirty records, this read + * is required to resolve the COW fault before those dirty records can be + * committed to disk. The FILL variants of these two states indicate that + * either new write data is being added to the dirty records for this dbuf, + * or the read has completed and the write and read data are being merged. + * + * Writers must block on dbufs in any of the FILL states. + * + * Synchronous readers must block on dbufs in the READ, and any + * of the FILL states. Further, a reader must transition a dbuf from the + * UNCACHED or PARTIAL state to the READ state by issuing a read, before + * blocking. + * + * The transition from PARTIAL to READ is also triggered by writers that + * perform a discontiguous write to the buffer, meaning that there is + * little chance for a latter writer to completely fill the buffer. + * Since the read cannot be avoided, it is issued immediately. */ typedef enum dbuf_states { - DB_UNCACHED, - DB_FILL, - DB_NOFILL, - DB_READ, - DB_CACHED, - DB_EVICTING + /** + * Dbuf has no valid data. + */ + DB_UNCACHED = 0x01, + + /** + * The Dbuf's contents are being modified by an active thread. + * This state can be combined with PARTIAL or READ. When + * just in the DB_FILL state, the entire buffer's contents are + * being supplied by the writer. When combined with the other + * states, the buffer is only being partially dirtied. + */ + DB_FILL = 0x02, + + /** + * Dbuf has been partially dirtied by writers. No read has been + * issued to resolve the COW fault. + */ + DB_PARTIAL = 0x04, + + /** + * A NULL DBuf associated with swap backing store. + */ + DB_NOFILL = 0x08, + + /** + * A read has been issued for an uncached buffer with no + * outstanding dirty data (i.e. Not PARTIAL). + */ + DB_READ = 0x10, + + /** + * The entire contents of this dbuf are valid. The buffer + * may still be dirty. + */ + DB_CACHED = 0x20, + + /** + * The Dbuf is in the process of being freed. + */ + DB_EVICTING = 0x40, + + /** + * Dbuf has been partially dirtied by writers and a + * thread is actively modifying the dbuf. + */ + DB_PARTIAL_FILL = DB_PARTIAL|DB_FILL, + + /** + * Dbuf has been partially dirtied by writers, a read + * has been issued to resolve the COW fault, and a + * thread is actively modifying the dbuf. + */ + DB_READ_FILL = DB_READ|DB_FILL } dbuf_states_t; struct dnode; @@ -102,6 +263,34 @@ * have child DRs, each associated with its child dbufs. Finally, the leaf * DRs contain the ARC buffer containing the data to be written. */ +typedef struct dbuf_dirty_indirect_record { + kmutex_t dr_mtx; /* Protects the children. */ + list_t dr_children; /* List of our dirty children. */ +} dbuf_dirty_indirect_record_t; + +typedef struct dbuf_dirty_leaf_record { + /* + * dr_data is set when we dirty the buffer so that we can retain the + * pointer even if it gets COW'd in a subsequent transaction group. + */ + arc_buf_t *dr_data; + blkptr_t dr_overridden_by; + override_states_t dr_override_state; + uint8_t dr_copies; + + /* + * List of the ranges that dr_data's contents are valid for. + * Used when not all of dr_data is valid, as it may be if writes + * only cover part of it, and no read has filled in the gaps yet. + */ + list_t write_ranges; +} dbuf_dirty_leaf_record_t; + +typedef union dbuf_dirty_record_types { + struct dbuf_dirty_indirect_record di; + struct dbuf_dirty_leaf_record dl; +} dbuf_dirty_record_types_t; + typedef struct dbuf_dirty_record { /** link on our parents dirty list */ list_node_t dr_dirty_node; @@ -115,36 +304,25 @@ /** pointer back to our dbuf */ struct dmu_buf_impl *dr_dbuf; - /** pointer to next dirty record */ - struct dbuf_dirty_record *dr_next; + /** list link for dbuf dirty records */ + list_node_t db_dirty_record_link; /** pointer to parent dirty record */ struct dbuf_dirty_record *dr_parent; - union dirty_types { - struct dirty_indirect { + union dbuf_dirty_record_types dt; +} dbuf_dirty_record_t; - /** protect access to list */ - kmutex_t dr_mtx; +typedef struct dbuf_dirty_range { + list_node_t write_range_link; + int start; + int end; + int size; +} dbuf_dirty_range_t; - /** Our list of dirty children */ - list_t dr_children; - } di; - struct dirty_leaf { +struct dbuf_array; +struct dmu_buf_impl; - /** - * dr_data is set when we dirty the buffer - * so that we can retain the pointer even if it - * gets COW'd in a subsequent transaction group. - */ - arc_buf_t *dr_data; - blkptr_t dr_overridden_by; - override_states_t dr_override_state; - uint8_t dr_copies; - } dl; - } dt; -} dbuf_dirty_record_t; - typedef struct dmu_buf_impl { /* * The following members are immutable, with the exception of @@ -215,8 +393,14 @@ kcondvar_t db_changed; dbuf_dirty_record_t *db_data_pending; - /** pointer to most recent dirty record for this buffer */ - dbuf_dirty_record_t *db_last_dirty; + /** List of dirty records for the buffer sorted newest to oldest. */ + list_t db_dirty_records; + + /** + * List of DMU buffer sets dependent on this dbuf. + * See dmu_context_node_t, the indirect list entry structure used. + */ + list_t db_dmu_buf_sets; /** * Our link on the owner dnodes's dn_dbufs list. @@ -226,15 +410,14 @@ /** Data which is unique to data (leaf) blocks: */ - /** stuff we store for the user (see dmu_buf_set_user) */ - void *db_user_ptr; - void **db_user_data_ptr_ptr; - dmu_buf_evict_func_t *db_evict_func; + /** User callback information. See dmu_buf_set_user(). */ + dmu_buf_user_t *db_user; uint8_t db_immediate_evict; uint8_t db_freed_in_flight; uint8_t db_dirtycnt; + } dmu_buf_impl_t; #define DBUF_MUTEXES 256 @@ -246,6 +429,28 @@ kmutex_t hash_mutexes[DBUF_MUTEXES]; } dbuf_hash_table_t; +typedef struct dmu_context_node { + + /** This entry's link in the list. */ + list_node_t dcn_link; + + /** This entry's buffer set pointer. */ + dmu_buf_set_t *buf_set; + +} dmu_context_node_t; + +void dmu_context_node_add(list_t *list, dmu_buf_set_t *buf_set); +void dmu_context_node_remove(list_t *list, dmu_context_node_t *dcn); + +/** + * \brief Thread-specific DMU callback state for processing async I/O's. + */ +typedef struct dmu_cb_state { + + /** The list of IOs that are ready to be processed. */ + list_t io_list; + +} dmu_cb_state_t; uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset); @@ -260,7 +465,7 @@ dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, void *tag); int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create, - void *tag, dmu_buf_impl_t **dbp); + void *tag, dmu_buf_impl_t **dbp, dmu_buf_set_t *buf_set); void dbuf_prefetch(struct dnode *dn, uint64_t blkid); @@ -274,6 +479,9 @@ int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +void dbuf_will_dirty_range(dmu_buf_impl_t *db, dmu_tx_t *tx, int offset, + int size); +void dbuf_transition_to_read(dmu_buf_impl_t *db); void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); @@ -282,8 +490,8 @@ dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); -void dbuf_clear(dmu_buf_impl_t *db); -void dbuf_evict(dmu_buf_impl_t *db); +void dbuf_clear(dmu_buf_impl_t *db, list_t *evict_list); +void dbuf_evict(dmu_buf_impl_t *db, list_t *evict_list); void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); void dbuf_unoverride(dbuf_dirty_record_t *dr); @@ -317,6 +525,7 @@ void dbuf_init(void); void dbuf_fini(void); +void dbuf_dirty_record_cleanup_ranges(dbuf_dirty_record_t *dr); boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); @@ -353,8 +562,8 @@ (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ (u_longlong_t)__db_obj); \ dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \ - "obj=%s lvl=%u blkid=%lld " fmt, \ - __db_buf, (dbuf)->db_level, \ + "ptr=%p arc=%p obj=%s lvl=%u blkid=%lld " fmt, \ + dbuf, (dbuf)->db_buf, __db_buf, (dbuf)->db_level, \ (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \ } \ _NOTE(CONSTCOND) } while (0) --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2011-2012 Spectra Logic Corporation. All rights reserved. */ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. @@ -40,6 +41,7 @@ * dmu_spa.h. */ +#include #include #include #include @@ -178,11 +180,11 @@ /** * \brief Artificial blkid for bonus blocks */ -#define DMU_BONUS_BLKID (-1ULL) +#define DMU_BONUS_BLKID (ULLONG_MAX) /** * \brief Artificial blkid for spill blocks */ -#define DMU_SPILL_BLKID (-2ULL) +#define DMU_SPILL_BLKID (ULLONG_MAX - 1) /* * Public routines to create, destroy, open, and close objsets. */ @@ -217,7 +219,223 @@ void *db_data; /**< data in buffer */ } dmu_buf_t; -typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); +/** + * \brief These structures are for DMU consumers that want async + * callbacks. + */ +struct dmu_context; +struct dmu_buf_set; +struct zio; +typedef void (*dmu_context_callback_t)(struct dmu_context *); +typedef void (*dmu_buf_set_callback_t)(struct dmu_buf_set *); +typedef void (*dmu_buf_transfer_callback_t)(struct dmu_buf_set *, dmu_buf_t *, + uint64_t, uint64_t); + +typedef struct dmu_context { + + /** The primary data associated with this context. */ + uint64_t size; /**< Requested total I/O size. */ + uint64_t resid; /**< Remaining bytes to process. */ + uint64_t dn_start; /**< Starting block offset into the dnode. */ + uint64_t dn_offset; /**< Current block offset. */ + dmu_tx_t *tx; /**< Caller's transaction, if specified. */ + void *data_buf; /**< UIO or char pointer */ + + /** The dnode held in association with this context. */ + struct dnode *dn; + objset_t *os; /**< Object set associated with the dnode. */ + uint64_t object; /**< Object ID associated with the dnode. */ + + /** Number of buffer sets left to complete. */ + int holds; + + /** The tag used for this context. */ + void *tag; + + /** The callback to call once an I/O completes entirely. */ + dmu_context_callback_t context_cb; + + /** The callback to call to transfer a buffer set. */ + dmu_buf_set_callback_t buf_set_transfer_cb; + + /** The callback to call to transfer a buffer. */ + dmu_buf_transfer_callback_t buf_transfer_cb; + + /** + * The callback to call to move a specific block's contents. This + * is normally only set by dmu_context_init(). + */ + dmu_buf_transfer_callback_t move_cb; + + /** Total number of bytes transferred. */ + uint64_t completed_size; + + /** Flags for this DMU context. */ + uint32_t flags; +#define DMU_CTX_FLAG_READ (1 << 1) +#define DMU_CTX_FLAG_UIO (1 << 2) +#define DMU_CTX_FLAG_PREFETCH (1 << 3) +#define DMU_CTX_FLAG_NO_HOLD (1 << 4) +#define DMU_CTX_FLAG_SUN_PAGES (1 << 5) +#define DMU_CTX_FLAG_NOFILL (1 << 6) +#define DMU_CTX_FLAG_ASYNC (1 << 7) + +#define DMU_CTX_WRITER_FLAGS (DMU_CTX_FLAG_SUN_PAGES|DMU_CTX_FLAG_NOFILL) +#define DMU_CTX_READER_FLAGS (DMU_CTX_FLAG_PREFETCH) + +#define DMU_CTX_BUF_IS_CHAR(dmu_ctx) \ + (((dmu_ctx)->flags & (DMU_CTX_FLAG_UIO|DMU_CTX_FLAG_SUN_PAGES)) == 0) + + /** The number of errors that occurred. */ + int err; + +} dmu_context_t; + +typedef struct dmu_buf_set { + + /** The DMU context that this buffer set is associated with. */ + dmu_context_t *dmu_ctx; + + /** Number of dmu_bufs associated with this context. */ + int count; + + /** Length of dbp; only used to free the correct size. */ + int dbp_length; + + /** Number of dmu_bufs left to complete. */ + int holds; + + /** The starting offset, relative to the associated dnode. */ + uint64_t dn_start; + /** The size of the I/O. */ + uint64_t size; + /** The amount of data remaining to process for this buffer set. */ + uint64_t resid; + + /** For writes only, if the context doesn't have a transaction. */ + dmu_tx_t *tx; +#define DMU_BUF_SET_TX(buf_set) \ + ((buf_set)->dmu_ctx->tx ? (buf_set)->dmu_ctx->tx : (buf_set)->tx) + + /** The number of errors that occurred. */ + int err; + + /** The ZIO associated with this context. */ + struct zio *zio; + + /** The set of buffers themselves. */ + struct dmu_buf *dbp[0]; + +} dmu_buf_set_t; + +void dmu_buf_set_rele(dmu_buf_set_t *buf_set, boolean_t err); +int dmu_context_init(dmu_context_t *dmu_ctx, struct dnode *dn, objset_t *os, + uint64_t object, uint64_t offset, uint64_t size, void *data_buf, void *tag, + uint32_t flags); +void dmu_context_seek(dmu_context_t *dmu_ctx, uint64_t offset, uint64_t size, + void *data_buf); +void dmu_context_rele(dmu_context_t *dmu_ctx); +void dmu_buf_set_transfer(dmu_buf_set_t *buf_set); +void dmu_buf_set_transfer_write(dmu_buf_set_t *buf_set); + +/* Optional context setters; use after calling dmu_context_init*(). */ +static inline void +dmu_context_set_context_cb(dmu_context_t *ctx, dmu_context_callback_t cb) +{ + ctx->context_cb = cb; +} +static inline void +dmu_context_set_buf_set_transfer_cb(dmu_context_t *ctx, + dmu_buf_set_callback_t cb) +{ + ctx->buf_set_transfer_cb = cb; +} +static inline void +dmu_context_set_buf_transfer_cb(dmu_context_t *ctx, + dmu_buf_transfer_callback_t cb) +{ + ctx->buf_transfer_cb = cb; +} +static inline void +dmu_context_set_dmu_tx(dmu_context_t *ctx, dmu_tx_t *tx) +{ + ASSERT(tx != NULL && ((ctx->flags & DMU_CTX_FLAG_READ) == 0)); + dmu_context_set_buf_set_transfer_cb(ctx, dmu_buf_set_transfer); + ctx->tx = tx; +} + +/* DMU thread context handlers. */ +int dmu_thread_context_create(void); +void dmu_thread_context_process(void); +void dmu_thread_context_destroy(void *); + +struct dmu_buf_user; + +typedef void dmu_buf_evict_func_t(struct dmu_buf_user *); + +/** + * The DMU buffer user eviction data container. This is used to allow users + * of a dbuf to register with the dbuf layer that they wish to be notified + * when the backing store for a dbuf they're using has been evicted. The + * backing store is an ARC buffer that corresponds to the transaction group + * that the user is currently operating in. + * + * Whenever a dirty record's ARC buffer is removed, the context in which the + * removal occurs must queue an user eviction. This queue must then be + * processed while not holding any dbuf locks. In this way, the user can + * perform any work needed in their eviction function. + * + * Implementation Note: Users are expected to allocate and free space for + * this structure. Consequently, if any additional context is needed, another + * struct that includes this one at the start should be passed in. + */ +typedef struct dmu_buf_user { + /** + * This instance's link in the eviction queue. Set when the buffer + * has evicted and the callback needs to be called. + */ + list_node_t evict_queue_link; + /** This instance's eviction function pointer. */ + dmu_buf_evict_func_t *evict_func; + /** Location that db_data, when updated, should be copied to. */ + void **user_data_ptr_ptr; +} dmu_buf_user_t; + +/** Initialization routine for dmu_buf_user_t instances. */ +static inline void +dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, + void **user_data_ptr_ptr) +{ + list_link_init(&dbu->evict_queue_link); + dbu->evict_func = evict_func; + dbu->user_data_ptr_ptr = user_data_ptr_ptr; +} + +/** DMU buffer user eviction routines. */ +static inline void +dmu_buf_create_user_evict_list(list_t *evict_list_p) +{ + list_create(evict_list_p, sizeof(dmu_buf_user_t), + offsetof(dmu_buf_user_t, evict_queue_link)); +} +static inline void +dmu_buf_process_user_evicts(list_t *evict_list_p) +{ + dmu_buf_user_t *dbu, *next; + + for (dbu = (dmu_buf_user_t *)list_head(evict_list_p); dbu != NULL; + dbu = next) { + next = (dmu_buf_user_t *)list_next(evict_list_p, dbu); + list_remove(evict_list_p, dbu); + dbu->evict_func(dbu); + } +} +static inline void +dmu_buf_destroy_user_evict_list(list_t *evict_list_p) +{ + dmu_buf_process_user_evicts(evict_list_p); + list_destroy(evict_list_p); +} /* * The names of zap entries in the DIRECTORY_OBJECT of the MOS. @@ -293,18 +511,17 @@ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); -void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr, - dmu_buf_evict_func_t *pageout_func); -void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr, - void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func); -void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, - void *user_ptr, void *user_data_ptr_ptr, - dmu_buf_evict_func_t *pageout_func); +dmu_buf_user_t *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user); +dmu_buf_user_t *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user); +dmu_buf_user_t *dmu_buf_update_user(dmu_buf_t *db, dmu_buf_user_t *old_user, + dmu_buf_user_t *new_user); void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func); -void *dmu_buf_get_user(dmu_buf_t *db); +dmu_buf_user_t *dmu_buf_get_user(dmu_buf_t *db); void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_will_dirty_range(dmu_buf_t *db, dmu_tx_t *tx, int offset, + int size); boolean_t dmu_buf_freeable(dmu_buf_t *); @@ -336,19 +553,23 @@ uint64_t size); int dmu_free_object(objset_t *os, uint64_t object); +void dmu_buf_cb_process(void); + /* * Convenience functions. * * Canfail routines will return 0 on success, or an errno if there is a * nonrecoverable I/O error. */ +// XXX REMOVE THESE IN FAVOR OF DMU_CTX_FLAG_PREFETCH #define DMU_READ_PREFETCH 0 /* prefetch */ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags); +int dmu_issue(dmu_context_t *dmu_ctx); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); -void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, +int dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h @@ -130,14 +130,14 @@ int dmu_tx_private_ok(dmu_tx_t *tx); void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object); void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta); -void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db); +void dmu_tx_verify_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db); int dmu_tx_holds(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space); #ifdef ZFS_DEBUG -#define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db) +#define DMU_TX_VERIFY_DIRTY_BUF(tx, db) dmu_tx_verify_dirty_buf(tx, db) #else -#define DMU_TX_DIRTY_BUF(tx, db) +#define DMU_TX_VERIFY_DIRTY_BUF(tx, db) #endif #ifdef __cplusplus --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012, Spectra Logic Corporation. All rights reserved. */ #ifndef _SYS_DNODE_H @@ -99,6 +100,10 @@ #define DNODES_PER_LEVEL (1ULL << DNODES_PER_LEVEL_SHIFT) /** \} */ +/* Next level for a given dnode and txg */ +#define DN_NEXT_LEVEL(dn, txg) \ + (dn)->dn_next_nlevels[(txg) & TXG_MASK] + /* The +2 here is a cheesy way to round up */ #define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \ (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT))) @@ -278,6 +283,8 @@ } dnode_handle_t; typedef struct dnode_children { + /** Dbuf user eviction data for this instance. */ + dmu_buf_user_t db_evict; size_t dnc_count; /**< number of children */ dnode_handle_t dnc_children[1]; /**< sized dynamically */ } dnode_children_t; @@ -303,6 +310,7 @@ boolean_t dnode_add_ref(dnode_t *dn, void *ref); void dnode_rele(dnode_t *dn, void *ref); void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); +void dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, void *tag); void dnode_sync(dnode_t *dn, dmu_tx_t *tx); void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); @@ -327,6 +335,12 @@ int minlvl, uint64_t blkfill, uint64_t txg); void dnode_evict_dbufs(dnode_t *dn); +#define DNODE_VERIFY_DIRTYCTX(dn, tx) \ + ASSERT((dn)->dn_object == DMU_META_DNODE_OBJECT || \ + (dn)->dn_dirtyctx == DN_UNDIRTIED || \ + (dn)->dn_dirtyctx == \ + (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)) + #ifdef ZFS_DEBUG /* --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h @@ -107,6 +107,9 @@ } dsl_dataset_phys_t; typedef struct dsl_dataset { + /** Dbuf user eviction data for this instance. */ + dmu_buf_user_t db_evict; + /** * \name Immutable * \{ */ --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h @@ -76,6 +76,9 @@ } dsl_dir_phys_t; struct dsl_dir { + /** Dbuf user eviction data for this instance. */ + dmu_buf_user_t db_evict; + /** * \name These are immutable; no lock needed * \{ */ --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h @@ -40,7 +40,7 @@ * particular object, use FTAG (which is a string) for the holder_tag. * Otherwise, use the object that holds the reference. */ -#define FTAG ((char *)__func__) +#define FTAG ((char *)(uintptr_t)__func__) #ifdef ZFS_DEBUG typedef struct reference { --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h @@ -206,6 +206,8 @@ * This needs to be kept as small as possible. */ struct sa_handle { + /** Dbuf user eviction data for this instance. */ + dmu_buf_user_t db_evict; kmutex_t sa_lock; dmu_buf_t *sa_bonus; dmu_buf_t *sa_spill; --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h @@ -343,9 +343,11 @@ extern void vdev_set_min_asize(vdev_t *vd); /* - * zdb uses this tunable, so it must be declared here to make lint happy. + * Global variables */ +/* zdb uses this tunable, so it must be declared here to make lint happy. */ extern int zfs_vdev_cache_size; +extern uint_t zfs_geom_probe_vdev; #ifdef __cplusplus } --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h @@ -138,6 +138,8 @@ typedef struct zap_table_phys zap_table_phys_t; typedef struct zap { + /** Dbuf user eviction data for this instance. */ + dmu_buf_user_t db_evict; objset_t *zap_objset; uint64_t zap_object; struct dmu_buf *zap_dbuf; @@ -182,7 +184,7 @@ int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp); void zap_unlockdir(zap_t *zap); -void zap_evict(dmu_buf_t *db, void *vmzap); +void zap_evict(dmu_buf_user_t *dbu); zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt); void zap_name_free(zap_name_t *zn); int zap_hashbits(zap_t *zap); --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h @@ -156,6 +156,8 @@ } zap_leaf_chunk_t; typedef struct zap_leaf { + /** Dbuf user eviction data for this instance. */ + dmu_buf_user_t db_evict; krwlock_t l_rwlock; uint64_t l_blkid; /**< 1<io_error = (error); \ + if (error != 0 && error != ECKSUM) { \ + dprintf("zio %p error %d at %s:%d\n", \ + zio, (error), __FILE__, __LINE__); \ + (zio)->io_last_errno.err = error; \ + (zio)->io_last_errno.filename = __FILE__; \ + (zio)->io_last_errno.lineno = __LINE__; \ + } \ +} while (0) +#else +#define ZIO_SET_ERROR(zio, error) (zio)->io_error = error +#endif /* ZFS_DEBUG */ + extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *priv, enum zio_flag flags); --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c @@ -863,7 +863,7 @@ uint64_t object = 0; error = dmu_read(mos, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, - DMU_READ_PREFETCH); + DMU_CTX_FLAG_PREFETCH); if (error) return (error); if (object != 0) { --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c @@ -66,6 +66,13 @@ SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW, &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); +/** + * Thread local storage used to indicate when a thread is probing geoms + * for their guids. If NULL, this thread is not tasting geoms. If non NULL, + * it is looking for a replacement for the vdev_t* that is its value. + */ +uint_t zfs_geom_probe_vdev; + static void vdev_geom_orphan(struct g_consumer *cp) { @@ -93,7 +100,6 @@ * async removal support to invoke a close on this * vdev once it is safe to do so. */ - zfs_post_remove(vd->vdev_spa, vd); vd->vdev_remove_wanted = B_TRUE; spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); } @@ -374,9 +380,8 @@ static void vdev_geom_taste_orphan(struct g_consumer *cp) { - - KASSERT(1 == 0, ("%s called while tasting %s.", __func__, - cp->provider->name)); + ZFS_LOG(0, "WARNING: Orphan %s while tasting its VDev GUID.", + cp->provider->name); } static struct g_consumer * @@ -392,7 +397,6 @@ g_topology_assert(); zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste"); - /* This orphan function should be never called. */ zgp->orphan = vdev_geom_taste_orphan; zcp = g_new_consumer(zgp); @@ -535,6 +539,9 @@ g_topology_lock(); error = 0; + /* Set the TLS to indicate downstack that we should not access zvols*/ + VERIFY(tsd_set(zfs_geom_probe_vdev, vd) == 0); + /* * Try using the recorded path for this device, but only * accept it if its label data contains the expected GUIDs. @@ -569,6 +576,9 @@ cp = vdev_geom_open_by_path(vd, 0); } + /* Clear the TLS now that tasting is done */ + VERIFY(tsd_set(zfs_geom_probe_vdev, NULL) == 0); + if (cp == NULL) { ZFS_LOG(1, "Provider %s not found.", vd->vdev_path); error = ENOENT; --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c @@ -384,19 +384,38 @@ uint64_t nparity) { raidz_map_t *rm; + /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = zio->io_offset >> unit_shift; /* The zio's size in units of the vdev's preferred sector size */ uint64_t s = zio->io_size >> unit_shift; + /* The first column for this stripe. */ uint64_t f = b % dcols; + /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << unit_shift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; + /* + * The number of sectors for this stripe on all but the "remnant" + * child vdev. + */ q = s / (dcols - nparity); + + /* + * The number of "remnant" sectors in this I/O. This will add a + * sector to some, but not all, child vdevs. + */ r = s - q * (dcols - nparity); + + /* The number of "bonus columns" - those which contain remnant data. */ bc = (r == 0 ? 0 : r + nparity); + + /* The total number of sectors associated with this I/O. */ tot = s + nparity * (q + (r == 0 ? 0 : 1)); if (q == 0) { + /* + * Our I/O request doesn't span all child vdevs. + */ acols = bc; scols = MIN(dcols, roundup(bc, nparity + 1)); } else { --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c @@ -52,7 +52,6 @@ int fzap_default_block_shift = 14; /* 16k blocksize */ -static void zap_leaf_pageout(dmu_buf_t *db, void *vl); static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); @@ -82,8 +81,10 @@ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); zap->zap_ismicro = FALSE; - (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap, - &zap->zap_f.zap_phys, zap_evict); + dmu_buf_init_user(&zap->db_evict, zap_evict, + (void **)&zap->zap_f.zap_phys); + (void) dmu_buf_update_user(zap->zap_dbuf, &zap->db_evict, + &zap->db_evict); mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1; @@ -389,10 +390,19 @@ return (newblk); } +static void +zap_leaf_pageout(dmu_buf_user_t *dbu) +{ + zap_leaf_t *l = (zap_leaf_t *)dbu; + + rw_destroy(&l->l_rwlock); + kmem_free(l, sizeof (zap_leaf_t)); +} + static zap_leaf_t * zap_create_leaf(zap_t *zap, dmu_tx_t *tx) { - void *winner; + zap_leaf_t *winner; zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); @@ -406,7 +416,8 @@ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, DMU_READ_NO_PREFETCH)); - winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout); + dmu_buf_init_user(&l->db_evict, zap_leaf_pageout, (void **)&l->l_phys); + winner = (zap_leaf_t *)dmu_buf_set_user(l->l_dbuf, &l->db_evict); ASSERT(winner == NULL); dmu_buf_will_dirty(l->l_dbuf, tx); @@ -438,16 +449,6 @@ dmu_buf_rele(l->l_dbuf, NULL); } -_NOTE(ARGSUSED(0)) -static void -zap_leaf_pageout(dmu_buf_t *db, void *vl) -{ - zap_leaf_t *l = vl; - - rw_destroy(&l->l_rwlock); - kmem_free(l, sizeof (zap_leaf_t)); -} - static zap_leaf_t * zap_open_leaf(uint64_t blkid, dmu_buf_t *db) { @@ -463,12 +464,13 @@ l->l_dbuf = db; l->l_phys = NULL; - winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout); + dmu_buf_init_user(&l->db_evict, zap_leaf_pageout, (void **)&l->l_phys); + winner = (zap_leaf_t *)dmu_buf_set_user(db, &l->db_evict); rw_exit(&l->l_rwlock); if (winner != NULL) { /* someone else set it first */ - zap_leaf_pageout(NULL, l); + zap_leaf_pageout(&l->db_evict); l = winner; } @@ -517,7 +519,7 @@ ASSERT3U(db->db_size, ==, 1 << bs); ASSERT(blkid != 0); - l = dmu_buf_get_user(db); + l = (zap_leaf_t *)dmu_buf_get_user(db); if (l == NULL) l = zap_open_leaf(blkid, db); --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c @@ -390,7 +390,9 @@ * it, because zap_lockdir() checks zap_ismicro without the lock * held. */ - winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict); + dmu_buf_init_user(&zap->db_evict, zap_evict, + (void **)&zap->zap_m.zap_phys); + winner = (zap_t *)dmu_buf_set_user(db, &zap->db_evict); if (winner != NULL) { rw_exit(&zap->zap_rwlock); @@ -476,7 +478,7 @@ } #endif - zap = dmu_buf_get_user(db); + zap = (zap_t *)dmu_buf_get_user(db); if (zap == NULL) zap = mzap_open(os, obj, db); @@ -710,11 +712,10 @@ return (dmu_object_free(os, zapobj, tx)); } -_NOTE(ARGSUSED(0)) void -zap_evict(dmu_buf_t *db, void *vzap) +zap_evict(dmu_buf_user_t *dbu) { - zap_t *zap = vzap; + zap_t *zap = (zap_t *)dbu; rw_destroy(&zap->zap_rwlock); --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c @@ -1118,7 +1118,7 @@ if (znode_acl.z_acl_extern_obj) { error = dmu_read(zp->z_zfsvfs->z_os, znode_acl.z_acl_extern_obj, 0, aclnode->z_size, - aclnode->z_acldata, DMU_READ_PREFETCH); + aclnode->z_acldata, DMU_CTX_FLAG_PREFETCH); } else { bcopy(znode_acl.z_ace_data, aclnode->z_acldata, aclnode->z_size); --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c @@ -134,7 +134,7 @@ packed = kmem_alloc(fuid_size, KM_SLEEP); VERIFY(dmu_read(os, fuid_obj, 0, - fuid_size, packed, DMU_READ_PREFETCH) == 0); + fuid_size, packed, DMU_CTX_FLAG_PREFETCH) == 0); VERIFY(nvlist_unpack(packed, fuid_size, &nvp, 0) == 0); VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY, --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -25,6 +25,7 @@ * Portions Copyright 2011 Martin Matuska * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2011-2012, Spectra Logic Corporation. All rights reserved. */ #include @@ -149,6 +150,7 @@ va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); + printf("%s", buf); /* * To get this data, use the zfs-dprintf probe as so: @@ -3073,17 +3075,16 @@ /* * It would be nice to do this atomically. */ - if (error == 0) { + if (error == 0) error = zfs_set_prop_nvlist(zc->zc_name, ZPROP_SRC_LOCAL, nvprops, NULL); - if (error != 0) - (void) dmu_objset_destroy(zc->zc_name, B_FALSE); - } nvlist_free(nvprops); #ifdef __FreeBSD__ if (error == 0 && type == DMU_OST_ZVOL) - zvol_create_minors(zc->zc_name); + error = zvol_create_minors(zc->zc_name); #endif + if (error) + (void) dmu_objset_destroy(zc->zc_name, B_FALSE); return (error); } @@ -5311,6 +5312,8 @@ uint_t zfs_fsyncer_key; extern uint_t rrw_tsd_key; +extern uint_t zfs_async_io_key; +extern uint_t zfs_geom_probe_vdev; #ifdef sun int @@ -5331,6 +5334,7 @@ tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, NULL); + tsd_create(&zfs_async_io_key, dmu_thread_context_destroy); error = ldi_ident_from_mod(&modlinkage, &zfs_li); ASSERT(error == 0); @@ -5361,6 +5365,7 @@ (void) ddi_modclose(sharefs_mod); tsd_destroy(&zfs_fsyncer_key); + tsd_destroy(&zfs_async_io_key); ldi_ident_release(zfs_li); zfs_li = NULL; mutex_destroy(&zfs_share_lock); @@ -5392,6 +5397,8 @@ tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, NULL); + tsd_create(&zfs_async_io_key, dmu_thread_context_destroy); + tsd_create(&zfs_geom_probe_vdev, NULL); printf("ZFS storage pool version " SPA_VERSION_STRING "\n"); root_mount_rel(zfs_root_token); @@ -5412,6 +5419,7 @@ tsd_destroy(&zfs_fsyncer_key); tsd_destroy(&rrw_tsd_key); + tsd_destroy(&zfs_async_io_key); mutex_destroy(&zfs_share_lock); break; --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c @@ -503,7 +503,7 @@ (write_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, - zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { + zp->z_id, off, len, lr + 1, /*flags*/0) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(txtype, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -413,7 +413,7 @@ va+off, tx); } else { (void) dmu_read(os, oid, start+off, nbytes, - va+off, DMU_READ_PREFETCH); + va+off, DMU_CTX_FLAG_PREFETCH); } zfs_unmap_page(sf); VM_OBJECT_LOCK(obj); @@ -465,7 +465,7 @@ VM_OBJECT_UNLOCK(obj); va = zfs_map_page(pp, &sf); error = dmu_read(os, zp->z_id, start, bytes, va, - DMU_READ_PREFETCH); + DMU_CTX_FLAG_PREFETCH); if (bytes != PAGESIZE && error == 0) bzero(va + bytes, PAGESIZE - bytes); zfs_unmap_page(sf); @@ -569,10 +569,11 @@ znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; - ssize_t n, nbytes; + ssize_t n; int error; rl_t *rl; xuio_t *xuio = NULL; + dmu_context_t dmu_ctx; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); @@ -665,19 +666,27 @@ } #endif /* sun */ + error = dmu_context_init(&dmu_ctx, /*dnode*/NULL, os, zp->z_id, + uio->uio_loffset, n, uio, FTAG, + DMU_CTX_FLAG_READ|DMU_CTX_FLAG_UIO|DMU_CTX_FLAG_PREFETCH); + if (error) + goto out; + while (n > 0) { - nbytes = MIN(n, zfs_read_chunk_size - + ssize_t sz = MIN(n, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); #ifdef __FreeBSD__ if (uio->uio_segflg == UIO_NOCOPY) - error = mappedread_sf(vp, nbytes, uio); + error = mappedread_sf(vp, sz, uio); else #endif /* __FreeBSD__ */ if (vn_has_cached_data(vp)) - error = mappedread(vp, nbytes, uio); - else - error = dmu_read_uio(os, zp->z_id, uio, nbytes); + error = mappedread(vp, sz, uio); + else { + dmu_context_seek(&dmu_ctx, uio->uio_loffset, sz, uio); + error = dmu_issue(&dmu_ctx); + } if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) @@ -685,8 +694,9 @@ break; } - n -= nbytes; + n -= sz; } + dmu_context_rele(&dmu_ctx); out: zfs_range_unlock(rl); @@ -1152,7 +1162,7 @@ error = ENOENT; } else { error = dmu_read(os, object, offset, size, buf, - DMU_READ_NO_PREFETCH); + /*flags*/0); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ @@ -4624,7 +4634,7 @@ ASSERT3U(io_off, ==, cur_pp->p_offset); va = zfs_map_page(cur_pp, S_WRITE); err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, - DMU_READ_PREFETCH); + DMU_CTX_FLAG_PREFETCH); zfs_unmap_page(cur_pp, va); if (err) { /* On error, toss the entire kluster */ @@ -5528,7 +5538,7 @@ VM_OBJECT_UNLOCK(object); va = zfs_map_page(mreq, &sf); error = dmu_read(os, zp->z_id, IDX_TO_OFF(mreq->pindex), - size, va, DMU_READ_PREFETCH); + size, va, DMU_CTX_FLAG_PREFETCH); if (size != PAGE_SIZE) bzero(va + size, PAGE_SIZE - size); zfs_unmap_page(sf); --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c @@ -1166,7 +1166,7 @@ return (EINVAL); } - hdl = dmu_buf_get_user(db); + hdl = (sa_handle_t *)dmu_buf_get_user(db); if (hdl != NULL) { zp = sa_get_userdata(hdl); --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -1265,17 +1265,19 @@ boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); - return; + break; } zio->io_stage = stage; rv = zio_pipeline[highbit(stage) - 1](zio); if (rv == ZIO_PIPELINE_STOP) - return; + break; ASSERT(rv == ZIO_PIPELINE_CONTINUE); } + /* Process any deferred events placed on this thread's list. */ + dmu_thread_context_process(); } /** --- old/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c +++ new/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c @@ -23,6 +23,8 @@ * * Copyright (c) 2006-2010 Pawel Jakub Dawidek * All rights reserved. + * + * Copyright (c) 2011-2012, Spectra Logic Corporation. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -51,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -88,6 +91,25 @@ .version = G_VERSION, }; +static d_open_t zvol_open; +static d_close_t zvol_close; +static d_strategy_t zvol_strategy; +static d_read_t zvol_freebsd_read; +static d_write_t zvol_freebsd_write; +static d_ioctl_t zvol_freebsd_ioctl; + +struct cdevsw zfs_zvol_cdevsw = { + .d_version = D_VERSION, + .d_flags = D_DISK|D_TRACKCLOSE, + .d_name = "zvol", + .d_open = zvol_open, + .d_close = zvol_close, + .d_strategy = zvol_strategy, + .d_read = zvol_freebsd_read, + .d_write = zvol_freebsd_write, + .d_ioctl = zvol_freebsd_ioctl, +}; + DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); /** @@ -117,6 +139,7 @@ uint64_t zv_volsize; /**< amount of space we advertise */ uint64_t zv_volblocksize; /**< volume block size */ struct g_provider *zv_provider; /**< GEOM provider */ + struct cdev *zv_dev; /**< DEVFS device */ uint8_t zv_min_bs; /**< minimum addressable block shift */ uint8_t zv_flags; /**< readonly, dumpified, etc. */ objset_t *zv_objset; /**< objset handle */ @@ -151,7 +174,6 @@ static int zvol_dump_fini(zvol_state_t *zv); static int zvol_dump_init(zvol_state_t *zv, boolean_t resize); -static zvol_state_t *zvol_geom_create(const char *name); static void zvol_geom_run(zvol_state_t *zv); static void zvol_geom_destroy(zvol_state_t *zv); static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); @@ -471,8 +493,96 @@ mutex_exit(&spa_namespace_lock); return (zv ? 0 : -1); } + +static int +zvol_create_minor_sun(zvol_state_t **zvp, const char *name) +{ + int error = 0; + boolean_t minor_created = B_FALSE; + minor_t minor; + + if ((minor = zfsdev_minor_alloc()) == 0) + return (ENXIO); + + if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) + return (EAGAIN); + (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME, + (char *)name); + + (void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor); + + if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR, + minor, DDI_PSEUDO, 0) == DDI_FAILURE) { + error = EAGAIN; + goto out; + } + + (void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor); + + if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK, + minor, DDI_PSEUDO, 0) == DDI_FAILURE) { + error = EAGAIN; + goto out; + } + minor_created = B_TRUE; + + zs = ddi_get_soft_state(zfsdev_state, minor); + zs->zss_type = ZSST_ZVOL; + zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); + if (zv == NULL) + error = ENOMEM; + else + *zvp = zv; + +out: + if (error) { + if (minor_created) + ddi_remove_minor_node(zfs_dip, chrbuf); + ddi_soft_state_free(zfsdev_state, minor); + } + return (error); +} #endif /* sun */ +#ifdef __FreeBSD__ +static int +zvol_create_minor_freebsd(zvol_state_t **zvp, const char *name) +{ + struct g_provider *pp; + struct g_geom *gp; + zvol_state_t *zv; + struct cdev *zv_dev; + int error = 0; + + error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &zv_dev, + &zfs_zvol_cdevsw, NULL, UID_ROOT, GID_OPERATOR, 0600, "%s/%s", + ZVOL_DRIVER, name); + if (error) { + printf("ZFS: ZVOL '%s': Could not create device node\n", name); + return (error); + } + + zv = kmem_zalloc(sizeof(*zv), KM_SLEEP); + gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); + gp->start = zvol_geom_start; + gp->access = zvol_geom_access; + pp = g_new_providerf(gp, "g%s/%s", ZVOL_DRIVER, name); + pp->sectorsize = DEV_BSIZE; + zv->zv_provider = pp; + zv->zv_state = 0; + zv->zv_dev = zv_dev; + bioq_init(&zv->zv_queue); + mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF); + + /* Provide both GEOM and the block device with its zvol state. */ + pp->private = zv; + zv->zv_dev->si_drv1 = zv; + *zvp = zv; + + return (0); +} +#endif + /** * \brief Create a minor node (plus a whole lot more) for the specified volume. */ @@ -481,7 +591,7 @@ { zfs_soft_state_t *zs; zvol_state_t *zv; - objset_t *os; + objset_t *os = NULL; dmu_object_info_t doi; int error; @@ -497,56 +607,19 @@ /* lie and say we're read-only */ error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os); - if (error) { - mutex_exit(&spa_namespace_lock); - return (error); - } + if (error) + goto out; -#ifdef sun - if ((minor = zfsdev_minor_alloc()) == 0) { - dmu_objset_disown(os, FTAG); - mutex_exit(&spa_namespace_lock); - return (ENXIO); - } - - if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) { - dmu_objset_disown(os, FTAG); - mutex_exit(&spa_namespace_lock); - return (EAGAIN); - } - (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME, - (char *)name); - - (void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor); - - if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR, - minor, DDI_PSEUDO, 0) == DDI_FAILURE) { - ddi_soft_state_free(zfsdev_state, minor); - dmu_objset_disown(os, FTAG); - mutex_exit(&spa_namespace_lock); - return (EAGAIN); - } - - (void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor); - - if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK, - minor, DDI_PSEUDO, 0) == DDI_FAILURE) { - ddi_remove_minor_node(zfs_dip, chrbuf); - ddi_soft_state_free(zfsdev_state, minor); - dmu_objset_disown(os, FTAG); - mutex_exit(&spa_namespace_lock); - return (EAGAIN); - } - - zs = ddi_get_soft_state(zfsdev_state, minor); - zs->zss_type = ZSST_ZVOL; - zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); -#else /* !sun */ - +#if defined(sun) + error = zvol_create_minor_sun(&zv, os, name); +#elif defined(__FreeBSD__) DROP_GIANT(); g_topology_lock(); - zv = zvol_geom_create(name); -#endif /* !sun */ + error = zvol_create_minor_freebsd(&zv, name); +#endif + + if (error) + goto out; (void) strlcpy(zv->zv_name, name, MAXPATHLEN); zv->zv_min_bs = DEV_BSHIFT; @@ -569,21 +642,30 @@ else zil_replay(os, zv, zvol_replay_vector); } - dmu_objset_disown(os, FTAG); - zv->zv_objset = NULL; + +out: + if (os != NULL) + dmu_objset_disown(os, FTAG); - zvol_minors++; + if (error == 0) { + zvol_minors++; + zv->zv_objset = NULL; + } mutex_exit(&spa_namespace_lock); - zvol_geom_run(zv); + if (error == 0) + zvol_geom_run(zv); +#ifdef __FreeBSD__ g_topology_unlock(); PICKUP_GIANT(); +#endif - ZFS_LOG(1, "ZVOL %s created.", name); + if (error == 0) + ZFS_LOG(1, "ZVOL %s created.", name); - return (0); + return (error); } /** @@ -710,12 +792,13 @@ tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); error = dmu_tx_assign(tx, TXG_WAIT); + if (error == 0) + error = dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx); if (error) { dmu_tx_abort(tx); (void) dmu_free_long_range(os, ZVOL_OBJ, 0, off); return (error); } - dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx); dmu_tx_commit(tx); off += bytes; resid -= bytes; @@ -870,28 +953,31 @@ return (error); } -/*ARGSUSED*/ +/** + * \invariant: spa_namespace_lock must be held. + */ static int -zvol_open(struct g_provider *pp, int flag, int count) +zvol_common_open(zvol_state_t *zv, int flag, int count) { - zvol_state_t *zv; - int err = 0; + int err; - if (MUTEX_HELD(&spa_namespace_lock)) { - /* - * If the spa_namespace_lock is being held, it means that ZFS - * is trying to open ZVOL as its VDEV. This is not supported. - */ - return (EOPNOTSUPP); - } + err = 0; - mutex_enter(&spa_namespace_lock); - - zv = pp->private; if (zv == NULL) { mutex_exit(&spa_namespace_lock); return (ENXIO); } + if (tsd_get(zfs_geom_probe_vdev) != NULL) { + /* + * if zfs_geom_probe_vdev is set, that means that zfs is + * attempting to probe geom providers while looking for a + * replacement for a missing VDEV. In this case, the + * spa_namespace_lock will not be held, but it is still illegal + * to use a zvol as a vdev. Deadlocks can result if another + * thread has spa_namespace_lock + */ + return (EOPNOTSUPP); + } if (zv->zv_total_opens == 0) err = zvol_first_open(zv); @@ -930,14 +1016,37 @@ /*ARGSUSED*/ static int -zvol_close(struct g_provider *pp, int flag, int count) +zvol_geom_open(struct g_provider *pp, int flag, int count) +{ + zvol_state_t *zv; + + if (MUTEX_HELD(&spa_namespace_lock)) { + printf("ZFS: Using ZVOL as a vdev is not supported\n"); + return (EOPNOTSUPP); + } + + mutex_enter(&spa_namespace_lock); + return (zvol_common_open(pp->private, flag, count)); +} + +static int +zvol_open(struct cdev *dev, int flags, int fmt, struct thread *td) { zvol_state_t *zv; - int error = 0; + + if (MUTEX_HELD(&spa_namespace_lock)) { + printf("ZFS: Using ZVOL as a vdev is not supported\n"); + return (EOPNOTSUPP); + } mutex_enter(&spa_namespace_lock); + return (zvol_common_open(dev->si_drv1, flags, /*count*/ 1)); +} + +static int +zvol_common_close(zvol_state_t *zv, int count) +{ - zv = pp->private; if (zv == NULL) { mutex_exit(&spa_namespace_lock); return (ENXIO); @@ -954,16 +1063,35 @@ */ ASSERT(zv->zv_total_opens != 0); + zv->zv_total_opens -= count; + /* - * You may get multiple opens, but only one close. + * We track closes in the standard and GEOM cases, so we should get + * a close (or close count) for every open. */ - zv->zv_total_opens -= count; - if (zv->zv_total_opens == 0) zvol_last_close(zv); mutex_exit(&spa_namespace_lock); - return (error); + + return (0); +} + +/*ARGSUSED*/ +static int +zvol_geom_close(struct g_provider *pp, int flag, int count) +{ + + mutex_enter(&spa_namespace_lock); + return (zvol_common_close(pp->private, count)); +} + +static int +zvol_close(struct cdev *dev, int flags, int fmt, struct thread *td) +{ + + mutex_enter(&spa_namespace_lock); + return (zvol_common_close(dev->si_drv1, /*count*/ 1)); } static void @@ -1012,7 +1140,7 @@ */ if (buf != NULL) { /* immediate write */ error = dmu_read(os, object, offset, size, buf, - DMU_READ_NO_PREFETCH); + /*flags*/0); } else { size = zv->zv_volblocksize; offset = P2ALIGN(offset, size); @@ -1089,7 +1217,7 @@ (write_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (write_state == WR_COPIED && dmu_read(zv->zv_objset, - ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { + ZVOL_OBJ, off, len, lr + 1, /*flags*/0) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; @@ -1198,90 +1326,181 @@ } #endif /* sun */ -int -zvol_strategy(struct bio *bp) +typedef struct zvol_dmu_state { + /** + * The DMU context associated with this DMU state. Note that this + * must be the first entry in order for the callback to be able to + * discover the zvol_dmu_state_t. + */ + dmu_context_t dmu_ctx; + zvol_state_t *zv; + rl_t *rl; +} zvol_dmu_state_t; + +static void +zvol_dmu_buf_set_transfer_write(dmu_buf_set_t *buf_set) +{ + zvol_dmu_state_t *zds = (zvol_dmu_state_t *)buf_set->dmu_ctx; + zvol_state_t *zv = zds->zv; + boolean_t sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); + dmu_tx_t *tx = DMU_BUF_SET_TX(buf_set); + + dmu_buf_set_transfer_write(buf_set); + + /* Log this write. */ + if ((zv->zv_flags & ZVOL_WCE) == 0 || sync) + zvol_log_write(zv, tx, buf_set->dn_start, buf_set->size, sync); + dmu_tx_commit(tx); +} + +static void +zvol_dmu_done(dmu_context_t *dmu_ctx) +{ + zvol_dmu_state_t *zds = (zvol_dmu_state_t *)dmu_ctx; + boolean_t sync_always = zds->zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; + int err; + + if ((dmu_ctx->flags & DMU_CTX_FLAG_READ) == 0 && sync_always) + zil_commit(zds->zv->zv_zilog, ZVOL_OBJ); + if (dmu_ctx->completed_size < dmu_ctx->size) { + if (dmu_ctx->dn_offset > zds->zv->zv_volsize) + err = EINVAL; + } else + err = (dmu_ctx->err == 0) ? 0 : EIO; + dmu_ctx->err = err; + + zfs_range_unlock(zds->rl); +} + +static int +zvol_dmu_context_init(zvol_dmu_state_t *zds, void *data, uint64_t off, + uint64_t io_size, uint32_t dmu_flags, dmu_context_callback_t done_cb) +{ + zvol_state_t *zv = zds->zv; + boolean_t reader = (dmu_flags & DMU_CTX_FLAG_READ) != 0; + int error; + + /* Truncate I/Os to the end of the volume, if needed. */ + if (io_size > zv->zv_volsize - off) + io_size = zv->zv_volsize - off; + + if (reader) + dmu_flags |= DMU_CTX_FLAG_PREFETCH; + + error = dmu_context_init(&zds->dmu_ctx, /*dnode*/NULL, zv->zv_objset, + ZVOL_OBJ, off, io_size, data, FTAG, dmu_flags); + if (error) + return (error); + /* Override the writer case to log the writes. */ + if (!reader) + dmu_context_set_buf_set_transfer_cb(&zds->dmu_ctx, + zvol_dmu_buf_set_transfer_write); + dmu_context_set_context_cb(&zds->dmu_ctx, done_cb); + zds->rl = zfs_range_lock(&zds->zv->zv_znode, off, io_size, + reader ? RL_READER : RL_WRITER); + + return (error); +} + +static void +zvol_dmu_issue(zvol_dmu_state_t *zds) +{ + int error; + + error = dmu_issue(&zds->dmu_ctx); + if (error) + zds->dmu_ctx.err++; + dmu_context_rele(&zds->dmu_ctx); +} + +typedef void (*zvol_strategy_deliver_cb)(struct bio *bp, int err); + +static void +zvol_strategy_bio_deliver(struct bio *bp, int err) +{ + bp->bio_error = err; + bp->bio_done(bp); +} + +/** + * Use another layer on top of zvol_dmu_state_t to provide additional + * context specific to zvol_common_strategy(), namely, the bio and the done + * callback, which calls zvol_dmu_done, as is done for zvol_dmu_state_t. + */ +typedef struct zvol_strategy_state { + zvol_dmu_state_t zds; + struct bio *bp; + zvol_strategy_deliver_cb deliver_cb; +} zvol_strategy_state_t; + +static void +zvol_strategy_dmu_done(dmu_context_t *dmu_ctx) +{ + zvol_strategy_state_t *zss = (zvol_strategy_state_t *)dmu_ctx; + + zvol_dmu_done(dmu_ctx); + zss->bp->bio_completed = dmu_ctx->completed_size; + zss->deliver_cb(zss->bp, dmu_ctx->err); + kmem_free(zss, sizeof(zvol_strategy_state_t)); +} + +static void +zvol_common_strategy(struct bio *bp, zvol_state_t *zv, + zvol_strategy_deliver_cb deliver_cb) { - zvol_state_t *zv = bp->bio_to->private; - uint64_t off, volsize; - size_t resid; - char *addr; - objset_t *os; - rl_t *rl; + zvol_strategy_state_t *zss; int error = 0; - boolean_t doread = (bp->bio_cmd == BIO_READ); - boolean_t sync; + uint32_t dmu_flags = DMU_CTX_FLAG_ASYNC; if (zv == NULL) { - g_io_deliver(bp, ENXIO); - return (0); + deliver_cb(bp, ENXIO); + return; } if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) { - g_io_deliver(bp, EROFS); - return (0); + deliver_cb(bp, EROFS); + return; } - off = bp->bio_offset; - volsize = zv->zv_volsize; - - os = zv->zv_objset; - ASSERT(os != NULL); - - addr = bp->bio_data; - resid = bp->bio_length; - - if (resid > 0 && (off < 0 || off >= volsize)) { - g_io_deliver(bp, EIO); - return (0); + ASSERT(zv->zv_objset != NULL); + if (bp->bio_length > 0 && + (bp->bio_offset < 0 || bp->bio_offset >= zv->zv_volsize)) { + deliver_cb(bp, EIO); + return; } - sync = !doread && zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; + if (bp->bio_cmd == BIO_READ) + dmu_flags |= DMU_CTX_FLAG_READ; - /* - * There must be no buffer changes when doing a dmu_sync() because - * we can't change the data whilst calculating the checksum. - */ - rl = zfs_range_lock(&zv->zv_znode, off, resid, - doread ? RL_READER : RL_WRITER); + zss = kmem_zalloc(sizeof(zvol_strategy_state_t), KM_SLEEP); + zss->bp = bp; + zss->deliver_cb = deliver_cb; + zss->zds.zv = zv; - while (resid != 0 && off < volsize) { - size_t size = MIN(resid, zvol_maxphys); - if (doread) { - error = dmu_read(os, ZVOL_OBJ, off, size, addr, - DMU_READ_PREFETCH); - } else { - dmu_tx_t *tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - dmu_write(os, ZVOL_OBJ, off, size, addr, tx); - zvol_log_write(zv, tx, off, size, sync); - dmu_tx_commit(tx); - } - } - if (error) { - /* convert checksum errors into IO errors */ - if (error == ECKSUM) - error = EIO; - break; - } - off += size; - addr += size; - resid -= size; + error = zvol_dmu_context_init(&zss->zds, bp->bio_data, bp->bio_offset, + bp->bio_length, dmu_flags, zvol_strategy_dmu_done); + if (error) { + kmem_free(zss, sizeof(zvol_strategy_state_t)); + deliver_cb(bp, error); + return; } - zfs_range_unlock(rl); - bp->bio_completed = bp->bio_length - resid; - if (bp->bio_completed < bp->bio_length) - bp->bio_error = (off > volsize ? EINVAL : error); + /* Errors are reported via the callback. */ + zvol_dmu_issue(&zss->zds); +} - if (sync) - zil_commit(zv->zv_zilog, ZVOL_OBJ); - g_io_deliver(bp, 0); +static void +zvol_strategy(struct bio *bp) +{ + zvol_state_t *zv = bp->bio_dev->si_drv1; + zvol_common_strategy(bp, zv, zvol_strategy_bio_deliver); +} - return (0); +static void +zvol_geom_strategy(struct bio *bp) +{ + zvol_state_t *zv = bp->bio_to->private; + zvol_common_strategy(bp, zv, g_io_deliver); } #ifdef sun @@ -1332,110 +1551,111 @@ return (error); } +#endif /* sun */ -/*ARGSUSED*/ -int -zvol_read(dev_t dev, uio_t *uio, cred_t *cr) +static int +zvol_dmu_uio_common(zvol_dmu_state_t *zds, uio_t *uio, uint32_t dmu_flags) { - minor_t minor = getminor(dev); - zvol_state_t *zv; - uint64_t volsize; - rl_t *rl; - int error = 0; + int err; + boolean_t reader = (dmu_flags & DMU_CTX_FLAG_READ); - zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); - if (zv == NULL) + if (zds->zv == NULL) return (ENXIO); - volsize = zv->zv_volsize; +#ifdef sun + if (zds.zv->zv_flags & ZVOL_DUMPIFIED) + return (physio(zvol_strategy, NULL, dev, + reader ? B_READ : B_WRITE, zvol_minphys, uio)); +#endif + + /* Don't allow I/Os that are not within the volume. */ if (uio->uio_resid > 0 && - (uio->uio_loffset < 0 || uio->uio_loffset >= volsize)) + (uio->uio_loffset < 0 || uio->uio_loffset >= zds->zv->zv_volsize)) return (EIO); - if (zv->zv_flags & ZVOL_DUMPIFIED) { - error = physio(zvol_strategy, NULL, dev, B_READ, - zvol_minphys, uio); - return (error); - } + err = zvol_dmu_context_init(zds, uio, uio->uio_loffset, + uio->uio_resid, dmu_flags|DMU_CTX_FLAG_UIO, zvol_dmu_done); + if (err) + return (err); + zvol_dmu_issue(zds); + return (zds->dmu_ctx.err); +} - rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, - RL_READER); - while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { - uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); +#if defined(__FreeBSD__) && defined(_KERNEL) +int +zvol_freebsd_read(struct cdev *dev, struct uio *uio, int ioflag) +{ + zvol_dmu_state_t zds; - /* don't read past the end */ - if (bytes > volsize - uio->uio_loffset) - bytes = volsize - uio->uio_loffset; + zds.zv = (zvol_state_t *)dev->si_drv1; + return (zvol_dmu_uio_common(&zds, uio, DMU_CTX_FLAG_READ)); +} +int +zvol_freebsd_write(struct cdev *dev, struct uio *uio, int ioflag) +{ + zvol_dmu_state_t zds; - error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes); - if (error) { - /* convert checksum errors into IO errors */ - if (error == ECKSUM) - error = EIO; - break; - } - } - zfs_range_unlock(rl); - return (error); + zds.zv = (zvol_state_t *)dev->si_drv1; + return (zvol_dmu_uio_common(&zds, uio, /*flags*/0)); } -/*ARGSUSED*/ int -zvol_write(dev_t dev, uio_t *uio, cred_t *cr) +zvol_freebsd_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, + struct thread *td) { - minor_t minor = getminor(dev); - zvol_state_t *zv; - uint64_t volsize; - rl_t *rl; + zvol_state_t *zv = dev->si_drv1; int error = 0; - boolean_t sync; - zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); if (zv == NULL) return (ENXIO); - volsize = zv->zv_volsize; - if (uio->uio_resid > 0 && - (uio->uio_loffset < 0 || uio->uio_loffset >= volsize)) - return (EIO); - - if (zv->zv_flags & ZVOL_DUMPIFIED) { - error = physio(zvol_strategy, NULL, dev, B_WRITE, - zvol_minphys, uio); - return (error); + switch (cmd) { + case DIOCGSECTORSIZE: + *(u_int *)data = DEV_BSIZE; + break; + case DIOCGMEDIASIZE: + *(off_t *)data = zv->zv_volsize; + if (*(off_t *)data == 0) + error = ENOENT; + break; + /* + * TODO: These probably need to be implemented, too. There may be + * more, see sys/geom/geom_dev.c:g_dev_ioctl(). + */ + case DIOCGFLUSH: + case DIOCGDELETE: + case DIOCGSTRIPESIZE: + case DIOCGSTRIPEOFFSET: + /* FALLTHROUGH */ + default: + error = ENOIOCTL; + break; } + return (error); +} +#endif /* __FreeBSD__ && _KERNEL */ - sync = !(zv->zv_flags & ZVOL_WCE) || - (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); +#ifdef sun +/*ARGSUSED*/ +int +zvol_read(dev_t dev, uio_t *uio, cred_t *cr) +{ + minor_t minor = getminor(dev); + zvol_dmu_state_t zds; - rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, - RL_WRITER); - while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { - uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); - uint64_t off = uio->uio_loffset; - dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + zds.zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); + return (zvol_dmu_uio_common(&zds, uio, DMU_CTX_FLAG_READ)); +} - if (bytes > volsize - off) /* don't write past the end */ - bytes = volsize - off; +/*ARGSUSED*/ +int +zvol_write(dev_t dev, uio_t *uio, cred_t *cr) +{ + minor_t minor = getminor(dev); + zvol_dmu_state_t zds; - dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - break; - } - error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx); - if (error == 0) - zvol_log_write(zv, tx, off, bytes, sync); - dmu_tx_commit(tx); - - if (error) - break; - } - zfs_range_unlock(rl); - if (sync) - zil_commit(zv->zv_zilog, ZVOL_OBJ); - return (error); + zds.zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); + return (zvol_dmu_uio_common(&zds, uio, /*flags*/0)); } int @@ -1960,30 +2180,6 @@ } #endif /* sun */ -static zvol_state_t * -zvol_geom_create(const char *name) -{ - struct g_provider *pp; - struct g_geom *gp; - zvol_state_t *zv; - - gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); - gp->start = zvol_geom_start; - gp->access = zvol_geom_access; - pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); - pp->sectorsize = DEV_BSIZE; - - zv = kmem_zalloc(sizeof(*zv), KM_SLEEP); - zv->zv_provider = pp; - zv->zv_state = 0; - bioq_init(&zv->zv_queue); - mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF); - - pp->private = zv; - - return (zv); -} - static void zvol_geom_run(zvol_state_t *zv) { @@ -2015,6 +2211,8 @@ pp->private = NULL; g_wither_geom(pp->geom, ENXIO); + destroy_dev_sched(zv->zv_dev); + kmem_free(zv, sizeof(*zv)); } @@ -2061,9 +2259,9 @@ g_topology_unlock(); if (count > 0) - error = zvol_open(pp, flags, count); + error = zvol_geom_open(pp, flags, count); else - error = zvol_close(pp, flags, -count); + error = zvol_geom_close(pp, flags, -count); g_topology_lock(); return (error); } @@ -2128,7 +2326,7 @@ break; case BIO_READ: case BIO_WRITE: - zvol_strategy(bp); + zvol_geom_strategy(bp); break; } } @@ -2166,8 +2364,8 @@ } if ((error = zvol_create_minor(sname)) != 0) { - printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n", - sname, error); + printf("ZFS WARNING: Unable to create ZVOL snapshot " + "%s (error=%d).\n", sname, error); break; } } @@ -2193,12 +2391,12 @@ return (error); } if (dmu_objset_type(os) == DMU_OST_ZVOL) { - if ((error = zvol_create_minor(name)) == 0) + error = zvol_create_minor(name); + if (error == 0) error = zvol_create_snapshots(os, name); - else { - printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n", - name, error); - } + if (error) + printf("ZFS WARNING: Unable to create ZVOL %s " + "(error=%d).\n", name, error); dmu_objset_rele(os, FTAG); return (error); } @@ -2227,43 +2425,65 @@ while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL, &cookie) == 0) { dmu_objset_rele(os, FTAG); - (void)zvol_create_minors(osname); - if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) { - printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n", - name, error); + error = zvol_create_minors(osname); + if (error) { + kmem_free(osname, MAXPATHLEN); + return (error); + } + error = dmu_objset_hold(name, FTAG, &os); + if (error) { + printf("ZFS WARNING: Unable to put hold on %s" + " (error=%d)\n", name, error); + kmem_free(osname, MAXPATHLEN); return (error); } } dmu_objset_rele(os, FTAG); kmem_free(osname, MAXPATHLEN); - return (0); + return (error); } static void zvol_rename_minor(struct g_geom *gp, const char *newname) { - struct g_provider *pp; + struct g_provider *new_pp, *old_pp; zvol_state_t *zv; + struct cdev *new_dev, *old_dev; + int error; ASSERT(MUTEX_HELD(&spa_namespace_lock)); g_topology_assert(); - pp = LIST_FIRST(&gp->provider); - ASSERT(pp != NULL); - zv = pp->private; + old_pp = LIST_FIRST(&gp->provider); + ASSERT(old_pp != NULL); + zv = old_pp->private; ASSERT(zv != NULL); - zv->zv_provider = NULL; - g_wither_provider(pp, ENXIO); + error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &new_dev, + &zfs_zvol_cdevsw, NULL, UID_ROOT, GID_OPERATOR, 0600, "%s/%s", + ZVOL_DRIVER, newname); + if (error) { + printf("ZFS: Could not rename ZVOL %s to %s\n", + zv->zv_name, newname); + return; + } - pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); - pp->sectorsize = DEV_BSIZE; - pp->mediasize = zv->zv_volsize; - pp->private = zv; - zv->zv_provider = pp; + new_pp = g_new_providerf(gp, "g%s/%s", ZVOL_DRIVER, newname); + new_pp->sectorsize = DEV_BSIZE; + new_pp->mediasize = zv->zv_volsize; + new_pp->private = zv; strlcpy(zv->zv_name, newname, sizeof(zv->zv_name)); - g_error_provider(pp, 0); + g_error_provider(new_pp, 0); + + /* + * We're piggybacking on the GEOM code to rename standard block + * devices as well. + */ + destroy_dev_sched(zv->zv_dev); + zv->zv_dev = new_dev; + zv->zv_dev->si_drv1 = zv; + old_dev = zv->zv_dev; } void --- old/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h +++ new/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h @@ -21,6 +21,8 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright (c) 2012 Spectra Logic Corporation. All rights reserved. */ #ifndef _SYS_TASKQ_H @@ -68,6 +70,8 @@ #ifdef _KERNEL +typedef void (*taskq_callback_fn)(void *); + extern taskq_t *system_taskq; extern void taskq_init(void); @@ -77,7 +81,7 @@ extern taskq_t *taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t); extern taskq_t *taskq_create_proc(const char *, int, pri_t, int, int, - struct proc *, uint_t); + struct proc *, uint_t, taskq_callback_fn, taskq_callback_fn); extern taskq_t *taskq_create_sysdc(const char *, int, int, int, struct proc *, uint_t, uint_t); extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); --- old/sys/kern/subr_taskqueue.c +++ new/sys/kern/subr_taskqueue.c @@ -63,6 +63,9 @@ int tq_spin; int tq_flags; int tq_callouts; + + taskqueue_callback_fn tq_callbacks[TASKQUEUE_CALLBACK_TYPE_MAX]; + void *tq_cb_contexts[TASKQUEUE_CALLBACK_TYPE_MAX]; }; #define TQ_FLAGS_ACTIVE (1 << 0) @@ -87,6 +90,13 @@ mtx_unlock(&(tq)->tq_mutex); \ } while (0) +#define TASKQUEUE_RUN_CALLBACK(tq, cb_type) \ + do { \ + if ((tq)->tq_callbacks[cb_type] != NULL) \ + (tq)->tq_callbacks[cb_type]( \ + (tq)->tq_cb_contexts[cb_type]); \ + } while (0) + void _timeout_task_init(struct taskqueue *queue, struct timeout_task *timeout_task, int priority, task_fn_t func, void *context) @@ -137,6 +147,19 @@ MTX_DEF, "taskqueue"); } +void +taskqueue_set_callback(struct taskqueue *queue, + enum taskqueue_callback_type cb_type, taskqueue_callback_fn callback, + void *context) +{ + + if (cb_type >= TASKQUEUE_CALLBACK_TYPE_MAX) + panic("Newer taskqueue consumer using old taskqueue API"); + + queue->tq_callbacks[cb_type] = callback; + queue->tq_cb_contexts[cb_type] = context; +} + /* * Signal a taskqueue thread to terminate. */ @@ -165,7 +188,7 @@ } static int -taskqueue_enqueue_locked(struct taskqueue *queue, struct task *task) +taskqueue_enqueue_locked(struct taskqueue *queue, struct task *queued_task) { struct task *ins; struct task *prev; @@ -173,9 +196,9 @@ /* * Count multiple enqueues. */ - if (task->ta_pending) { - if (task->ta_pending < USHRT_MAX) - task->ta_pending++; + if (queued_task->ta_pending) { + if (queued_task->ta_pending < USHRT_MAX) + queued_task->ta_pending++; return (0); } @@ -183,22 +206,24 @@ * Optimise the case when all tasks have the same priority. */ prev = STAILQ_LAST(&queue->tq_queue, task, ta_link); - if (!prev || prev->ta_priority >= task->ta_priority) { - STAILQ_INSERT_TAIL(&queue->tq_queue, task, ta_link); + if (!prev || prev->ta_priority >= queued_task->ta_priority) { + STAILQ_INSERT_TAIL(&queue->tq_queue, queued_task, ta_link); } else { prev = NULL; for (ins = STAILQ_FIRST(&queue->tq_queue); ins; prev = ins, ins = STAILQ_NEXT(ins, ta_link)) - if (ins->ta_priority < task->ta_priority) + if (ins->ta_priority < queued_task->ta_priority) break; if (prev) - STAILQ_INSERT_AFTER(&queue->tq_queue, prev, task, ta_link); + STAILQ_INSERT_AFTER(&queue->tq_queue, prev, queued_task, + ta_link); else - STAILQ_INSERT_HEAD(&queue->tq_queue, task, ta_link); + STAILQ_INSERT_HEAD(&queue->tq_queue, queued_task, + ta_link); } - task->ta_pending = 1; + queued_task->ta_pending = 1; if ((queue->tq_flags & TQ_FLAGS_BLOCKED) == 0) queue->tq_enqueue(queue->tq_context); else @@ -492,7 +517,9 @@ tqp = arg; tq = *tqp; + TASKQUEUE_RUN_CALLBACK(tq, TASKQUEUE_CALLBACK_TYPE_INIT); TQ_LOCK(tq); + while ((tq->tq_flags & TQ_FLAGS_ACTIVE) != 0) { taskqueue_run_locked(tq); /* @@ -506,10 +533,20 @@ } taskqueue_run_locked(tq); + /* + * This thread is on its way out, so just drop the lock temporarily + * in order to call the shutdown callback. This allows the callback + * to look at the taskqueue, even just before it dies. + */ + TQ_UNLOCK(tq); + TASKQUEUE_RUN_CALLBACK(tq, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN); + TQ_LOCK(tq); + /* rendezvous with thread that asked us to terminate */ tq->tq_tcount--; wakeup_one(tq->tq_threads); TQ_UNLOCK(tq); + kthread_exit(); } --- old/sys/modules/zfs/Makefile +++ new/sys/modules/zfs/Makefile @@ -88,8 +88,10 @@ CFLAGS+=-mminimal-toc .endif -#CFLAGS+=-DDEBUG=1 -#DEBUG_FLAGS=-g +.ifdef WITH_ZFS_DEBUGGING +CFLAGS+=-DDEBUG=1 +DEBUG_FLAGS?=-g +.endif .include --- old/sys/sys/bio.h +++ new/sys/sys/bio.h @@ -41,19 +41,23 @@ #include /* bio_cmd */ -#define BIO_READ 0x01 -#define BIO_WRITE 0x02 -#define BIO_DELETE 0x04 -#define BIO_GETATTR 0x08 -#define BIO_FLUSH 0x10 +#define BIO_READ 0x01 /* Read I/O data */ +#define BIO_WRITE 0x02 /* Write I/O data */ +#define BIO_DELETE 0x04 /* TRIM or free blocks, i.e. mark as unused */ +#define BIO_GETATTR 0x08 /* Get GEOM attributes of object */ +#define BIO_FLUSH 0x10 /* Commit outstanding I/O now */ #define BIO_CMD0 0x20 /* Available for local hacks */ #define BIO_CMD1 0x40 /* Available for local hacks */ #define BIO_CMD2 0x80 /* Available for local hacks */ /* bio_flags */ -#define BIO_ERROR 0x01 -#define BIO_DONE 0x02 -#define BIO_ONQUEUE 0x04 +#define BIO_ERROR 0x01 /* An error occurred processing this bio. */ +#define BIO_DONE 0x02 /* This bio is finished. */ +#define BIO_ONQUEUE 0x04 /* This bio is in a queue & not yet taken. */ +/* + * This bio must be executed after all previous bios in the queue have been + * executed, and before any successive bios can be executed. + */ #define BIO_ORDERED 0x08 #ifdef _KERNEL --- old/sys/sys/taskqueue.h +++ new/sys/sys/taskqueue.h @@ -47,6 +47,14 @@ int f; }; +enum taskqueue_callback_type { + TASKQUEUE_CALLBACK_TYPE_INIT, + TASKQUEUE_CALLBACK_TYPE_SHUTDOWN, + TASKQUEUE_CALLBACK_TYPE_MAX, +}; + +typedef void (*taskqueue_callback_fn)(void *context); + /* * A notification callback function which is called from * taskqueue_enqueue(). The context argument is given in the call to @@ -76,6 +84,9 @@ void taskqueue_block(struct taskqueue *queue); void taskqueue_unblock(struct taskqueue *queue); int taskqueue_member(struct taskqueue *queue, struct thread *td); +void taskqueue_set_callback(struct taskqueue *queue, + enum taskqueue_callback_type cb_type, + taskqueue_callback_fn callback, void *context); #define TASK_INITIALIZER(priority, func, context) \ { .ta_pending = 0, \ --- old/tools/tools/nanobsd/spectra/ZLINE +++ new/tools/tools/nanobsd/spectra/ZLINE @@ -9,7 +9,8 @@ ident ZLINE makeoptions DEBUG="-g" # Build kernel with gdb(1) debug symbols -makeoptions MODULES_OVERRIDE="cyclic dtrace mps mpt nullfs opensolaris zfs" +makeoptions ".MAKEFLAGS"="-DZFS_DEBUG" +makeoptions WITH_CTF=1 options SCHED_ULE # ULE scheduler options PREEMPTION # Enable kernel thread preemption @@ -22,8 +23,10 @@ options UFS_DIRHASH # Improve performance on big directories options UFS_GJOURNAL # Enable gjournal-based UFS journaling options MD_ROOT # MD is a potential root device -options NFSCLIENT # Network Filesystem Client -options NFSSERVER # Network Filesystem Server +options NFSCL # New Network Filesystem Client +options NFSD # New Network Filesystem Server +#options NFSCLIENT # Network Filesystem Client +#options NFSSERVER # Network Filesystem Server options NFSLOCKD # Network Lock Manager options NFS_ROOT # NFS usable as /, requires NFSCLIENT options MSDOSFS # MSDOS Filesystem @@ -57,15 +60,10 @@ options DDB # Support DDB. options GDB # Support remote GDB. options DEADLKRES # Enable the deadlock resolver -#options INVARIANTS # Enable calls of extra sanity checking -#options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS -#options WITNESS # Enable checks to detect deadlocks and cycles -#options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed - -# -# NOTE: The below is true UNLESS you pin your VCPUs to physical CPUS that -# are not shared with any other virtual machines. We pin the storage -# domain's CPUs, so adaptive mutexes work as expected. +options INVARIANTS # Enable calls of extra sanity checking +options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS +options WITNESS # Enable checks to detect deadlocks and cycles +options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed # # Adaptive locks rely on a lock-free pointer read to determine the run state # of the thread holding a lock when under contention; under a virtualisation @@ -73,14 +71,13 @@ # (or rather its host VCPU) is actually executing. As such, disable this # optimisation. # -#options NO_ADAPTIVE_MUTEXES -#options NO_ADAPTIVE_RWLOCKS -#options NO_ADAPTIVE_SX options MCLSHIFT=12 # Use 4k mbuf clusters so that a # maximally fragmented TSO/LRO # operation can be transmitted # across Xen virtual network links. - +options NO_ADAPTIVE_MUTEXES +options NO_ADAPTIVE_RWLOCKS +options NO_ADAPTIVE_SX options BREAK_TO_DEBUGGER # Break signal on console enters kdb options ALT_BREAK_TO_DEBUGGER # "^M~" on console enters kdb