diff -Nur vanilla/sys/geom/geom_vfs.c patched/sys/geom/geom_vfs.c --- vanilla/sys/geom/geom_vfs.c 2013-03-20 13:19:13.368990520 +0100 +++ patched/sys/geom/geom_vfs.c 2013-03-27 15:43:41.916990886 +0100 @@ -31,9 +31,14 @@ #include #include #include +#include #include #include #include +#include +#include +#include +#include #include #include @@ -161,12 +166,312 @@ bufdone(bp); } +#ifdef RACCT + +struct g_sched_bio { + struct bio *bip; + struct g_consumer *cp; + struct proc *p; + STAILQ_ENTRY(g_sched_bio) gsb_link; +}; + +struct g_sched_bio_list { + int dispatch; + RB_ENTRY(g_sched_bio_list) gsb_link; + STAILQ_HEAD(, g_sched_bio) bio_list; +}; + +struct g_sched { + int initialised; + int total; + int min_dispatch; + struct mtx lock; + RB_HEAD(g_sched_bio_tree, g_sched_bio_list) tree; +} sched_bio; + +static struct callout g_dispatch_bio_callout; +static uma_zone_t g_sched_bio_zone; +static uma_zone_t g_sched_bio_list_zone; + +MTX_SYSINIT(sched_bio_lock, &sched_bio.lock, "geom sched bio lock", MTX_DEF); + +static void +g_sched_bio_init() +{ + mtx_assert(&sched_bio.lock, MA_OWNED); + + g_sched_bio_zone = uma_zcreate("g_sched_bio", + sizeof(struct g_sched_bio), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, + UMA_ZONE_NOFREE); + g_sched_bio_list_zone = uma_zcreate("g_sched_bio_list", + sizeof(struct g_sched_bio_list), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + + callout_init_mtx(&g_dispatch_bio_callout, &sched_bio.lock, 0); + RB_INIT(&sched_bio.tree); + sched_bio.total = 0; + sched_bio.min_dispatch = INT_MAX; + sched_bio.initialised = 1; +} + +static int +g_sched_bio_list_cmp(const struct g_sched_bio_list *a, + const struct g_sched_bio_list *b) +{ + if (a->dispatch > b->dispatch) + return (1); + else if (a->dispatch < b->dispatch) + return (-1); + return (0); +} + +static +struct g_sched_bio_list * +g_sched_bio_list_alloc(int flags) +{ + struct g_sched_bio_list *sb; + + sb = uma_zalloc(g_sched_bio_list_zone, flags); + if (sb == NULL) + return (NULL); + + STAILQ_INIT(&sb->bio_list); + return (sb); +} + +static +struct g_sched_bio * +g_sched_bio_alloc(int flags) +{ + struct g_sched_bio *gsb; + + gsb = uma_zalloc(g_sched_bio_zone, flags); + + return (gsb); +} + +RB_PROTOTYPE(g_sched_bio_tree, g_sched_bio_list, gsb_link, + g_sched_bio_list_cmp); +RB_GENERATE(g_sched_bio_tree, g_sched_bio_list, gsb_link, + g_sched_bio_list_cmp); + +static void +g_racct_process_bio(struct proc *p, struct bio *bip, struct g_consumer *cp, + int res); + +static void +g_racct_submit_bio(struct proc *p, struct bio *bip, struct g_consumer *cp, + int res) +{ + PROC_LOCK_ASSERT(p, MA_OWNED); + g_io_request(bip, cp); + racct_add_force(p, res, bip->bio_length); +} + +static void +g_process_bio_list(struct g_sched_bio_list *sb) +{ + struct g_sched_bio *gsb; + struct g_sched_bio *tmp; + int res; + + STAILQ_FOREACH_SAFE(gsb, &sb->bio_list, gsb_link, tmp) { + switch (gsb->bip->bio_cmd) { + case BIO_READ: + res = RACCT_IOR; + break; + case BIO_WRITE: + res = RACCT_IOW; + break; + default: + res = 0; + break; + } + KASSERT((res != 0), ("Invalid command in bio list")); + STAILQ_REMOVE_HEAD(&sb->bio_list, gsb_link); + g_racct_process_bio(gsb->p, gsb->bip, gsb->cp, res); + sched_bio.total--; + uma_zfree(g_sched_bio_zone, gsb); + } +} + + +static void +g_dispatch_bio(void * dummy) +{ + struct g_sched_bio_list *sb; + int rel; + + mtx_assert(&sched_bio.lock, MA_OWNED); + + sb = RB_MIN(g_sched_bio_tree, &sched_bio.tree); + while ((sb != NULL) && (sb->dispatch <= ticks)) { + RB_REMOVE(g_sched_bio_tree, &sched_bio.tree, sb); + mtx_unlock(&sched_bio.lock); + + /* + * Once the bio list has been removed from the RB-tree, + * we do not need the tree lock to process the list. + */ + g_process_bio_list(sb); + uma_zfree(g_sched_bio_list_zone, sb); + + mtx_lock(&sched_bio.lock); + sb = RB_MIN(g_sched_bio_tree, &sched_bio.tree); + if (sb == NULL) + sched_bio.min_dispatch = INT_MAX; + else + sched_bio.min_dispatch = sb->dispatch; + } + + if (sb != NULL) { + rel = sb->dispatch - ticks; + callout_reset(&g_dispatch_bio_callout, rel, g_dispatch_bio, + NULL); + } +} + +/* Calculate the relative dispatch time from now. */ +static int +g_sched_dispatch_relative(int64_t bextra, int64_t ballowed) +{ + int dispatch; + + dispatch = 0; + + // Add extra delay. + if (bextra >= ballowed) + dispatch += hz; + else + dispatch += (bextra * hz) / ballowed; + + return (dispatch); +} + +static int +extra_bytes(int64_t free, int bio_length) +{ + if ((free != INT64_MAX) && (free < bio_length)) + return (bio_length - free); + return (0); +} + +static struct g_sched_bio_list* +find_add_bio_list(int dispatch) +{ + struct g_sched_bio_list dummy; + struct g_sched_bio_list *sb; + + mtx_assert(&sched_bio.lock, MA_OWNED); + + dummy.dispatch = dispatch; + sb = RB_FIND(g_sched_bio_tree, &sched_bio.tree, &dummy); + if (sb == NULL) { + sb = g_sched_bio_list_alloc(M_NOWAIT); + if (sb == NULL) { + return (NULL); + } + sb->dispatch = dispatch; + RB_INSERT(g_sched_bio_tree, &sched_bio.tree, sb); + if (dispatch < sched_bio.min_dispatch) + sched_bio.min_dispatch = dispatch; + } + + return (sb); +} + +static int +enqueue_bio(struct proc *p, struct bio *bip, struct g_consumer *cp, + int rel) +{ + int old_min, dispatch; + struct g_sched_bio_list *sb; + struct g_sched_bio *gsb; + + dispatch = ticks + rel; + + mtx_lock(&sched_bio.lock); + old_min = sched_bio.min_dispatch; + sb = find_add_bio_list(dispatch); + if (sb == NULL) { + mtx_unlock(&sched_bio.lock); + return (0); + } + + gsb = g_sched_bio_alloc(M_NOWAIT); + if (gsb == NULL) { + mtx_unlock(&sched_bio.lock); + return (0); + } + + gsb->p = p; + gsb->bip = bip; + gsb->cp = cp; + + STAILQ_INSERT_TAIL(&sb->bio_list, gsb, gsb_link); + sched_bio.total++; + + if (dispatch < old_min) { + callout_reset(&g_dispatch_bio_callout, rel, + g_dispatch_bio, NULL); + } + + mtx_unlock(&sched_bio.lock); + return (1); +} + +/* Process bio read requests. */ +static void +g_racct_process_bio(struct proc *p, struct bio *bip, struct g_consumer *cp, + int res) +{ + int64_t free; + uint64_t allowed; + int rel, extra; + + PROC_LOCK(p); + if (p->p_state != PRS_NORMAL) { + PROC_UNLOCK(p); + g_io_request(bip, cp); + return; + } + + rctl_get_free_allowed(p, res, &free, &allowed); + if (free != INT64_MAX) + free = free * (ticks - racct_last_slice_ticks()) / hz; + extra = extra_bytes(free, bip->bio_length); + + if (extra > 0) { + rel = g_sched_dispatch_relative(extra, allowed); + if (!enqueue_bio(p, bip, cp, rel)) + g_racct_submit_bio(p, bip, cp, res); + + } else + g_racct_submit_bio(p, bip, cp, res); + PROC_UNLOCK(p); +} + +static void +g_racct_init_bio(void) +{ + mtx_lock(&sched_bio.lock); + if (!sched_bio.initialised) + g_sched_bio_init(); + mtx_unlock(&sched_bio.lock); +} + +#endif + void g_vfs_strategy(struct bufobj *bo, struct buf *bp) { struct g_vfs_softc *sc; struct g_consumer *cp; struct bio *bip; +#ifdef RACCT + int res; + struct proc *p; +#endif cp = bo->bo_private; sc = cp->geom->softc; @@ -196,7 +501,30 @@ bip->bio_flags |= BIO_ORDERED; bp->b_flags &= ~B_BARRIER; } +#ifdef RACCT + if (!sched_bio.initialised) + g_racct_init_bio(); + p = curthread->td_proc; + + switch (bip->bio_cmd) { + case BIO_READ: + res = RACCT_IOR; + break; + case BIO_WRITE: + res = RACCT_IOW; + break; + default: + res = 0; + break; + } + + if (res != 0) + g_racct_process_bio(p, bip, cp, res); + else + g_io_request(bip, cp); +#else g_io_request(bip, cp); +#endif } static void diff -Nur vanilla/sys/kern/kern_racct.c patched/sys/kern/kern_racct.c --- vanilla/sys/kern/kern_racct.c 2013-03-20 13:20:19.205316992 +0100 +++ patched/sys/kern/kern_racct.c 2013-03-20 13:18:39.932824715 +0100 @@ -72,6 +72,8 @@ */ static int pcpu_threshold = 1; +static int slice_tick; + SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting"); SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold, 0, "Processes with higher %cpu usage than this value can be throttled."); @@ -163,9 +165,13 @@ [RACCT_WALLCLOCK] = RACCT_IN_MILLIONS, [RACCT_PCTCPU] = - RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS }; + RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS, + [RACCT_IOR] = + RACCT_RECLAIMABLE | RACCT_DENIABLE, + [RACCT_IOW] = + RACCT_RECLAIMABLE | RACCT_DENIABLE }; -static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE; +static fixpt_t racct_decay_factor; #ifdef SCHED_4BSD /* @@ -632,7 +638,7 @@ * between the new amount and the proportional value of the * old amount that has decayed in the ucred racct containers. */ - decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; + decayed_amount = old_amount * racct_decay_factor / FSCALE; diff_cred = amount - decayed_amount; } else diff_cred = diff_proc; @@ -703,7 +709,7 @@ * between the new amount and the proportional value of the * old amount that has decayed in the ucred racct containers. */ - decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; + decayed_amount = old_amount * racct_decay_factor / FSCALE; diff_cred = amount - decayed_amount; } else diff_cred = diff_proc; @@ -1033,6 +1039,7 @@ p->p_throttled = 1; FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); switch (td->td_state) { case TDS_RUNQ: /* @@ -1041,27 +1048,24 @@ * TDF_NEEDRESCHED for the thread, so that once it is * running, it is taken off the cpu as soon as possible. */ - thread_lock(td); td->td_flags |= TDF_NEEDRESCHED; - thread_unlock(td); break; case TDS_RUNNING: /* * If the thread is running, we request a context * switch for it by setting the TDF_NEEDRESCHED flag. */ - thread_lock(td); td->td_flags |= TDF_NEEDRESCHED; #ifdef SMP cpuid = td->td_oncpu; if ((cpuid != NOCPU) && (td != curthread)) ipi_cpu(cpuid, IPI_AST); #endif - thread_unlock(td); break; default: break; } + thread_unlock(td); } } @@ -1090,7 +1094,7 @@ return; mtx_lock(&racct_lock); - r_new = r_old * RACCT_DECAY_FACTOR / FSCALE; + r_new = r_old * racct_decay_factor / FSCALE; racct->r_resources[resource] = r_new; mtx_unlock(&racct_lock); } @@ -1098,11 +1102,21 @@ static void racct_decay(int resource) { + fixpt_t ldavg; + + ldavg = averunnable.ldavg[0]; + racct_decay_factor = (2 * ldavg * FSCALE) / (2 * ldavg + FSCALE); ui_racct_foreach(racct_decay_resource, &resource, NULL); loginclass_racct_foreach(racct_decay_resource, &resource, NULL); prison_racct_foreach(racct_decay_resource, &resource, NULL); } +int +racct_last_slice_ticks(void) +{ + return (slice_tick); +} + static void racctd(void) { @@ -1111,6 +1125,7 @@ struct timeval wallclock; uint64_t runtime; uint64_t pct, pct_estimate; + int ticks_a, ticks_b; for (;;) { racct_decay(RACCT_PCTCPU); @@ -1123,6 +1138,7 @@ PROC_UNLOCK(p); } + ticks_a = ticks; FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state != PRS_NORMAL) { @@ -1158,9 +1174,15 @@ racct_set_locked(p, RACCT_WALLCLOCK, (uint64_t)wallclock.tv_sec * 1000000 + wallclock.tv_usec); + + racct_set_locked(p, RACCT_IOR, 0); + racct_set_locked(p, RACCT_IOW, 0); + mtx_unlock(&racct_lock); PROC_UNLOCK(p); } + ticks_b = ticks; + slice_tick = ((uint64_t)ticks_a + ticks_b) / 2; /* * To ensure that processes are throttled in a fair way, we need diff -Nur vanilla/sys/kern/kern_rctl.c patched/sys/kern/kern_rctl.c --- vanilla/sys/kern/kern_rctl.c 2013-03-20 13:20:19.633319099 +0100 +++ patched/sys/kern/kern_rctl.c 2013-03-20 13:18:39.988824997 +0100 @@ -123,6 +123,8 @@ { "shmsize", RACCT_SHMSIZE }, { "wallclock", RACCT_WALLCLOCK }, { "pcpu", RACCT_PCTCPU }, + { "ior", RACCT_IOR}, + { "iow", RACCT_IOW}, { NULL, -1 }}; static struct dict actionnames[] = { @@ -478,6 +480,42 @@ return (amount); } +void +rctl_get_free_allowed(struct proc *p, int resource, int64_t *free, + uint64_t *allowed) +{ + struct rctl_rule *rule; + struct rctl_rule *min_rule; + struct rctl_rule_link *link; + int64_t available, minavailable; + + minavailable = INT64_MAX; + + rw_rlock(&rctl_lock); + + /* + * There may be more than one matching rule; go through all of them. + * Denial should be done last, after logging and sending signals. + */ + LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { + rule = link->rrl_rule; + if (rule->rr_resource != resource) + continue; + if (rule->rr_action != RCTL_ACTION_DENY) + continue; + available = rctl_available_resource(p, rule); + if (available < minavailable) { + minavailable = available; + min_rule = rule; + } + } + + rw_runlock(&rctl_lock); + + *free = minavailable; + *allowed = rule->rr_amount; +} + uint64_t rctl_get_available(struct proc *p, int resource) { @@ -1026,6 +1064,15 @@ RACCT_IS_SLOPPY(rule->rr_resource)) return (EOPNOTSUPP); + /* Only the deny action is allowed here */ + switch (rule->rr_resource) { + case RACCT_PCTCPU: + case RACCT_IOR: + case RACCT_IOW: + if (rule->rr_action != RCTL_ACTION_DENY) + return (EOPNOTSUPP); + } + /* * Make sure there are no duplicated rules. Also, for the "deny" * rules, remove ones differing only by "amount". diff -Nur vanilla/sys/sys/racct.h patched/sys/sys/racct.h --- vanilla/sys/sys/racct.h 2013-03-20 13:20:06.877255844 +0100 +++ patched/sys/sys/racct.h 2013-03-20 13:18:34.512797831 +0100 @@ -69,7 +69,9 @@ #define RACCT_SHMSIZE 18 #define RACCT_WALLCLOCK 19 #define RACCT_PCTCPU 20 -#define RACCT_MAX RACCT_PCTCPU +#define RACCT_IOR 21 +#define RACCT_IOW 22 +#define RACCT_MAX RACCT_IOW /* * Resource properties. @@ -161,5 +163,6 @@ void racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred, struct ucred *newcred); void racct_move(struct racct *dest, struct racct *src); +int racct_last_slice_ticks(void); #endif /* !_RACCT_H_ */ diff -Nur vanilla/sys/sys/rctl.h patched/sys/sys/rctl.h --- vanilla/sys/sys/rctl.h 2013-03-20 13:20:06.825255584 +0100 +++ patched/sys/sys/rctl.h 2013-03-20 13:18:34.388797221 +0100 @@ -141,6 +141,8 @@ int rctl_rule_remove(struct rctl_rule *filter); int rctl_enforce(struct proc *p, int resource, uint64_t amount); int64_t rctl_pcpu_available(const struct proc *p); +void rctl_get_free_allowed(struct proc *p, int resource, int64_t *free, + uint64_t *allowed); uint64_t rctl_get_limit(struct proc *p, int resource); uint64_t rctl_get_available(struct proc *p, int resource); const char *rctl_resource_name(int resource);