diff --git a/cddl/contrib/opensolaris/cmd/dtrace/dtrace.c b/cddl/contrib/opensolaris/cmd/dtrace/dtrace.c index cc7959f..a745ceb 100644 --- a/cddl/contrib/opensolaris/cmd/dtrace/dtrace.c +++ b/cddl/contrib/opensolaris/cmd/dtrace/dtrace.c @@ -23,8 +23,9 @@ * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ #include #include @@ -1409,6 +1410,7 @@ main(int argc, char *argv[]) (void) dtrace_setopt(g_dtp, "bufsize", "4m"); (void) dtrace_setopt(g_dtp, "aggsize", "4m"); #endif + (void) dtrace_setopt(g_dtp, "temporal", "yes"); /* * If -G is specified, enable -xlink=dynamic and -xunodefs to permit diff --git a/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/buffering/tst.fill1.d b/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/buffering/tst.fill1.d index 143ed64..fffc7e3 100644 --- a/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/buffering/tst.fill1.d +++ b/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/buffering/tst.fill1.d @@ -23,26 +23,29 @@ * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ /* * ASSERTION: * Positive test for fill buffer policy. * * SECTION: Buffers and Buffering/fill Policy; - * Buffers and Buffering/Buffer Sizes; + * Buffers and Buffering/Buffer Sizes; * Options and Tunables/bufsize; * Options and Tunables/bufpolicy; * Options and Tunables/statusrate */ /* - * This is a brute-force way of testing fill buffers. We assume that each - * printf() stores 8 bytes. Because each fill buffer is per-CPU, we must - * fill up our buffer in one series of enablings on a single CPU. + * This is a brute-force way of testing fill buffers. We assume that + * each printf() stores 16 bytes (4x 32-bit words for EPID, timestamp + * lo, timestamp hi, and the variable i). Because each fill buffer is + * per-CPU, we must fill up our buffer in one series of enablings on a + * single CPU. */ #pragma D option bufpolicy=fill -#pragma D option bufsize=64 +#pragma D option bufsize=128 #pragma D option statusrate=10ms #pragma D option quiet diff --git a/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/pragma/tst.temporal.ksh b/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/pragma/tst.temporal.ksh new file mode 100644 index 0000000..f09c71f --- /dev/null +++ b/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/pragma/tst.temporal.ksh @@ -0,0 +1,106 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2012 by Delphix. All rights reserved. +# + +############################################################################ +# ASSERTION: +# temporal option causes output to be sorted +# +# SECTION: Pragma +# +# NOTES: The temporal option has no effect on a single-CPU system, so +# this needs to be run on a multi-CPU system to effectively test the +# temporal option. +# +############################################################################ + +if [ $# != 1 ]; then + echo expected one argument: '<'dtrace-path'>' + exit 2 +fi + +dtrace=$1 +file=/tmp/out.$$ + +rm -f $file + +$dtrace -o $file -c 'sleep 3' -s /dev/stdin < $file.2 + +sort -n $file.2 | diff $file.2 - +status=$? +if [ "$status" -ne 0 ]; then + echo $tst: output is not sorted + exit $status +fi + +head -n 1 $file.2 | grep begin >/dev/null +status=$? +if [ "$status" -ne 0 ]; then + echo $tst: begin probe did not fire + exit $status +fi + +tail -n 2 $file.2 | grep end >/dev/null +status=$? +if [ "$status" -ne 0 ]; then + echo $tst: end probe did not fire + exit $status +fi + +if [ $(tail -n 1 $file.2 | cut -f3 -d ' ') -ne \ + $(wc -l $file.2) ]; then + echo $tst: incorrect number of lines output + exit 1 +fi + +exit $status diff --git a/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/pragma/tst.temporal2.ksh b/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/pragma/tst.temporal2.ksh new file mode 100644 index 0000000..05b078a --- /dev/null +++ b/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/pragma/tst.temporal2.ksh @@ -0,0 +1,102 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2012 by Delphix. All rights reserved. +# + +############################################################################ +# ASSERTION: +# temporal option causes output to be sorted, even when some +# buffers are empty +# +# SECTION: Pragma +# +# NOTES: The temporal option has no effect on a single-CPU system, so +# this needs to be run on a multi-CPU system to effectively test the +# temporal option. +# +############################################################################ + +if [ $# != 1 ]; then + echo expected one argument: '<'dtrace-path'>' + exit 2 +fi + +dtrace=$1 +file=/tmp/out.$$ + +rm -f $file + +$dtrace -o $file -s /dev/stdin < $file.2 + +sort -n $file.2 | diff $file.2 - +status=$? +if [ "$status" -ne 0 ]; then + echo $tst: output is not sorted + exit $status +fi + +exit $status diff --git a/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/pragma/tst.temporal3.d b/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/pragma/tst.temporal3.d new file mode 100644 index 0000000..b4c0e55 --- /dev/null +++ b/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/pragma/tst.temporal3.d @@ -0,0 +1,48 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +/* + * This test excercises the "remnant" handling of the temporal option. + * At the end of one pass of retrieving and printing data from all CPUs, + * some unprocessed data will remain, because its timestamp is after the + * time covered by all CPUs' buffers. This unprocessed data is + * rearranged in a more space-efficient manner. If this is done + * incorrectly, an alignment error may occur. To test this, we use a + * high-frequency probe so that data will be recorded in subsequent + * CPU's buffers after the first CPU's buffer is obtained. The + * combination of data traced here (a 8-byte value and a 4-byte value) + * is effective to cause alignment problems with an incorrect + * implementation. + * + * This test needs to be run on a multi-CPU system to be effective. + */ + +#pragma D option quiet +#pragma D option temporal + +profile-4997 +{ + printf("%u %u", 1ULL, 2); +} + +tick-1 +/i++ == 10/ +{ + exit(0); +} diff --git a/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/speculation/err.BufSizeVariations1.d b/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/speculation/err.BufSizeVariations1.d index e97506e..c59ea3b 100644 --- a/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/speculation/err.BufSizeVariations1.d +++ b/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/speculation/err.BufSizeVariations1.d @@ -24,7 +24,10 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + /* * ASSERTION: @@ -35,17 +38,10 @@ * * NOTES: This test behaves differently depending on the values * assigned to bufsize. - * 1. 0 > bufsize. - * 2. 0 == bufsize. - * 3. 0 < bufsize <= 7 - * 4. 8 <= bufsize <= 31 - * 5. 32 <= bufsize <= 47 - * 6. 48 <= bufsize <= 71 - * 7. 72 <= bufsize */ #pragma D option quiet -#pragma D option bufsize=41 +#pragma D option bufsize=49 BEGIN { diff --git a/cddl/contrib/opensolaris/lib/libdtrace/common/dt_consume.c b/cddl/contrib/opensolaris/lib/libdtrace/common/dt_consume.c index 797e7e3..c88a92c 100644 --- a/cddl/contrib/opensolaris/lib/libdtrace/common/dt_consume.c +++ b/cddl/contrib/opensolaris/lib/libdtrace/common/dt_consume.c @@ -25,7 +25,7 @@ /* * Copyright (c) 2011, Joyent, Inc. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -39,6 +39,7 @@ #include #endif #include +#include #if !defined(sun) #include #endif @@ -443,17 +444,8 @@ dt_flowindent(dtrace_hdl_t *dtp, dtrace_probedata_t *data, dtrace_epid_t last, offs += epd->dtepd_size; do { - if (offs >= buf->dtbd_size) { - /* - * We're at the end -- maybe. If the oldest - * record is non-zero, we need to wrap. - */ - if (buf->dtbd_oldest != 0) { - offs = 0; - } else { - goto out; - } - } + if (offs >= buf->dtbd_size) + goto out; next = *(uint32_t *)((uintptr_t)buf->dtbd_data + offs); @@ -2014,26 +2006,27 @@ dt_setopt(dtrace_hdl_t *dtp, const dtrace_probedata_t *data, } static int -dt_consume_cpu(dtrace_hdl_t *dtp, FILE *fp, int cpu, dtrace_bufdesc_t *buf, +dt_consume_cpu(dtrace_hdl_t *dtp, FILE *fp, int cpu, + dtrace_bufdesc_t *buf, boolean_t just_one, dtrace_consume_probe_f *efunc, dtrace_consume_rec_f *rfunc, void *arg) { dtrace_epid_t id; - size_t offs, start = buf->dtbd_oldest, end = buf->dtbd_size; + size_t offs; int flow = (dtp->dt_options[DTRACEOPT_FLOWINDENT] != DTRACEOPT_UNSET); int quiet = (dtp->dt_options[DTRACEOPT_QUIET] != DTRACEOPT_UNSET); int rval, i, n; - dtrace_epid_t last = DTRACE_EPIDNONE; uint64_t tracememsize = 0; dtrace_probedata_t data; uint64_t drops; - caddr_t addr; + data.dtpda_flow = dtp->dt_flow; + data.dtpda_indent = dtp->dt_indent; + data.dtpda_prefix = dtp->dt_prefix; bzero(&data, sizeof (data)); data.dtpda_handle = dtp; data.dtpda_cpu = cpu; -again: - for (offs = start; offs < end; ) { + for (offs = buf->dtbd_oldest; offs < buf->dtbd_size; ) { dtrace_eprobedesc_t *epd; /* @@ -2068,7 +2061,8 @@ again: } if (flow) - (void) dt_flowindent(dtp, &data, last, buf, offs); + (void) dt_flowindent(dtp, &data, dtp->dt_last_epid, + buf, offs); rval = (*efunc)(&data, arg); @@ -2087,6 +2081,7 @@ again: return (dt_set_errno(dtp, EDT_BADRVAL)); for (i = 0; i < epd->dtepd_nrecs; i++) { + caddr_t addr; dtrace_recdesc_t *rec = &epd->dtepd_rec[i]; dtrace_actkind_t act = rec->dtrd_action; @@ -2458,14 +2453,16 @@ nextrec: rval = (*rfunc)(&data, NULL, arg); nextepid: offs += epd->dtepd_size; - last = id; + dtp->dt_last_epid = id; + if (just_one) { + buf->dtbd_oldest = offs; + break; + } } - if (buf->dtbd_oldest != 0 && start == buf->dtbd_oldest) { - end = buf->dtbd_oldest; - start = 0; - goto again; - } + dtp->dt_flow = data.dtpda_flow; + dtp->dt_indent = data.dtpda_indent; + dtp->dt_prefix = data.dtpda_prefix; if ((drops = buf->dtbd_drops) == 0) return (0); @@ -2478,6 +2475,130 @@ nextepid: return (dt_handle_cpudrop(dtp, cpu, DTRACEDROP_PRINCIPAL, drops)); } +/* + * Reduce memory usage by shrinking the buffer if it's no more than half full. + * Note, we need to preserve the alignment of the data at dtbd_oldest, which is + * only 4-byte aligned. + */ +static void +dt_realloc_buf(dtrace_hdl_t *dtp, dtrace_bufdesc_t *buf, int cursize) +{ + uint64_t used = buf->dtbd_size - buf->dtbd_oldest; + if (used < cursize / 2) { + int misalign = buf->dtbd_oldest & (sizeof (uint64_t) - 1); + char *newdata = dt_alloc(dtp, used + misalign); + if (newdata == NULL) + return; + bzero(newdata, misalign); + bcopy(buf->dtbd_data + buf->dtbd_oldest, + newdata + misalign, used); + dt_free(dtp, buf->dtbd_data); + buf->dtbd_oldest = misalign; + buf->dtbd_size = used + misalign; + buf->dtbd_data = newdata; + } +} + +/* + * If the ring buffer has wrapped, the data is not in order. Rearrange it + * so that it is. Note, we need to preserve the alignment of the data at + * dtbd_oldest, which is only 4-byte aligned. + */ +static int +dt_unring_buf(dtrace_hdl_t *dtp, dtrace_bufdesc_t *buf) +{ + int misalign; + char *newdata, *ndp; + + if (buf->dtbd_oldest == 0) + return (0); + + misalign = buf->dtbd_oldest & (sizeof (uint64_t) - 1); + newdata = ndp = dt_alloc(dtp, buf->dtbd_size + misalign); + + if (newdata == NULL) + return (-1); + + assert(0 == (buf->dtbd_size & (sizeof (uint64_t) - 1))); + + bzero(ndp, misalign); + ndp += misalign; + + bcopy(buf->dtbd_data + buf->dtbd_oldest, ndp, + buf->dtbd_size - buf->dtbd_oldest); + ndp += buf->dtbd_size - buf->dtbd_oldest; + + bcopy(buf->dtbd_data, ndp, buf->dtbd_oldest); + + dt_free(dtp, buf->dtbd_data); + buf->dtbd_oldest = 0; + buf->dtbd_data = newdata; + buf->dtbd_size += misalign; + + return (0); +} + +static void +dt_put_buf(dtrace_hdl_t *dtp, dtrace_bufdesc_t *buf) +{ + dt_free(dtp, buf->dtbd_data); + dt_free(dtp, buf); +} + +/* + * Returns 0 on success, in which case *cbp will be filled in if we retrieved + * data, or NULL if there is no data for this CPU. + * Returns -1 on failure and sets dt_errno. + */ +static int +dt_get_buf(dtrace_hdl_t *dtp, int cpu, dtrace_bufdesc_t **bufp) +{ + dtrace_optval_t size; + dtrace_bufdesc_t *buf = dt_zalloc(dtp, sizeof (*buf)); + int error; + + if (buf == NULL) + return (-1); + + (void) dtrace_getopt(dtp, "bufsize", &size); + buf->dtbd_data = dt_alloc(dtp, size); + if (buf->dtbd_data == NULL) { + dt_free(dtp, buf); + return (-1); + } + buf->dtbd_size = size; + buf->dtbd_cpu = cpu; + +#if defined(sun) + if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, buf) == -1) { +#else + if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, &buf) == -1) { +#endif + dt_put_buf(dtp, buf); + /* + * If we failed with ENOENT, it may be because the + * CPU was unconfigured -- this is okay. Any other + * error, however, is unexpected. + */ + if (errno == ENOENT) { + *bufp = NULL; + return (0); + } + + return (dt_set_errno(dtp, errno)); + } + + error = dt_unring_buf(dtp, buf); + if (error != 0) { + dt_put_buf(dtp, buf); + return (error); + } + dt_realloc_buf(dtp, buf, size); + + *bufp = buf; + return (0); +} + typedef struct dt_begin { dtrace_consume_probe_f *dtbgn_probefunc; dtrace_consume_rec_f *dtbgn_recfunc; @@ -2541,7 +2662,7 @@ dt_consume_begin_error(const dtrace_errdata_t *data, void *arg) } static int -dt_consume_begin(dtrace_hdl_t *dtp, FILE *fp, dtrace_bufdesc_t *buf, +dt_consume_begin(dtrace_hdl_t *dtp, FILE *fp, dtrace_consume_probe_f *pf, dtrace_consume_rec_f *rf, void *arg) { /* @@ -2565,33 +2686,19 @@ dt_consume_begin(dtrace_hdl_t *dtp, FILE *fp, dtrace_bufdesc_t *buf, * first pass, and that we only process ERROR enablings _not_ induced * by BEGIN enablings in the second pass. */ + dt_begin_t begin; processorid_t cpu = dtp->dt_beganon; - dtrace_bufdesc_t nbuf; -#if !defined(sun) - dtrace_bufdesc_t *pbuf; -#endif int rval, i; static int max_ncpus; - dtrace_optval_t size; + dtrace_bufdesc_t *buf; dtp->dt_beganon = -1; -#if defined(sun) - if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, buf) == -1) { -#else - if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, &buf) == -1) { -#endif - /* - * We really don't expect this to fail, but it is at least - * technically possible for this to fail with ENOENT. In this - * case, we just drive on... - */ - if (errno == ENOENT) - return (0); - - return (dt_set_errno(dtp, errno)); - } + if (dt_get_buf(dtp, cpu, &buf) != 0) + return (-1); + if (buf == NULL) + return (0); if (!dtp->dt_stopped || buf->dtbd_cpu != dtp->dt_endedon) { /* @@ -2599,7 +2706,10 @@ dt_consume_begin(dtrace_hdl_t *dtp, FILE *fp, dtrace_bufdesc_t *buf, * we are, we actually processed any END probes on another * CPU. We can simply consume this buffer and return. */ - return (dt_consume_cpu(dtp, fp, cpu, buf, pf, rf, arg)); + rval = dt_consume_cpu(dtp, fp, cpu, buf, B_FALSE, + pf, rf, arg); + dt_put_buf(dtp, buf); + return (rval); } begin.dtbgn_probefunc = pf; @@ -2616,61 +2726,41 @@ dt_consume_begin(dtrace_hdl_t *dtp, FILE *fp, dtrace_bufdesc_t *buf, dtp->dt_errhdlr = dt_consume_begin_error; dtp->dt_errarg = &begin; - rval = dt_consume_cpu(dtp, fp, cpu, buf, dt_consume_begin_probe, - dt_consume_begin_record, &begin); + rval = dt_consume_cpu(dtp, fp, cpu, buf, B_FALSE, + dt_consume_begin_probe, dt_consume_begin_record, &begin); dtp->dt_errhdlr = begin.dtbgn_errhdlr; dtp->dt_errarg = begin.dtbgn_errarg; - if (rval != 0) + if (rval != 0) { + dt_put_buf(dtp, buf); return (rval); - - /* - * Now allocate a new buffer. We'll use this to deal with every other - * CPU. - */ - bzero(&nbuf, sizeof (dtrace_bufdesc_t)); - (void) dtrace_getopt(dtp, "bufsize", &size); - if ((nbuf.dtbd_data = malloc(size)) == NULL) - return (dt_set_errno(dtp, EDT_NOMEM)); + } if (max_ncpus == 0) max_ncpus = dt_sysconf(dtp, _SC_CPUID_MAX) + 1; for (i = 0; i < max_ncpus; i++) { - nbuf.dtbd_cpu = i; - + dtrace_bufdesc_t *nbuf; if (i == cpu) continue; -#if defined(sun) - if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, &nbuf) == -1) { -#else - pbuf = &nbuf; - if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, &pbuf) == -1) { -#endif - /* - * If we failed with ENOENT, it may be because the - * CPU was unconfigured -- this is okay. Any other - * error, however, is unexpected. - */ - if (errno == ENOENT) - continue; - - free(nbuf.dtbd_data); - - return (dt_set_errno(dtp, errno)); + if (dt_get_buf(dtp, i, &nbuf) != 0) { + dt_put_buf(dtp, buf); + return (-1); } + if (nbuf == NULL) + continue; - if ((rval = dt_consume_cpu(dtp, fp, - i, &nbuf, pf, rf, arg)) != 0) { - free(nbuf.dtbd_data); + rval = dt_consume_cpu(dtp, fp, i, nbuf, B_FALSE, + pf, rf, arg); + dt_put_buf(dtp, nbuf); + if (rval != 0) { + dt_put_buf(dtp, buf); return (rval); } } - free(nbuf.dtbd_data); - /* * Okay -- we're done with the other buffers. Now we want to * reconsume the first buffer -- but this time we're looking for @@ -2685,8 +2775,8 @@ dt_consume_begin(dtrace_hdl_t *dtp, FILE *fp, dtrace_bufdesc_t *buf, dtp->dt_errhdlr = dt_consume_begin_error; dtp->dt_errarg = &begin; - rval = dt_consume_cpu(dtp, fp, cpu, buf, dt_consume_begin_probe, - dt_consume_begin_record, &begin); + rval = dt_consume_cpu(dtp, fp, cpu, buf, B_FALSE, + dt_consume_begin_probe, dt_consume_begin_record, &begin); dtp->dt_errhdlr = begin.dtbgn_errhdlr; dtp->dt_errarg = begin.dtbgn_errarg; @@ -2694,11 +2784,32 @@ dt_consume_begin(dtrace_hdl_t *dtp, FILE *fp, dtrace_bufdesc_t *buf, return (rval); } +/* ARGSUSED */ +static uint64_t +dt_buf_oldest(void *elem, void *arg) +{ + dtrace_bufdesc_t *buf = elem; + size_t offs = buf->dtbd_oldest; + + while (offs < buf->dtbd_size) { + dtrace_rechdr_t *dtrh = + /* LINTED - alignment */ + (dtrace_rechdr_t *)(buf->dtbd_data + offs); + if (dtrh->dtrh_epid == DTRACE_EPIDNONE) { + offs += sizeof (dtrace_epid_t); + } else { + return (DTRACE_RECORD_LOAD_TIMESTAMP(dtrh)); + } + } + + /* There are no records left; use the time the buffer was retrieved. */ + return (buf->dtbd_timestamp); +} + int dtrace_consume(dtrace_hdl_t *dtp, FILE *fp, dtrace_consume_probe_f *pf, dtrace_consume_rec_f *rf, void *arg) { - dtrace_bufdesc_t *buf = &dtp->dt_buf; dtrace_optval_t size; static int max_ncpus; int i, rval; @@ -2726,79 +2837,158 @@ dtrace_consume(dtrace_hdl_t *dtp, FILE *fp, if (rf == NULL) rf = (dtrace_consume_rec_f *)dt_nullrec; - if (buf->dtbd_data == NULL) { - (void) dtrace_getopt(dtp, "bufsize", &size); - if ((buf->dtbd_data = malloc(size)) == NULL) - return (dt_set_errno(dtp, EDT_NOMEM)); - - buf->dtbd_size = size; - } - - /* - * If we have just begun, we want to first process the CPU that - * executed the BEGIN probe (if any). - */ - if (dtp->dt_active && dtp->dt_beganon != -1) { - buf->dtbd_cpu = dtp->dt_beganon; - if ((rval = dt_consume_begin(dtp, fp, buf, pf, rf, arg)) != 0) - return (rval); - } - - for (i = 0; i < max_ncpus; i++) { - buf->dtbd_cpu = i; - + if (dtp->dt_options[DTRACEOPT_TEMPORAL] == DTRACEOPT_UNSET) { /* - * If we have stopped, we want to process the CPU on which the - * END probe was processed only _after_ we have processed - * everything else. + * The output will not be in the order it was traced. Rather, + * we will consume all of the data from each CPU's buffer in + * turn. We apply special handling for the records from BEGIN + * and END probes so that they are consumed first and last, + * respectively. + * + * If we have just begun, we want to first process the CPU that + * executed the BEGIN probe (if any). */ - if (dtp->dt_stopped && (i == dtp->dt_endedon)) - continue; + if (dtp->dt_active && dtp->dt_beganon != -1 && + (rval = dt_consume_begin(dtp, fp, pf, rf, arg)) != 0) + return (rval); + + for (i = 0; i < max_ncpus; i++) { + dtrace_bufdesc_t *buf; -#if defined(sun) - if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, buf) == -1) { -#else - if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, &buf) == -1) { -#endif /* - * If we failed with ENOENT, it may be because the - * CPU was unconfigured -- this is okay. Any other - * error, however, is unexpected. + * If we have stopped, we want to process the CPU on + * which the END probe was processed only _after_ we + * have processed everything else. */ - if (errno == ENOENT) + if (dtp->dt_stopped && (i == dtp->dt_endedon)) continue; - return (dt_set_errno(dtp, errno)); + if (dt_get_buf(dtp, i, &buf) != 0) + return (-1); + if (buf == NULL) + continue; + + dtp->dt_flow = 0; + dtp->dt_indent = 0; + dtp->dt_prefix = NULL; + rval = dt_consume_cpu(dtp, fp, i, + buf, B_FALSE, pf, rf, arg); + dt_put_buf(dtp, buf); + if (rval != 0) + return (rval); } + if (dtp->dt_stopped) { + dtrace_bufdesc_t *buf; + + if (dt_get_buf(dtp, dtp->dt_endedon, &buf) != 0) + return (-1); + if (buf == NULL) + return (0); - if ((rval = dt_consume_cpu(dtp, fp, i, buf, pf, rf, arg)) != 0) + rval = dt_consume_cpu(dtp, fp, dtp->dt_endedon, + buf, B_FALSE, pf, rf, arg); + dt_put_buf(dtp, buf); return (rval); - } + } + } else { + /* + * The output will be in the order it was traced (or for + * speculations, when it was committed). We retrieve a buffer + * from each CPU and put it into a priority queue, which sorts + * based on the first entry in the buffer. This is sufficient + * because entries within a buffer are already sorted. + * + * We then consume records one at a time, always consuming the + * oldest record, as determined by the priority queue. When + * we reach the end of the time covered by these buffers, + * we need to stop and retrieve more records on the next pass. + * The kernel tells us the time covered by each buffer, in + * dtbd_timestamp. The first buffer's timestamp tells us the + * time covered by all buffers, as subsequently retrieved + * buffers will cover to a more recent time. + */ - if (!dtp->dt_stopped) - return (0); + uint64_t *drops = alloca(max_ncpus * sizeof (uint64_t)); + uint64_t first_timestamp = 0; + uint_t cookie = 0; + dtrace_bufdesc_t *buf; - buf->dtbd_cpu = dtp->dt_endedon; + bzero(drops, max_ncpus * sizeof (uint64_t)); + + if (dtp->dt_bufq == NULL) { + dtp->dt_bufq = dt_pq_init(dtp, max_ncpus * 2, + dt_buf_oldest, NULL); + if (dtp->dt_bufq == NULL) /* ENOMEM */ + return (-1); + } + + /* Retrieve data from each CPU. */ + (void) dtrace_getopt(dtp, "bufsize", &size); + for (i = 0; i < max_ncpus; i++) { + dtrace_bufdesc_t *buf; + + if (dt_get_buf(dtp, i, &buf) != 0) + return (-1); + if (buf != NULL) { + if (first_timestamp == 0) + first_timestamp = buf->dtbd_timestamp; + assert(buf->dtbd_timestamp >= first_timestamp); + + dt_pq_insert(dtp->dt_bufq, buf); + drops[i] = buf->dtbd_drops; + buf->dtbd_drops = 0; + } + } + + /* Consume records. */ + for (;;) { + dtrace_bufdesc_t *buf = dt_pq_pop(dtp->dt_bufq); + uint64_t timestamp; + + if (buf == NULL) + break; + + timestamp = dt_buf_oldest(buf, dtp); + assert(timestamp >= dtp->dt_last_timestamp); + dtp->dt_last_timestamp = timestamp; + + if (timestamp == buf->dtbd_timestamp) { + /* + * We've reached the end of the time covered + * by this buffer. If this is the oldest + * buffer, we must do another pass + * to retrieve more data. + */ + dt_put_buf(dtp, buf); + if (timestamp == first_timestamp && + !dtp->dt_stopped) + break; + continue; + } + + if ((rval = dt_consume_cpu(dtp, fp, + buf->dtbd_cpu, buf, B_TRUE, pf, rf, arg)) != 0) + return (rval); + dt_pq_insert(dtp->dt_bufq, buf); + } + + /* Consume drops. */ + for (i = 0; i < max_ncpus; i++) { + if (drops[i] != 0) { + int error = dt_handle_cpudrop(dtp, i, + DTRACEDROP_PRINCIPAL, drops[i]); + if (error != 0) + return (error); + } + } -#if defined(sun) - if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, buf) == -1) { -#else - if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, &buf) == -1) { -#endif /* - * This _really_ shouldn't fail, but it is strictly speaking - * possible for this to return ENOENT if the CPU that called - * the END enabling somehow managed to become unconfigured. - * It's unclear how the user can possibly expect anything - * rational to happen in this case -- the state has been thrown - * out along with the unconfigured CPU -- so we'll just drive - * on... + * Reduce memory usage by re-allocating smaller buffers + * for the "remnants". */ - if (errno == ENOENT) - return (0); - - return (dt_set_errno(dtp, errno)); + while (buf = dt_pq_walk(dtp->dt_bufq, &cookie)) + dt_realloc_buf(dtp, buf, buf->dtbd_size); } - return (dt_consume_cpu(dtp, fp, dtp->dt_endedon, buf, pf, rf, arg)); + return (0); } diff --git a/cddl/contrib/opensolaris/lib/libdtrace/common/dt_impl.h b/cddl/contrib/opensolaris/lib/libdtrace/common/dt_impl.h index 61d901b..c066b03 100644 --- a/cddl/contrib/opensolaris/lib/libdtrace/common/dt_impl.h +++ b/cddl/contrib/opensolaris/lib/libdtrace/common/dt_impl.h @@ -26,7 +26,7 @@ /* * Copyright (c) 2011, Joyent, Inc. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _DT_IMPL_H @@ -64,6 +64,7 @@ extern "C" { #include #include #include +#include struct dt_module; /* see below */ struct dt_pfdict; /* see */ @@ -239,6 +240,7 @@ struct dtrace_hdl { uint_t dt_provbuckets; /* number of provider hash buckets */ uint_t dt_nprovs; /* number of providers in hash and list */ dt_proc_hash_t *dt_procs; /* hash table of grabbed process handles */ + char **dt_proc_env; /* additional environment variables */ dt_intdesc_t dt_ints[6]; /* cached integer type descriptions */ ctf_id_t dt_type_func; /* cached CTF identifier for function type */ ctf_id_t dt_type_fptr; /* cached CTF identifier for function pointer */ @@ -257,7 +259,7 @@ struct dtrace_hdl { int dt_maxstrdata; /* max strdata ID */ char **dt_strdata; /* pointer to strdata array */ dt_aggregate_t dt_aggregate; /* aggregate */ - dtrace_bufdesc_t dt_buf; /* staging buffer */ + dt_pq_t *dt_bufq; /* CPU-specific data queue */ struct dt_pfdict *dt_pfdict; /* dictionary of printf conversions */ dt_version_t dt_vmax; /* optional ceiling on program API binding */ dtrace_attribute_t dt_amin; /* optional floor on program attributes */ @@ -326,6 +328,11 @@ struct dtrace_hdl { struct utsname dt_uts; /* uname(2) information for system */ dt_list_t dt_lib_dep; /* scratch linked-list of lib dependencies */ dt_list_t dt_lib_dep_sorted; /* dependency sorted library list */ + dtrace_flowkind_t dt_flow; /* flow kind */ + const char *dt_prefix; /* recommended flow prefix */ + int dt_indent; /* recommended flow indent */ + dtrace_epid_t dt_last_epid; /* most recently consumed EPID */ + uint64_t dt_last_timestamp; /* most recently consumed timestamp */ }; /* diff --git a/cddl/contrib/opensolaris/lib/libdtrace/common/dt_open.c b/cddl/contrib/opensolaris/lib/libdtrace/common/dt_open.c index dcc4e7c..25f9956 100644 --- a/cddl/contrib/opensolaris/lib/libdtrace/common/dt_open.c +++ b/cddl/contrib/opensolaris/lib/libdtrace/common/dt_open.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, Joyent, Inc. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -92,7 +92,7 @@ /* * The version number should be increased for every customer visible release - * of Solaris. The major number should be incremented when a fundamental + * of DTrace. The major number should be incremented when a fundamental * change has been made that would affect all consumers, and would reflect * sweeping changes to DTrace or the D language. The minor number should be * incremented when a change is introduced that could break scripts that had @@ -121,8 +121,9 @@ #define DT_VERS_1_8 DT_VERSION_NUMBER(1, 8, 0) #define DT_VERS_1_8_1 DT_VERSION_NUMBER(1, 8, 1) #define DT_VERS_1_9 DT_VERSION_NUMBER(1, 9, 0) -#define DT_VERS_LATEST DT_VERS_1_9 -#define DT_VERS_STRING "Sun D 1.9" +#define DT_VERS_1_9_1 DT_VERSION_NUMBER(1, 9, 1) +#define DT_VERS_LATEST DT_VERS_1_9_1 +#define DT_VERS_STRING "Sun D 1.9.1" const dt_version_t _dtrace_versions[] = { DT_VERS_1_0, /* D API 1.0.0 (PSARC 2001/466) Solaris 10 FCS */ @@ -143,6 +144,7 @@ const dt_version_t _dtrace_versions[] = { DT_VERS_1_8, /* D API 1.8 */ DT_VERS_1_8_1, /* D API 1.8.1 */ DT_VERS_1_9, /* D API 1.9 */ + DT_VERS_1_9_1, /* D API 1.9.1 */ 0 }; @@ -1630,7 +1632,6 @@ dtrace_close(dtrace_hdl_t *dtp) dt_strdata_destroy(dtp); dt_buffered_destroy(dtp); dt_aggregate_destroy(dtp); - free(dtp->dt_buf.dtbd_data); dt_pfdict_destroy(dtp); dt_provmod_destroy(&dtp->dt_provmod); dt_dof_fini(dtp); diff --git a/cddl/contrib/opensolaris/lib/libdtrace/common/dt_options.c b/cddl/contrib/opensolaris/lib/libdtrace/common/dt_options.c index fa1407f..020df2f 100644 --- a/cddl/contrib/opensolaris/lib/libdtrace/common/dt_options.c +++ b/cddl/contrib/opensolaris/lib/libdtrace/common/dt_options.c @@ -26,6 +26,10 @@ #pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + #include #include #include @@ -368,6 +372,61 @@ dt_opt_pgmax(dtrace_hdl_t *dtp, const char *arg, uintptr_t option) return (0); } +static int +dt_opt_setenv(dtrace_hdl_t *dtp, const char *arg, uintptr_t option) +{ + char **p; + char *var; + int i; + + /* + * We can't effectively set environment variables from #pragma lines + * since the processes have already been spawned. + */ + if (dtp->dt_pcb != NULL) + return (dt_set_errno(dtp, EDT_BADOPTCTX)); + + if (arg == NULL) + return (dt_set_errno(dtp, EDT_BADOPTVAL)); + + if (!option && strchr(arg, '=') != NULL) + return (dt_set_errno(dtp, EDT_BADOPTVAL)); + + for (i = 1, p = dtp->dt_proc_env; *p != NULL; i++, p++) + continue; + + for (p = dtp->dt_proc_env; *p != NULL; p++) { + var = strchr(*p, '='); + if (var == NULL) + var = *p + strlen(*p); + if (strncmp(*p, arg, var - *p) == 0) { + dt_free(dtp, *p); + *p = dtp->dt_proc_env[i - 1]; + dtp->dt_proc_env[i - 1] = NULL; + i--; + } + } + + if (option) { + if ((var = strdup(arg)) == NULL) + return (dt_set_errno(dtp, EDT_NOMEM)); + + if ((p = dt_alloc(dtp, sizeof (char *) * (i + 1))) == NULL) { + dt_free(dtp, var); + return (dt_set_errno(dtp, EDT_NOMEM)); + } + + bcopy(dtp->dt_proc_env, p, sizeof (char *) * i); + dt_free(dtp, dtp->dt_proc_env); + dtp->dt_proc_env = p; + + dtp->dt_proc_env[i - 1] = var; + dtp->dt_proc_env[i] = NULL; + } + + return (0); +} + /*ARGSUSED*/ static int dt_opt_stdc(dtrace_hdl_t *dtp, const char *arg, uintptr_t option) @@ -411,7 +470,6 @@ dt_opt_syslibdir(dtrace_hdl_t *dtp, const char *arg, uintptr_t option) return (0); } - /*ARGSUSED*/ static int dt_opt_tree(dtrace_hdl_t *dtp, const char *arg, uintptr_t option) @@ -912,6 +970,7 @@ static const dt_option_t _dtrace_ctoptions[] = { { "pgmax", dt_opt_pgmax }, { "preallocate", dt_opt_preallocate }, { "pspec", dt_opt_cflags, DTRACE_C_PSPEC }, + { "setenv", dt_opt_setenv, 1 }, { "stdc", dt_opt_stdc }, { "strip", dt_opt_dflags, DTRACE_D_STRIP }, { "syslibdir", dt_opt_syslibdir }, @@ -920,6 +979,7 @@ static const dt_option_t _dtrace_ctoptions[] = { { "udefs", dt_opt_invcflags, DTRACE_C_UNODEF }, { "undef", dt_opt_cpp_opts, (uintptr_t)"-U" }, { "unodefs", dt_opt_cflags, DTRACE_C_UNODEF }, + { "unsetenv", dt_opt_setenv, 0 }, { "verbose", dt_opt_cflags, DTRACE_C_DIFV }, { "version", dt_opt_version }, { "zdefs", dt_opt_cflags, DTRACE_C_ZDEFS }, @@ -947,6 +1007,7 @@ static const dt_option_t _dtrace_rtoptions[] = { { "statusrate", dt_opt_rate, DTRACEOPT_STATUSRATE }, { "strsize", dt_opt_strsize, DTRACEOPT_STRSIZE }, { "ustackframes", dt_opt_runtime, DTRACEOPT_USTACKFRAMES }, + { "temporal", dt_opt_runtime, DTRACEOPT_TEMPORAL }, { NULL, NULL, 0 } }; diff --git a/cddl/contrib/opensolaris/lib/libdtrace/common/dt_pq.c b/cddl/contrib/opensolaris/lib/libdtrace/common/dt_pq.c new file mode 100644 index 0000000..0cd556a --- /dev/null +++ b/cddl/contrib/opensolaris/lib/libdtrace/common/dt_pq.c @@ -0,0 +1,157 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include + +/* + * Create a new priority queue. + * + * size is the maximum number of items that will be stored in the priority + * queue at one time. + */ +dt_pq_t * +dt_pq_init(dtrace_hdl_t *dtp, uint_t size, dt_pq_value_f value_cb, void *cb_arg) +{ + dt_pq_t *p; + assert(size > 1); + + if ((p = dt_zalloc(dtp, sizeof (dt_pq_t))) == NULL) + return (NULL); + + p->dtpq_items = dt_zalloc(dtp, size * sizeof (p->dtpq_items[0])); + if (p->dtpq_items == NULL) { + dt_free(dtp, p); + return (NULL); + } + + p->dtpq_hdl = dtp; + p->dtpq_size = size; + p->dtpq_last = 1; + p->dtpq_value = value_cb; + p->dtpq_arg = cb_arg; + + return (p); +} + +void +dt_pq_fini(dt_pq_t *p) +{ + dtrace_hdl_t *dtp = p->dtpq_hdl; + + dt_free(dtp, p->dtpq_items); + dt_free(dtp, p); +} + +static uint64_t +dt_pq_getvalue(dt_pq_t *p, uint_t index) +{ + void *item = p->dtpq_items[index]; + return (p->dtpq_value(item, p->dtpq_arg)); +} + +void +dt_pq_insert(dt_pq_t *p, void *item) +{ + uint_t i; + + assert(p->dtpq_last < p->dtpq_size); + + i = p->dtpq_last++; + p->dtpq_items[i] = item; + + while (i > 1 && dt_pq_getvalue(p, i) < dt_pq_getvalue(p, i / 2)) { + void *tmp = p->dtpq_items[i]; + p->dtpq_items[i] = p->dtpq_items[i / 2]; + p->dtpq_items[i / 2] = tmp; + i /= 2; + } +} + +/* + * Return elements from the priority queue. *cookie should be zero when first + * called. Returns NULL when there are no more elements. + */ +void * +dt_pq_walk(dt_pq_t *p, uint_t *cookie) +{ + (*cookie)++; + if (*cookie >= p->dtpq_last) + return (NULL); + + return (p->dtpq_items[*cookie]); +} + +void * +dt_pq_pop(dt_pq_t *p) +{ + uint_t i = 1; + void *ret; + + assert(p->dtpq_last > 0); + + if (p->dtpq_last == 1) + return (NULL); + + ret = p->dtpq_items[1]; + + p->dtpq_last--; + p->dtpq_items[1] = p->dtpq_items[p->dtpq_last]; + p->dtpq_items[p->dtpq_last] = NULL; + + for (;;) { + uint_t lc = i * 2; + uint_t rc = i * 2 + 1; + uint_t c; + uint64_t v; + void *tmp; + + if (lc >= p->dtpq_last) + break; + + if (rc >= p->dtpq_last) { + c = lc; + v = dt_pq_getvalue(p, lc); + } else { + uint64_t lv = dt_pq_getvalue(p, lc); + uint64_t rv = dt_pq_getvalue(p, rc); + + if (lv < rv) { + c = lc; + v = lv; + } else { + c = rc; + v = rv; + } + } + + if (v >= dt_pq_getvalue(p, i)) + break; + + tmp = p->dtpq_items[i]; + p->dtpq_items[i] = p->dtpq_items[c]; + p->dtpq_items[c] = tmp; + + i = c; + } + + return (ret); +} diff --git a/cddl/contrib/opensolaris/lib/libdtrace/common/dt_pq.h b/cddl/contrib/opensolaris/lib/libdtrace/common/dt_pq.h new file mode 100644 index 0000000..8184a90 --- /dev/null +++ b/cddl/contrib/opensolaris/lib/libdtrace/common/dt_pq.h @@ -0,0 +1,51 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifndef _DT_PQ_H +#define _DT_PQ_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint64_t (*dt_pq_value_f)(void *, void *); + +typedef struct dt_pq { + dtrace_hdl_t *dtpq_hdl; /* dtrace handle */ + void **dtpq_items; /* array of elements */ + uint_t dtpq_size; /* count of allocated elements */ + uint_t dtpq_last; /* next free slot */ + dt_pq_value_f dtpq_value; /* callback to get the value */ + void *dtpq_arg; /* callback argument */ +} dt_pq_t; + +extern dt_pq_t *dt_pq_init(dtrace_hdl_t *, uint_t size, dt_pq_value_f, void *); +extern void dt_pq_fini(dt_pq_t *); + +extern void dt_pq_insert(dt_pq_t *, void *); +extern void *dt_pq_pop(dt_pq_t *); +extern void *dt_pq_walk(dt_pq_t *, uint_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _DT_PQ_H */ diff --git a/cddl/lib/libdtrace/Makefile b/cddl/lib/libdtrace/Makefile index 7dd0afd..4fd7fae 100644 --- a/cddl/lib/libdtrace/Makefile +++ b/cddl/lib/libdtrace/Makefile @@ -31,6 +31,7 @@ SRCS= dt_aggregate.c \ dt_parser.c \ dt_pcb.c \ dt_pid.c \ + dt_pq.c \ dt_pragma.c \ dt_print.c \ dt_printf.c \ diff --git a/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c index c8bb4de..729a2cb 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c +++ b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c @@ -23,6 +23,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved * Use is subject to license terms. */ @@ -2345,9 +2346,10 @@ dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu, { dtrace_speculation_t *spec; dtrace_buffer_t *src, *dest; - uintptr_t daddr, saddr, dlimit; + uintptr_t daddr, saddr, dlimit, slimit; dtrace_speculation_state_t current, new = 0; intptr_t offs; + uint64_t timestamp; if (which == 0) return; @@ -2423,7 +2425,37 @@ dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu, } /* - * We have the space; copy the buffer across. (Note that this is a + * We have sufficient space to copy the speculative buffer into the + * primary buffer. First, modify the speculative buffer, filling + * in the timestamp of all entries with the current time. The data + * must have the commit() time rather than the time it was traced, + * so that all entries in the primary buffer are in timestamp order. + */ + timestamp = dtrace_gethrtime(); + saddr = (uintptr_t)src->dtb_tomax; + slimit = saddr + src->dtb_offset; + while (saddr < slimit) { + size_t size; + dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr; + + if (dtrh->dtrh_epid == DTRACE_EPIDNONE) { + saddr += sizeof (dtrace_epid_t); + continue; + } + ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs); + size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size; + + ASSERT3U(saddr + size, <=, slimit); + ASSERT3U(size, >=, sizeof (dtrace_rechdr_t)); + ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX); + + DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp); + + saddr += size; + } + + /* + * Copy the buffer across. (Note that this is a * highly subobtimal bcopy(); in the unlikely event that this becomes * a serious performance issue, a high-performance DTrace-specific * bcopy() should obviously be invented.) @@ -6206,7 +6238,7 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, if (now - state->dts_alive > dtrace_deadman_timeout) { /* * We seem to be dead. Unless we (a) have kernel - * destructive permissions (b) have expicitly enabled + * destructive permissions (b) have explicitly enabled * destructive actions and (c) destructive actions have * not been disabled, we're going to transition into * the KILLED state, from which no further processing @@ -6234,8 +6266,18 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, tomax = buf->dtb_tomax; ASSERT(tomax != NULL); - if (ecb->dte_size != 0) - DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid); + if (ecb->dte_size != 0) { + dtrace_rechdr_t dtrh; + if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) { + mstate.dtms_timestamp = dtrace_gethrtime(); + mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP; + } + ASSERT3U(ecb->dte_size, >=, sizeof (dtrace_rechdr_t)); + dtrh.dtrh_epid = ecb->dte_epid; + DTRACE_RECORD_STORE_TIMESTAMP(&dtrh, + mstate.dtms_timestamp); + *((dtrace_rechdr_t *)(tomax + offs)) = dtrh; + } mstate.dtms_epid = ecb->dte_epid; mstate.dtms_present |= DTRACE_MSTATE_EPID; @@ -6382,7 +6424,9 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, continue; switch (act->dta_kind) { - case DTRACEACT_SPECULATE: + case DTRACEACT_SPECULATE: { + dtrace_rechdr_t *dtrh; + ASSERT(buf == &state->dts_buffer[cpuid]); buf = dtrace_speculation_buffer(state, cpuid, val); @@ -6404,10 +6448,23 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, tomax = buf->dtb_tomax; ASSERT(tomax != NULL); - if (ecb->dte_size != 0) - DTRACE_STORE(uint32_t, tomax, offs, - ecb->dte_epid); + if (ecb->dte_size == 0) + continue; + + ASSERT3U(ecb->dte_size, >=, + sizeof (dtrace_rechdr_t)); + dtrh = ((void *)(tomax + offs)); + dtrh->dtrh_epid = ecb->dte_epid; + /* + * When the speculation is committed, all of + * the records in the speculative buffer will + * have their timestamps set to the commit + * time. Until then, it is set to a sentinel + * value, for debugability. + */ + DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX); continue; + } case DTRACEACT_PRINTM: { /* The DIF returns a 'memref'. */ @@ -9754,9 +9811,9 @@ dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe) /* * The default size is the size of the default action: recording - * the epid. + * the header. */ - ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t); + ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t); ecb->dte_alignment = sizeof (dtrace_epid_t); epid = state->dts_epid++; @@ -9854,122 +9911,89 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb) static void dtrace_ecb_resize(dtrace_ecb_t *ecb) { - uint32_t maxalign = sizeof (dtrace_epid_t); - uint32_t align = sizeof (uint8_t), offs, diff; dtrace_action_t *act; - int wastuple = 0; + uint32_t curneeded = UINT32_MAX; uint32_t aggbase = UINT32_MAX; - dtrace_state_t *state = ecb->dte_state; /* - * If we record anything, we always record the epid. (And we always - * record it first.) + * If we record anything, we always record the dtrace_rechdr_t. (And + * we always record it first.) */ - offs = sizeof (dtrace_epid_t); - ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t); + ecb->dte_size = sizeof (dtrace_rechdr_t); + ecb->dte_alignment = sizeof (dtrace_epid_t); for (act = ecb->dte_action; act != NULL; act = act->dta_next) { dtrace_recdesc_t *rec = &act->dta_rec; + ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1); - if ((align = rec->dtrd_alignment) > maxalign) - maxalign = align; - - if (!wastuple && act->dta_intuple) { - /* - * This is the first record in a tuple. Align the - * offset to be at offset 4 in an 8-byte aligned - * block. - */ - diff = offs + sizeof (dtrace_aggid_t); - - if ((diff = (diff & (sizeof (uint64_t) - 1)))) - offs += sizeof (uint64_t) - diff; - - aggbase = offs - sizeof (dtrace_aggid_t); - ASSERT(!(aggbase & (sizeof (uint64_t) - 1))); - } - - /*LINTED*/ - if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) { - /* - * The current offset is not properly aligned; align it. - */ - offs += align - diff; - } - - rec->dtrd_offset = offs; - - if (offs + rec->dtrd_size > ecb->dte_needed) { - ecb->dte_needed = offs + rec->dtrd_size; - - if (ecb->dte_needed > state->dts_needed) - state->dts_needed = ecb->dte_needed; - } + ecb->dte_alignment = MAX(ecb->dte_alignment, + rec->dtrd_alignment); if (DTRACEACT_ISAGG(act->dta_kind)) { dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act; - dtrace_action_t *first = agg->dtag_first, *prev; - ASSERT(rec->dtrd_size != 0 && first != NULL); - ASSERT(wastuple); + ASSERT(rec->dtrd_size != 0); + ASSERT(agg->dtag_first != NULL); + ASSERT(act->dta_prev->dta_intuple); ASSERT(aggbase != UINT32_MAX); + ASSERT(curneeded != UINT32_MAX); agg->dtag_base = aggbase; - while ((prev = first->dta_prev) != NULL && - DTRACEACT_ISAGG(prev->dta_kind)) { - agg = (dtrace_aggregation_t *)prev; - first = agg->dtag_first; - } + curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment); + rec->dtrd_offset = curneeded; + curneeded += rec->dtrd_size; + ecb->dte_needed = MAX(ecb->dte_needed, curneeded); - if (prev != NULL) { - offs = prev->dta_rec.dtrd_offset + - prev->dta_rec.dtrd_size; - } else { - offs = sizeof (dtrace_epid_t); + aggbase = UINT32_MAX; + curneeded = UINT32_MAX; + } else if (act->dta_intuple) { + if (curneeded == UINT32_MAX) { + /* + * This is the first record in a tuple. Align + * curneeded to be at offset 4 in an 8-byte + * aligned block. + */ + ASSERT(act->dta_prev == NULL || + !act->dta_prev->dta_intuple); + ASSERT3U(aggbase, ==, UINT32_MAX); + curneeded = P2PHASEUP(ecb->dte_size, + sizeof (uint64_t), sizeof (dtrace_aggid_t)); + + aggbase = curneeded - sizeof (dtrace_aggid_t); + ASSERT(IS_P2ALIGNED(aggbase, + sizeof (uint64_t))); } - wastuple = 0; + curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment); + rec->dtrd_offset = curneeded; + curneeded += rec->dtrd_size; } else { - if (!act->dta_intuple) - ecb->dte_size = offs + rec->dtrd_size; + /* tuples must be followed by an aggregation */ + ASSERT(act->dta_prev == NULL || + !act->dta_prev->dta_intuple); - offs += rec->dtrd_size; + ecb->dte_size = P2ROUNDUP(ecb->dte_size, + rec->dtrd_alignment); + rec->dtrd_offset = ecb->dte_size; + ecb->dte_size += rec->dtrd_size; + ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size); } - - wastuple = act->dta_intuple; } if ((act = ecb->dte_action) != NULL && !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) && - ecb->dte_size == sizeof (dtrace_epid_t)) { + ecb->dte_size == sizeof (dtrace_rechdr_t)) { /* - * If the size is still sizeof (dtrace_epid_t), then all + * If the size is still sizeof (dtrace_rechdr_t), then all * actions store no data; set the size to 0. */ - ecb->dte_alignment = maxalign; ecb->dte_size = 0; - - /* - * If the needed space is still sizeof (dtrace_epid_t), then - * all actions need no additional space; set the needed - * size to 0. - */ - if (ecb->dte_needed == sizeof (dtrace_epid_t)) - ecb->dte_needed = 0; - - return; } - /* - * Set our alignment, and make sure that the dte_size and dte_needed - * are aligned to the size of an EPID. - */ - ecb->dte_alignment = maxalign; - ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) & - ~(sizeof (dtrace_epid_t) - 1); - ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) & - ~(sizeof (dtrace_epid_t) - 1); - ASSERT(ecb->dte_size <= ecb->dte_needed); + ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t)); + ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t))); + ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed, + ecb->dte_needed); } static dtrace_action_t * @@ -10349,7 +10373,7 @@ dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc) break; case DTRACEACT_SPECULATE: - if (ecb->dte_size > sizeof (dtrace_epid_t)) + if (ecb->dte_size > sizeof (dtrace_rechdr_t)) return (EINVAL); if (dp == NULL) @@ -10470,7 +10494,7 @@ dtrace_ecb_action_remove(dtrace_ecb_t *ecb) ecb->dte_action = NULL; ecb->dte_action_last = NULL; - ecb->dte_size = sizeof (dtrace_epid_t); + ecb->dte_size = 0; } static void @@ -10739,12 +10763,13 @@ dtrace_buffer_switch(dtrace_buffer_t *buf) caddr_t tomax = buf->dtb_tomax; caddr_t xamot = buf->dtb_xamot; dtrace_icookie_t cookie; - hrtime_t now = dtrace_gethrtime(); + hrtime_t now; ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH)); ASSERT(!(buf->dtb_flags & DTRACEBUF_RING)); cookie = dtrace_interrupt_disable(); + now = dtrace_gethrtime(); buf->dtb_tomax = xamot; buf->dtb_xamot = tomax; buf->dtb_xamot_drops = buf->dtb_drops; @@ -11110,7 +11135,7 @@ dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align, if (epid == DTRACE_EPIDNONE) { size = sizeof (uint32_t); } else { - ASSERT(epid <= state->dts_necbs); + ASSERT3U(epid, <=, state->dts_necbs); ASSERT(state->dts_ecbs[epid - 1] != NULL); size = state->dts_ecbs[epid - 1]->dte_size; @@ -16387,6 +16412,7 @@ dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) desc.dtbd_drops = buf->dtb_drops; desc.dtbd_errors = buf->dtb_errors; desc.dtbd_oldest = buf->dtb_xamot_offset; + desc.dtbd_timestamp = dtrace_gethrtime(); mutex_exit(&dtrace_lock); @@ -16439,6 +16465,7 @@ dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) desc.dtbd_drops = buf->dtb_xamot_drops; desc.dtbd_errors = buf->dtb_xamot_errors; desc.dtbd_oldest = 0; + desc.dtbd_timestamp = buf->dtb_switched; mutex_exit(&dtrace_lock); diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h b/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h index 4968352..4595c2e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h @@ -26,6 +26,7 @@ /* * Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_DTRACE_H @@ -930,10 +931,10 @@ typedef struct dtrace_ecbdesc { * DTrace Metadata Description Structures * * DTrace separates the trace data stream from the metadata stream. The only - * metadata tokens placed in the data stream are enabled probe identifiers - * (EPIDs) or (in the case of aggregations) aggregation identifiers. In order - * to determine the structure of the data, DTrace consumers pass the token to - * the kernel, and receive in return a corresponding description of the enabled + * metadata tokens placed in the data stream are the dtrace_rechdr_t (EPID + + * timestamp) or (in the case of aggregations) aggregation identifiers. To + * determine the structure of the data, DTrace consumers pass the token to the + * kernel, and receive in return a corresponding description of the enabled * probe (via the dtrace_eprobedesc structure) or the aggregation (via the * dtrace_aggdesc structure). Both of these structures are expressed in terms * of record descriptions (via the dtrace_recdesc structure) that describe the @@ -1028,7 +1029,8 @@ typedef struct dtrace_fmtdesc { #define DTRACEOPT_AGGSORTREV 24 /* reverse-sort aggregations */ #define DTRACEOPT_AGGSORTPOS 25 /* agg. position to sort on */ #define DTRACEOPT_AGGSORTKEYPOS 26 /* agg. key position to sort on */ -#define DTRACEOPT_MAX 27 /* number of options */ +#define DTRACEOPT_TEMPORAL 27 /* temporally ordered output */ +#define DTRACEOPT_MAX 28 /* number of options */ #define DTRACEOPT_UNSET (dtrace_optval_t)-2 /* unset option */ @@ -1048,7 +1050,9 @@ typedef struct dtrace_fmtdesc { * where user-level wishes the kernel to snapshot the buffer to (the * dtbd_data field). The kernel uses the same structure to pass back some * information regarding the buffer: the size of data actually copied out, the - * number of drops, the number of errors, and the offset of the oldest record. + * number of drops, the number of errors, the offset of the oldest record, + * and the time of the snapshot. + * * If the buffer policy is a "switch" policy, taking a snapshot of the * principal buffer has the additional effect of switching the active and * inactive buffers. Taking a snapshot of the aggregation buffer _always_ has @@ -1061,9 +1065,30 @@ typedef struct dtrace_bufdesc { uint64_t dtbd_drops; /* number of drops */ DTRACE_PTR(char, dtbd_data); /* data */ uint64_t dtbd_oldest; /* offset of oldest record */ + uint64_t dtbd_timestamp; /* hrtime of snapshot */ } dtrace_bufdesc_t; /* + * Each record in the buffer (dtbd_data) begins with a header that includes + * the epid and a timestamp. The timestamp is split into two 4-byte parts + * so that we do not require 8-byte alignment. + */ +typedef struct dtrace_rechdr { + dtrace_epid_t dtrh_epid; /* enabled probe id */ + uint32_t dtrh_timestamp_hi; /* high bits of hrtime_t */ + uint32_t dtrh_timestamp_lo; /* low bits of hrtime_t */ +} dtrace_rechdr_t; + +#define DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) \ + ((dtrh)->dtrh_timestamp_lo + \ + ((uint64_t)(dtrh)->dtrh_timestamp_hi << 32)) + +#define DTRACE_RECORD_STORE_TIMESTAMP(dtrh, hrtime) { \ + (dtrh)->dtrh_timestamp_lo = (uint32_t)hrtime; \ + (dtrh)->dtrh_timestamp_hi = hrtime >> 32; \ +} + +/* * DTrace Status * * The status of DTrace is relayed via the dtrace_status structure. This diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h index 4d9a4f8..7355aa8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h @@ -23,6 +23,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. * Use is subject to license terms. */ @@ -209,15 +210,18 @@ typedef struct dtrace_hash { * predicate is non-NULL, the DIF object is executed. If the result is * non-zero, the action list is processed, with each action being executed * accordingly. When the action list has been completely executed, processing - * advances to the next ECB. processing advances to the next ECB. If the - * result is non-zero; For each ECB, it first determines the The ECB - * abstraction allows disjoint consumers to multiplex on single probes. + * advances to the next ECB. The ECB abstraction allows disjoint consumers + * to multiplex on single probes. + * + * Execution of the ECB results in consuming dte_size bytes in the buffer + * to record data. During execution, dte_needed bytes must be available in + * the buffer. This space is used for both recorded data and tuple data. */ struct dtrace_ecb { dtrace_epid_t dte_epid; /* enabled probe ID */ uint32_t dte_alignment; /* required alignment */ - size_t dte_needed; /* bytes needed */ - size_t dte_size; /* total size of payload */ + size_t dte_needed; /* space needed for execution */ + size_t dte_size; /* size of recorded payload */ dtrace_predicate_t *dte_predicate; /* predicate, if any */ dtrace_action_t *dte_action; /* actions, if any */ dtrace_ecb_t *dte_next; /* next ECB on probe */ @@ -275,27 +279,30 @@ typedef struct dtrace_aggregation { * the EPID, the consumer can determine the data layout. (The data buffer * layout is shown schematically below.) By assuring that one can determine * data layout from the EPID, the metadata stream can be separated from the - * data stream -- simplifying the data stream enormously. - * - * base of data buffer ---> +------+--------------------+------+ - * | EPID | data | EPID | - * +------+--------+------+----+------+ - * | data | EPID | data | - * +---------------+------+-----------+ - * | data, cont. | - * +------+--------------------+------+ - * | EPID | data | | - * +------+--------------------+ | - * | || | - * | || | - * | \/ | - * : : - * . . - * . . - * . . - * : : - * | | - * limit of data buffer ---> +----------------------------------+ + * data stream -- simplifying the data stream enormously. The ECB always + * proceeds the recorded data as part of the dtrace_rechdr_t structure that + * includes the EPID and a high-resolution timestamp used for output ordering + * consistency. + * + * base of data buffer ---> +--------+--------------------+--------+ + * | rechdr | data | rechdr | + * +--------+------+--------+----+--------+ + * | data | rechdr | data | + * +---------------+--------+-------------+ + * | data, cont. | + * +--------+--------------------+--------+ + * | rechdr | data | | + * +--------+--------------------+ | + * | || | + * | || | + * | \/ | + * : : + * . . + * . . + * . . + * : : + * | | + * limit of data buffer ---> +--------------------------------------+ * * When evaluating an ECB, dtrace_probe() determines if the ECB's needs of the * principal buffer (both scratch and payload) exceed the available space. If diff --git a/sys/cddl/dev/dtrace/dtrace_ioctl.c b/sys/cddl/dev/dtrace/dtrace_ioctl.c index 6a01e13..bb03ffd 100644 --- a/sys/cddl/dev/dtrace/dtrace_ioctl.c +++ b/sys/cddl/dev/dtrace/dtrace_ioctl.c @@ -274,6 +274,7 @@ dtrace_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, desc.dtbd_drops = buf->dtb_drops; desc.dtbd_errors = buf->dtb_errors; desc.dtbd_oldest = buf->dtb_xamot_offset; + desc.dtbd_timestamp = dtrace_gethrtime(); mutex_exit(&dtrace_lock); @@ -328,6 +329,7 @@ dtrace_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, desc.dtbd_drops = buf->dtb_xamot_drops; desc.dtbd_errors = buf->dtb_xamot_errors; desc.dtbd_oldest = 0; + desc.dtbd_timestamp = buf->dtb_switched; mutex_exit(&dtrace_lock);