#include #include #include #ifdef __FreeBSD__ #include #include #endif #ifdef __linux__ #define __USE_GNU #include #endif #include #include #include #include #include static __inline uint64_t rdtsc(void) { u_int32_t low, high; __asm __volatile("rdtsc" : "=a" (low), "=d" (high)); return (low | ((u_int64_t)high << 32)); } uint64_t ticks = 25; /* Default to a few seconds of runtime. */ char *progname; int ncpu = 1; int msize = (256 * 1024 * 1024); int blocks; int fflag; int gflag; int wflag; int rflag; int bsize; #define TICK_INTERVAL 1000000000ULL /* One billion ticks */ #define BLOCK_MIN 64 #define BLOCK_SIZE 8192 #define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) typedef long mr_op(char *buf, int iter); struct mr_thread { pthread_t mt_thr; mr_op *mt_op; char *mt_buf; int mt_cpu; uint64_t mt_icount; uint64_t mt_iticks; uint64_t mt_tticks; uint64_t mt_tusec; uint64_t mt_bytes; }; int memzeronti(void *dst, int len) { /* * amd64 abi says args order %rdi, %rsi, %rdx, %rcx, %r8, etc. * callee saved are %rbp, %rbx, %r12 - %r15. */ __asm__ __volatile__( ".align 16 \n\t" "64: \n\t" "cmp $64, %%rsi \n\t" "jl 0f \n\t" "mov $0, %%rax \n\t" "movnti %%rax, 0x0(%%rdi) \n\t" "movnti %%rax, 0x8(%%rdi) \n\t" "movnti %%rax, 0x10(%%rdi) \n\t" "movnti %%rax, 0x18(%%rdi) \n\t" "movnti %%rax, 0x20(%%rdi) \n\t" "movnti %%rax, 0x28(%%rdi) \n\t" "movnti %%rax, 0x30(%%rdi) \n\t" "movnti %%rax, 0x38(%%rdi) \n\t" "add $64, %%rdi \n\t" "sub $64, %%rsi \n\t" "jmp 64b \n\t" "0: \n\t" : : "D" ( dst ) , "S" (len) ); return len; } int memcpynti(void *dst, const void *src, int len) { /* * amd64 abi says args order %rdi, %rsi, %rdx, %rcx, %r8, etc. * callee saved are %rbp, %rbx, %r12 - %r15. */ __asm__ __volatile__( "push %%r12 \n\t" "push %%r13 \n\t" ".align 16 \n\t" "64: \n\t" "cmp $64, %%rdx \n\t" "jl 0f \n\t" "movq 0x0(%%rsi), %%rax \n\t" "movnti %%rax, 0x0(%%rdi) \n\t" "movq 0x8(%%rsi), %%rcx \n\t" "movnti %%rcx, 0x8(%%rdi) \n\t" "movq 0x10(%%rsi), %%r8 \n\t" "movnti %%r8, 0x10(%%rdi) \n\t" "movq 0x18(%%rsi), %%r9 \n\t" "movnti %%r9, 0x18(%%rdi) \n\t" "movq 0x20(%%rsi), %%r10 \n\t" "movnti %%r10, 0x20(%%rdi) \n\t" "movq 0x28(%%rsi), %%r11 \n\t" "movnti %%r11, 0x28(%%rdi) \n\t" "movq 0x30(%%rsi), %%r12 \n\t" "movnti %%r12, 0x30(%%rdi) \n\t" "movq 0x38(%%rsi), %%r13 \n\t" "movnti %%r13, 0x38(%%rdi) \n\t" "sub $64, %%rdx \n\t" "add $64, %%rdi \n\t" "add $64, %%rsi \n\t" "jmp 64b \n\t" "0: \n\t" "pop %%r13 \n\t" "pop %%r12 \n\t" : : "D" ( dst ) , "S" (src) , "d" (len) ); return len; } static inline char * mr_buf(char *buf, int iter) { int block; block = random() % blocks; return buf + (block * bsize); } static long mr_write(char *buf, int iter) { char *dst; dst = mr_buf(buf, iter); if (fflag) memzeronti(dst, bsize); else memset(dst, 0, bsize); return (bsize); } static long mr_rw(char *buf, int iter) { char *dst; char *src; src = mr_buf(buf, iter); dst = mr_buf(buf, iter * 3); if (fflag) memcpynti(dst, src, bsize); else memcpy(dst, src, bsize); return (bsize * 2); } static long mr_read(char *buf, int iter) { long *src; long *end; long sum; src = (long *)mr_buf(buf, iter); end = src + (bsize / sizeof(long)); sum = 0; /* Read and keep a sum so the compiler doesn't throw things away. */ while (src < end) sum += *src++; return (bsize); } static void mr_alloc(struct mr_thread *mtp) { if (mtp->mt_buf == NULL) mtp->mt_buf = malloc(msize); if (mtp->mt_buf == NULL) { fprintf(stderr, "Failed to allocate %d bytes\n", msize); exit(EXIT_FAILURE); } } static void mr_bind(struct mr_thread *mtp) { #ifdef __linux__ cpu_set_t mask; #else cpuset_t mask; #endif int cpu; cpu = mtp->mt_cpu; CPU_ZERO(&mask); CPU_SET(cpu, &mask); #ifdef __linux__ if (sched_setaffinity(0, sizeof(cpu_set_t), &mask)) { fprintf(stderr, "CPU %d: sched_setaffinity: %s\n", cpu, strerror(errno)); exit(EXIT_FAILURE); } #else if (cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(cpuset_t), &mask)) { fprintf(stderr, "CPU %d: cpuset_setaffinity: %s\n", cpu, strerror(errno)); exit(EXIT_FAILURE); } #endif } static void * mr_thread(void *arg) { struct timeval tv_begin; struct timeval tv_end; struct mr_thread *mtp; uint64_t start; uint64_t begin; uint64_t stop; uint64_t end; uint64_t now; int i; mtp = (struct mr_thread *)arg; mr_bind(mtp); mr_alloc(mtp); /* * We want to run for one tick interval before and after we sample * so that we know all processors were spun up and running at the * same time. This overlap helps to ensure that. */ gettimeofday(&tv_begin, NULL); begin = rdtsc(); start = begin + TICK_INTERVAL; stop = start + ticks; end = stop + TICK_INTERVAL; /* Loop until we reach the start time. */ while ((now = rdtsc()) < start) mtp->mt_op(mtp->mt_buf, 0); /* * Record the real start time and loop until we reach stop, counting * iterations. */ start = now; for (i = 0; (now = rdtsc()) < stop; i++) mtp->mt_bytes += mtp->mt_op(mtp->mt_buf, i); stop = now; mtp->mt_icount = i; /* * Now loop for one more interval so we are certain we overlap * with other threads. */ while (rdtsc() < end) mtp->mt_op(mtp->mt_buf, i); mtp->mt_iticks = stop - start; mtp->mt_tticks = rdtsc() - begin; gettimeofday(&tv_end, NULL); /* Now re-use begin/end to calculate microseconds of runtime */ begin = tv_begin.tv_sec * 1000000 + tv_begin.tv_usec; end = tv_end.tv_sec * 1000000 + tv_end.tv_usec; mtp->mt_tusec = end - begin; pthread_exit(NULL); } char * mr_prefix(uint64_t bytes, float *bytep) { char *prefixes[] = { "B", "KB", "MB", "GB", "TB", NULL }; char **prefix; float b; b = (float)bytes; prefix = prefixes; while (b > 1024) { b /= 1024; prefix++; } *bytep = b; return (*prefix); } static void mr_report(struct mr_thread *tds) { uint64_t intervals; uint64_t runus; uint64_t ticks; uint64_t usec; uint64_t tticks; uint64_t bytes; uint64_t tpus; float hbytes; char *p; int i; /* Aggregate results */ usec = tticks = ticks = intervals = bytes = 0; for (i = 0; i < ncpu; i++) { ticks += tds[i].mt_iticks; intervals += tds[i].mt_icount; tticks += tds[i].mt_tticks; usec += tds[i].mt_tusec; bytes += tds[i].mt_bytes; #if 0 printf("CPU %d: ticks %ju, intervals %ju, bytes %ju\n", i, tds[i].mt_iticks, tds[i].mt_icount, tds[i].mt_bytes); #endif } /* * Convert to averages. */ ticks /= ncpu; intervals /= ncpu; tticks /= ncpu; usec /= ncpu; bytes /= ncpu; /* * Total ticks per usec. */ tpus = tticks / usec; /* * Number of microseconds we were running for. */ runus = ticks / tpus; p = mr_prefix(bytes, &hbytes); /* * Output some results. */ #if 0 printf("Average of %ju ticks and %ju intervals for %.2f%s\n", ticks, intervals, hbytes, p); printf("Total usecs %ju for %ju ticks\n", usec, tticks); printf("Ticks per usec %ju\n", tpus); #endif bytes /= runus; p = mr_prefix(bytes, &hbytes); printf("Running time %ju usec, bytes per usec: %0.f%s\n", runus, hbytes, p); bytes *= 1000000; p = mr_prefix(bytes, &hbytes); printf("Bytes per second per cpu: %.2f%s\n", hbytes, p); bytes *= ncpu; p = mr_prefix(bytes, &hbytes); printf("Total bytes per second: %.2f%s\n", hbytes, p); } static void mr_start(mr_op *op) { struct mr_thread tds[ncpu]; pthread_attr_t attr; char *buf; int i; memset(&tds, 0, sizeof(tds)); pthread_attr_init(&attr); if (gflag) { buf = malloc(msize); if (buf == NULL) { fprintf(stderr, "Failed to allocate %d bytes\n", msize); exit(EXIT_FAILURE); } } else buf = NULL; for (i = 0; i < ncpu; i++) { tds[i].mt_cpu = i; tds[i].mt_buf = buf; tds[i].mt_op = op; if (pthread_create(&tds[i].mt_thr, &attr, mr_thread, &tds[i])) { perror("pthread_create"); exit(EXIT_FAILURE); } } for (i = 0; i < ncpu; i++) pthread_join(tds[i].mt_thr, NULL); mr_report(tds); } static void usage() { fprintf(stderr, "usage: %s [-fgrw -m size -n cpu -t ticks]\n", progname); fprintf(stderr, "\t-f Use \'fast\' non-temporal store functions\n"); fprintf(stderr, "\t-g Use a global memory buffer for all CPUs\n"); fprintf(stderr, "\t-r Read memory. May be combined with -w\n"); fprintf(stderr, "\t-w Write memory. May be combined with -r\n"); fprintf(stderr, "\t-m Size of buffer in bytes, per-cpu or global.\n"); fprintf(stderr, "\t-n Number of cpus to execute on concurrently.\n"); fprintf(stderr, "\t-t Number of cycles to execute for x 1 billion\n"); exit(EXIT_FAILURE); } int main(int argc, char *argv[]) { mr_op *op; int ch; progname = argv[0]; while ((ch = getopt(argc, argv, "fgm:n:rt:w")) != -1) { switch (ch) { case 'f': fflag = 1; break; case 'g': gflag = 1; break; case 'm': msize = roundup2(atoi(optarg), BLOCK_MIN*2); break; case 'n': ncpu = atoi(optarg); break; case 'r': rflag = 1; break; case 't': ticks = atoi(optarg); break; case 'w': wflag = 1; break; default: usage(); } } /* Both are set or default to zero use mr_rw. */ if (rflag == wflag) op = mr_rw; else if (rflag) op = mr_read; else if (wflag) op = mr_write; else op = mr_rw; if (msize < BLOCK_SIZE) bsize = msize / 2; /* space for two blocks for memcpy */ else bsize = BLOCK_SIZE; printf("Number of CPUS: %d, Memory size: %d, Block size: %d, Billion ticks: %jd\n", ncpu, msize, bsize, (intmax_t)ticks); blocks = msize / bsize; ticks *= TICK_INTERVAL; mr_start(op); exit(EXIT_SUCCESS); }