/*
 * 64-bit atomic counter test.
 *
 * Compilation:
 * $ gcc -O2 counter_test.c -o counter_test -lpthread
 *
 * With <machine/atomic.h> (amd64):
 * $ gcc -O2 counter_test.c -o counter_test -lpthread -DATOMIC
 *
 * With libatomic_ops installed (see below):
 * $ gcc -O2 counter_test.c -o counter_test -lpthread -DLIBATOMIC_OPS -I/usr/local/include -L/usr/local/lib -latomic_ops
 *
 * With gcc builtins (see below):
 * $ gcc -O2 counter_test.c -o counter_test -lpthread -DGCC_ATOMIC
 *
 */

#include <sys/types.h>
#include <sys/param.h>

#include <sys/cpuset.h>
#include <sys/sysctl.h>
#include <sys/timespec.h>

#ifdef ATOMIC
/* Internally supported atomic operations, see atomic(9).
 * Note that 64-bit atomics are only supported on 64-bit
 * platforms at the moment.
 */
#include <machine/atomic.h>
#endif

#ifdef LIBATOMIC_OPS
/* The atomic_ops library [1] installed from the sources of
 * devel/boehm-gc.  Note that this is not installed with the
 * port, it was done manually.
 *
 * [1] http://www.hpl.hp.com/research/linux/atomic_ops/
 */
#include <atomic_ops.h>
#endif

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

/*
 * A type synonym for making the code more self-documented.
 */
typedef	uint64_t	counter_t;

const counter_t	count	= 1000000ULL;
const counter_t offset	= 4294467296ULL;

counter_t	global_counter;

struct test_config {
	char	*name;
	void	(*func)(counter_t *, counter_t);
	int	ncpu;
};

int
set_affinity(int pnum)
{
	cpuset_t set;

	CPU_ZERO(&set);
	CPU_SET(pnum, &set);

	if (cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1,
	    sizeof(cpuset_t), &set) != 0)
	{
		perror("cpuset_setaffinity()");
		return -1;
	}

	return 0;
}

/*
 * A naive method.  No locking, no atomicity but performance.
 */
void
simple_add(counter_t *p, counter_t v)
{
	(*p) += v;
}

#ifdef FPU_ATOMIC
/*
 * Use the FPU to juggle with 64-bit values, since they can be load and stored
 * atomically.
 */
void
fpu_add(counter_t *p, counter_t v)
{
	counter_t r = (*p) + v;
	__asm __volatile(
	"	fild	%0 ;		"
	"	fistp	%1 ;		"
	: "=m" (r),
	  "=m" (*p)
	: "m" (*p));
}
#endif

#ifdef ATOMIC
/*
 * Use atomic(9).
 */
void
atomic_add(counter_t *p, counter_t v)
{
	atomic_add_64(p, v);
}
#endif

#ifdef LIBATOMIC_OPS
void
libatomic_add(counter_t *p, counter_t v)
{
	AO_fetch_and_add(p, v);
}
#endif

#ifdef GCC_ATOMIC
/*
 * Ask GCC to generate atomics on the given platform.  Note that it works
 * on amd64 without problems, but on i386 it requires -march=i586, since
 * it uses cmpxchg8b which was introduced on Pentiums.
 */
void
gccatomic_add(counter_t * p, counter_t v)
{
	__sync_fetch_and_add(p, v);
}
#endif

struct timespec
diff(struct timespec begin, struct timespec end)
{
	struct timespec tmp;

	if ((end.tv_nsec - begin.tv_nsec) < 0)
	{
		tmp.tv_sec  = end.tv_sec - begin.tv_sec - 1;
		tmp.tv_nsec = 1000000000 + end.tv_nsec - begin.tv_nsec;
	}
	else
	{
		tmp.tv_sec  = end.tv_sec - begin.tv_sec;
		tmp.tv_nsec = end.tv_nsec - begin.tv_nsec;
	}

	return tmp;
}

void *
concurrent_inc(void *arg)
{
	struct test_config *cfg;
	void	(*f)(counter_t *, counter_t);
	counter_t i;

	cfg = (struct test_config *) arg;
	f = cfg->func;

	if (set_affinity(cfg->ncpu) != 0)
		return NULL;

	for (i = 0; i < count; i++)
		f(&global_counter, 1);

	return NULL;
}

int
run_concurrent_inc(int n, struct test_config *cfg)
{
	pthread_t *threads;
	struct timespec start;
	struct timespec stop;
	struct timespec runtime;
	struct test_config *cfgs;
	int i;
	const counter_t expected = offset + (n * count);

	threads = (pthread_t *) malloc(sizeof(pthread_t) * n);
	if (threads == NULL)
	{
		perror("malloc(threads)");
		return -1;
	}

	cfgs = (struct test_config *) malloc(sizeof(struct test_config) * n);
	if (cfgs == NULL)
	{
		perror("malloc(cfgs)");
		return -1;
	}

	global_counter = offset;
	clock_gettime(CLOCK_REALTIME_PRECISE, &start);
	for (i = 0; i < n; i++)
	{
		cfgs[i] = *cfg;
		cfgs[i].ncpu = i;
		if (pthread_create(&threads[i], NULL, concurrent_inc, &cfgs[i]) != 0)
		{
			perror("pthread_create()");
			n = i;
			break;
		}
	}

	for (i = 0; i < n; i++)
		pthread_join(threads[i], NULL);

	clock_gettime(CLOCK_REALTIME_PRECISE, &stop);
	runtime = diff(start, stop);

	if (cfg->name != NULL)
	{
		printf("[%s] global_counter = %llu\n", cfg->name, global_counter);
		printf("[%s] expected: %llu\n", cfg->name, expected);
		printf("[%s] %scorrect.\n", cfg->name,
		    expected != global_counter ? "NOT " : "");
		printf("[%s] time: %d ms\n", cfg->name,
		    runtime.tv_sec * 1000 + runtime.tv_nsec / (long) 1000000);
		printf("\n");
	}

	free(threads);
	free(cfgs);

	return 0;
}

int
main(void)
{
	int procs;
	size_t sprocs;
	struct test_config cfg;

	procs = 1;
	sprocs = sizeof(procs);
	if (sysctlbyname("hw.ncpu", &procs, &sprocs, NULL, 0) != 0)
	{
		perror("sysctlbyname()");
		return EXIT_FAILURE;
	}

#if defined(__amd64__)
	printf("Compiled for x86_64.\n");
#elif defined(__i386__)
	printf("Compiled for x86.\n");
#endif
	printf("%d CPU%s detected, working with %d thread%s.\n",
	    procs, procs > 1 ? "s" : "", procs, procs > 1 ? "s" : "");
	if (procs < 2)
	{
		printf("Warning: There is no \"real\" multiprocessing present, "
		    "hence the results likely will not be valid.\n");
	}
	printf("\n");

	/* Warm up. */
	cfg.name = NULL, cfg.func = simple_add;
	if (run_concurrent_inc(procs, &cfg) != 0)
		return EXIT_FAILURE;

	cfg.name = "naive", cfg.func = simple_add;
	if (run_concurrent_inc(procs, &cfg) != 0)
		return EXIT_FAILURE;

#ifdef ATOMIC
	cfg.name = "atomic.h", cfg.func = atomic_add;
	if (run_concurrent_inc(procs, &cfg) != 0)
		return EXIT_FAILURE;
#endif

#ifdef LIBATOMIC_OPS
	cfg.name = "atomic_ops.h", cfg.func = libatomic_add;
	if (run_concurrent_inc(procs, &cfg) != 0)
		return EXIT_FAILURE;
#endif

#ifdef GCC_ATOMIC
	cfg.name = "gcc atomic", cfg.func = gccatomic_add;
	if (run_concurrent_inc(procs, &cfg) != 0)
		return EXIT_FAILURE;
#endif

#ifdef FPU_ATOMIC
	cfg.name = "fpu", cfg.func = fpu_add;
	if (run_concurrent_inc(procs, &cfg) != 0)
		return EXIT_FAILURE;
#endif

	return EXIT_SUCCESS;
}