/*
 * Copyright (c) 2000 Alfred Perlstein
 * Author: Alfred Perlstein <alfred@paycounter.com>, <alfred@FreeBSD.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * $FreeBSD$
 */

/*

Slab and mp caching allocator.

The concepts used here are a combination of the slab, Dynix and
Horde allocators.

Slab concepts:

The concept of slab is to cache align blocks of allocated memory,
provide quick allocate and free for fixed sized memory, the most
important part is that slab also offers the ability to have all
objects allocated come from either memory initialized by a constructor
or from memory that has been free'd back to the same pool.

This allows one to reduce the amount of initilization needed per
object because when an object is free()'d it is most likely near
where it was at allocation time.

Dynix/Horde:

The Dynix approach was to keep per-processor caches of memory blocks
to reduce contention on allocation of memory.

Horde was an improvement over the Dynix allocator in so far that it
was essentially the same concept, except where Dynix each CPU would free()
memory back to its own private cache, Horde free()'s memory to the cache
where it was allocated.

This can increase contention, but on a positive side it reduces false
sharing because for the most part the CPU doing the free()'ing doesn't
write (much) to the object.

Locking:

Giant is aquired whenever the slab allocator needs to dip into the
kmem_malloc/free functions

Each CPU has a private cache that gets locked whenever an allocation
or free operation is done.  Each memcache-manager has a lock that is
accessed whenever a private cache spills by either underflowing or
overflowing.

Lock order is to aquire the per-cpu lock then memcache-manager lock.
This is needed so that concurrant free's don't get confused about
which cache owns a particular memcache.

Style:

4 space tabs (:set ts=4) deal with it.

*/

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/sysctl.h>
#include <sys/queue.h>
#include <sys/mutex.h>

#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>

#include <sys/memcache.h>


#define NCPU 4
#warning NCPU bad

#define	PAGES_PER_MEMCACHE		((1024*16) / PAGE_SIZE)

/* XXX: need a real way to get cacheline size */
#ifndef CACHELINESIZE
#define	CACHELINESIZE 32
#endif

#define CACHELINE_ROUNDUP(size)	\
	(size + (CACHELINESIZE - (size % CACHELINESIZE)))

#define	MEMCACHE_MGP_ENTRY(memcachep)	\
	(((char*)memcachep - kmembase) / (PAGES_PER_MEMCACHE * PAGE_SIZE))

struct memcache {
	TAILQ_ENTRY(memcache) m_lnk;	/* next on list */
	struct memcache_mgr *m_mgr;	/* back pointer to memcache manager */
	int m_free;			/* pointer to next free (index into  m_blks) */
	volatile int m_cpuowner;		/* -1 == general cache, 0-N == cpu */
	void *m_pageptr;	/* pointer to managed memory base */
	void *m_blks[1];	/* pointers to free blocks */
};

#define MEMCACHE_ENDP (NULL)

TAILQ_HEAD(memcache_tqh, memcache);

struct memcache_pcpu {
	/* complete full memcaches */
	TAILQ_HEAD(, memcache) mp_full;
	/* free memcaches, partially free on head, completely on the tail */
	TAILQ_HEAD(, memcache) mp_free;
	int mp_freecnt;		/* number of free objects on mp_free */
	int mp_cpu;			/* which cpu this cache belongs to */
	/* protect from interrupts on same cpu and free()'s by other CPUs */
	struct mtx mp_mtx;	
} __attribute__ ((aligned (CACHELINESIZE)));

struct memcache_mgr {
	/*
	 * XXX: need to broken down into cacheline-size areas to prevent
	 * false sharing for read-only fields, specifically for
	 * 'mm_spfreecntmax'
	 */
	char *mm_name;			/* cache name */
	int mm_flags;		/* flags */
	int mm_size;			/* size of objects */
	int mm_max;				/* max objects */
	int mm_nobj;			/* number of objects (rounded to memcache size) */
	int mm_freecnt;			/* number of free objects cached in the manager */
	int mm_freecntmax;		/* max free objects to cache in the manager */
	int mm_spfreecntmax;	/* max free objects to cache per-CPU */
	TAILQ_HEAD(, memcache) mm_free;		/* memcaches with free memory */
	int (*mm_ctor)(void *, int);	/* constructor */
	void (*mm_dtor)(void *, int);	/* destructor */
	struct mtx mm_mtx;
	struct memcache_pcpu mm_pcpu[NCPU];
};

#define	MEMCACHE_MUTEX_INTIALIZE(m)	mtx_init(m, "memcache", MTX_DEF)
#define	MEMCACHE_PCPU_LOCK(mpu)	mtx_enter(&mpu->mp_mtx, MTX_DEF)
#define	MEMCACHE_PCPU_UNLOCK(mpu)	mtx_exit(&mpu->mp_mtx, MTX_DEF)
#define	MEMCACHE_MGR_LOCK(mmp)	mtx_enter(&mmp->mm_mtx, MTX_DEF)
#define	MEMCACHE_MGR_UNLOCK(mmp)	mtx_exit(&mmp->mm_mtx, MTX_DEF)
/*
 * XXX: needed because kmem_* aren't mpsafe
 */
#define MEMCACHE_KERNEL_LOCK()	mtx_enter(&Giant, MTX_DEF)
#define MEMCACHE_KERNEL_UNLOCK()	mtx_exit(&Giant, MTX_DEF)

#define	MEMCACHE_GET_PCPU(mmp)	(&mmp->mm_pcpu[cpuid])
#define	MEMCACHE_GET_PCPU_NCPU(mmp, cpu)	\
	(&mmp->mm_pcpu[cpu])

/* rough number of memcaches to keep around */
#define	MEMCACHE_SLACK 4

/* number of objects per-memcache */
#define	MEMCACHE_OBJS_CNT(objsize)	\
	((PAGES_PER_MEMCACHE * PAGE_SIZE) / CACHELINE_ROUNDUP(objsize))

extern char *kmemlimit, *kmembase;

static MALLOC_DEFINE(M_MEMCACHE, "memcache", "memcache managent structures");

static struct memcache **memcache_mgptrs;

struct memcache_mgr *
memcache_create(char *sname, int size, int max,
	int (*ctor)(void *, int), void (*dtor)(void *, int), int flags)
{
	struct memcache_mgr *ret;
	int i;

	ret = malloc(sizeof(*ret), M_MEMCACHE, M_WAITOK);
	ret->mm_flags = flags;
	ret->mm_name = sname;
	ret->mm_size = size;
	ret->mm_max = max;
	ret->mm_ctor = ctor;
	ret->mm_dtor = dtor;
	ret->mm_freecnt = 0;
	/* auto-tune max free? */
	ret->mm_freecntmax =
		ret->mm_spfreecntmax = MEMCACHE_OBJS_CNT(size) * MEMCACHE_SLACK;
	TAILQ_INIT(&ret->mm_free);
	MEMCACHE_MUTEX_INTIALIZE(&ret->mm_mtx);
	for (i = 0; i < NCPU; i++) {
		struct memcache_pcpu *mpu;

		mpu = ret->mm_pcpu + i;
		mpu->mp_cpu = i;
		mpu->mp_freecnt = 0;
		TAILQ_INIT(&mpu->mp_full);
		TAILQ_INIT(&mpu->mp_free);
		MEMCACHE_MUTEX_INTIALIZE(&mpu->mp_mtx);
	}
	return (ret);
}

void
memcache_adjust(struct memcache_mgr *mmp, int max, int flags)
{

	MEMCACHE_MGR_LOCK(mmp);
	mmp->mm_flags |= flags;
	if (max)
		mmp->mm_max = max;
	MEMCACHE_MGR_UNLOCK(mmp);
}

static void
memcache_init(void)
{
	unsigned long size;

	size = sizeof(*memcache_mgptrs) *
		((kmemlimit - kmembase) / (PAGES_PER_MEMCACHE * PAGE_SIZE));
	memcache_mgptrs = malloc(size, M_MEMCACHE, M_WAITOK | M_ZERO);
}

static struct memcache *
memcache_getmemcache(struct memcache_mgr *mmp)
{
	int flags, nobjs, i;
	unsigned long size;
	char *memcache_data, *memcache_walk;
	struct memcache *ret;

	/* can we wait on allocation? */
	flags = (mmp->mm_flags & MEMCACHE_NOWAIT) != 0 ? M_NOWAIT : M_WAITOK;

	/* how many objects in the memcache */
	nobjs = MEMCACHE_OBJS_CNT(mmp->mm_size);
	if (mmp->mm_flags & MEMCACHE_NOGROW && mmp->mm_nobj + nobjs > mmp->mm_max)
		return (NULL);

	/* size of memcache management struct and free index */
	size = sizeof(*ret) + (nobjs + 1) * sizeof(void *);

	/* get space for objects */
	MEMCACHE_KERNEL_LOCK();
	memcache_data =
		(char *) kmem_malloc(kmem_map, PAGES_PER_MEMCACHE * PAGE_SIZE, flags);
	MEMCACHE_KERNEL_UNLOCK();
	if (memcache_data == NULL)
		goto failnofree;

	/* XXX: idle pre-zero? */
	if ((mmp->mm_flags & MEMCACHE_ZERO) != 0)
		bzero(memcache_data, PAGES_PER_MEMCACHE * PAGE_SIZE);

	ret = malloc(size, M_MEMCACHE, flags);
	if (ret == NULL) {
		MEMCACHE_KERNEL_LOCK();
		kmem_free(kmem_map, (vm_offset_t)memcache_data,
			PAGES_PER_MEMCACHE * PAGE_SIZE);
		MEMCACHE_KERNEL_UNLOCK();
		goto failnofree;
	}

	/* call constructor if needed */
	if (mmp->mm_ctor != NULL && mmp->mm_ctor(memcache_data, nobjs) != 0)
		goto failfree;

	/* initialize the free list */
	memcache_walk = memcache_data;
	for (i = 0; i < nobjs; i++) {
		ret->m_blks[i] = memcache_walk;
		memcache_walk += CACHELINE_ROUNDUP(mmp->mm_size);
	}
	ret->m_blks[nobjs] = MEMCACHE_ENDP;
	ret->m_free = nobjs;
	ret->m_pageptr = memcache_data;
	ret->m_mgr = mmp;
	memcache_mgptrs[MEMCACHE_MGP_ENTRY(memcache_data)] = ret;
	return (ret);

failfree:
	/*
	 * order is somewhat important here.
	 * we really only need to free the memcache after freeing and NULL'ing out
	 * the memcache struct pointer.  This prevents a new allocation from
	 * hashing to the same entry in memcache_mgptrs[] until we free and NULL
	 * out the entry
	 */
	memcache_mgptrs[MEMCACHE_MGP_ENTRY(memcache_data)] = NULL;
	free(ret, M_MEMCACHE);
	MEMCACHE_KERNEL_LOCK();
	kmem_free(kmem_map, (vm_offset_t)memcache_data,
		PAGES_PER_MEMCACHE * PAGE_SIZE);
	MEMCACHE_KERNEL_UNLOCK();

failnofree:
	if (mmp->mm_flags & MEMCACHE_PANIC)
		panic("failed allocation with MEMCACHE_PANIC");
	else
		return (NULL);
}

/*
 * get an item from a memcache, need to be called with the memcache_pcpu locked
 * unless it's not associated with any memcache management structures
 * possibly move from exhasted list to partially allocated list
 */
static void *
memcache_nibble(struct memcache *s)
{
	void *ret;

	ret = s->m_blks[--(s->m_free)];
	if (s->m_free == 0) {
		struct memcache_pcpu *mpu;

		mpu = MEMCACHE_GET_PCPU(s->m_mgr);
		TAILQ_REMOVE(&mpu->mp_free, s, m_lnk);
		TAILQ_INSERT_HEAD(&mpu->mp_full, s, m_lnk);
	}
	return (ret);
}

void *
memcache_alloc(struct memcache_mgr *mmp)
{
	struct memcache_pcpu *mpu;
	struct memcache *s;
	void *ret;

	/*
	 * do we have any memcaches with free objects local to ourselves?
	 */
	mpu = MEMCACHE_GET_PCPU(mmp);
	MEMCACHE_PCPU_LOCK(mpu);
	if ((s = TAILQ_FIRST(&mpu->mp_free)) != NULL) {
		ret = memcache_nibble(s);
		MEMCACHE_PCPU_UNLOCK(mpu);
		return (ret);
	}
	MEMCACHE_PCPU_UNLOCK(mpu);
	/*
	 * no free local memcaches.
	 * do we have any free memcaches in the general manager?
	 */
	MEMCACHE_MGR_LOCK(mmp);
	if ((s = TAILQ_FIRST(&mmp->mm_free)) != NULL) {
		/*
		 * the manager has a free memcache.
		 * grab it and put it on our freelist.
		 * consume an object.
		 */
		TAILQ_REMOVE(&mmp->mm_free, s, m_lnk);
		MEMCACHE_MGR_UNLOCK(mmp);
		MEMCACHE_PCPU_LOCK(mpu);
		TAILQ_INSERT_HEAD(&mpu->mp_free, s, m_lnk);
		ret = memcache_nibble(s);
		MEMCACHE_PCPU_UNLOCK(mpu);
		return (ret);
	}
	MEMCACHE_MGR_UNLOCK(mmp);
	/*
	 * no free memcaches in the general cache.
	 * get a new memcache.
	 */
	if ((mmp->mm_flags & MEMCACHE_NOGROW) != 0 ||
		(s = memcache_getmemcache(mmp)) == NULL)
		return (NULL);
	/*
	 * normally calling memcache_nibble(s) would be a problem if s->m_lnk
	 * wasn't initialized, however since the first allocation shouldn't
	 * exhaust the memcache so it's ok.
	 */
	ret = memcache_nibble(s);
	MEMCACHE_PCPU_LOCK(mpu);
	s->m_cpuowner = mpu->mp_cpu;
	TAILQ_INSERT_HEAD(&mpu->mp_free, s, m_lnk);
	MEMCACHE_PCPU_UNLOCK(mpu);
	return (ret);
}

void
memcache_free(void *p)
{
	int cpu;
	caddr_t memcache_data;
	struct memcache *s;
	struct memcache_mgr *mmp;
	struct memcache_pcpu *mpu;

	/* quiet compiler */
	mmp = NULL;
	mpu = NULL;

	KASSERT(kmembase <= (char *)p && (char *)p < kmemlimit,
		("memcache_free: address out of range: %p", p));

	/* shave to memcache boundry */
	memcache_data = (caddr_t) p;
	memcache_data -= (int)memcache_data % (PAGES_PER_MEMCACHE * PAGE_SIZE);

	/* do lookup in global memcache manager pointer array */
	s = memcache_mgptrs[MEMCACHE_MGP_ENTRY(memcache_data)];
	KASSERT((int)memcache_data % CACHELINE_ROUNDUP(s->m_mgr->mm_size) == 0,
		("memcache_free: unaligned free: %p", p));
	/*
	 * this is tricky, since we may be freeing to another CPU's
	 * cache we need to be aware of a race condition where we lock the
	 * the wrong mutex because the memcache has migrated while we waited for
	 * the lock.
	 * If that happens we need to unlock and try again.
	 * In theory this shouldn't happen all that often.
	 */
retry_lock:
	/* get memcache owner */
	cpu = s->m_cpuowner;
	if (cpu == -1) {
		mmp = s->m_mgr;
		/* memcache owned by memcache_mgr */
		MEMCACHE_MGR_LOCK(mmp);
		if (cpu != s->m_cpuowner) {
			MEMCACHE_MGR_UNLOCK(mmp);
			goto retry_lock;
		}
	} else {
		/* memcache owned by per-cpu manager */
		mpu = MEMCACHE_GET_PCPU_NCPU(mmp, cpu);
		MEMCACHE_PCPU_LOCK(mpu);
		if (cpu != s->m_cpuowner) {
			MEMCACHE_PCPU_UNLOCK(mpu);
			goto retry_lock;
		}
	}
	/*
	 * move to free list instead of exhausted list if necessary,
	 * this can only happen to memcaches on a per-cpu list
	 */
	if (s->m_free == 0) {
		TAILQ_REMOVE(&mpu->mp_full, s, m_lnk);
		TAILQ_INSERT_HEAD(&mpu->mp_free, s, m_lnk);
	}
	s->m_blks[(s->m_free)++] = p;

	/*
	 * if it's completely free then stick it on the end of the tail queue
	 * that it's living on
	 */
	if (s->m_blks[s->m_free] == MEMCACHE_ENDP) {
		if (cpu != -1) {
			TAILQ_REMOVE(&mpu->mp_free, s, m_lnk);
			TAILQ_INSERT_TAIL(&mpu->mp_free, s, m_lnk);
		} else {
			TAILQ_REMOVE(&mmp->mm_free, s, m_lnk);
			TAILQ_INSERT_TAIL(&mmp->mm_free, s, m_lnk);
		}
	}
	/*
	 * possibly free a chunk back to the manager
	 */
	if (cpu != -1) {
		mpu->mp_freecnt++;
		/*
		 * we reached the max free threshold,
		 * free a memcache from the back to the free queue to the manager
		 * interlock to switch m_cpuowner safely between queues
		 * then give the manager a shot at freeing it by falling into
		 * the manager free code.
		 */
		if (mpu->mp_freecnt > mmp->mm_spfreecntmax) {
			struct memcache *s2;
			/*
			 * too many locally cached objects, free to general
			 * manager
			 */
			s2 = TAILQ_LAST(&mpu->mp_free, memcache_tqh);
			TAILQ_REMOVE(&mpu->mp_free, s2, m_lnk);
			mpu->mp_freecnt -= s2->m_free;
			MEMCACHE_MGR_LOCK(mmp);
			s2->m_cpuowner = -1;
			if (s2->m_blks[s2->m_free] == MEMCACHE_ENDP)
				TAILQ_INSERT_TAIL(&mmp->mm_free, s2, m_lnk);
			else
				TAILQ_INSERT_HEAD(&mmp->mm_free, s2, m_lnk);
			mmp->mm_freecnt += s2->m_free;
			MEMCACHE_PCPU_UNLOCK(mpu);
			/*
			 * fall into manager free code
			 */
		} else {
			MEMCACHE_PCPU_UNLOCK(mpu);
			return;
		}
	} else {
		mmp->mm_freecnt++;
	}
	/*
 	 * manager free code
	 */
	/*
	 * if we have too many free objects then possibly free back to
	 * the system
	 */
	if (mmp->mm_freecnt > mmp->mm_freecntmax &&
		(mmp->mm_flags & MEMCACHE_STABLE) == 0) {
		struct memcache *s2;

		s2 = TAILQ_LAST(&mmp->mm_free, memcache_tqh);
		/*
		 * can we free it?
		 */
		if (s2->m_blks[s2->m_free] == MEMCACHE_ENDP) {
			TAILQ_REMOVE(&mmp->mm_free, s2, m_lnk);
			mmp->mm_freecnt -= s2->m_free;
			MEMCACHE_MGR_UNLOCK(mmp);
			memcache_mgptrs[MEMCACHE_MGP_ENTRY(s->m_pageptr)] = NULL;
			if (mmp->mm_dtor != NULL)
				mmp->mm_dtor(s->m_pageptr, s->m_free);

			MEMCACHE_KERNEL_LOCK();
			kmem_free(kmem_map, (vm_offset_t)s->m_pageptr,
				PAGES_PER_MEMCACHE * PAGE_SIZE);
			MEMCACHE_KERNEL_UNLOCK();

			free(s, M_MEMCACHE);
			return;
		}
	}
	MEMCACHE_MGR_UNLOCK(mmp);
	return;
}