wesnoth/src/poolalloc.c

/* $Id$ */
/*
   Copyright (C) 2008 by David White <dave@whitevine.net>
   Part of the Battle for Wesnoth Project http://www.wesnoth.org/

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License version 2
   or at your option any later version.
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY.

   See the COPYING file for more details.
*/

/*
This file defines a custom allocator that is optimized for doing memory
allocations for Wesnoth. Its primary consideration is space, though it
should be pretty fast too.

The largest consideration is meta-data: Wesnoth allocates many small chunks,
and so we want to minimize per-chunk meta-data. Typical general-purpose
allocators have a per-chunk overhead of one or two pointers. This allocator
has no per-chunk overhead, just memory overhead of less than 2%.

The allocator is designed to handle small chunks. We include dlmalloc's source,
and allocations that are not considered small are simply punted to dlmalloc
to allocate.

Some basic terminology:

- chunk: a single allocation, allocated with malloc.
- block: a block of memory from which we allocate chunks. A block has a header
and then its data section. A block should be a multiple of the page size.
A given block is dedicated to allocating chunks of a specific size. All blocks
are the same size (4096 bytes by default, which should be the minimum).
- superblock: we allocate one huge block from which all blocks are allocated.
*/

#include <assert.h>
#include <inttypes.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

void* dlmalloc(size_t size);
void* dlcalloc(size_t count, size_t size);
void* dlvalloc(size_t size);
void* dlrealloc(void* ptr, size_t size);
void dlfree(void* ptr);

#define BLOCK_SIZE (4096)

#define MAX_CHUNK_SIZE (256)
#define CHUNK_SIZE_STEP (sizeof(void*))
#define NUM_POOLS ((MAX_CHUNK_SIZE/CHUNK_SIZE_STEP) + 1)

// find the index of the pool that a chunk of size n will be allocated from.
#define GET_POOL_INDEX(n) ((n)/CHUNK_SIZE_STEP)
#define ROUNDUP_SIZE(n) (((n)%CHUNK_SIZE_STEP) ? ((n) + CHUNK_SIZE_STEP - ((n)%CHUNK_SIZE_STEP)) : (n))

#define CUSTOM_MEMORY_SIZE (1024*1024*40)
uint8_t* begin_superblock_range = NULL;
uint8_t* begin_superblock = NULL;
uint8_t* end_superblock = NULL;
#define IS_OUR_PTR(ptr) ((uint8_t*)(ptr) >= begin_superblock_range && (uint8_t*)(ptr) < end_superblock)

pthread_t main_thread;

void init_custom_malloc()
{
	main_thread = pthread_self();

	// allocate the memory -- allocate an extra block at the end, so that
	// if the address we get back isn't block-aligned, we can advance
	// the pointer until it is.
	void* alloc = dlmalloc(CUSTOM_MEMORY_SIZE + BLOCK_SIZE);
	assert(alloc);
	begin_superblock = (uint8_t*)alloc;
	while(((uintptr_t)begin_superblock)%BLOCK_SIZE) {
		++begin_superblock;
	}

	end_superblock = begin_superblock + CUSTOM_MEMORY_SIZE;
	begin_superblock_range = begin_superblock;
}

typedef struct BlockHeader {
	uint32_t check_a;
	struct Block* next;
	struct Block* prev;
	uint8_t* uninit;
	void* free_list;
	uint32_t chunk_size;
	uint32_t allocated_chunks;
	uint32_t check_b;
} BlockHeader;

typedef struct Block {
	BlockHeader header;
	char data[BLOCK_SIZE - sizeof(BlockHeader)];
} Block;

#define BLOCK_EMPTY(b) ((b)->header.allocated_chunks == 0)
#define BLOCK_FULL(b) ((b)->header.uninit == NULL && (b)->header.free_list == NULL)

void* allocate_chunk_from_block(Block* b)
{
	b->header.allocated_chunks++;
	if(b->header.uninit) {
		void* result = b->header.uninit;
		b->header.uninit += b->header.chunk_size;

		//check if we've run out of uninitialized elements.
		if(b->header.uninit + b->header.chunk_size > (uint8_t*)(b+1)) {
			b->header.uninit = NULL;
		}

		return result;
	}

	assert(b->header.free_list);
	void* result = b->header.free_list;
	b->header.free_list = *(void**)result;
	return result;
}

inline Block* get_block_from_chunk(void* chunk)
{
	int8_t* block_ptr = ((int8_t*)chunk) - ((uintptr_t)chunk)%BLOCK_SIZE;
	return (Block*)block_ptr;
}

Block* free_chunk_from_block(void* chunk)
{
	Block* block = get_block_from_chunk(chunk);
	block->header.allocated_chunks--;
	*(void**)chunk = block->header.free_list;
	block->header.free_list = chunk;

	return block;
}

Block* format_block(Block* ptr, int chunk_size)
{
	BlockHeader* block = &ptr->header;

	block->check_a = 0xFFFFFFFF;
	block->check_b = 0xFFFFFFFF;

	block->next = NULL;
	block->prev = NULL;
	block->uninit = (uint8_t*)(block+1);
	block->free_list = NULL;
	block->chunk_size = chunk_size;
	block->allocated_chunks = 0;
	return ptr;
}

Block* block_free_list = NULL;

Block* allocate_new_block(uint32_t chunk_size)
{
	if(block_free_list == NULL && begin_superblock >= end_superblock) {
		return NULL;
	}

	Block* block;

	if(block_free_list != NULL) {
		block = block_free_list;
		block_free_list = block->header.next;
	} else {
		block = (Block*)begin_superblock;
		begin_superblock += sizeof(Block);
	}

	return format_block(block, chunk_size);
}

void return_block_to_free_list(Block* block)
{
	block->header.next = block_free_list;
	block_free_list = block;
}

Block* block_pools[NUM_POOLS];

#define IS_BLOCK_ORPHAN(block) ((block)->header.next == NULL && (block)->header.prev == NULL && block_pools[GET_POOL_INDEX(block->header.chunk_size)] != (block))

void add_block_to_pool(Block* block)
{
	assert(block->header.chunk_size > 0 && block->header.chunk_size <= MAX_CHUNK_SIZE);
	Block** target = &block_pools[GET_POOL_INDEX(block->header.chunk_size)];
	block->header.next = *target;
	block->header.prev = NULL;
	if(*target) {
		(*target)->header.prev = block;
	}
	*target = block;
}

void make_block_orphan(Block* block)
{
	BlockHeader* header = &block->header;
	if(block_pools[GET_POOL_INDEX(header->chunk_size)] == block) {
		block_pools[GET_POOL_INDEX(header->chunk_size)] = header->next;
	}

	if(header->prev) {
		header->prev->header.next = header->next;
	}

	if(header->next) {
		header->next->header.prev = header->prev;
	}

	header->prev = NULL;
	header->next = NULL;
}

// A list of the chunks that were allocated in the main thread, but free()
// was called in another thread. We can't deallocate them from another thread,
// so we put them in this array. The main thread will free all these chunks,
// whenever it can't immediately allocate memory.
void** free_chunks;
size_t nfree_chunks, capacity_free_chunks;
pthread_mutex_t free_chunks_mutex = PTHREAD_MUTEX_INITIALIZER;

//mutex to protect all calls to dlmalloc.
pthread_mutex_t dlmalloc_mutex = PTHREAD_MUTEX_INITIALIZER;

void free_memory(void* ptr);

void collect_memory_from_other_threads()
{
	pthread_mutex_lock(&free_chunks_mutex);
	int n;
	for(n = 0; n != free_chunks; ++n) {
		free_memory(free_chunks[n]);
	}

	nfree_chunks = 0;
	pthread_mutex_unlock(&free_chunks_mutex);
}

void free_memory_from_other_thread(void* ptr)
{
	pthread_mutex_lock(&free_chunks_mutex);

	if(nfree_chunks == capacity_free_chunks) {
		capacity_free_chunks *= 2;
		if(capacity_free_chunks < 16) {
			capacity_free_chunks = 16;
		}

		pthread_mutex_lock(&dlmalloc_mutex);
		void** new_free_chunks = (void**)dlrealloc(free_chunks, sizeof(void*)*capacity_free_chunks);
		pthread_mutex_unlock(&dlmalloc_mutex);
		if(!new_free_chunks) {
			pthread_mutex_unlock(&free_chunks_mutex);
			fprintf(stderr, "DLREALLOC FAILED!\n");
			return;
		}

		free_chunks = new_free_chunks;
	}

	free_chunks[nfree_chunks++] = ptr;
	pthread_mutex_unlock(&free_chunks_mutex);
}

Block* get_block(uint32_t chunk_size)
{
	const int index = GET_POOL_INDEX(chunk_size);
	assert(index >= 0 && index < sizeof(block_pools)/sizeof(*block_pools));
	if(block_pools[index]) {
		return block_pools[index];
	}

	// free memory from other threads and then try again. This requires a mutex
	// lock, but this code should be rarely reached.
	collect_memory_from_other_threads();

	if(block_pools[index]) {
		return block_pools[index];
	}

	Block* block = allocate_new_block(chunk_size);
	if(block == NULL) {
		return block;
	}
	add_block_to_pool(block);
	return block;
}

void* allocate_memory(int32_t size)
{
	Block* block = get_block(size);
	if(block == NULL) {
		return NULL;
	}

	void* result = allocate_chunk_from_block(block);
	if(BLOCK_FULL(block)) {
		make_block_orphan(block);
	}

	return result;
}

void free_memory(void* ptr)
{
	Block* block = free_chunk_from_block(ptr);
	if(IS_BLOCK_ORPHAN(block)) {
		add_block_to_pool(block);
	} else if(BLOCK_EMPTY(block)) {
		//since the block is empty, return it to the global free list of
		//blocks, so it can be moved to a different pool.
		make_block_orphan(block);
		return_block_to_free_list(block);
	}
}

void* malloc(size_t size)
{
	if(pthread_self() == main_thread && size > 0 && size <= MAX_CHUNK_SIZE) {
		size = ROUNDUP_SIZE(size);
		void* result = allocate_memory(size);
		if(result != NULL) {
			return result;
		}
	}

	pthread_mutex_lock(&dlmalloc_mutex);
	void* result = dlmalloc(size);
	pthread_mutex_unlock(&dlmalloc_mutex);
	return result;
}

void* calloc(size_t count, size_t size)
{
	pthread_mutex_lock(&dlmalloc_mutex);
	void* result = dlcalloc(count, size);
	pthread_mutex_unlock(&dlmalloc_mutex);
	return result;
}

void* valloc(size_t size)
{
	pthread_mutex_lock(&dlmalloc_mutex);
	void* result = dlvalloc(size);
	pthread_mutex_unlock(&dlmalloc_mutex);
	return result;
}

void* realloc(void* ptr, size_t size)
{
	if(IS_OUR_PTR(ptr)) {
		if(size == 0) {
			free(ptr);
			return NULL;
		}

		void* new_memory = malloc(size);
		if(new_memory == NULL) {
			return NULL;
		}

		const int old_size = get_block_from_chunk(ptr)->header.chunk_size;
		const size_t nbytes = size < old_size ? size : old_size;
		memcpy(new_memory, ptr, nbytes);
		free(ptr);
		return new_memory;
	}

	pthread_mutex_lock(&dlmalloc_mutex);
	void* result = dlrealloc(ptr, size);
	pthread_mutex_unlock(&dlmalloc_mutex);
	return result;
}

void free(void* ptr)
{
	if(IS_OUR_PTR(ptr)) {
		if(pthread_self() != main_thread) {
			//this will queue up the free to be performed later in the
			//main thread when it wants more memory.
			free_memory_from_other_thread(ptr);
			return;
		}

		free_memory(ptr);
		return;
	}
	pthread_mutex_lock(&dlmalloc_mutex);
	dlfree(ptr);
	pthread_mutex_unlock(&dlmalloc_mutex);
}

#ifdef TEST_POOLED_ALLOC
int main()
{
	init_custom_malloc();

	void** items = NULL;
	int nitems = 0;

	while(nitems < 100000) {
		if(nitems) {
			int clear = rand()%nitems;
			while(--clear >= 0) {
				int len = rand()%1000;
				free(items[clear]);
				items[clear] = malloc(len);
				assert(items[clear]);
				memset(items[clear], 10, len);
			}
		}

		int i = nitems;
		nitems += rand()%100;
		items = realloc(items, sizeof(*items)*nitems);
		while(i != nitems) {
			int len = rand()%1000;
			items[i] = malloc(len);
			assert(items[i]);
			memset(items[i], 10, len);
			++i;
		}
	}

	while(nitems--) {
		free(items[nitems]);
	}

	return 0;
}
#endif