LBPM/threadpool/thread_pool.cpp

#define _CRT_NONSTDC_NO_DEPRECATE
#include "thread_pool.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <typeinfo>
#include <stdexcept>
#include <climits>

#include "ProfilerApp.h"
#include "common/Utilities.h"
#define perr std::cerr
#define pout std::cout
#define printp printf

#define MONITOR_THREADPOOL_PERFORMANCE 0

#if 0
    #define PROFILE_THREAD_START(X)   PROFILE_START(X,3)
    #define PROFILE_THREAD_START2(X)  PROFILE_START2(X,3)
    #define PROFILE_THREAD_STOP(X)    PROFILE_STOP(X,3)
    #define PROFILE_THREAD_STOP2(X)   PROFILE_STOP2(X,3)
#else
    #define PROFILE_THREAD_START(X)   do {} while(0)
    #define PROFILE_THREAD_START2(X)  do {} while(0)
    #define PROFILE_THREAD_STOP(X)    do {} while(0)
    #define PROFILE_THREAD_STOP2(X)   do {} while(0)
#endif


// Include system dependent headers and define some functions
#ifdef __WORDSIZE
    #define ARCH_SIZE __WORDSIZE
#elif defined(_WIN64)
    #define ARCH_SIZE 64
#elif defined(_WIN32) // Note: WIN64 also defines WIN32
    #define ARCH_SIZE 32
#endif
#ifdef USE_WINDOWS
    #include <windows.h>
    #define get_time(x) QueryPerformanceCounter(x)
    #define get_frequency(f) QueryPerformanceFrequency(f)
    #define get_diff(start,end,f) \
        static_cast<double>(end.QuadPart-start.QuadPart)/static_cast<double>(f.QuadPart)
    #define TIME_TYPE LARGE_INTEGER
#elif defined(USE_LINUX)
    #include <sys/time.h>
    #include <errno.h>
    #define Sleep(x) usleep(1000*x)
    #define get_time(x) gettimeofday(x,NULL);
    #define get_frequency(f) (*f=timeval())
    #define get_diff(start,end,f) 1e-6*static_cast<double>( \
        0xF4240*(static_cast<int64_t>(end.tv_sec)-static_cast<int64_t>(start.tv_sec)) + \
                (static_cast<int64_t>(end.tv_usec)-static_cast<int64_t>(start.tv_usec)) )
    #define TIME_TYPE timeval
#elif defined(USE_MAC)
    #include <sys/time.h>
    #include <mach/mach.h>
    #include <errno.h>
    #define Sleep(x) usleep(1000*x)
    #define get_time(x) gettimeofday(x,NULL);
    #define get_frequency(f) (*f=timeval())
    #define get_diff(start,end,f) 1e-6*static_cast<double>( \
        0xF4240*(static_cast<int64_t>(end.tv_sec)-static_cast<int64_t>(start.tv_sec)) + \
                (static_cast<int64_t>(end.tv_usec)-static_cast<int64_t>(start.tv_usec)) )
    #define TIME_TYPE timeval
    #ifndef ARCH_SIZE
        #ifdef __LP64__
            #define ARCH_SIZE 64
        #else
            #define ARCH_SIZE 32
        #endif
    #endif
#else
    #error Unknown OS
#endif


// Check the ARCH_SIZE and set macros
// Note: ARCH_SIZE must match the number of bits in size_t
#if ARCH_SIZE == 64
    // 32-bit macros
#elif ARCH_SIZE == 32
    // 64-bit macros
#else
    #error Cannot identify 32 vs 64-bit
#endif


#define MAX(a,b) (((a) > (b)) ? (a) : (b))
#define MIN(a,b) (((a) < (b)) ? (a) : (b))


#if MONITOR_THREADPOOL_PERFORMANCE==1
    static TIME_TYPE frequency;            // Clock frequency (only used for windows)
    static double total_add_work_time[3] = {0,0,0};
#endif


// Helper functions
template <class T>  void quicksort(std::vector<T> &x);
static inline bool find_id(const std::vector<ThreadPool::thread_id_t> &x_in, const ThreadPool::thread_id_t &id );


// Function to generate a random size_t number (excluding 0 and ~0)
static size_t rand_size_t()
{
    size_t key = 0;
    double tmp = 1;
    if ( sizeof(size_t)==4 ) {
        while ( tmp < 4e9 ) {
            key ^= rand()*0x9E3779B9;   // 2^32*0.5*(sqrt(5)-1)
            tmp *= RAND_MAX;
        }
    } else if ( sizeof(size_t)==8 ) {
        while ( tmp < 1.8e19 ) {
            key ^= rand()*0x9E3779B97F4A7C15;  // 2^64*0.5*(sqrt(5)-1)
            tmp *= RAND_MAX;
        }
    } else {
        throw std::logic_error("Unhandled case");
    }
    if ( key==0 || (~key)==0 )
        key = rand_size_t();
    return key;
}


/******************************************************************
* Run some basic compile-time checks                              *
******************************************************************/
#if MAX_NUM_THREADS%64 != 0
    // We use a bit array for d_active and d_cancel
    #error MAX_NUM_THREADS must be a multiple of 64
#endif
#if MAX_NUM_THREADS >= 65535
    // We store N_threads as a short int
    #error MAX_NUM_THREADS must < 65535
#endif
#if MAX_QUEUED >= 65535
    // We store the indicies to the queue list as short ints
    #error MAX_QUEUED must < 65535
#endif


/******************************************************************
* Convert a string to binary                                      *
******************************************************************/
template<class T>
static inline std::string convert_binary(T x) {
    char buffer[65];
    T mask = ((size_t)1)<<(8*sizeof(T)-1);
    for (size_t i=0; i<8*sizeof(T); i++) {
        if ( ( x & mask ) == 0 )
            buffer[i] = '0';
        else
            buffer[i] = '1';
        mask >>= 1;
    }
    buffer[8*sizeof(T)] = 0;
    return std::string(buffer);
}


/******************************************************************
* Get/Set a bit                                                   *
******************************************************************/
static inline void set_bit( volatile ThreadPool::uint64* x, size_t index, bool val )
{
    ThreadPool::uint64 mask = 0x01;
    mask <<= index%64;
    if ( val )
        x[index/64] |= mask;
    else
        x[index/64] &= ~mask;
}
static inline bool get_bit( const volatile ThreadPool::uint64* x, size_t index )
{
    ThreadPool::uint64 mask = 0x01;
    mask <<= index%64;
    return (x[index/64]&mask)!=0;
}


/******************************************************************
* Some mutex helper functions                                     *
******************************************************************/
#if defined(USE_LINUX) || defined(USE_MAC)
    // Store a set of global attributes for the thread pool
    static pthread_mutexattr_t threadpool_global_attr;
    static int initialize_threadpool_global_attr() {
        pthread_mutexattr_init(&threadpool_global_attr);
        #ifdef __USE_UNIX98
            pthread_mutexattr_settype( &threadpool_global_attr, PTHREAD_MUTEX_ERRORCHECK );
        #endif
        return 1;
    }
    static int threadpool_global_attr_dummy = 0;
    static inline void throw_pthread_error( std::string msg, int value ) {
        std::string code;
        if ( value==0 ) {
            code = "SUCCESS";
        } else if ( value==EINVAL ) {
            code = "EINVAL";
        } else if ( value==EBUSY ) {
            code = "EBUSY";
        } else if ( value==EAGAIN ) {
            code = "EAGAIN";
        } else if ( value==EDEADLK ) {
            code = "EDEADLK";
        } else if ( value==EPERM ) {
            code = "EPERM";
        } else {
            char tmp[100];
            sprintf(tmp,"Unknown (%i)",value);
            code = std::string(tmp);
        }
        throw std::logic_error(msg+code);
    }
#endif
#ifdef USE_WINDOWS
    static inline void lock_mutex( CRITICAL_SECTION *lock ) {
        EnterCriticalSection(lock);
    }
    static inline void unlock_mutex( CRITICAL_SECTION *lock ) {
        LeaveCriticalSection(lock);
    }
    static CRITICAL_SECTION* create_mutex( ) {
        CRITICAL_SECTION *lock = new CRITICAL_SECTION;
        if (!InitializeCriticalSectionAndSpinCount(lock,0x00000400) )
            throw std::exception();
        return lock;
    }
    static void destroy_mutex( CRITICAL_SECTION *lock ) {
        DeleteCriticalSection(lock);
        delete lock;
    }
#elif defined(USE_LINUX) || defined(USE_MAC)
    static inline void lock_mutex( pthread_mutex_t *lock ) {
        int retval = pthread_mutex_lock(lock);
        if ( retval != 0 )
            throw_pthread_error("Error locking mutex: ",retval);
    }
    static inline void unlock_mutex( pthread_mutex_t *lock ) {
        int retval = pthread_mutex_unlock(lock);
        if ( retval != 0 )
            throw_pthread_error("Error unlocking mutex: ",retval);
    }
    static pthread_mutex_t* create_mutex( ) {
        pthread_mutex_t* lock = NULL;
        #if defined(USE_LINUX) || defined(USE_MAC)
            if (threadpool_global_attr_dummy!=1)
                threadpool_global_attr_dummy = initialize_threadpool_global_attr();
        #endif
        // We are creating a new mutex
        lock = new pthread_mutex_t;
        int error = pthread_mutex_init(lock,&threadpool_global_attr);
        if ( error != 0 )
            throw_pthread_error("Error initializing mutex: ",error);
        return lock;
    }
    static void destroy_mutex( pthread_mutex_t* lock ) {
        pthread_mutex_destroy(lock);
        delete lock;
    }
#else
    #error Unknown OS
#endif


/******************************************************************
* Mutex class                                                     *
******************************************************************/
Mutex::Mutex()
{
    d_lock = create_mutex();
    d_recursive = false;
    d_count = new int;
    d_lock_count = new int;
    d_thread = new size_t;
    *d_count = 1;
    *d_lock_count = 0;
    *d_thread = 0;
}
Mutex::Mutex(bool recursive)
{
    d_lock = create_mutex();
    d_recursive = recursive;
    d_count = new int;
    d_lock_count = new int;
    d_thread = new size_t;
    *d_count = 1;
    *d_lock_count = 0;
    *d_thread = 0;
}
Mutex::Mutex(const Mutex& rhs)
{
    rhs.lock();
    d_lock = rhs.d_lock;
    d_count = rhs.d_count;
    d_recursive = rhs.d_recursive;
    d_lock_count = rhs.d_lock_count;
    d_thread = rhs.d_thread;
    ++(*d_count);
    rhs.unlock();
}
Mutex& Mutex::operator=(const Mutex& rhs)
{
    if (this == &rhs) // protect against invalid self-assignment
        return *this;
    rhs.lock();
    this->d_lock = rhs.d_lock;
    this->d_count = rhs.d_count;
    this->d_recursive = rhs.d_recursive;
    this->d_lock_count = rhs.d_lock_count;
    this->d_thread = rhs.d_thread;
    ++(*this->d_count);
    rhs.unlock();
    return *this;
}
Mutex::~Mutex()
{
    lock();
    bool destroy = (*d_count)==1;
    (*d_count)--;
    unlock();
    if ( destroy ) {
        delete d_count;
        delete d_lock_count;
        delete d_thread;
        destroy_mutex(d_lock);
    }
}
void Mutex::lock() const
{
    // Check if we already own the lock
    size_t id = ThreadPool::getThreadId();
    if ( *d_lock_count>0 && *d_thread==id ) {
        if ( !d_recursive )
            throw std::logic_error("Lock is already locked and non-recursive");
        // Increment the lock count and return
        ++(*d_lock_count);
        return;
    }
    // Acquire the lock
    lock_mutex(d_lock);
    if ( *d_lock_count != 0 )   // If we are getting the lock, the count must be 0
        throw std::logic_error("Internal error");
    *d_lock_count = 1;  // Change lock count after acquiring mutex
    *d_thread = id;
}
bool Mutex::tryLock() const
{
    // Check if we already own the lock
    size_t id = ThreadPool::getThreadId();
    if ( *d_lock_count>0 && *d_thread==id ) {
        if ( !d_recursive )
            return false;
        // Increment the lock count and return
        ++(*d_lock_count);
        return true;
    }
    // Try and acquire the lock
    #ifdef USE_WINDOWS
        bool success = TryEnterCriticalSection(d_lock)!=0;
    #elif defined(USE_LINUX) || defined(USE_MAC)
        bool success = pthread_mutex_trylock(const_cast<pthread_mutex_t*>(d_lock))==0;
    #else
        #error Unknown OS
    #endif
    if ( success ) {
        if ( *d_lock_count != 0 )   // If we are getting the lock, the count must be 0
            throw std::logic_error("Internal error");
        *d_lock_count = 1;  // Chage lock count after acquiring mutex
        *d_thread = id;
    }
    return success;
}
void Mutex::unlock() const
{
    // Check if we already own the lock
    size_t id = ThreadPool::getThreadId();
    if ( *d_lock_count <= 0 )
        throw std::logic_error("Trying to release a lock that has not been locked");
    if ( *d_thread != id )
        throw std::logic_error("Thread that does not own lock is attempting to release");
    // Release the lock
    --(*d_lock_count);  // Change lock count before releasing mutex
    if ( *d_lock_count == 0 ) {
        *d_thread = 0;
        unlock_mutex(d_lock);
    }
}
bool Mutex::ownLock() const
{
    size_t id = ThreadPool::getThreadId();
    if ( *d_lock_count>0 && *d_thread==id )
        return true;
    return false;
}


/******************************************************************
* Functions to deal with the signaling                            *
******************************************************************/
#ifdef USE_WINDOWS
    static inline bool SIGNAL_EVENT(HANDLE event) {
        SetEvent(event);
         return false;
    }
#elif defined(USE_LINUX) || defined(USE_MAC)
    static inline bool SIGNAL_EVENT(pthread_cond_t *event) {
        int retval = pthread_cond_signal(event);
        if ( retval == -1 ) {
            perr << "Error signaling event\n";
            return true;
        }
        return false;
    }
#else
    #error Not programmed
#endif


/******************************************************************
* Simple function to check if the parity is odd (true) or even    *
******************************************************************/
static inline bool is_odd8(size_t x) {  // This only works for 64-bit integers
    x ^= (x >> 1);
    x ^= (x >> 2);
    x ^= (x >> 4);
    x ^= (x >> 8);
    x ^= (x >> 16);
    x ^= (x >> 32);
    return (x & 0x01) > 0;
}
template<class int_type>
static inline int count_bits(int_type x) {
    int count = 0;
    for (size_t i=0; i<8*sizeof(int_type); i++) {
        if ( (x>>i)&0x01 )
            ++count;
    }
    return count;
}


/******************************************************************
* Set the bahvior of OS warnings                                  *
******************************************************************/
static int global_OS_behavior = 0;
void ThreadPool::set_OS_warnings( int behavior )
{
    ASSERT(behavior>=0&&behavior<=2);
    global_OS_behavior = behavior;
}
static void OS_warning( const std::string& message )
{
    if ( global_OS_behavior==0 ) {
        pout << "Warning: " << message << std::endl;
    } else if ( global_OS_behavior==2 ) {
        perr << "Error: " << message << std::endl;
    }
}


/******************************************************************
* Function to return the number of prcessors availible            *
******************************************************************/
int ThreadPool::getNumberOfProcessors()
{
    #if defined(USE_LINUX) || defined(USE_MAC)
        return sysconf( _SC_NPROCESSORS_ONLN );
    #elif defined(USE_WINDOWS)
        SYSTEM_INFO sysinfo;
        GetSystemInfo( &sysinfo );
        return static_cast<int>(sysinfo.dwNumberOfProcessors);
    #else
        #error Unknown OS
    #endif
}


/******************************************************************
* Function to return the processor number of the current thread   *
******************************************************************/
int ThreadPool::getCurrentProcessor()
{
    #if defined(USE_LINUX)
        return sched_getcpu()+1;
    #elif defined(USE_MAC)
        OS_warning("MAC does not support getCurrentProcessor");
        return 0;
    #elif defined(USE_WINDOWS)
        return GetCurrentProcessorNumber()+1;
    #else
        #error Unknown OS
    #endif
}


/******************************************************************
* Function to get/set the affinity of the current process         *
******************************************************************/
std::vector<int> ThreadPool::getProcessAffinity()
{
    std::vector<int> procs;
    #ifdef USE_LINUX
        #ifdef _GNU_SOURCE
            cpu_set_t mask;
            int error = sched_getaffinity(getpid(), sizeof(cpu_set_t), &mask );
            if ( error!=0 )
                throw std::logic_error("Error getting process affinity");
            for (int i=0; i<(int)sizeof(cpu_set_t)*CHAR_BIT; i++) {
                if ( CPU_ISSET(i,&mask) )
                    procs.push_back(i);
            }
        #else
            #warning sched_getaffinity is not supported for this compiler/OS
            OS_warning("sched_getaffinity is not supported for this compiler/OS");
            procs.clear();
        #endif
    #elif defined(USE_MAC)
        // MAC does not support getting or setting the affinity
        OS_warning("MAC does not support getting the process affinity");
        procs.clear();
    #elif defined(USE_WINDOWS)
        HANDLE hProc = GetCurrentProcess();
        size_t procMask;
        size_t sysMask;
        PDWORD_PTR procMaskPtr = reinterpret_cast<PDWORD_PTR>(&procMask);
        PDWORD_PTR sysMaskPtr  = reinterpret_cast<PDWORD_PTR>(&sysMask);
        GetProcessAffinityMask(hProc,procMaskPtr,sysMaskPtr);
        for (int i=0; i<(int)sizeof(size_t)*CHAR_BIT; i++) {
            if ( (procMask&0x1) != 0 )
                procs.push_back(i);
            procMask >>= 1;
        }
    #else
        #error Unknown OS
    #endif
    return procs;
}
void ThreadPool::setProcessAffinity( std::vector<int> procs )
{
    #ifdef USE_LINUX
        #ifdef _GNU_SOURCE
            cpu_set_t mask;
            CPU_ZERO(&mask);
            for (size_t i=0; i<procs.size(); i++)
                CPU_SET(procs[i],&mask);
            int error = sched_setaffinity(getpid(), sizeof(cpu_set_t), &mask );
            if ( error!=0 )
                throw std::logic_error("Error setting process affinity");
        #else
            #warning sched_setaffinity is not supported for this compiler/OS
            OS_warning("sched_setaffinity is not supported for this compiler/OS");
            procs.clear();
        #endif
    #elif defined(USE_MAC)
        // MAC does not support getting or setting the affinity
        OS_warning("Warning: MAC does not support setting the process affinity");
        procs.clear();
    #elif defined(USE_WINDOWS)
        DWORD mask = 0;
        for (size_t i=0; i<procs.size(); i++)
            mask |= ((DWORD)1) << procs[i];
        HANDLE hProc = GetCurrentProcess();
        SetProcessAffinityMask( hProc, mask );
    #else
        #error Unknown OS
    #endif
}


/******************************************************************
* Function to get the thread affinities                           *
******************************************************************/
#ifdef USE_WINDOWS
    DWORD GetThreadAffinityMask(HANDLE thread)
    {
        DWORD mask = 1;
        DWORD old = 0;
        // try every CPU one by one until one works or none are left
        while(mask)
        {
            old = SetThreadAffinityMask(thread, mask);
            if(old)
            {   // this one worked
                SetThreadAffinityMask(thread, old); // restore original
                return old;
            }
            else
            {
                if(GetLastError() != ERROR_INVALID_PARAMETER)
                    return 0; // fatal error, might as well throw an exception
            }
            mask <<= 1;
        }

        return 0;
    }
#endif
std::vector<int> ThreadPool::getThreadAffinity()
{
    std::vector<int> procs;
    #ifdef USE_LINUX
        #ifdef _GNU_SOURCE
            cpu_set_t mask;
            int error = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &mask );
            if ( error!=0 )
                throw std::logic_error("Error getting thread affinity");
            for (int i=0; i<(int)sizeof(cpu_set_t)*CHAR_BIT; i++) {
                if ( CPU_ISSET(i,&mask) )
                    procs.push_back(i);
            }
        #else
            #warning pthread_getaffinity_np is not supported
            OS_warning("pthread does not support pthread_getaffinity_np");
            procs.clear();
        #endif
    #elif defined(USE_MAC)
        // MAC does not support getting or setting the affinity
        OS_warning("MAC does not support getting the thread affinity");
        procs.clear();
    #elif defined(USE_WINDOWS)
        size_t procMask = GetThreadAffinityMask(GetCurrentThread());
        for (int i=0; i<(int)sizeof(size_t)*CHAR_BIT; i++) {
            if ( (procMask&0x1) != 0 )
                procs.push_back(i);
            procMask >>= 1;
        }
    #else
        #error Unknown OS
    #endif
    return procs;
}
std::vector<int> ThreadPool::getThreadAffinity( int thread ) const
{
    if ( thread >= getNumThreads() )
        std::logic_error("Invalid thread number");
    std::vector<int> procs;
    #ifdef USE_LINUX
        #ifdef _GNU_SOURCE
            cpu_set_t mask;
            int error = pthread_getaffinity_np(d_hThread[thread], sizeof(cpu_set_t), &mask );
            if ( error!=0 )
                throw std::logic_error("Error getting thread affinity");
            for (int i=0; i<(int)sizeof(cpu_set_t)*CHAR_BIT; i++) {
                if ( CPU_ISSET(i,&mask) )
                    procs.push_back(i);
            }
        #else
            #warning pthread_getaffinity_np is not supported
            OS_warning("pthread does not support pthread_getaffinity_np");
            procs.clear();
        #endif
    #elif defined(USE_MAC)
        // MAC does not support getting or setting the affinity
        OS_warning("MAC does not support getting the thread affinity");
        procs.clear();
    #elif defined(USE_WINDOWS)
        size_t procMask = GetThreadAffinityMask(d_hThread[thread]);
        for (int i=0; i<(int)sizeof(size_t)*CHAR_BIT; i++) {
            if ( (procMask&0x1) != 0 )
                procs.push_back(i);
            procMask >>= 1;
        }
    #else
        #error Unknown OS
    #endif
    return procs;
}


/******************************************************************
* Function to set the thread affinity                             *
******************************************************************/
void ThreadPool::setThreadAffinity( std::vector<int> procs )
{
    #ifdef USE_LINUX
        #ifdef _GNU_SOURCE
            cpu_set_t mask;
            CPU_ZERO(&mask);
            for (size_t i=0; i<procs.size(); i++)
                CPU_SET(procs[i],&mask);
            int error = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &mask );
            if ( error!=0 )
                throw std::logic_error("Error setting thread affinity");
        #else
            #warning pthread_getaffinity_np is not supported
            OS_warning("pthread does not support pthread_setaffinity_np");
            procs.clear();
        #endif
    #elif defined(USE_MAC)
        // MAC does not support getting or setting the affinity
        NULL_USE(procs);
        OS_warning("MAC does not support setting the thread affinity");
    #elif defined(USE_WINDOWS)
        DWORD mask = 0;
        for (size_t i=0; i<procs.size(); i++)
            mask |= ((DWORD)1) << procs[i];
        SetThreadAffinityMask( GetCurrentThread(), mask );
    #else
        #error Unknown OS
    #endif
}
void ThreadPool::setThreadAffinity( int thread, std::vector<int> procs ) const
{
    if ( thread >= getNumThreads() )
        std::logic_error("Invalid thread number");
    #ifdef USE_LINUX
        #ifdef __USE_GNU
            cpu_set_t mask;
            CPU_ZERO(&mask);
            for (size_t i=0; i<procs.size(); i++)
                CPU_SET(procs[i],&mask);
            int error = pthread_setaffinity_np(d_hThread[thread], sizeof(cpu_set_t), &mask );
            if ( error!=0 )
                throw std::logic_error("Error setting thread affinity");
        #else
            #warning pthread_getaffinity_np is not supported
            OS_warning("pthread does not support pthread_setaffinity_np");
            procs.clear();
        #endif
    #elif defined(USE_MAC)
        // MAC does not support getting or setting the affinity
        NULL_USE(procs);
        OS_warning("MAC does not support getting the process affinity");
    #elif defined(USE_WINDOWS)
        DWORD mask = 0;
        for (size_t i=0; i<procs.size(); i++)
            mask |= ((DWORD)1) << procs[i];
        SetThreadAffinityMask( d_hThread[thread], mask );
    #else
        #error Unknown OS
    #endif
}


/******************************************************************
* Function to perform some basic checks before we start           *
******************************************************************/
void ThreadPool::check_startup(size_t size0)
{
    // Check the size of the class to make sure that we don't have any
    // byte alignment problems between a library implimentation and a calling pacakge
    size_t size1 = sizeof(ThreadPool);
    size_t size2 = ((size_t)&d_NULL_HEAD)-((size_t)this)+sizeof(size_t);
    size_t size3 = ((size_t)&d_NULL_TAIL)-((size_t)this)+sizeof(size_t);
    if ( size0!=size1 || size1<size2 || size1<size3 )
        throw std::logic_error("Internal data format problem");
    // Check the size of variables
    if ( sizeof(ThreadPool::uint64)!=8 )
        throw std::logic_error("uint64 is not 64 bits");
    if ( sizeof(AtomicOperations::int32_atomic)!=4 )
        throw std::logic_error("AtomicOperations::int32_atomic is not 32 bits");
    if ( sizeof(AtomicOperations::int64_atomic)!=8 )
        throw std::logic_error("AtomicOperations::int32_atomic is not 64 bits");
    // Check getting/setting a bit
    uint64 x[2] = {0x0,0x7};
    set_bit(x,2,true);
    set_bit(x,66,false);
    if ( x[0]!=4 || x[1]!=3 || !get_bit(x,2) || get_bit(x,66) )
        throw std::logic_error("Getting/setting a bit failed");
    // Check the thread id
    bool pass = true;
    ThreadPool::thread_id_t id;
    if ( id.getPriority()!=-128 )
        pass = false;
    id.reset(3,564,NULL);
    if ( id.getPriority()!=3 || id.getLocalID()!=564 )
        pass = false;
    if ( count_bits(0x0)!=0 || count_bits(0x03)!=2 )
        pass = false;
    if ( count_bits(~((size_t)0)) != 8*sizeof(size_t) )
        pass = false;
    if ( sizeof(size_t)==8 ) {
        if ( is_odd8(0x0) || !is_odd8(0x02) || is_odd8(0x03) )
            pass = false;
        if ( is_odd8(~((size_t)0)) || !is_odd8(MAXID64))
            pass = false;
        for (size_t i=0; i<1024; i++) {
            if ( (count_bits(MAXID64-i)%2==1) != is_odd8(MAXID64-i) ) {
                printp("%i %i %s\n",count_bits(MAXID64-i),is_odd8(MAXID64-i)?1:0,
                    convert_binary<unsigned long long int>(MAXID64-i).c_str());
                pass = false;
            }
        }
    }
    initialize_id();
    advance_id(); advance_id();
    ThreadPool::thread_id_t id2;
    id2.reset(3,d_id_assign,NULL);
    if ( isValid(id) || !isValid(id2) )
        pass = false;
    if ( !pass ) {
        throw std::logic_error("Thread pool failed to initialize");
    }
}


/******************************************************************
* Function to initialize the thread pool                          *
******************************************************************/
void ThreadPool::initialize( const int N, const char* affinity, int N_procs, const int* procs )
{
    // Get the clock frequency
    #if MONITOR_THREADPOOL_PERFORMANCE==1
        get_frequency( &frequency );
    #endif
    // Initialize the header/tail
    d_NULL_HEAD = rand_size_t();
    d_NULL_TAIL = d_NULL_HEAD;
    for (int i=0; i<MAX_NUM_THREADS; i++)
        d_hThread[i] = 0;
    // Initialize the variables to NULL values
    d_id_assign = 0;
    d_signal_empty = false;
    d_signal_count = 0;
    d_N_threads = 0;
    d_num_active = 0;
    d_queue_size = 0;
    d_N_wait = 0;
    for (int i=0; i<MAX_NUM_THREADS; i++)
        d_ThreadId[i] = ~((size_t)0);
    memset((void*)d_active,0,MAX_NUM_THREADS/8);
    memset((void*)d_cancel,0,MAX_NUM_THREADS/8);
    for (int i=0; i<MAX_QUEUED; i++) {
        d_queue_ids[i].reset();
        d_queue_list[i].reset();
        d_queue_list[i].position = i;
        d_queue_list[i].prev = i-1;
        d_queue_list[i].next = i+1;
    }
    d_queue_head = -1;
    d_queue_free = 0;
    for (int i=0; i<MAX_WAIT; i++)
        d_wait[i] = NULL;
    d_wait_finished = 0;
    d_lock_queue = 0;
    for (int i=0; i<MAX_NUM_THREADS; i++)
        d_hThread[i] = 0;
    #if defined(USE_LINUX) || defined(USE_MAC)
        d_queue_not_empty = 0;
    #endif
    // Initialize the id
    initialize_id();
    // Create the mutex lock and signal variables
    d_lock_queue = create_mutex();
    #ifdef USE_WINDOWS
        d_wait_finished = CreateEvent(NULL,FALSE,FALSE,NULL);
    #elif defined(USE_LINUX) || defined(USE_MAC)
        d_queue_not_empty = new pthread_cond_t;
        d_wait_finished = new pthread_cond_t;
        int error = pthread_cond_init(d_queue_not_empty,NULL);
        if ( error == -1 )
            perr << "Error creating d_queue_not_empty\n";
        error = pthread_cond_init(d_wait_finished,NULL);
        if ( error == -1 )
            perr << "Error creating d_wait_finished\n";
    #else
        #error Not programmed
    #endif
    // Create the threads
    setNumThreads(N,affinity,N_procs,procs);
}


/******************************************************************
* This is the de-constructor                                      *
******************************************************************/
ThreadPool::~ThreadPool() {
    if ( !is_valid(this) )
        throw std::logic_error("Thread pool is not valid");
    // Destroy the threads
    setNumThreads(0);
    // Delete all remaining data
    destroy_mutex(d_lock_queue);
    #ifdef USE_WINDOWS
        CloseHandle(d_wait_finished);
    #elif defined(USE_LINUX) || defined(USE_MAC)
        pthread_cond_destroy(d_wait_finished);
        pthread_cond_destroy(d_queue_not_empty);
        delete d_queue_not_empty;     d_queue_not_empty=NULL;
        delete d_wait_finished;       d_wait_finished=NULL;
    #else
        #error Not programmed
    #endif
    d_N_threads = -1;
    d_NULL_HEAD = 0;
    d_NULL_TAIL = 0;
    // Print the performance metrics
    #if MONITOR_THREADPOOL_PERFORMANCE==1
        printp("ThreadPool Performance:\n");
        printp("add_work: %e %e %e\n",total_add_work_time[0],total_add_work_time[1],total_add_work_time[2]);
    #endif
}


/******************************************************************
* Check if the pointer points to a valid thread pool object       *
******************************************************************/
bool ThreadPool::is_valid( const ThreadPool* tpool )
{
    if ( tpool == NULL )
        return false;
    if ( tpool->d_N_threads<0 || tpool->d_N_threads>MAX_NUM_THREADS )
        return false;
    if ( tpool->d_NULL_HEAD==0 || tpool->d_NULL_HEAD!=tpool->d_NULL_TAIL )
        return false;
    return true;
}


/******************************************************************
* This function creates the threads in the thread pool            *
******************************************************************/
void ThreadPool::setNumThreads( int num_worker_threads,
    const char* affinity2, int N_procs, const int* procs )
{
    // Check if we are a member thread
    if ( isMemberThread() )
        throw std::logic_error("Member threads are not allowed to change the number of threads in the pool");
    // Determing the number of threads we need to create or destroy
    if ( num_worker_threads > MAX_NUM_THREADS ) {
        printp("Warning: Maximum Number of Threads is %i\n",MAX_NUM_THREADS);
        printp("         Only that number will be created\n");
        num_worker_threads = MAX_NUM_THREADS;
    } else if ( num_worker_threads < 0 ) {
        printp("Error: cannot have a negitive number of threads\n");
        printp("       Setting the number of threads to 0\n");
        num_worker_threads = 0;
    }
    int d_N_threads_diff = num_worker_threads-d_N_threads;
    if ( d_N_threads_diff > 0 ) {
        // Create new threads
        lock_mutex(d_lock_queue);
        // Check that no threads are in the process of being deleted
        for (int i=0; i<MAX_NUM_THREADS/64; i++) {
            if ( d_cancel[i] != 0 )
                throw std::logic_error("Threads are being created and destroyed at the same time");
        }
        // Create the thread attributes (linux only)
        #if defined(USE_LINUX) || defined(USE_MAC)
            pthread_attr_t attr;
            pthread_attr_init(&attr);
            //int ptmp;
            //pthread_attr_setstacksize(&attr,2097152);     // Default stack size is 8MB
            //pthread_attr_setschedpolicy(&attr,1);
            //pthread_attr_getschedpolicy(&attr,&ptmp);
            //pout << "getschedpolicy = " << ptmp << std::endl;
        #endif
        // Create the threads
        void **tmp = new void*[2*d_N_threads_diff];
        int j = d_N_threads;
        for (int i=0; i<d_N_threads_diff; i++) {
            d_N_threads++;
            tmp[0+2*i] = this;
            tmp[1+2*i] = reinterpret_cast<void*>(static_cast<size_t>(j));
            bool error = false;
            set_bit(d_cancel,j,true);
            #ifdef USE_WINDOWS
                d_hThread[j] = (HANDLE)_beginthread( create_new_thread, 0, (void *) &tmp[2*i]);
                error = d_hThread==(HANDLE)(-1);
            #elif defined(USE_LINUX) || defined(USE_MAC)
                int rtn = pthread_create( &d_hThread[j], &attr, (void *(*)(void*)) create_new_thread, (void *) &tmp[2*i]);
                error = rtn!=0;
            #else
                #error Not programmed
            #endif
            if ( error ) {
                pout << "Warning: Only able to create " << i << " threads\n";
                break;
            }
            j++;
        }
        // Wait for all of the threads to finish initialization
        while ( 1 ) {
            unlock_mutex(d_lock_queue);
            Sleep(25);
            lock_mutex(d_lock_queue);
            bool wait = false;
            for (int i=0; i<MAX_NUM_THREADS/64; i++) {
                if ( d_cancel[i] != 0 )
                    wait = true;
            }
            if ( !wait )
                break;
        }
        // Delete the thread attributes (linux only)
        #if defined(USE_LINUX) || defined(USE_MAC)
            pthread_attr_destroy(&attr);
        #endif
        // Release the lock
        unlock_mutex(d_lock_queue);
        Sleep(25);
        delete [] tmp;
    } else if ( d_N_threads_diff < 0 ) {
        // Reduce the number of threads
        if ( num_worker_threads==0 ) {
            // Special case if we want to delete all of the threads
            wait_pool_finished();
        }
        // Lock the mutex for the deletion of existing threads
        lock_mutex(d_lock_queue);
        // Tell the threads to shutdown
        for (int i=0; i>d_N_threads_diff; i--)
            set_bit(d_cancel,d_N_threads-1+i,true);
        #ifdef USE_WINDOWS
            // Release the lock
            unlock_mutex(d_lock_queue);
            // Wake all threads to process the shutdown (Doesn't require blocking)
            for (int i=0; i<d_N_threads; i++) {
                ResumeThread(d_hThread[i]);
            }
        #elif defined(USE_LINUX) || defined(USE_MAC)
            // Wake all threads to process the shutdown
            int error = pthread_cond_broadcast(d_queue_not_empty);
            if ( error != 0 )
                perr << "Error in signaling thread";
            // Release the lock
            unlock_mutex(d_lock_queue);
        #else
            #error Not programmed
        #endif
        Sleep(25);
        // Wait for all of the threads to close
        #ifdef USE_WINDOWS
            int j = d_N_threads+d_N_threads_diff;
            WaitForMultipleObjects( -d_N_threads_diff, &d_hThread[j], 1, 10000 );
        #elif defined(USE_LINUX) || defined(USE_MAC)
            for (int i=0; i>d_N_threads_diff; i--) {
                int rtn = pthread_join(d_hThread[d_N_threads-1+i],NULL);
                if ( rtn != 0 ) {
                    perr << "error\n";
                    perr << "Error joining threads";
                }
            }
        #else
            #error Not programmed
        #endif
        for (int i=0; i>d_N_threads_diff; i--) {
            set_bit(d_cancel,d_N_threads-1+i,false);
            d_hThread[d_N_threads-1+i] = 0;
            d_ThreadId[d_N_threads-1+i] = ~((size_t)0);
        }
        d_N_threads += d_N_threads_diff;
    }
    if ( d_N_threads == 0 )
        return;
    // Get the default thread affinity to use
    std::vector<int> cpus;
    int tmp = global_OS_behavior;
    global_OS_behavior = 1;
    OS_warning("Dummy message (should not print)");
    try {
        cpus = ThreadPool::getProcessAffinity();
    } catch(...) {
        pout << "Warning: Unable to get default cpus for thread affinities\n";
    }
    if ( !cpus.empty() && N_procs>0 ) {
        cpus.resize(N_procs);
        for (int i=0; i<N_procs; i++)
            cpus[i] = procs[i];
    }
    // Set the affinity model and the associated thread affinities
    // Note: not all OS's support setting the thread affinities
    std::vector<std::vector<int> > t_procs(d_N_threads);
    std::string affinity(affinity2);
    if ( cpus.empty() ) {
        // We do not have a list of cpus to use, do nothing (OS not supported)
    } else if ( affinity=="none" ) {
        // We are using the default thread affinities (all threads get all procs of the program)
        for (int i=0; i<d_N_threads; i++)
            t_procs[i] = cpus;
    } else if ( affinity=="independent" ) {
        // We want to use an independent set of processors for each thread
        if ( (int) cpus.size() == d_N_threads ) {
            // The number of cpus matches the number of threads
            for (int i=0; i<d_N_threads; i++)
                t_procs[i] = std::vector<int>(1,cpus[i]);
        } else if ( (int) cpus.size() > d_N_threads ) {
            // There are more cpus than threads, threads will use more the one processor
            int N_procs_thread = (cpus.size()+d_N_threads-1)/d_N_threads;
            size_t k = 0;
            for (int i=0; i<d_N_threads; i++) {
                for (int j=0; j<N_procs_thread && k<cpus.size(); j++) {
                    t_procs[i].push_back( cpus[k] );
                    k++;
                }
            }
        } else {
            // There are fewer cpus than threads, threads will share a processor
            int N_threads_proc = (cpus.size()+d_N_threads-1)/cpus.size();
            for (int i=0; i<d_N_threads; i++)
                t_procs[i].push_back( cpus[i/N_threads_proc] );
        }
    } else {
        global_OS_behavior = tmp;
        throw std::logic_error("Unknown affinity model");
    }
    try {
        for (int i=0; i<d_N_threads; i++) {
            ThreadPool::setThreadAffinity( i, t_procs[i] );
            std::vector<int> cpus2 = getThreadAffinity( i );
            if ( cpus2 != t_procs[i] )
                pout << "Warning: error setting affinities (failed to set)\n";
        }
    } catch (...) {
        pout << "Warning: error setting affinities (exception)\n";
    }
    global_OS_behavior = tmp;
}


/******************************************************************
* Get an item in the work queue that is ready to be processed     *
******************************************************************/
int ThreadPool::getThreadNumber() const
{
    size_t id = getThreadId();
    int index = 0;
    for (int i=0; i<d_N_threads; i++) {
        if ( d_ThreadId[i]==id )
            index = i+1;
    }
    return index;
}


/******************************************************************
* Get an item in the work queue that is ready to be processed     *
******************************************************************/
short int ThreadPool::get_work_item( )
{
    const thread_id_t *ids = const_cast<const thread_id_t*>(d_queue_ids);
    const queue_list_struct *list = const_cast<const queue_list_struct*>(d_queue_list);
    short int index = d_queue_head;
    short int index2 = check_dependecies(list,ids,index);
    while ( index2==-1 && index!=-1 ) {
        index = d_queue_list[index].next;
        index2 = index==-1 ? -1:check_dependecies(list,ids,index);
    }
    return index2;
}
inline short int ThreadPool::check_dependecies( const ThreadPool::queue_list_struct *list,
    const thread_id_t *queue, short int index )
{
    if ( index==-1 )
        return -1;
    WorkItem* work = reinterpret_cast<WorkItem*>(queue[index].d_work);
    // Loop through the dependencies, removing any that have finished,
    // and search for any that have not started (keeping the one with the fewest dependencies)
    size_t N_active = 0;
    thread_id_t* ids = work->d_ids;
    short int index2 = index;
    int N_dependencies = static_cast<int>(work->d_N_ids);
    for (int i=N_dependencies-1; i>=0; i--) {
        WorkItem* work2 = reinterpret_cast<WorkItem*>(ids[i].d_work);
        char state = work2->d_state;
        if ( state==0 ) {
            // We found a new potential item to process
            index2 = work2->d_tpool_index;
            index2 = check_dependecies(list,queue,index2);
            if ( index2 != -1 )
                break;
        } else if ( state==1 || state==-1 ) {
            // We found an item that is processing
            N_active++;
        } else if ( state==2 ) {
            // The item has finished
            ids[i].reset();
            std::swap(ids[i],ids[work->d_N_ids-1]);
            work->d_N_ids--;
            continue;
        }
    }
    if ( N_active>0 ) {
        // Some dependencies are working, choose a different work item
        index2 = -1;
    }
    return index2;
}


/******************************************************************
* This is the function that controls the individual thread and    *
* allows it to do work.                                           *
******************************************************************/
void ThreadPool::tpool_thread(int thread_id)
{
    if ( getThreadId()==0 )
        throw std::logic_error("Invalid thread id");
    bool shutdown = false;
    bool printInfo = false;
    d_ThreadId[thread_id] = getThreadId();
    // Acquire mutex
    lock_mutex(d_lock_queue);
    if ( get_bit(d_active,thread_id) )
        throw std::logic_error("Thread cannot already be active");
    d_num_active++;
    set_bit(d_active,thread_id,true);
    set_bit(d_cancel,thread_id,false);
    if ( printInfo ) {
        // Print the pid
        printp("pid = %i\n",(int)getpid());
        // Print the processor affinities for the process
        try {
            std::vector<int> cpus = ThreadPool::getProcessAffinity();
            printp("%i cpus for current thread: ",(int)cpus.size());
            for (size_t i=0; i<cpus.size(); i++)
                printp("%i ",cpus[i]);
            printp("\n");
        } catch(...) {
            printp("Unable to get process affinity\n");
        }
    }
    // Check for shutdown
    shutdown = false;
    //pout << "Thread initialized\n";
    PROFILE_THREAD_START("thread active");
    while ( !shutdown ) {
        // Check if there is work to do
        if ( d_queue_size>0 ) {
            // Get next work item to process
            short int work_index = ThreadPool::get_work_item();
            if ( work_index==-1 ) {
                unlock_mutex(d_lock_queue);
                Sleep(0);
                lock_mutex(d_lock_queue);
                continue;
            }
            // Remove the work item from the queue
            #ifdef D_DEBUG
                short int cur = d_queue_list[work_index].position;
            #endif
            short int next = d_queue_list[work_index].next;
            short int prev = d_queue_list[work_index].prev;
            if ( prev==-1 ) {
                d_queue_head = next;
            } else {
                d_queue_list[prev].next = next;
            }
            if ( next!=-1 ) {
                d_queue_list[next].prev = prev;
            }
            --d_queue_size;
            #ifdef D_DEBUG
                if ( cur!=work_index || ( d_queue_size>0 && d_queue_head==-1 ) )
                    throw std::logic_error("Internal error with threadpool");
            #endif
            thread_id_t work_id = const_cast<thread_id_t&>(d_queue_ids[work_index]);
            d_queue_ids[work_index].reset();
            d_queue_list[work_index].reset();
            d_queue_list[work_index].next = d_queue_free;
            d_queue_free = work_index;
            WorkItem* work = reinterpret_cast<WorkItem*>(work_id.d_work);
            work->d_state = -1;
            // Release mutex
            unlock_mutex(d_lock_queue);
            // Start work here
            PROFILE_THREAD_START("thread working");
            work->run();
            if ( work->d_state!=2 ) { throw std::logic_error("Work item is not changing state"); }
            PROFILE_THREAD_STOP("thread working");
            // Work finished, acquire mutex and remove it from the active list
            lock_mutex(d_lock_queue);
            // Check if any threads are waiting on the current work item
            for (int i=0; i<d_N_wait; i++) {
                wait_event_struct* wait = const_cast<wait_event_struct*>(d_wait[i]);
                bool found = false;
                if ( wait->ids.empty() ) {
                    // Special case where we just want to wait for any work items to finish
                    found = true;
                } else {
                    found = find_id( wait->ids, work_id );
                }
                if ( found ) {
                    wait_type event = 0;
                    volatile int* count = &(wait->count);
                    if ( *count == 1 )
                        event = const_cast<wait_type>(wait->wait_event);
                    --(*count);
                    if ( event != 0 )
                        SIGNAL_EVENT(event);
                }
            }
            // Check the signal count and signal if desired
            if ( d_signal_count > 0 ) {
                --d_signal_count;
                if ( d_signal_count == 0 )
                    SIGNAL_EVENT(d_wait_finished);
            }
        } else {
            int N_active = --d_num_active;
            set_bit(d_active,thread_id,false);
            // Alert main thread that a thread finished processing
            if ( N_active==0 ) {
                if ( d_signal_empty ) {
                    SIGNAL_EVENT(d_wait_finished);
                    d_signal_empty = false;
                }
            }
            // Wait for work
            PROFILE_THREAD_STOP2("thread active");
            #ifdef USE_WINDOWS
                unlock_mutex(d_lock_queue);
                SuspendThread(d_hThread[thread_id]);
                lock_mutex(d_lock_queue);
            #elif defined(USE_LINUX) || defined(USE_MAC)
                pthread_cond_wait(d_queue_not_empty,d_lock_queue);
            #endif
            PROFILE_THREAD_START2("thread active");
            ++d_num_active;
            set_bit(d_active,thread_id,true);
        }
        // Check if there is a shutdown requested
        shutdown = get_bit(d_cancel,thread_id);
    }
    PROFILE_THREAD_STOP("thread active");
    d_num_active--;
    set_bit(d_active,thread_id,false);
    // Release mutex
    unlock_mutex(d_lock_queue);
    return;
}


/******************************************************************
* This is the function that adds work to the thread pool          *
* Note: this version uses a last in - first out work scheduling.  *
******************************************************************/
void ThreadPool::add_work( size_t N, ThreadPool::WorkItem* work[],
    const int* priority, ThreadPool::thread_id_t* ids )
{
    #if MONITOR_THREADPOOL_PERFORMANCE
        TIME_TYPE start_time_local;
        get_time(&start_time_local);
    #endif
    // If we have a very long list, break it up into smaller pieces to keep the threads busy
    const size_t block_size = MAX_QUEUED/4;
    if ( N > block_size ) {
        size_t N_sets = (N+block_size-1)/block_size;
        for (size_t i=0; i<N_sets; i++) {
            size_t index = i*block_size;
            size_t N2 = std::min<size_t>(block_size,N-index);
            add_work( N2, &work[index], &priority[index], &ids[index] );
        }
        return;
    }
    // Create the thread ids (can be done without blocking)
    for (size_t i=0; i<N; i++) {
        ids[i].reset(priority[i],advance_id(),work[i]);
        work[i]->d_tpool_index = -2;
    }
    // If there are no threads, perform the work immediately
    if ( d_N_threads < 1 ) {
        for (size_t i=0; i<N; i++) {
            work[i]->run();
        }
        return;
    }
    // Wait for enough room in the queue (doesn't need blocking since it isn't that precise)
    if ( N > static_cast<size_t>(MAX_QUEUED-d_queue_size) ) {
        int N_wait = static_cast<int>( N - (MAX_QUEUED-d_queue_size) );
        while ( N_wait > 0 ) {
            d_signal_count = static_cast<unsigned char>(std::min(N_wait,255));
            #ifdef USE_WINDOWS
                DWORD ret = WaitForSingleObject( d_wait_finished, INFINITE );
            #elif defined(USE_LINUX) || defined(USE_MAC)
                lock_mutex(d_lock_queue);
                if ( d_signal_count > 0 )
                    pthread_cond_wait(d_wait_finished,d_lock_queue);
                unlock_mutex(d_lock_queue);
            #else
                #error Not programmed
            #endif
            N_wait = static_cast<int>( N - (MAX_QUEUED-d_queue_size) );
        }
    }
    // Get the lock and add the work items
    lock_mutex(d_lock_queue);
    #if MONITOR_THREADPOOL_PERFORMANCE
        TIME_TYPE stop_time_local;
        get_time(&stop_time_local);
        total_add_work_time[0] += get_diff(start_time_local,stop_time_local,frequency);
    #endif
    // Next create the work items and add them to the queue
    for (size_t i=0; i<N; i++) {
        queue_list_struct *work_item = const_cast<queue_list_struct*>(&d_queue_list[d_queue_free]);
        d_queue_free = work_item->next;
        work_item->next = -1;
        work_item->prev = -1;
        d_queue_ids[work_item->position] = ids[i];
        reinterpret_cast<WorkItem*>(ids[i].d_work)->d_tpool_index = work_item->position;
        if ( d_queue_head==-1 ) {
            d_queue_head = work_item->position;
        } else if ( ids[i] > d_queue_ids[d_queue_list[d_queue_head].position] ) {
            work_item->next = d_queue_head;
            d_queue_list[d_queue_head].prev = work_item->position;
            d_queue_head = work_item->position;
        } else {
            short int prev = d_queue_head;
            short int cur = d_queue_list[prev].next;
            while ( cur!=-1 ) {
                if ( d_queue_ids[cur] < ids[i] )
                    break;
                prev = cur;
                cur = d_queue_list[prev].next;
            }
            work_item->prev = prev;
            work_item->next = cur;
            if ( cur != -1 )
                d_queue_list[cur].prev = work_item->position;
            d_queue_list[prev].next = work_item->position;
        }
        ++d_queue_size;
    }
    int num_active2 = d_num_active;       // Copy the number of active threads to a local variable
    unlock_mutex(d_lock_queue);
    #if MONITOR_THREADPOOL_PERFORMANCE
        get_time(&stop_time_local);
        total_add_work_time[1] += get_diff(start_time_local,stop_time_local,frequency);
    #endif
    // Activate sleeping threads
    #ifdef USE_WINDOWS
        for (int i=0; i<d_N_threads; i++) {
            if ( num_active2 == d_N_threads ) {
                // All threads are active, no need to activate
                break;
            } else if ( d_queue_size == 0 ) {
                // Queue is empty, no need to activate
                break;
            } else if ( !get_bit(d_active,i) ) {
                // Thread is inactive, wake it
                ResumeThread(d_hThread[i]);
            }
        }
    #elif defined(USE_LINUX) || defined(USE_MAC)
        if ( num_active2 == d_N_threads ) {
            // All threads are active, no need to wake anybody
        } else if ( d_queue_size == 0 ) {
            // Queue is empty, no need to activate
        } else if ( N == 1 ) {
            // Added 1 item to the queue, wake 1 worker
            int error = pthread_cond_signal(d_queue_not_empty);
            if ( error != 0 )
                perr << "Error in signaling thread";
        } else {
            // Added multple items in the queue, wake all workers
            int error = pthread_cond_broadcast(d_queue_not_empty);
            if ( error != 0 )
                perr << "Error in signaling thread";
        }
    #endif
    #if MONITOR_THREADPOOL_PERFORMANCE
        get_time(&stop_time_local);
        total_add_work_time[2] += get_diff(start_time_local,stop_time_local,frequency);
    #endif
}


/******************************************************************
* This function checks if the work item has finished              *
******************************************************************/
bool ThreadPool::isFinished(ThreadPool::thread_id_t id) const
{
    if ( !isValid(id) ) {
        // The thread id is not valid
        return false;
    }
    return reinterpret_cast<WorkItem*>(id.d_work)->d_state==2;
}


/******************************************************************
* This function removes a finished work item                      *
******************************************************************/
ThreadPool::WorkItem* ThreadPool::getFinishedWorkItem(ThreadPool::thread_id_t id) const
{
    if ( !isValid(id) )
        return NULL;
    if ( reinterpret_cast<WorkItem*>(id.d_work)->d_state!=2 )
        return NULL;
    // Return the result
    WorkItem* work = reinterpret_cast<WorkItem*>(id.d_work);
    return work;
}


/******************************************************************
* This function waits for a some of the work items to finish      *
******************************************************************/
static inline void check_finished( size_t N_work, const ThreadPool::thread_id_t *ids, size_t& N_finished, bool* finished)
{
    for (size_t k=0; k<N_work; k++) {
        if ( !finished[k] && ids[k].finished() ) {
            N_finished++;
            finished[k] = true;
        }
    }
}
int ThreadPool::wait_some(size_t N_work, const ThreadPool::thread_id_t *ids, size_t N_wait, bool* finished) const
{
    // Check the inputs
    if ( N_wait<=0 || N_wait>N_work ) {
        printp("Invalid arguments in thread pool wait (%i,%i)\n",(int)N_work,(int)N_wait);
        return -1;
    }
    size_t N_finished = 0;
    memset(finished,0,N_work*sizeof(bool));
    // Check that all the ids are valid
    size_t next_id = d_id_assign-1;
    for (size_t k=0; k<N_work; k++) {
        if ( !ids[k].initialized() ) {
            finished[k] = true;
            N_finished++;
        }
        size_t local_id = ids[k].getLocalID();
        bool test = local_id==0 || local_id>MAXID64 || local_id<=next_id;
        test = test && !finished[k];
        if ( test ) {
            printp("Invalid ids for wait\n");
            return -1;
        }
    }
    // Check which ids have finished
    check_finished(N_work,ids,N_finished,finished);
    // If enough ids have finished return
    if ( N_finished >= N_wait ) {
        return 0;
    }
    // Acquire the lock and update the finished list
    // It is possible that in the time required to acquire the lock, the work items may finish
    lock_mutex(d_lock_queue);
    check_finished(N_work,ids,N_finished,finished);
    if ( N_finished >= N_wait ) {
        unlock_mutex(d_lock_queue);
        return 0;
    }
    // Create the wait event struct
    wait_event_struct* tmp = new wait_event_struct(&wait_pool);
    wait_type event = tmp->wait_event;
    tmp->count = static_cast<int>(N_wait-N_finished);
    tmp->ids.reserve(N_wait-N_finished);
    for (size_t k=0; k<N_work; k++) {
        if ( !finished[k] )
            tmp->ids.push_back(ids[k]);
    }
    quicksort(tmp->ids);
    d_wait[d_N_wait] = tmp;
    d_N_wait++;
    // Wait for a signal indicating that a thread has finished
    #ifdef USE_WINDOWS
        unlock_mutex(d_lock_queue);
        DWORD ret = WaitForSingleObject( event, INFINITE );
        lock_mutex(d_lock_queue);
    #elif defined(USE_LINUX) || defined(USE_MAC)
        pthread_cond_wait(event,d_lock_queue);
    #endif
    // Check for remaining references to the wait struct and delete the structure
    for (int k=0; k<d_N_wait; k++) {
        if ( d_wait[k] == tmp ) {
            for (int m=k+1; m<d_N_wait; m++)
                d_wait[m-1] = d_wait[m];
            d_wait[d_N_wait-1] = NULL;
            break;
        }
    }
    d_N_wait--;
    delete tmp;
    unlock_mutex(d_lock_queue);
    // Update the ids that have finished
    check_finished(N_work,ids,N_finished,finished);
    if ( N_finished<N_wait && N_work!=0 ) {
        throw std::logic_error("Internal error: failed to wait");
    }
    return 0;
}


/******************************************************************
* This function waits for all of the threads to finish their work *
******************************************************************/
void ThreadPool::wait_pool_finished() const
{
    // First check that we are not one of the threads
    if ( isMemberThread() ) {
        throw std::logic_error("Member thread attempted to call wait_pool_finished");
    }
    lock_mutex(d_lock_queue);
    // Wait for all threads to finish their work
    while ( d_num_active>0 || d_queue_size>0 ) {
        d_signal_empty = true;
        #ifdef USE_WINDOWS
            unlock_mutex(d_lock_queue);
            DWORD ret = WaitForSingleObject( d_wait_finished, INFINITE );
            lock_mutex(d_lock_queue);
        #elif defined(USE_LINUX) || defined(USE_MAC)
            pthread_cond_wait(d_wait_finished,d_lock_queue);
        #else
            #error Not programmed
        #endif
    }
    d_signal_empty = false;
    unlock_mutex(d_lock_queue);
}


/******************************************************************
* These functions create the unique id to assign each work item   *
* If id is a 32-bit number we have 4e9 possible work items        *
* If id is a 64-bit number we have 9e19 possible work items and   *
*    we have some checking that will catch some invalid ids       *
******************************************************************/
inline void ThreadPool::initialize_id()
{
    // Note that the best option is to use a 64-bit integer
    if ( sizeof(size_t)==8 ) {
        // Set the starting value to 2^56-3
        d_id_assign = MAXID64;
    } else if ( sizeof(size_t)==4 ) {
        // Set the starting value to 2^32-3
        d_id_assign = MAXID32;
    } else {
        throw std::logic_error("Internal error: failed to initialize ids");
    }
}
inline size_t ThreadPool::advance_id()
{
    size_t id = AtomicOperations::atomic_decrement( &d_id_assign );
    if ( id==0 )
        throw std::logic_error("Ran out of valid ids");
    return id;
}


/******************************************************************
* Function to check if the current thread is a member thread      *
******************************************************************/
inline bool ThreadPool::isMemberThread() const
{
    size_t id = getThreadId();
    for (int i=0; i<d_N_threads; i++) {
        if ( id==d_ThreadId[i] )
            return true;
    }
    return false;
}


/******************************************************************
* Member functions of wait_event_struct                           *
******************************************************************/
ThreadPool::wait_event_struct::wait_event_struct( wait_pool_struct* wait_pool )
{
    count = 0;
    ThreadId = getThreadId();
    d_wait_pool = wait_pool;
    wait_event = d_wait_pool->pop();
}
ThreadPool::wait_event_struct::~wait_event_struct( )
{
    d_wait_pool->push(wait_event);
}


/******************************************************************
* Member functions of wait_pool_struct                            *
******************************************************************/
ThreadPool::wait_pool_struct::wait_pool_struct( )
{
    d_size = 16;
    d_count = 0;
    d_pool = new wait_type[d_size];
    memset(const_cast<wait_type*>(d_pool),0,d_size*sizeof(wait_type));
    d_lock = create_mutex( );
}
ThreadPool::wait_pool_struct::~wait_pool_struct( )
{
    for (size_t i=0; i<d_count; i++) {
        #ifdef USE_WINDOWS
            CloseHandle(d_pool[i]);
        #elif defined(USE_LINUX) || defined(USE_MAC)
            pthread_cond_destroy(d_pool[i]);
            delete d_pool[i];
        #else
            #error Not programmed
        #endif
    }
    delete [] d_pool;
    destroy_mutex( d_lock );
    d_size = 0;
    d_count = 0;
    d_pool = 0;
    d_lock = 0;
}
void ThreadPool::wait_pool_struct::push( ThreadPool::wait_type event )
{
    lock_mutex(d_lock);
    if ( d_count >= d_size ) {
        volatile wait_type* tmp = d_pool;
        d_pool = new wait_type[2*d_size];
        memset((void*)d_pool,0,2*d_size*sizeof(wait_type));
        memcpy((void*)d_pool,(void*)tmp,d_size*sizeof(wait_type));
        delete [] d_pool;
        d_size = 2*d_size;
    }
    d_pool[d_count] = event;
    ++d_count;
    unlock_mutex(d_lock);
}
ThreadPool::wait_type ThreadPool::wait_pool_struct::pop( )
{
    lock_mutex(d_lock);
    wait_type event = 0;
    if ( d_count == 0 ) {
        #ifdef USE_WINDOWS
            event = CreateEvent(NULL,FALSE,FALSE,NULL);
        #elif defined(USE_LINUX) || defined(USE_MAC)
            event = new pthread_cond_t;
            int error = pthread_cond_init(event,NULL);
            if ( error == -1 )
                std::logic_error("Error creating wait_event");
        #else
            #error Not programmed
        #endif
    } else {
        event = d_pool[d_count-1];
        --d_count;
    }
    unlock_mutex(d_lock);
    return event;
}


/******************************************************************
* templated quicksort routine                                     *
******************************************************************/
template <class T>
void quicksort(std::vector<T> &x)
{
    int n = (int) x.size();
    if ( n <= 1 )
        return;
    T *arr = &x[0];
    bool test;
    int i, ir, j, jstack, k, l, istack[100];
    T a, tmp_a;
    jstack = 0;
    l = 0;
    ir = n-1;
    while (1) {
        if ( ir-l < 7 ) {             // Insertion sort when subarray small enough.
            for ( j=l+1; j<=ir; j++ ) {
                a = arr[j];
                test = true;
                for (i=j-1; i>=0; i--) {
                    if ( arr[i] < a ) {
                        arr[i+1] = a;
                        test = false;
                        break;
                    }
                    arr[i+1] = arr[i];
                }
                if ( test ) {
                    i = l-1;
                    arr[i+1] = a;
                }
            }
            if ( jstack==0 )
                return;
            ir = istack[jstack];    // Pop stack and begin a new round of partitioning.
            l = istack[jstack-1];
            jstack -= 2;
        } else {
            k = (l+ir)/2;           // Choose median of left, center and right elements as partitioning
                                    // element a. Also rearrange so that a(l) < a(l+1) < a(ir).
            tmp_a = arr[k];
            arr[k] = arr[l+1];
            arr[l+1] = tmp_a;
            if ( arr[l]>arr[ir] ) {
                tmp_a = arr[l];
                arr[l] = arr[ir];
                arr[ir] = tmp_a;
            }
            if ( arr[l+1] > arr[ir] ) {
                tmp_a = arr[l+1];
                arr[l+1] = arr[ir];
                arr[ir] = tmp_a;
            }
            if ( arr[l] > arr[l+1] ) {
                tmp_a = arr[l];
                arr[l] = arr[l+1];
                arr[l+1] = tmp_a;
            }
            // Scan up to find element > a
            j = ir;
            a = arr[l+1];           // Partitioning element.
            for (i=l+2; i<=ir; i++) {
                if ( arr[i]<a )
                    continue;
                while ( arr[j]>a )  // Scan down to find element < a.
                    j--;
                if ( j < i )
                    break;          // Pointers crossed. Exit with partitioning complete.
                tmp_a = arr[i];     // Exchange elements of both arrays.
                arr[i] = arr[j];
                arr[j] = tmp_a;
            }
            arr[l+1] = arr[j];      // Insert partitioning element in both arrays.
            arr[j] = a;
            jstack += 2;
            // Push pointers to larger subarray on stack, process smaller subarray immediately.
            if ( ir-i+1 >= j-l ) {
                istack[jstack] = ir;
                istack[jstack-1] = i;
                ir = j-1;
            } else {
                istack[jstack] = j-1;
                istack[jstack-1] = l;
                l = i;
            }
        }
    }
}


/************************************************************************
* Function to find the id in a sorted vector                            *
************************************************************************/
inline bool find_id(const std::vector<ThreadPool::thread_id_t> &x_in, const ThreadPool::thread_id_t &id )
{
    if ( x_in.empty() )
        return false;
    size_t n = x_in.size();
    const ThreadPool::thread_id_t *x = &x_in[0];   // Use the pointer for speed
    if ( n<4 ) {
        for (size_t i=0; i<n; i++) {
            if ( x[i] == id )
                return true;
        }
    }
    // Check if value is within the range of x
    if ( id == x[0] )
        return true;
    if ( id < x[0] )
        return false;
    if ( id == x[n-1] )
        return true;
    if ( id > x[n-1] )
        return false;
    // Perform the search
    size_t lower = 0;
    size_t upper = n-1;
    size_t index;
    while ( (upper-lower) != 1 ) {
        index = (upper+lower)/2;
        if ( x[index] == id )
            return true;
        if ( x[index] >= id )
            upper = index;
        else
            lower = index;
    }
    return false;
}


/************************************************************************
* Function to add dependencies to the work item                         *
************************************************************************/
void ThreadPool::WorkItem::add_dependencies( size_t N, const ThreadPool::thread_id_t* ids)
{
    if ( d_tpool_index != -1 ) {
        // The item has already been added to the threadpool,
        // we are not allowed to add dependencies
        throw std::logic_error("Cannot add dependency to work item once it has been added the the threadpool");
    }
    if ( static_cast<size_t>(d_N_ids)+N > 0xFFFF ) {
        throw std::logic_error("Cannot add more than 65000 dependencies");
    }
    for (size_t i=0; i<N; i++) {
        if ( !ids[i].finished() ) {
            if ( d_N_ids >= d_size ) {
                thread_id_t* tmp = d_ids;
                unsigned int N2 = d_size;
                if ( N2 == 0 ) { N2 = 8; }
                while ( N2 <= d_N_ids )
                    N2 *= 2;
                d_ids = new thread_id_t[N2];
                for (size_t i=0; i<d_N_ids; i++)
                    std::swap(d_ids[i],tmp[i]);
                delete [] tmp;
                d_size = N2;
            }
            d_ids[d_N_ids] = ids[i];
            d_N_ids++;
        }
    }
}