From a04317d766ccaf20c1948b3aced9e3b2ecbd27e5 Mon Sep 17 00:00:00 2001
From: Mark Berrill <berrillma@ornl.gov>
Date: Wed, 5 Jul 2017 12:08:21 -0400
Subject: [PATCH] Updating threadpool class

---
 common/StackTrace.cpp                | 1698 ++++++++++++++++---
 common/StackTrace.h                  |  187 ++-
 tests/lbpm_color_simulator.h         |   15 +-
 threadpool/atomic_helpers.cpp        |   28 +-
 threadpool/atomic_helpers.h          |  657 +++++---
 threadpool/atomic_list.h             |  185 +++
 threadpool/atomic_list.hpp           |  242 +++
 threadpool/test/CMakeLists.txt       |    1 +
 threadpool/test/test_atomic.cpp      |  250 +--
 threadpool/test/test_atomic_list.cpp |  210 +++
 threadpool/test/test_thread_pool.cpp | 1296 ++++++++-------
 threadpool/thread_pool.cpp           | 2244 ++++++++++----------------
 threadpool/thread_pool.h             |  768 +++++----
 threadpool/thread_pool.hpp           | 1352 ++++------------
 14 files changed, 5020 insertions(+), 4113 deletions(-)
 create mode 100644 threadpool/atomic_list.h
 create mode 100644 threadpool/atomic_list.hpp
 create mode 100644 threadpool/test/test_atomic_list.cpp
 mode change 100755 => 100644 threadpool/test/test_thread_pool.cpp
 mode change 100755 => 100644 threadpool/thread_pool.cpp
 mode change 100755 => 100644 threadpool/thread_pool.h
diff --git a/common/StackTrace.cpp b/common/StackTrace.cpp
index 9e3fdda6..9786644e 100644
--- a/common/StackTrace.cpp
+++ b/common/StackTrace.cpp
@@ -1,126 +1,424 @@
-#include "StackTrace.h"
+#include "common/StackTrace.h"
 
-#include <iostream>
-#include <sstream>
-#include <cstring>
 #include <algorithm>
-#if __cplusplus > 199711L
-    #include <mutex>
-#endif
+#include <csignal>
+#include <cstring>
+#include <iostream>
+#include <set>
+#include <map>
+#include <mutex>
+#include <sstream>
+#include <stdexcept>
+#include <thread>
+#include <memory>
+#include <random>
 
 
-// Detect the OS and include system dependent headers
-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) || defined(_MSC_VER)
-    // Note: windows has not been testeds
+// Detect the OS
+// clang-format off
+#if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 ) || defined( _MSC_VER )
     #define USE_WINDOWS
-    #include <iostream>
-    #include <windows.h>
-    #include <process.h>
-    #include <stdio.h>   
-    #include <tchar.h>
-    #include <psapi.h>
-    #include <DbgHelp.h>
-    //#pragma comment(lib, psapi.lib) //added
-    //#pragma comment(linker, /DEFAULTLIB:psapi.lib)
-#elif defined(__APPLE__)
+    #define NOMINMAX
+#elif defined( __APPLE__ )
     #define USE_MAC
-    #include <sys/time.h>
-    #include <sys/sysctl.h>
-    #include <signal.h>
-    #include <execinfo.h>
-    #include <dlfcn.h>
-    #include <mach/mach.h>
-    #include <sys/types.h>
-    #include <sys/sysctl.h>
-    #include <unistd.h>
-    #include <sched.h>
-#elif defined(__linux) || defined(__unix) || defined(__posix)
+    #define USE_NM
+#elif defined( __linux ) || defined( __unix ) || defined( __posix )
     #define USE_LINUX
     #define USE_NM
-    #include <sys/time.h>
-    #include <time.h>
-    #include <execinfo.h>
-    #include <dlfcn.h>
-    #include <malloc.h>
-    #include <unistd.h>
-    #include <sched.h>
 #else
     #error Unknown OS
 #endif
+// clang-format on
+
+
+// Include/detect MPI
+// clang-format off
+#ifndef USE_MPI
+    #ifdef USE_EXT_MPI
+        #define USE_MPI
+    #elif defined(__has_include)
+        #if __has_include("mpi.h")
+            #define USE_MPI
+        #endif
+    #endif
+#endif
+#ifdef USE_MPI
+    #include "mpi.h"
+#endif
+// clang-format on
+
+
+// Include system dependent headers
+// clang-format off
+// Detect the OS and include system dependent headers
+#ifdef USE_WINDOWS
+    #include <windows.h>
+    #include <dbghelp.h>
+    #include <DbgHelp.h>
+    #include <TlHelp32.h>
+    #include <Psapi.h>
+    #include <process.h>
+    #include <stdio.h>
+    #include <tchar.h>
+    #pragma comment( lib, "version.lib" ) // for "VerQueryValue"
+#else
+    #include <dlfcn.h>
+    #include <execinfo.h>
+    #include <sched.h>
+    #include <sys/time.h>
+    #include <time.h>
+    #include <unistd.h>
+    #include <sys/syscall.h>
+#endif
+#ifdef USE_MAC
+    #include <mach-o/dyld.h>
+    #include <mach/mach.h>
+    #include <sys/sysctl.h>
+    #include <sys/types.h>
+#endif
+// clang-format on
 
 
 #ifdef __GNUC__
-    #define USE_ABI
-    #include <cxxabi.h>
+#define USE_ABI
+#include <cxxabi.h>
 #endif
 
+
 #ifndef NULL_USE
-    #define NULL_USE(variable) do {                         \
-        if(0) {char *temp = (char *)&variable; temp++;}     \
-    }while(0)
+#define NULL_USE( variable )                 \
+    do {                                     \
+        if ( 0 ) {                           \
+            char *temp = (char *) &variable; \
+            temp++;                          \
+        }                                    \
+    } while ( 0 )
 #endif
 
 
+// Set the callstack signal
+#ifdef SIGRTMIN
+    #define CALLSTACK_SIG SIGRTMIN+4
+#else
+    #define CALLSTACK_SIG SIGUSR1
+    #define SIGRTMIN SIGUSR1
+    #define SIGRTMAX SIGUSR1
+#endif
+
+
+// Utility to break a string by a newline
+static inline std::vector<std::string> breakString( const std::string& str )
+{
+    std::vector<std::string> strvec;
+    size_t i1 = 0;
+    size_t i2 = std::min( str.find( '\n', i1 ), str.length() );
+    while ( i1 < str.length() ) {
+        strvec.push_back( str.substr( i1, i2-i1 ) );
+        i1 = i2 + 1;
+        i2 = std::min( str.find( '\n', i1 ), str.length() );
+    }
+    return strvec;
+}
+
+
 // Utility to strip the path from a filename
-inline std::string stripPath( const std::string& filename )
+static inline std::string stripPath( const std::string &filename )
 {
-    if ( filename.empty() ) { return std::string(); }
-    int i=0;
-    for (i=(int)filename.size()-1; i>=0&&filename[i]!=47&&filename[i]!=92; i--) {}
-    i = std::max(0,i+1);
-    return filename.substr(i);
+    if ( filename.empty() ) {
+        return std::string();
+    }
+    int i = 0;
+    for ( i = (int) filename.size() - 1; i >= 0 && filename[i] != 47 && filename[i] != 92; i-- ) {
+    }
+    i = std::max( 0, i + 1 );
+    return filename.substr( i );
 }
 
 
 // Inline function to subtract two addresses returning the absolute difference
-inline void* subtractAddress( void* a, void* b ) {
-    return reinterpret_cast<void*>( std::abs(
-        reinterpret_cast<long long int>(a)-reinterpret_cast<long long int>(b) ) );
+static inline void *subtractAddress( void *a, void *b )
+{
+    return reinterpret_cast<void *>(
+        std::abs( reinterpret_cast<long long int>( a ) - reinterpret_cast<long long int>( b ) ) );
+}
+
+
+#ifdef USE_WINDOWS
+static BOOL __stdcall readProcMem( HANDLE hProcess,
+                                   DWORD64 qwBaseAddress,
+                                   PVOID lpBuffer,
+                                   DWORD nSize,
+                                   LPDWORD lpNumberOfBytesRead )
+{
+    SIZE_T st;
+    BOOL bRet = ReadProcessMemory( hProcess, (LPVOID) qwBaseAddress, lpBuffer, nSize, &st );
+    *lpNumberOfBytesRead = (DWORD) st;
+    return bRet;
+}
+static inline std::string getCurrentDirectory()
+{
+    char temp[1024] = { 0 };
+    GetCurrentDirectoryA( sizeof( temp ), temp );
+    return temp;
+}
+namespace StackTrace {
+BOOL GetModuleListTH32( HANDLE hProcess, DWORD pid );
+BOOL GetModuleListPSAPI( HANDLE hProcess );
+DWORD LoadModule( HANDLE hProcess, LPCSTR img, LPCSTR mod, DWORD64 baseAddr, DWORD size );
+void LoadModules();
+};
+#endif
+
+
+// Functions to copy data
+static inline char* copy_in( size_t N, const void* data, char *ptr )
+{
+    memcpy( ptr, data, N );
+    return ptr + N;
+}
+static inline const char* copy_out( size_t N, void* data, const char *ptr )
+{
+    memcpy( data, ptr, N );
+    return ptr + N;
+}
+
+
+/****************************************************************************
+*  Utility to call system command and return output                         *
+****************************************************************************/
+#ifdef USE_WINDOWS
+#define popen _popen
+#define pclose _pclose
+#endif
+std::string StackTrace::exec( const std::string& cmd, int& code )
+{
+    signal( SIGCHLD, SIG_DFL );     // Clear child exited
+    FILE* pipe = popen(cmd.c_str(), "r");
+    if ( pipe == nullptr )
+        return std::string();
+    std::string result = "";
+    result.reserve(1024);    
+    while ( !feof(pipe) ) {
+        char buffer[257];
+        buffer[256] = 0;
+        if ( fgets(buffer, 128, pipe) != NULL )
+            result += buffer;
+    }
+    auto status = pclose( pipe );
+    code = WEXITSTATUS(status);
+    return result;
 }
 
 
 /****************************************************************************
 *  stack_info                                                               *
 ****************************************************************************/
+bool StackTrace::stack_info::operator==( const StackTrace::stack_info& rhs ) const
+{
+    if ( address == rhs.address )
+        return true;
+    if ( address2==rhs.address2 && object==rhs.object )
+        return true;
+    return false;
+}
+bool StackTrace::stack_info::operator!=( const StackTrace::stack_info& rhs ) const
+{
+    return !operator==( rhs );
+}
 std::string StackTrace::stack_info::print() const
 {
     char tmp[32];
-    sprintf(tmp,"0x%016llx:  ",reinterpret_cast<unsigned long long int>(address));
-    std::string stack(tmp);
-    sprintf(tmp,"%i",line);
-    std::string line_str(tmp);
-    stack += stripPath(object);
-    stack.resize(std::max<size_t>(stack.size(),38),' ');
+    sprintf( tmp, "0x%016llx:  ", reinterpret_cast<unsigned long long int>( address ) );
+    std::string stack( tmp );
+    sprintf( tmp, "%i", line );
+    std::string line_str( tmp );
+    stack += stripPath( object );
+    stack.resize( std::max<size_t>( stack.size(), 38 ), ' ' );
     stack += "  " + function;
-    if ( !filename.empty() && line>0 ) {
-        stack.resize(std::max<size_t>(stack.size(),70),' ');
-        stack += "  " + stripPath(filename) + ":" + line_str;
+    if ( !filename.empty() && line > 0 ) {
+        stack.resize( std::max<size_t>( stack.size(), 72 ), ' ' );
+        stack += "  " + stripPath( filename ) + ":" + line_str;
     } else if ( !filename.empty() ) {
-        stack.resize(std::max<size_t>(stack.size(),70),' ');
-        stack += "  " + stripPath(filename);
-    } else if ( line>0 ) {
+        stack.resize( std::max<size_t>( stack.size(), 72 ), ' ' );
+        stack += "  " + stripPath( filename );
+    } else if ( line > 0 ) {
         stack += " : " + line_str;
     }
     return stack;
 }
+size_t StackTrace::stack_info::size() const
+{
+    return 2*sizeof(void*) + 4*sizeof(int) + object.size() + function.size() + filename.size();
+}
+char* StackTrace::stack_info::pack( char* ptr ) const
+{
+    int Nobj = object.size();
+    int Nfun = function.size();
+    int Nfile = filename.size();
+    ptr = copy_in( sizeof(void*), &address,  ptr );
+    ptr = copy_in( sizeof(void*), &address2, ptr );
+    ptr = copy_in( sizeof(int), &Nobj,  ptr );
+    ptr = copy_in( sizeof(int), &Nfun,  ptr );
+    ptr = copy_in( sizeof(int), &Nfile, ptr );
+    ptr = copy_in( sizeof(int), &line,  ptr );
+    ptr = copy_in( Nobj,  object.data(),   ptr );
+    ptr = copy_in( Nfun,  function.data(), ptr );
+    ptr = copy_in( Nfile, filename.data(), ptr );
+    return ptr;    
+}
+const char* StackTrace::stack_info::unpack( const char* ptr )
+{
+    int Nobj, Nfun, Nfile;
+    ptr = copy_out( sizeof(void*), &address,  ptr );
+    ptr = copy_out( sizeof(void*), &address2, ptr );
+    ptr = copy_out( sizeof(int), &Nobj,  ptr );
+    ptr = copy_out( sizeof(int), &Nfun,  ptr );
+    ptr = copy_out( sizeof(int), &Nfile, ptr );
+    ptr = copy_out( sizeof(int), &line,  ptr );
+    object.resize( Nobj );
+    function.resize( Nfun );
+    filename.resize( Nfile );
+    ptr = copy_out( Nobj,  &object.front(),   ptr );
+    ptr = copy_out( Nfun,  &function.front(), ptr );
+    ptr = copy_out( Nfile, &filename.front(), ptr );
+    return ptr; 
+}
+std::vector<char> StackTrace::stack_info::packArray( const std::vector<stack_info>& data )
+{
+    size_t size = sizeof(int);
+    for (size_t i=0; i<data.size(); i++)
+        size += data[i].size();
+    std::vector<char> vec(size,0);
+    char* ptr = vec.data();
+    int N = data.size();
+    ptr = copy_in( sizeof(int), &N,  ptr );
+    for (size_t i=0; i<data.size(); i++)
+        ptr = data[i].pack( ptr );
+    return vec;
+}
+std::vector<StackTrace::stack_info> StackTrace::stack_info::unpackArray( const char* ptr )
+{
+    int N;
+    ptr = copy_out( sizeof(int), &N, ptr );
+    std::vector<stack_info> data(N);
+    for (size_t i=0; i<data.size(); i++)
+        ptr = data[i].unpack( ptr );
+    return data;
+}
+static std::vector<char> pack( const std::vector<std::vector<StackTrace::stack_info>>& data )
+{
+    size_t size = sizeof(int);
+    for (size_t i=0; i<data.size(); i++) {
+        size += sizeof(int);
+        for (size_t j=0; j<data[i].size(); j++)
+            size += data[i][j].size();
+    }
+    std::vector<char> out( size, 0 );
+    char* ptr = out.data();
+    int N = data.size();
+    ptr = copy_in( sizeof(int), &N,  ptr );
+    for (int i=0; i<N; i++) {
+        int M = data[i].size();
+        ptr = copy_in( sizeof(int), &M,  ptr );
+        for (int j=0; j<M; j++)
+            ptr = data[i][j].pack( ptr );
+    }
+    return out;
+}
+static std::vector<std::vector<StackTrace::stack_info>> unpack( const std::vector<char>& in )
+{
+    const char* ptr = in.data();
+    int N;
+    ptr = copy_out( sizeof(int), &N, ptr );
+    std::vector<std::vector<StackTrace::stack_info>> data( N );
+    for (int i=0; i<N; i++) {
+        int M;
+        ptr = copy_out( sizeof(int), &M, ptr );
+        data[i].resize( M );
+        for (int j=0; j<M; j++)
+            ptr = data[i][j].unpack( ptr );
+    }
+    return data;
+}
+
+
+/****************************************************************************
+*  multi_stack_info                                                         *
+****************************************************************************/
+/*static int maxDepth( const StackTrace::multi_stack_info& stack )
+{
+    int depth = 0;
+    for ( auto child : stack.children )
+        depth = std::max<int>( depth, maxDepth( child ) );
+    return depth+1;
+}*/
+std::vector<std::string> StackTrace::multi_stack_info::print( const std::string& prefix ) const
+{
+    std::vector<std::string> text;
+    if ( stack == stack_info() ) {
+        for ( const auto& child : children ) {
+            auto tmp = child.print( );
+            text.insert( text.end(), tmp.begin(), tmp.end() );
+        }
+        return text;
+    }        
+    //auto depth = maxDepth( *this );
+    //std::string line = prefix + "[" + std::to_string( N ) + "] ";
+    //for (auto i=1; i<depth; i++)
+    //    line += "--";
+    //line += stack.print();
+    std::string line = prefix + "[" + std::to_string( N ) + "] " + stack.print();
+    text.push_back( line );
+    std::string prefix2 = prefix + "  ";
+    for ( size_t i=0; i<children.size(); i++ ) {
+        const auto& child = children[i];
+        auto tmp = child.print( );
+        for ( size_t j=0; j<tmp.size(); j++ ) {
+            std::string line = prefix2 + tmp[j];
+            if ( children.size()>1 && j>0 && i<children.size()-1 )
+                line[prefix2.size()] = '|';
+            text.push_back( line );
+        }
+    }
+    return text;
+}
+void StackTrace::multi_stack_info::add( size_t len, const stack_info *stack )
+{
+    if ( len == 0 )
+        return;
+    const auto& s = stack[len-1];
+    for ( size_t i=0; i<children.size(); i++) {
+        if ( children[i].stack == s ) {
+            children[i].N++;
+            if ( len > 1 )
+                children[i].add( len-1, stack );
+            return;
+        }
+    }
+    children.resize( children.size()+1 );
+    children.back().N = 1;
+    children.back().stack = s;
+    if ( len > 1 )
+        children.back().add( len-1, stack );
+}
 
 
 /****************************************************************************
 *  Function to find an entry                                                *
 ****************************************************************************/
 template <class TYPE>
-inline size_t findfirst( const std::vector<TYPE>& X, TYPE Y )
+inline size_t findfirst( const std::vector<TYPE> &X, TYPE Y )
 {
     if ( X.empty() )
         return 0;
     size_t lower = 0;
-    size_t upper = X.size()-1;
+    size_t upper = X.size() - 1;
     if ( X[lower] >= Y )
         return lower;
     if ( X[upper] < Y )
         return upper;
-    while ( (upper-lower) != 1 ) {
-        size_t value = (upper+lower)/2;
+    while ( ( upper - lower ) != 1 ) {
+        size_t value = ( upper + lower ) / 2;
         if ( X[value] >= Y )
             upper = value;
         else
@@ -136,40 +434,46 @@ inline size_t findfirst( const std::vector<TYPE>& X, TYPE Y )
 *    exccessive calls to nm.  This function also uses a lock to ensure      *
 *    thread safety.                                                         *
 ****************************************************************************/
-#if __cplusplus <= 199711L
-    class mutex_class {
-      public:
-        void lock() {}
-        void unlock() {}
-    };
-    mutex_class getSymbols_mutex;
-#else
-    std::mutex getSymbols_mutex;
-#endif
+std::mutex getSymbols_mutex;
 struct global_symbols_struct {
-    std::vector<void*> address;
+    std::vector<void *> address;
     std::vector<char> type;
     std::vector<std::string> obj;
     int error;
 } global_symbols;
-static std::string get_executable()
+std::string StackTrace::getExecutable()
 {
     std::string exe;
-    try { 
-        #ifdef USE_LINUX
-            char *buf = new char[0x10000];
-            int len = ::readlink("/proc/self/exe",buf,0x10000);
-            if ( len!=-1 ) {
-                buf[len] = '\0';
-                exe = std::string(buf);
-            }
-            delete [] buf;
-        #endif
-    } catch (...) {}
+    try {
+#ifdef USE_LINUX
+        char *buf = new char[0x10000];
+        int len   = ::readlink( "/proc/self/exe", buf, 0x10000 );
+        if ( len != -1 ) {
+            buf[len] = '\0';
+            exe      = std::string( buf );
+        }
+        delete[] buf;
+#elif defined( USE_MAC )
+        uint32_t size = 0x10000;
+        char *buf     = new char[size];
+        memset( buf, 0, size );
+        if ( _NSGetExecutablePath( buf, &size ) == 0 )
+            exe = std::string( buf );
+        delete[] buf;
+#elif defined( USE_WINDOWS )
+        DWORD size = 0x10000;
+        char *buf  = new char[size];
+        memset( buf, 0, size );
+        GetModuleFileName( nullptr, buf, size );
+        exe = std::string( buf );
+        delete[] buf;
+#endif
+    } catch ( ... ) {
+    }
     return exe;
 }
-std::string global_exe_name = get_executable();
-static const global_symbols_struct& getSymbols2(  )
+std::string global_exe_name = StackTrace::getExecutable();
+static const global_symbols_struct &getSymbols2()
 {
     static bool loaded = false;
     static global_symbols_struct data;
@@ -178,212 +482,1088 @@ static const global_symbols_struct& getSymbols2(  )
         getSymbols_mutex.lock();
         if ( !loaded ) {
             loaded = true;
-            #ifdef USE_NM
-                try { 
-                    char cmd[1024];
-                    sprintf(cmd,"nm --demangle --numeric-sort %s",global_exe_name.c_str());
-                    FILE *in = popen(cmd,"r");
-                    if ( in==NULL ) {
-                        data.error = -2;
-                        return data;
-                    }
-                    char *buf = new char[0x100000];
-                    while ( fgets(buf,0xFFFFF,in)!=NULL ) {
-                        if ( buf[0]==' ' || buf==NULL )
-                            continue;
-                        char *a = buf;
-                        char *b = strchr(a,' ');  if (b==NULL) {continue;}  b[0] = 0;  b++;
-                        char *c = strchr(b,' ');  if (c==NULL) {continue;}  c[0] = 0;  c++;
-                        char *d = strchr(c,'\n');  if ( d ) { d[0]=0; }
-                        size_t add = strtoul(a,NULL,16);
-                        data.address.push_back( reinterpret_cast<void*>(add) );
-                        data.type.push_back( b[0] );
-                        data.obj.push_back( std::string(c) );
-                    }
-                    pclose(in);
-                    delete [] buf;
-                } catch (...) {
-                    data.error = -3;
+#ifdef USE_NM
+            try {
+                char cmd[1024];
+#ifdef USE_LINUX
+                sprintf( cmd, "nm -n --demangle %s", global_exe_name.c_str() );
+#elif defined( USE_MAC )
+                sprintf( cmd, "nm -n %s | c++filt", global_exe_name.c_str() );
+#else
+#error Unknown OS using nm
+#endif
+                int code;
+                auto output = breakString( StackTrace::exec( cmd, code ) );
+                for ( const auto& line : output ) {
+                    if ( line.empty() )
+                        continue;
+                    if ( line[0] == ' ' )
+                        continue;
+                    char *a = const_cast<char*>(line.c_str());
+                    char *b = strchr( a, ' ' );
+                    if ( b == nullptr )
+                        continue;
+                    b[0] = 0;
+                    b++;
+                    char *c = strchr( b, ' ' );
+                    if ( c == nullptr )
+                        continue;
+                    c[0] = 0;
+                    c++;
+                    char *d = strchr( c, '\n' );
+                    if ( d )
+                        d[0]   = 0;
+                    size_t add = strtoul( a, nullptr, 16 );
+                    data.address.push_back( reinterpret_cast<void *>( add ) );
+                    data.type.push_back( b[0] );
+                    data.obj.push_back( std::string( c ) );
                 }
-                data.error = 0;
-            #else
-                data.error = -1;
-            #endif
+            } catch ( ... ) {
+                data.error = -3;
+            }
+            data.error = 0;
+#else
+            data.error = -1;
+#endif
         }
         getSymbols_mutex.unlock();
     }
     return data;
 }
-int StackTrace::getSymbols( std::vector<void*>& address, std::vector<char>& type, 
-    std::vector<std::string>& obj )
+int StackTrace::getSymbols(
+    std::vector<void *> &address, std::vector<char> &type, std::vector<std::string> &obj )
 {
-    const global_symbols_struct& data = getSymbols2();
-    address = data.address;
-    type = data.type;
-    obj = data.obj;
+    const global_symbols_struct &data = getSymbols2();
+    address                           = data.address;
+    type                              = data.type;
+    obj                               = data.obj;
     return data.error;
 }
 
 
 /****************************************************************************
-*  Function to get the current call stack                                   *
+*  Function to get call stack info                                          *
 ****************************************************************************/
-static void getFileAndLine( StackTrace::stack_info& info )
+#ifdef USE_MAC
+static void *loadAddress( const std::string& object )
 {
-    #if defined(USE_LINUX) || defined(USE_MAC)
-        void *address = info.address;
-        if ( info.object.find(".so")!=std::string::npos )
-            address = info.address2;
-        char buf[4096];
-        sprintf(buf, "addr2line -C -e %s -f -i %lx 2> /dev/null",
-            info.object.c_str(),reinterpret_cast<unsigned long int>(address));
-        FILE* f = popen(buf, "r");
-        if (f == NULL)
+    static std::map<std::string,void*> obj_map;
+    if ( obj_map.empty() ) {
+        uint32_t numImages = _dyld_image_count();
+        for ( uint32_t i = 0; i < numImages; i++ ) {
+            const struct mach_header *header = _dyld_get_image_header( i );
+            const char *name                 = _dyld_get_image_name( i );
+            const char *p                    = strrchr( name, '/' );
+            struct mach_header *address      = const_cast<struct mach_header *>( header );
+            obj_map.insert( std::pair<std::string, void *>( p + 1, address ) );
+            // printf("   module=%s, address=%p\n", p + 1, header);
+        }
+    }
+    auto it       = obj_map.find( object );
+    void *address = 0;
+    if ( it != obj_map.end() ) {
+        address = it->second;
+    } else {
+        it = obj_map.find( stripPath( object ) );
+        if ( it != obj_map.end() )
+            address = it->second;
+    }
+    // printf("%s: 0x%016llx\n",object.c_str(),address);
+    return address;
+}
+static std::tuple<std::string, std::string, std::string, int> split_atos( const std::string &buf )
+{
+    if ( buf.empty() )
+        return std::tuple<std::string, std::string, std::string, int>();
+    // Get the function
+    size_t index = buf.find( " (in " );
+    if ( index == std::string::npos )
+        return std::make_tuple(
+            buf.substr( 0, buf.length() - 1 ), std::string(), std::string(), 0 );
+    std::string fun = buf.substr( 0, index );
+    std::string tmp = buf.substr( index + 5 );
+    // Get the object
+    index           = tmp.find( ')' );
+    std::string obj = tmp.substr( 0, index );
+    tmp             = tmp.substr( index + 1 );
+    // Get the filename and line number
+    size_t p1 = tmp.find( '(' );
+    size_t p2 = tmp.find( ')' );
+    tmp       = tmp.substr( p1 + 1, p2 - p1 - 1 );
+    index     = tmp.find( ':' );
+    std::string file;
+    int line = 0;
+    if ( index != std::string::npos ) {
+        file = tmp.substr( 0, index );
+        line = std::stoi( tmp.substr( index + 1 ) );
+    } else if ( p1 != std::string::npos ) {
+        file = tmp;
+    }
+    return std::make_tuple( fun, obj, file, line );
+}
+#endif
+#ifdef USE_LINUX
+    typedef uint64_t uint_p;
+#elif defined(USE_MAC)
+    typedef unsigned long uint_p;
+#endif
+#if defined( USE_LINUX ) || defined( USE_MAC )
+static inline std::string generateCmd( const std::string& s1,
+    const std::string& s2, const std::string& s3,
+    std::vector<void*> addresses, const std::string& s4 )
+{
+    std::string cmd = s1 + s2 + s3;
+    for (size_t i=0; i<addresses.size(); i++) {
+        char tmp[32];
+        sprintf( tmp, "%lx ", reinterpret_cast<uint_p>( addresses[i] ) );
+        cmd += tmp;
+    }
+    cmd += s4;
+    return cmd;
+}
+#endif
+// clang-format off
+static void getFileAndLineObject( std::vector<StackTrace::stack_info*> &info )
+{
+    if ( info.empty() )
+        return;
+    // This gets the file and line numbers for multiple stack lines in the same object
+    #if defined( USE_LINUX )
+        // Create the call command
+        std::vector<void*> address_list(info.size(),nullptr);
+        for (size_t i=0; i<info.size(); i++) {
+            address_list[i] = info[i]->address;
+            if ( info[i]->object.find( ".so" ) != std::string::npos )
+                address_list[i] = info[i]->address2; 
+        }
+        std::string cmd = generateCmd( "addr2line -C -e ", info[0]->object,
+            " -f -i ", address_list, " 2> /dev/null" );
+        // Get the function/line/file
+        int code;
+        auto cmd_output = StackTrace::exec( cmd, code );
+        auto output = breakString( cmd_output );
+        if ( output.size() != 2*info.size() )
             return;
-        buf[4095] = 0;
-        // get function name
-        char *rtn = fgets(buf,4095,f);
-        if ( info.function.empty() && rtn==buf ) {
-            info.function = std::string(buf);
-            info.function.resize(std::max<size_t>(info.function.size(),1)-1);
+        // Add the results to info
+        for (size_t i=0; i<info.size(); i++) {
+            // get function name
+            if ( info[i]->function.empty() )
+                info[i]->function = output[2*i+0];
+            // get file and line
+            const char *buf = output[2*i+1].c_str();
+            if ( buf[0] != '?' && buf[0] != 0 ) {
+                size_t j = 0;
+                for ( j = 0; j < 4095 && buf[j] != ':'; j++ ) {
+                }
+                info[i]->filename = std::string( buf, j );
+                info[i]->line     = atoi( &buf[j + 1] );
+            }
         }
-        // get file and line
-        rtn = fgets(buf,4095,f);
-        if ( buf[0]!='?' && buf[0]!=0 && rtn==buf ) {
-            size_t i = 0;
-            for (i=0; i<4095 && buf[i]!=':'; i++) { }
-            info.filename = std::string(buf,i);
-            info.line = atoi(&buf[i+1]);
+    #elif defined( USE_MAC )
+        // Create the call command
+        void* load_address = loadAddress( info[0]->object );
+        if ( load_address == nullptr )
+            return;
+        std::vector<void*> address_list(info.size(),nullptr);
+        for (size_t i=0; i<info.size(); i++)
+            address_list[i] = info[i]->address;
+        // Call atos to get the object info
+        char tmp[64];
+        sprintf( tmp, " -l %lx ", (uint_p) load_address );
+        std::string cmd = generateCmd( "atos -o ", info[0]->object,
+            tmp, address_list, " 2> /dev/null" );
+        // Get the function/line/file
+        int code;
+        auto cmd_output = StackTrace::exec( cmd, code );
+        auto output = breakString( cmd_output );
+        if ( output.size() != info.size() )
+            return;
+        // Parse the output for function, file and line info
+        for ( size_t i=0; i<info.size(); i++) {
+            auto data = split_atos( output[i] );
+            if ( info[i]->function.empty() )
+                info[i]->function = std::get<0>(data);
+            if ( info[i]->object.empty() )
+                info[i]->object = std::get<1>(data);
+            if ( info[i]->filename.empty() )
+                info[i]->filename = std::get<2>(data);
+            if ( info[i]->line==0 )
+                info[i]->line = std::get<3>(data);
         }
-        pclose(f);
     #endif
 }
-
-// Try to use the global symbols to decode info about the stack
-static void getDataFromGlobalSymbols( StackTrace::stack_info& info )
+static void getFileAndLine( std::vector<StackTrace::stack_info> &info )
 {
-    const global_symbols_struct& data = getSymbols2();
-    if ( data.error==0 ) {
-        size_t index = findfirst(global_symbols.address,info.address);
+    // Build a list of stack elements for each object
+    std::map<std::string,std::vector<StackTrace::stack_info*>> obj_map;
+    for (size_t i=0; i<info.size(); i++) {
+        auto& list = obj_map[info[i].object];
+        list.emplace_back( &info[i] );
+    }
+    // For each object, get the file/line numbers for all entries
+    for ( auto& entry : obj_map ) 
+        getFileAndLineObject( entry.second );
+}
+// Try to use the global symbols to decode info about the stack
+static void getDataFromGlobalSymbols( StackTrace::stack_info &info )
+{
+    const global_symbols_struct &data = getSymbols2();
+    if ( data.error == 0 ) {
+        size_t index = findfirst( global_symbols.address, info.address );
         if ( index > 0 )
-            info.object = global_symbols.obj[index-1];
+            info.object = global_symbols.obj[index - 1];
         else
             info.object = global_exe_name;
     }
 }
-StackTrace::stack_info StackTrace::getStackInfo( void* address )
+static void signal_handler( int sig )
 {
-    StackTrace::stack_info info;
-    info.address = address;
-    #ifdef _GNU_SOURCE
-        Dl_info dlinfo;
-        if ( !dladdr(address, &dlinfo) ) {
-            getDataFromGlobalSymbols( info );
-            getFileAndLine(info);
-            return info;
-        }
-        info.address2 = subtractAddress(info.address,dlinfo.dli_fbase);
-        info.object = std::string(dlinfo.dli_fname);
-        #if defined(USE_ABI)
-            int status;
-            char *demangled = abi::__cxa_demangle(dlinfo.dli_sname,NULL,0,&status);
-            if ( status == 0 && demangled!=NULL ) {
-                info.function = std::string(demangled);
-            } else if ( dlinfo.dli_sname!=NULL ) {
-                info.function = std::string(dlinfo.dli_sname);
+    printf("Signal caught acquiring stack (%i)\n",sig);
+    StackTrace::setErrorHandlers( [](std::string,StackTrace::terminateType) { exit( -1 ); } );
+}
+StackTrace::stack_info StackTrace::getStackInfo( void *address )
+{
+    return getStackInfo( std::vector<void*>(1,address) )[0];
+}
+std::vector<StackTrace::stack_info> StackTrace::getStackInfo( const std::vector<void*>& address )
+{
+    // Temporarily handle signals to prevent recursion on the stack
+    auto prev_handler = signal( SIGINT, signal_handler );
+    // Get the detailed stack info
+    std::vector<StackTrace::stack_info> info(address.size());
+    try {
+        #ifdef USE_WINDOWS
+            IMAGEHLP_SYMBOL64 pSym[1024];
+            memset( pSym, 0, sizeof( pSym ) );
+            pSym->SizeOfStruct  = sizeof( IMAGEHLP_SYMBOL64 );
+            pSym->MaxNameLength = 1024;
+
+            IMAGEHLP_MODULE64 Module;
+            memset( &Module, 0, sizeof( Module ) );
+            Module.SizeOfStruct = sizeof( Module );
+
+            HANDLE pid = GetCurrentProcess();
+
+            for (size_t i=0; i<address.size(); i++) {
+                info[i].address = address[i];
+                DWORD64 address2 = reinterpret_cast<DWORD64>( address[i] );
+                DWORD64 offsetFromSymbol;
+                if ( SymGetSymFromAddr( pid, address2, &offsetFromSymbol, pSym ) != FALSE ) {
+                    char name[8192]={0};
+                    DWORD rtn = UnDecorateSymbolName( pSym->Name, name, sizeof(name)-1, UNDNAME_COMPLETE );
+                    if ( rtn == 0 )
+                        info[i].function = std::string(pSym->Name);
+                    else
+                        info[i].function = std::string(name);
+                } else {
+                    printf( "ERROR: SymGetSymFromAddr (%d,%p)\n", GetLastError(), address2 );
+                }
+
+                // Get line number
+                IMAGEHLP_LINE64 Line;
+                memset( &Line, 0, sizeof( Line ) );
+                Line.SizeOfStruct = sizeof( Line );
+                DWORD offsetFromLine;
+                if ( SymGetLineFromAddr64( pid, address2, &offsetFromLine, &Line ) != FALSE ) {
+                    info[i].line     = Line.LineNumber;
+                    info[i].filename = std::string( Line.FileName );
+                } else {
+                    info[i].line     = 0;
+                    info[i].filename = std::string();
+                }
+
+                // Get the object
+                if ( SymGetModuleInfo64( pid, address2, &Module ) != FALSE ) {
+                    //info[i].object = std::string( Module.ModuleName );
+                    info[i].object = std::string( Module.LoadedImageName );
+                    //info[i].baseOfImage = Module.BaseOfImage;
+                }
             }
-            free(demangled);
         #else
-            if ( dlinfo.dli_sname!=NULL )
-                info.function = std::string(dlinfo.dli_sname);
+            for (size_t i=0; i<address.size(); i++) {
+                info[i].address = address[i];
+                #if defined(_GNU_SOURCE) || defined(USE_MAC)
+                    Dl_info dlinfo;
+                    if ( !dladdr( info[i].address, &dlinfo ) ) {
+                        getDataFromGlobalSymbols( info[i] );
+                        continue;
+                    }
+                    info[i].address2 = subtractAddress( info[i].address, dlinfo.dli_fbase );
+                    info[i].object   = std::string( dlinfo.dli_fname );
+                    #if defined( USE_ABI )
+                        int status;
+                        char *demangled = abi::__cxa_demangle( dlinfo.dli_sname, nullptr, nullptr, &status );
+                        if ( status == 0 && demangled != nullptr ) {
+                            info[i].function = std::string( demangled );
+                        } else if ( dlinfo.dli_sname != nullptr ) {
+                            info[i].function = std::string( dlinfo.dli_sname );
+                        }
+                        free( demangled );
+                    #else
+                        if ( dlinfo.dli_sname != NULL )
+                            info[i].function = std::string( dlinfo.dli_sname );
+                    #endif
+                #else
+                    getDataFromGlobalSymbols( info[i] );
+                #endif
+            }
+            // Get the filename / line numbers for each item on the stack
+            getFileAndLine( info );
         #endif
-    #else
-        getDataFromGlobalSymbols( info );
-    #endif
-    // Get the filename / line number
-    getFileAndLine(info);
+    } catch ( ... ) {
+    }
+    signal( SIGINT, prev_handler ) ;
     return info;
 }
 
-std::vector<StackTrace::stack_info>  StackTrace::getCallStack()
+
+/****************************************************************************
+*  Function to get the backtrace                                            *
+****************************************************************************/
+#if defined( USE_LINUX ) || defined( USE_MAC )
+static std::vector<void*> thread_backtrace;
+static bool thread_backtrace_finished;
+static std::mutex thread_backtrace_mutex;
+static void _callstack_signal_handler( int, siginfo_t*, void* )
 {
-    std::vector<StackTrace::stack_info>  stack_list;
-    #if defined(USE_LINUX) || defined(USE_MAC)
+    thread_backtrace = StackTrace::backtrace( );
+    thread_backtrace_finished = true;
+}
+#endif
+std::vector<void*> StackTrace::backtrace( std::thread::native_handle_type tid )
+{
+    std::vector<void*> trace;
+    #if defined( USE_LINUX ) || defined( USE_MAC )
         // Get the trace
-        void *trace[100];
-        memset(trace,0,100*sizeof(void*));
-        int trace_size = backtrace(trace,100);
-        stack_list.reserve(trace_size);
-        for (int i=0; i<trace_size; ++i)
-            stack_list.push_back(getStackInfo(trace[i]));
-    #elif defined(USE_WINDOWS)
-        #ifdef DBGHELP
-            ::CONTEXT lContext;
-            ::ZeroMemory( &lContext, sizeof( ::CONTEXT ) );
-            ::RtlCaptureContext( &lContext );
-            ::STACKFRAME64 lFrameStack;
-            ::ZeroMemory( &lFrameStack, sizeof( ::STACKFRAME64 ) );
-            lFrameStack.AddrPC.Offset = lContext.Rip;
-            lFrameStack.AddrFrame.Offset = lContext.Rbp;
-            lFrameStack.AddrStack.Offset = lContext.Rsp;
-            lFrameStack.AddrPC.Mode = lFrameStack.AddrFrame.Mode = lFrameStack.AddrStack.Mode = AddrModeFlat;
-            #ifdef _M_IX86
-                DWORD MachineType = IMAGE_FILE_MACHINE_I386;
-            #endif
-            #ifdef _M_X64
-                DWORD MachineType = IMAGE_FILE_MACHINE_AMD64;
-            #endif
-            #ifdef _M_IA64
-                DWORD MachineType = IMAGE_FILE_MACHINE_IA64;
-            #endif
-            while ( 1 ) {
-                int rtn = ::StackWalk64( MachineType, ::GetCurrentProcess(), ::GetCurrentThread(), 
-                    &lFrameStack, MachineType == IMAGE_FILE_MACHINE_I386 ? 0 : &lContext,
-                    NULL, &::SymFunctionTableAccess64, &::SymGetModuleBase64, NULL );
-                if( !rtn )
-                    break;
-                if( lFrameStack.AddrPC.Offset == 0 )
-                    break;
-                ::MEMORY_BASIC_INFORMATION lInfoMemory;
-                ::VirtualQuery( ( ::PVOID )lFrameStack.AddrPC.Offset, &lInfoMemory, sizeof( lInfoMemory ) );
-                if ( lInfoMemory.Type==MEM_PRIVATE )
-                    continue;
-                ::DWORD64 lBaseAllocation = reinterpret_cast< ::DWORD64 >( lInfoMemory.AllocationBase );
-                ::TCHAR lNameModule[ 1024 ];
-                ::HMODULE hBaseAllocation = reinterpret_cast< ::HMODULE >( lBaseAllocation );
-                ::GetModuleFileName( hBaseAllocation, lNameModule, 1024 );
-                PIMAGE_DOS_HEADER lHeaderDOS = reinterpret_cast<PIMAGE_DOS_HEADER>( lBaseAllocation );
-                if ( lHeaderDOS==NULL )
-                    continue;
-                PIMAGE_NT_HEADERS lHeaderNT = reinterpret_cast<PIMAGE_NT_HEADERS>( lBaseAllocation + lHeaderDOS->e_lfanew );
-                PIMAGE_SECTION_HEADER lHeaderSection = IMAGE_FIRST_SECTION( lHeaderNT );
-                ::DWORD64 lRVA = lFrameStack.AddrPC.Offset - lBaseAllocation;
-                ::DWORD64 lNumberSection = ::DWORD64();
-                ::DWORD64 lOffsetSection = ::DWORD64();
-                for( int lCnt = ::DWORD64(); lCnt < lHeaderNT->FileHeader.NumberOfSections; lCnt++, lHeaderSection++ ) {
-                    ::DWORD64 lSectionBase = lHeaderSection->VirtualAddress;
-                    ::DWORD64 lSectionEnd = lSectionBase + std::max<::DWORD64>(
-                        lHeaderSection->SizeOfRawData, lHeaderSection->Misc.VirtualSize );
-                    if( ( lRVA >= lSectionBase ) && ( lRVA <= lSectionEnd ) ) {
-                        lNumberSection = lCnt + 1;
-                        lOffsetSection = lRVA - lSectionBase;
-                        //break;
-                    }
-                }
-                StackTrace::stack_info info;
-                info.object = lNameModule;
-                info.address = reinterpret_cast<void*>(lRVA);
-                char tmp[20];
-                sprintf(tmp,"0x%016llx",static_cast<unsigned long long int>(lOffsetSection));
-                info.function = std::to_string(lNumberSection) + ":" + std::string(tmp);
-                stack_list.push_back(info);
+        if ( tid == pthread_self() ) {
+            trace.resize(1000,nullptr);
+            int trace_size = ::backtrace( trace.data(), trace.size() );
+            trace.resize (trace_size );
+        } else {
+            // Note: this will get the backtrace, but terminates the thread in the process!!!
+            thread_backtrace_mutex.lock();
+            struct sigaction sa;
+            sigfillset(&sa.sa_mask);
+            sa.sa_flags = SA_SIGINFO;
+            sa.sa_sigaction = _callstack_signal_handler;
+            sigaction(CALLSTACK_SIG, &sa, NULL);
+            thread_backtrace_finished = false;
+            pthread_kill( tid, CALLSTACK_SIG );
+            auto t1 = std::chrono::high_resolution_clock::now();
+            auto t2 = std::chrono::high_resolution_clock::now();
+            while ( !thread_backtrace_finished && std::chrono::duration<double>(t2-t1).count()<0.1 ) {
+                std::this_thread::yield();
+                t2 = std::chrono::high_resolution_clock::now();
             }
+            std::swap( trace, thread_backtrace );
+            thread_backtrace_finished = false;
+            thread_backtrace_mutex.unlock();
+        }
+    #elif defined( USE_WINDOWS )
+        #if defined(DBGHELP)
+
+            // Load the modules for the stack trace
+            LoadModules();
+
+            // Initialize stackframe for first call
+            ::CONTEXT context;
+            memset( &context, 0, sizeof( context ) );
+            context.ContextFlags = CONTEXT_FULL;
+            RtlCaptureContext( &context );
+            STACKFRAME64 frame; // in/out stackframe
+            memset( &frame, 0, sizeof( frame ) );
+            #ifdef _M_IX86
+                DWORD imageType = IMAGE_FILE_MACHINE_I386;
+                frame.AddrPC.Offset    = context.Eip;
+                frame.AddrPC.Mode      = AddrModeFlat;
+                frame.AddrFrame.Offset = context.Ebp;
+                frame.AddrFrame.Mode   = AddrModeFlat;
+                frame.AddrStack.Offset = context.Esp;
+                frame.AddrStack.Mode   = AddrModeFlat;
+            #elif _M_X64
+                DWORD imageType = IMAGE_FILE_MACHINE_AMD64;
+                frame.AddrPC.Offset    = context.Rip;
+                frame.AddrPC.Mode      = AddrModeFlat;
+                frame.AddrFrame.Offset = context.Rsp;
+                frame.AddrFrame.Mode   = AddrModeFlat;
+                frame.AddrStack.Offset = context.Rsp;
+                frame.AddrStack.Mode   = AddrModeFlat;
+            #elif _M_IA64
+                DWORD imageType = IMAGE_FILE_MACHINE_IA64;
+                frame.AddrPC.Offset     = context.StIIP;
+                frame.AddrPC.Mode       = AddrModeFlat;
+                frame.AddrFrame.Offset  = context.IntSp;
+                frame.AddrFrame.Mode    = AddrModeFlat;
+                frame.AddrBStore.Offset = context.RsBSP;
+                frame.AddrBStore.Mode   = AddrModeFlat;
+                frame.AddrStack.Offset  = context.IntSp;
+                frame.AddrStack.Mode    = AddrModeFlat;
+            #else
+                #error "Platform not supported!"
+            #endif
+
+            trace.reserve( 1000 );
+            auto pid = GetCurrentProcess();
+            for ( int frameNum = 0; frameNum<1024; ++frameNum ) {
+                BOOL rtn = StackWalk64( imageType, pid, tid, &frame, &context, readProcMem,
+                                        SymFunctionTableAccess, SymGetModuleBase64, NULL );
+                if ( !rtn ) {
+                    printf( "ERROR: StackWalk64 (%p)\n", frame.AddrPC.Offset );
+                    break;
+                }
+
+                if ( frame.AddrPC.Offset != 0 )
+                    trace.push_back( reinterpret_cast<void*>( frame.AddrPC.Offset ) );
+
+                if ( frame.AddrReturn.Offset == 0 )
+                    break;
+            }
+            SetLastError( ERROR_SUCCESS );
         #endif
     #else
         #warning Stack trace is not supported on this compiler/OS
     #endif
-    return stack_list;
+    return trace;
+}
+std::vector<void*> StackTrace::backtrace()
+{
+    std::vector<void*> trace = backtrace( thisThread() );
+    return trace;
+}
+std::vector<std::vector<void *>> StackTrace::backtraceAll()
+{
+    // Get the list of threads
+    auto threads = activeThreads( );
+    // Get the backtrace of each thread
+    std::vector<std::vector<void*>> thread_backtrace;
+    for ( auto thread : threads )
+        thread_backtrace.push_back( backtrace( thread ) );
+    return thread_backtrace;
+}
+
+
+/****************************************************************************
+*  Function to get the list of all active threads                           *
+****************************************************************************/
+#if defined( USE_LINUX )
+static std::thread::native_handle_type thread_handle;
+static void _activeThreads_signal_handler( int )
+{
+    auto handle = StackTrace::thisThread( );
+    thread_handle = handle;
+    thread_backtrace_finished = true;
+}
+static inline int get_tid( int pid, const std::string& line )
+{
+    char buf2[128];
+    int i1 = 0;
+    while ( line[i1]==' ' && line[i1]!=0 ) { i1++; }
+    int i2 = i1;
+    while ( line[i2]!=' ' && line[i2]!=0 ) { i2++; }
+    memcpy(buf2,&line[i1],i2-i1);
+    buf2[i2-i1+1] = 0;
+    int pid2 = atoi(buf2);
+    if ( pid2 != pid )
+        return -1;
+    i1 = i2;
+    while ( line[i1]==' ' && line[i1]!=0 ) { i1++; }
+    i2 = i1;
+    while ( line[i2]!=' ' && line[i2]!=0 ) { i2++; }
+    memcpy(buf2,&line[i1],i2-i1);
+    buf2[i2-i1+1] = 0;
+    int tid = atoi(buf2);
+    return tid;
+}
+#endif
+std::thread::native_handle_type StackTrace::thisThread( )
+{
+    #if defined( USE_LINUX ) || defined( USE_MAC )
+        return pthread_self();
+    #elif defined( USE_WINDOWS )
+        return GetCurrentThread();
+    #else
+        #warning Stack trace is not supported on this compiler/OS
+        return std::thread::native_handle_type();
+    #endif
+}
+std::set<std::thread::native_handle_type> StackTrace::activeThreads( )
+{
+    std::set<std::thread::native_handle_type> threads;
+    #if defined( USE_LINUX )
+        std::set<int> tid;
+        int pid = getpid();
+        char cmd[128];
+        sprintf( cmd, "ps -T -p %i", pid );
+        signal( SIGCHLD, SIG_DFL );     // Clear child exited
+        int code;
+        auto output = breakString( exec( cmd, code ) );
+        for ( const auto& line : output ) {
+            int tid2 = get_tid( pid, line );
+            if ( tid2 != -1 )
+                tid.insert( tid2 );
+        }
+        tid.erase( syscall(SYS_gettid) );
+        signal( CALLSTACK_SIG, _activeThreads_signal_handler );
+        for ( auto tid2 : tid ) {
+            thread_backtrace_mutex.lock();
+            thread_backtrace_finished = false;
+            thread_handle = thisThread();
+            syscall( SYS_tgkill, pid, tid2, CALLSTACK_SIG );
+            auto t1 = std::chrono::high_resolution_clock::now();
+            auto t2 = std::chrono::high_resolution_clock::now();
+            while ( !thread_backtrace_finished && std::chrono::duration<double>(t2-t1).count()<0.1 ) {
+                std::this_thread::yield();
+                t2 = std::chrono::high_resolution_clock::now();
+            }
+            threads.insert( thread_handle );
+            thread_backtrace_mutex.unlock();
+        }
+    #elif defined( USE_MAC )
+        printf("activeThreads not finished\n");
+    #elif defined( USE_WINDOWS )
+        HANDLE hThreadSnap = CreateToolhelp32Snapshot( TH32CS_SNAPTHREAD, 0 ); 
+        if( hThreadSnap != INVALID_HANDLE_VALUE ) {
+            // Fill in the size of the structure before using it
+            THREADENTRY32 te32
+            te32.dwSize = sizeof(THREADENTRY32 );
+            // Retrieve information about the first thread, and exit if unsuccessful
+            if( !Thread32First( hThreadSnap, &te32 ) ) {
+                printError( TEXT("Thread32First") );    // Show cause of failure
+                CloseHandle( hThreadSnap );             // Must clean up the snapshot object!
+                return( FALSE );
+            }
+            // Now walk the thread list of the system
+            do { 
+                if ( te32.th32OwnerProcessID == dwOwnerPID )
+                    threads.insert( te32.th32ThreadID );
+            } while( Thread32Next(hThreadSnap, &te32 ) );
+            CloseHandle( hThreadSnap );                 // Must clean up the snapshot object!
+        }
+    #else
+        #warning activeThreads is not yet supported on this compiler/OS
+    #endif
+    threads.insert( thisThread() );
+    return threads;
+}
+// clang-format on
+
+
+/****************************************************************************
+*  Function to get the current call stack                                   *
+****************************************************************************/
+std::vector<StackTrace::stack_info> StackTrace::getCallStack()
+{
+    auto trace = StackTrace::backtrace();
+    auto info = getStackInfo(trace);
+    return info;
+}
+std::vector<StackTrace::stack_info> StackTrace::getCallStack( std::thread::native_handle_type id )
+{
+    auto trace = StackTrace::backtrace( id );
+    auto info = getStackInfo(trace);
+    return info;
+}
+static StackTrace::multi_stack_info generateMultiStack( const std::vector<std::vector<void*>>& thread_backtrace )
+{
+    // Get the stack data for all pointers
+    std::set<void*> addresses_set;
+    for (const auto& trace : thread_backtrace ) {
+        for (auto ptr : trace )
+            addresses_set.insert( ptr );
+    }
+    std::vector<void*> addresses( addresses_set.begin(), addresses_set.end() );
+    auto stack_data = StackTrace::getStackInfo( addresses );
+    std::map<void*,StackTrace::stack_info> map_data;
+    for ( size_t i=0; i<addresses.size(); i++)
+        map_data.insert( std::make_pair( addresses[i], stack_data[i] ) );
+    // Create the multi-stack trace
+    StackTrace::multi_stack_info multistack;
+    for ( const auto& trace : thread_backtrace ) {
+        if ( trace.empty() )
+            continue;
+        // Create the stack for the given thread trace
+        std::vector<StackTrace::stack_info> stack( trace.size() );
+        for (size_t i=0; i<trace.size(); i++)
+            stack[i] = map_data[trace[i]];
+        // Add the data to the multistack
+        multistack.add( stack.size(), stack.data() );
+    }
+    return multistack;
+}
+StackTrace::multi_stack_info StackTrace::getAllCallStacks( )
+{
+    // Get the backtrace of each thread
+    auto thread_backtrace = backtraceAll();
+    // Create the multi-stack strucutre
+    auto stack = generateMultiStack( thread_backtrace );
+    return stack;
 }
 
 
 
+/****************************************************************************
+*  Function to get system search paths                                      *
+****************************************************************************/
+std::string StackTrace::getSymPaths()
+{
+    std::string paths;
+#ifdef USE_WINDOWS
+    // Create the path list (seperated by ';' )
+    paths = std::string( ".;" );
+    paths.reserve( 1000 );
+    // Add the current directory
+    paths += getCurrentDirectory() + ";";
+    // Now add the path for the main-module:
+    char temp[1024];
+    memset( temp, 0, sizeof( temp ) );
+    if ( GetModuleFileNameA( nullptr, temp, sizeof( temp ) - 1 ) > 0 ) {
+        for ( char *p = ( temp + strlen( temp ) - 1 ); p >= temp; --p ) {
+            // locate the rightmost path separator
+            if ( ( *p == '\\' ) || ( *p == '/' ) || ( *p == ':' ) ) {
+                *p = 0;
+                break;
+            }
+        }
+        if ( strlen( temp ) > 0 ) {
+            paths += temp;
+            paths += ";";
+        }
+    }
+    memset( temp, 0, sizeof( temp ) );
+    if ( GetEnvironmentVariableA( "_NT_SYMBOL_PATH", temp, sizeof( temp ) - 1 ) > 0 ) {
+        paths += temp;
+        paths += ";";
+    }
+    memset( temp, 0, sizeof( temp ) );
+    if ( GetEnvironmentVariableA( "_NT_ALTERNATE_SYMBOL_PATH", temp, sizeof( temp ) - 1 ) > 0 ) {
+        paths += temp;
+        paths += ";";
+    }
+    memset( temp, 0, sizeof( temp ) );
+    if ( GetEnvironmentVariableA( "SYSTEMROOT", temp, sizeof( temp ) - 1 ) > 0 ) {
+        paths += temp;
+        paths += ";";
+        // also add the "system32"-directory:
+        paths += temp;
+        paths += "\\system32;";
+    }
+    memset( temp, 0, sizeof( temp ) );
+    if ( GetEnvironmentVariableA( "SYSTEMDRIVE", temp, sizeof( temp ) - 1 ) > 0 ) {
+        paths += "SRV*;" + std::string( temp ) +
+                 "\\websymbols*http://msdl.microsoft.com/download/symbols;";
+    } else {
+        paths += "SRV*c:\\websymbols*http://msdl.microsoft.com/download/symbols;";
+    }
+#endif
+    return paths;
+}
+
+
+/****************************************************************************
+*  Load modules for windows                                                 *
+****************************************************************************/
+#ifdef USE_WINDOWS
+BOOL StackTrace::GetModuleListTH32( HANDLE hProcess, DWORD pid )
+{
+    // CreateToolhelp32Snapshot()
+    typedef HANDLE( __stdcall * tCT32S )( DWORD dwFlags, DWORD th32ProcessID );
+    // Module32First()
+    typedef BOOL( __stdcall * tM32F )( HANDLE hSnapshot, LPMODULEENTRY32 lpme );
+    // Module32Next()
+    typedef BOOL( __stdcall * tM32N )( HANDLE hSnapshot, LPMODULEENTRY32 lpme );
+
+    // try both dlls...
+    const TCHAR *dllname[] = { _T("kernel32.dll"), _T("tlhelp32.dll") };
+    HINSTANCE hToolhelp    = nullptr;
+    tCT32S pCT32S          = nullptr;
+    tM32F pM32F            = nullptr;
+    tM32N pM32N            = nullptr;
+
+    HANDLE hSnap;
+    MODULEENTRY32 me;
+    me.dwSize = sizeof( me );
+
+    for ( size_t i = 0; i < ( sizeof( dllname ) / sizeof( dllname[0] ) ); i++ ) {
+        hToolhelp = LoadLibrary( dllname[i] );
+        if ( hToolhelp == nullptr )
+            continue;
+        pCT32S = (tCT32S) GetProcAddress( hToolhelp, "CreateToolhelp32Snapshot" );
+        pM32F  = (tM32F) GetProcAddress( hToolhelp, "Module32First" );
+        pM32N  = (tM32N) GetProcAddress( hToolhelp, "Module32Next" );
+        if ( ( pCT32S != nullptr ) && ( pM32F != nullptr ) && ( pM32N != nullptr ) )
+            break; // found the functions!
+        FreeLibrary( hToolhelp );
+        hToolhelp = nullptr;
+    }
+
+    if ( hToolhelp == nullptr )
+        return FALSE;
+
+    hSnap = pCT32S( TH32CS_SNAPMODULE, pid );
+    if ( hSnap == (HANDLE) -1 ) {
+        FreeLibrary( hToolhelp );
+        return FALSE;
+    }
+
+    bool keepGoing = !!pM32F( hSnap, &me );
+    int cnt        = 0;
+    while ( keepGoing ) {
+        LoadModule( hProcess, me.szExePath, me.szModule, (DWORD64) me.modBaseAddr, me.modBaseSize );
+        cnt++;
+        keepGoing = !!pM32N( hSnap, &me );
+    }
+    CloseHandle( hSnap );
+    FreeLibrary( hToolhelp );
+    if ( cnt <= 0 )
+        return FALSE;
+    return TRUE;
+}
+DWORD StackTrace::LoadModule(
+    HANDLE hProcess, LPCSTR img, LPCSTR mod, DWORD64 baseAddr, DWORD size )
+{
+    CHAR *szImg  = _strdup( img );
+    CHAR *szMod  = _strdup( mod );
+    DWORD result = ERROR_SUCCESS;
+    if ( ( szImg == nullptr ) || ( szMod == nullptr ) ) {
+        result = ERROR_NOT_ENOUGH_MEMORY;
+    } else {
+        if ( SymLoadModule( hProcess, 0, szImg, szMod, baseAddr, size ) == 0 )
+            result = GetLastError();
+    }
+    ULONGLONG fileVersion = 0;
+    if ( szImg != nullptr ) {
+        // try to retrive the file-version:
+        VS_FIXEDFILEINFO *fInfo = nullptr;
+        DWORD dwHandle;
+        DWORD dwSize = GetFileVersionInfoSizeA( szImg, &dwHandle );
+        if ( dwSize > 0 ) {
+            LPVOID vData = malloc( dwSize );
+            if ( vData != nullptr ) {
+                if ( GetFileVersionInfoA( szImg, dwHandle, dwSize, vData ) != 0 ) {
+                    UINT len;
+                    TCHAR szSubBlock[] = _T("\\");
+                    if ( VerQueryValue( vData, szSubBlock, (LPVOID *) &fInfo, &len ) == 0 ) {
+                        fInfo = nullptr;
+                    } else {
+                        fileVersion = ( (ULONGLONG) fInfo->dwFileVersionLS ) +
+                                      ( (ULONGLONG) fInfo->dwFileVersionMS << 32 );
+                    }
+                }
+                free( vData );
+            }
+        }
+
+        // Retrive some additional-infos about the module
+        IMAGEHLP_MODULE64 Module;
+        Module.SizeOfStruct = sizeof( IMAGEHLP_MODULE64 );
+        SymGetModuleInfo64( hProcess, baseAddr, &Module );
+        LPCSTR pdbName = Module.LoadedImageName;
+        if ( Module.LoadedPdbName[0] != 0 )
+            pdbName = Module.LoadedPdbName;
+    }
+    if ( szImg != nullptr )
+        free( szImg );
+    if ( szMod != nullptr )
+        free( szMod );
+    return result;
+}
+BOOL StackTrace::GetModuleListPSAPI( HANDLE hProcess )
+{
+    DWORD cbNeeded;
+    HMODULE hMods[1024];
+    char tt[8192];
+    char tt2[8192];
+    if ( !EnumProcessModules( hProcess, hMods, sizeof( hMods ), &cbNeeded ) ) {
+        return false;
+    }
+    if ( cbNeeded > sizeof( hMods ) ) {
+        printf( "Insufficient memory allocated in GetModuleListPSAPI\n" );
+        return false;
+    }
+    int cnt = 0;
+    for ( DWORD i = 0; i < cbNeeded / sizeof( hMods[0] ); i++ ) {
+        // base address, size
+        MODULEINFO mi;
+        GetModuleInformation( hProcess, hMods[i], &mi, sizeof( mi ) );
+        // image file name
+        tt[0] = 0;
+        GetModuleFileNameExA( hProcess, hMods[i], tt, sizeof( tt ) );
+        // module name
+        tt2[0] = 0;
+        GetModuleBaseNameA( hProcess, hMods[i], tt2, sizeof( tt2 ) );
+        DWORD dwRes = LoadModule( hProcess, tt, tt2, (DWORD64) mi.lpBaseOfDll, mi.SizeOfImage );
+        if ( dwRes != ERROR_SUCCESS )
+            printf( "ERROR: LoadModule (%d)\n", dwRes );
+        cnt++;
+    }
+
+    return cnt != 0;
+}
+void StackTrace::LoadModules()
+{
+    static bool modules_loaded = false;
+    if ( !modules_loaded ) {
+        modules_loaded = true;
+
+        // Get the search paths for symbols
+        std::string paths = StackTrace::getSymPaths();
+
+        // Initialize the symbols
+        if ( SymInitialize( GetCurrentProcess(), paths.c_str(), FALSE ) == FALSE )
+            printf( "ERROR: SymInitialize (%d)\n", GetLastError() );
+
+        DWORD symOptions = SymGetOptions();
+        symOptions |= SYMOPT_LOAD_LINES | SYMOPT_FAIL_CRITICAL_ERRORS;
+        symOptions     = SymSetOptions( symOptions );
+        char buf[1024] = { 0 };
+        if ( SymGetSearchPath( GetCurrentProcess(), buf, sizeof( buf ) ) == FALSE )
+            printf( "ERROR: SymGetSearchPath (%d)\n", GetLastError() );
+
+        // First try to load modules from toolhelp32
+        BOOL loaded = StackTrace::GetModuleListTH32( GetCurrentProcess(), GetCurrentProcessId() );
+
+        // Try to load from Psapi
+        if ( !loaded )
+            loaded = StackTrace::GetModuleListPSAPI( GetCurrentProcess() );
+    }
+}
+#endif
+
+
+/****************************************************************************
+*  Get the signal name                                                      *
+****************************************************************************/
+std::string StackTrace::signalName( int sig )
+{
+    return std::string( strsignal(sig) );
+}
+std::vector<int> StackTrace::allSignalsToCatch()
+{
+    std::set<int> signals;
+    for (int i=1; i<32; i++)
+        signals.insert( i );
+    for (int i=SIGRTMIN; i<=SIGRTMAX; i++)
+        signals.insert( i );
+    signals.erase( SIGKILL );
+    signals.erase( SIGSTOP );
+    return std::vector<int>( signals.begin(), signals.end() );
+}
+std::vector<int> StackTrace::defaultSignalsToCatch()
+{
+    auto tmp = allSignalsToCatch();
+    std::set<int> signals( tmp.begin(), tmp.end() );
+    signals.erase( SIGWINCH );  // Don't catch window changed by default
+    signals.erase( SIGCONT );   // Don't catch continue by default
+    return std::vector<int>( signals.begin(), signals.end() );
+}
+
+
+/****************************************************************************
+*  Set the signal handlers                                                  *
+****************************************************************************/
+static std::function<void( std::string, StackTrace::terminateType )> abort_fun;
+static std::string rethrow()
+{
+    std::string last_message;
+#ifdef USE_LINUX
+    try {
+        static int tried_throw = 0;
+        if ( tried_throw == 0 ) {
+            tried_throw = 1;
+            throw;
+        }
+        // No active exception
+    } catch ( const std::exception &err ) {
+        // Caught a std::runtime_error
+        last_message = err.what();
+    } catch ( ... ) {
+        // Caught an unknown exception
+        last_message = "unknown exception occurred.";
+    }
+#endif
+    return last_message;
+}
+static void term_func_abort( int sig )
+{
+    std::string msg( "Caught signal: " );
+    msg += StackTrace::signalName( sig );
+    abort_fun( msg, StackTrace::terminateType::signal );
+}
+static std::set<int> signals_set = std::set<int>();
+static void term_func()
+{
+    std::string last_message = rethrow();
+    StackTrace::clearSignals();
+    abort_fun( "Unhandled exception:\n" + last_message, StackTrace::terminateType::exception );
+}
+void StackTrace::clearSignal( int sig )
+{
+    if ( signals_set.find(sig) != signals_set.end() ) {
+        signal( sig, SIG_DFL );
+        signals_set.erase( sig );
+    }
+}
+void StackTrace::clearSignals()
+{
+    for ( auto sig : signals_set )
+        signal( sig, SIG_DFL );
+    signals_set.clear();
+}
+void StackTrace::setSignals( const std::vector<int>& signals, void (*handler) (int) )
+{
+    for ( auto sig : signals ) {
+        signal( sig, handler );
+        signals_set.insert( sig );
+    }
+}
+void StackTrace::setErrorHandlers(
+    std::function<void( std::string, StackTrace::terminateType )> abort )
+{
+    abort_fun = abort;
+    std::set_terminate( term_func );
+    setSignals( defaultSignalsToCatch(), &term_func_abort );
+    std::set_unexpected( term_func );
+}
+
+
+/****************************************************************************
+*  Global call stack functionallity                                         *
+****************************************************************************/
+#ifdef USE_MPI
+static MPI_Comm globalCommForGlobalCommStack = MPI_COMM_NULL;
+static std::shared_ptr<std::thread> globalMonitorThread;
+static bool stopGlobalMonitorThread = false;
+static void runGlobalMonitorThread()
+{
+    int rank = 0;
+    int size = 1;
+    MPI_Comm_size( globalCommForGlobalCommStack, &size );
+    MPI_Comm_rank( globalCommForGlobalCommStack, &rank );
+    while ( !stopGlobalMonitorThread ) {
+        // Check for any messages
+        int flag = 0;
+        MPI_Status status;
+        int err = MPI_Iprobe( MPI_ANY_SOURCE, 1, globalCommForGlobalCommStack, &flag, &status );
+        if ( err != MPI_SUCCESS ) {
+            printf("Internal error in StackTrace::getGlobalCallStacks::runGlobalMonitorThread\n");
+            break;
+        } else if ( flag != 0 ) {
+            // We received a request
+            int src_rank = status.MPI_SOURCE;
+            int tag;
+            MPI_Recv( &tag, 1, MPI_INT, src_rank, 1, globalCommForGlobalCommStack, &status );
+            // Get a trace of all threads (except this)
+            auto threads = StackTrace::activeThreads( );
+            threads.erase( StackTrace::thisThread( ) );
+            if ( threads.empty() )
+                continue;
+            // Get the stack trace of each thread
+            std::vector<std::vector<StackTrace::stack_info>> stack;
+            for ( auto thread : threads )
+                stack.push_back( StackTrace::getCallStack( thread ) );
+            // Pack and send the data
+            auto data = pack( stack );
+            int count = data.size();
+            MPI_Send( data.data(), count, MPI_CHAR, src_rank, tag, globalCommForGlobalCommStack );
+        } else {
+            // No requests recieved
+            std::this_thread::sleep_for( std::chrono::milliseconds(50) );
+        }
+    }
+}
+void StackTrace::globalCallStackInitialize( MPI_Comm comm )
+{
+    #ifdef USE_MPI
+        MPI_Comm_dup( comm, &globalCommForGlobalCommStack );
+    #endif
+    stopGlobalMonitorThread = false;
+    globalMonitorThread.reset( new std::thread( runGlobalMonitorThread ) );
+}
+void StackTrace::globalCallStackFinalize( )
+{
+    stopGlobalMonitorThread = true;
+    globalMonitorThread->join();
+    globalMonitorThread.reset();
+    #ifdef USE_MPI
+        if ( globalCommForGlobalCommStack )
+            MPI_Comm_free( &globalCommForGlobalCommStack );
+    #endif
+}
+StackTrace::multi_stack_info StackTrace::getGlobalCallStacks( )
+{
+    // Check if we properly initialized the comm
+    if ( globalMonitorThread == nullptr ) {
+        printf("Warning: getGlobalCallStacks called without call to globalCallStackInitialize\n");
+        return getAllCallStacks( );
+    }
+    if ( activeThreads().size()==1 ) {
+        printf("Warning: getAllCallStacks not supported on this OS, defaulting to basic call stack\n");
+        return getAllCallStacks( );
+    }
+    // Signal all processes that we want their stack for all threads
+    int rank = 0;
+    int size = 1;
+    MPI_Comm_size( globalCommForGlobalCommStack, &size );
+    MPI_Comm_rank( globalCommForGlobalCommStack, &rank );
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> dis(2,0x7FFF);
+    int tag = dis(gen);
+    std::vector<MPI_Request> sendRequest( size );
+    for (int i=0; i<size; i++) {
+        if ( i == rank )
+            continue;
+        MPI_Isend( &tag, 1, MPI_INT, i, 1, globalCommForGlobalCommStack, &sendRequest[i] );
+    }
+    // Get the trace for the current process
+    auto threads = StackTrace::activeThreads( );
+    threads.erase( globalMonitorThread->native_handle() );
+    StackTrace::multi_stack_info multistack;
+    for ( auto thread : threads ) {
+        auto stack = StackTrace::getCallStack( thread );
+        multistack.add( stack.size(), stack.data() );
+    }
+    // Recieve the backtrace for all processes/threads
+    int N_finished = 1;
+    auto start = std::chrono::steady_clock::now();
+    double time = 0;
+    const double max_time = 2.0 + size*20e-3;
+    while ( N_finished<size && time<max_time ) {
+        int flag = 0;
+        MPI_Status status;
+        int err = MPI_Iprobe( MPI_ANY_SOURCE, tag, globalCommForGlobalCommStack, &flag, &status );
+        if ( err != MPI_SUCCESS ) {
+            printf("Internal error in StackTrace::getGlobalCallStacks\n");
+            break;
+        } else if ( flag != 0 ) {
+            // We recieved a response
+            int src_rank = status.MPI_SOURCE;
+            int count;
+            MPI_Get_count( &status, MPI_CHAR, &count );
+            std::vector<char> data( count, 0 );
+            MPI_Recv( data.data(), count, MPI_CHAR, src_rank, tag, globalCommForGlobalCommStack, &status );
+            auto stack_list = unpack( data );
+            for ( const auto& stack : stack_list )
+                multistack.add( stack.size(), stack.data() );
+            N_finished++;
+        } else {
+            auto stop = std::chrono::steady_clock::now();
+            time = std::chrono::duration_cast<std::chrono::seconds>(stop-start).count();
+            std::this_thread::yield();
+        }
+    }
+    return multistack;
+}
+#else
+void StackTrace::globalCallStackInitialize( MPI_Comm )
+{
+}
+void StackTrace::globalCallStackFinalize( )
+{
+}
+StackTrace::multi_stack_info StackTrace::getGlobalCallStacks( )
+{
+    return getAllCallStacks( );
+}
+#endif
+
diff --git a/common/StackTrace.h b/common/StackTrace.h
index 1a5d1dac..f3ca5698 100644
--- a/common/StackTrace.h
+++ b/common/StackTrace.h
@@ -1,12 +1,31 @@
-#ifndef included_StackTrace
-#define included_StackTrace
+#ifndef included_AtomicStackTrace
+#define included_AtomicStackTrace
 
+#include <functional>
+#include <iostream>
 #include <stdio.h>
 #include <stdlib.h>
-#include <iostream>
 #include <vector>
+#include <thread>
+#include <memory>
+#include <set>
 
 
+// Check for and include MPI
+// clang-format off
+#if defined(USE_MPI) || defined(USE_EXT_MPI)
+    #include "mpi.h"
+#elif defined(__has_include)
+    #if __has_include("mpi.h")
+        #include "mpi.h"
+    #else
+        typedef int MPI_Comm;
+    #endif
+#else
+    typedef int MPI_Comm;
+#endif
+// clang-format on
+
 
 namespace StackTrace {
 
@@ -19,29 +38,179 @@ struct stack_info {
     std::string filename;
     int line;
     //! Default constructor
-    stack_info(): address(NULL), address2(NULL), line(0) {}
+    stack_info() : address( nullptr ), address2( nullptr ), line( 0 ) {}
+    //! Operator==
+    bool operator==( const stack_info& rhs ) const;
+    //! Operator!=
+    bool operator!=( const stack_info& rhs ) const;
     //! Print the stack info
     std::string print() const;
+    //! Compute the number of bytes needed to store the object
+    size_t size() const;
+    //! Pack the data to a byte array, returning a pointer to the end of the data
+    char* pack( char* ptr ) const;
+    //! Unpack the data from a byte array, returning a pointer to the end of the data
+    const char* unpack( const char* ptr );
+    //! Pack a vector of data to a memory block
+    static std::vector<char> packArray( const std::vector<stack_info>& data );
+    //! Unpack a vector of data from a memory block
+    static std::vector<stack_info> unpackArray( const char* data );
 };
 
 
-//! Function to return the current call stack
+struct multi_stack_info {
+    int N;
+    stack_info stack;
+    std::vector<multi_stack_info> children;
+    //! Default constructor
+    multi_stack_info() : N( 0 ) {}
+    //! Add the given stack to the multistack
+    void add( size_t N, const stack_info *stack );
+    //! Print the stack info
+    std::vector<std::string> print( const std::string& prefix=std::string() ) const;
+};
+
+
+/*!
+ * @brief  Get the current call stack
+ * @details  This function returns the current call stack for the current thread
+ * @return      Returns vector containing the stack
+ */
 std::vector<stack_info> getCallStack();
 
 
+/*!
+ * @brief  Get the current call stack for a thread
+ * @details  This function returns the current call stack for the given thread
+ * @param[in] id    The thread id of the stack we want to return
+ * @return          Returns vector containing the stack
+ */
+std::vector<stack_info> getCallStack( std::thread::native_handle_type id );
+
+
+/*!
+ * @brief  Get the current call stack for all threads
+ * @details  This function returns the current call stack for all threads
+ *    in the current process.
+ *    Note: This functionality may not be availible on all platforms
+ * @return          Returns vector containing the stack
+ */
+multi_stack_info getAllCallStacks( );
+
+
+/*!
+ * @brief  Get the current call stack for all threads/processes
+ * @details  This function returns the current call stack for all threads
+ *    for all processes in the current process.  This function requires
+ *    the user to call globalCallStackInitialize() before calling this
+ *    routine, and globalCallStackFinalize() before exiting.
+ *    Note: This functionality may not be availible on all platforms
+ * @return          Returns vector containing the stack
+ */
+multi_stack_info getGlobalCallStacks( );
+
+
+//! Function to return the current call stack for the current thread
+std::vector<void *> backtrace();
+
+//! Function to return the current call stack for the given thread
+std::vector<void *> backtrace( std::thread::native_handle_type id );
+
+//! Function to return the current call stack for all threads
+std::vector<std::vector<void *>> backtraceAll();
+
+
 //! Function to return the stack info for a given address
-stack_info getStackInfo( void* address );
+stack_info getStackInfo( void *address );
+
+
+//! Function to return the stack info for a given address
+std::vector<stack_info> getStackInfo( const std::vector<void *> &address );
+
+
+//! Function to return the signal name
+std::string signalName( int signal );
 
 
 /*!
  * Return the symbols from the current executable (not availible for all platforms)
  * @return      Returns 0 if sucessful
  */
-int getSymbols( std::vector<void*>& address, std::vector<char>& type, std::vector<std::string>& obj );
+int getSymbols(
+    std::vector<void *> &address, std::vector<char> &type, std::vector<std::string> &obj );
+
+
+/*!
+ * Return the name of the executable
+ * @return      Returns the name of the executable (usually the full path)
+ */
+std::string getExecutable();
+
+
+/*!
+ * Return the search path for the symbols
+ * @return      Returns the search path for the symbols
+ */
+std::string getSymPaths();
+
+
+//!< Terminate type
+enum class terminateType { signal, exception };
+
+/*!
+ * Set the error handlers
+ * @param[in]   Function to terminate the program: abort(msg,type)
+ */
+void setErrorHandlers( std::function<void( std::string, terminateType )> abort );
+
+
+/*!
+ * Set the given signals to the handler
+ * @param[in]   Function to terminate the program: abort(msg,type)
+ */
+void setSignals( const std::vector<int>& signals, void (*handler) (int) );
+
+
+//! Clear a signal set by setSignals
+void clearSignal( int signal );
+
+
+//! Clear all signals set by setSignals
+void clearSignals( );
+
+
+//! Return a list of all signals that can be caught
+std::vector<int> allSignalsToCatch( );
+
+//! Return a default list of signals to catch
+std::vector<int> defaultSignalsToCatch( );
+
+
+//! Get a list of the active threads
+std::set<std::thread::native_handle_type> activeThreads( );
+
+//! Get a handle to this thread
+std::thread::native_handle_type thisThread( );
+
+
+//! Initialize globalCallStack functionallity
+void globalCallStackInitialize( MPI_Comm comm );
+
+//! Clean up globalCallStack functionallity
+void globalCallStackFinalize( );
+
+
+/*!
+ * @brief  Call system command
+ * @details  This function calls a system command, waits for the program
+ *   to execute, captures and returns the output and exit code.
+ * @param[in] cmd           Command to execute
+ * @param[out] exit_code    Exit code returned from child process
+ * @return                  Returns string containing the output
+ */
+std::string exec( const std::string& cmd, int& exit_code );
 
 
 } // namespace StackTrace
 
-
 #endif
-
diff --git a/tests/lbpm_color_simulator.h b/tests/lbpm_color_simulator.h
index 1a7a0294..1aa00123 100644
--- a/tests/lbpm_color_simulator.h
+++ b/tests/lbpm_color_simulator.h
@@ -36,12 +36,11 @@ public:
         std::shared_ptr<double> cDistEven_, std::shared_ptr<double>cDistOdd_, int N_ ):
         filename(filename_), cDen(cDen_), cDistEven(cDistEven_), cDistOdd(cDistOdd_), N(N_) {}
     virtual void run() {
-        ThreadPool::WorkItem::d_state = 1;  // Change state to in progress
         PROFILE_START("Save Checkpoint",1);
         WriteCheckpoint(filename,cDen.get(),cDistEven.get(),cDistOdd.get(),N);
         PROFILE_STOP("Save Checkpoint",1);
-        ThreadPool::WorkItem::d_state = 2;  // Change state to finished
     };
+    virtual bool has_result() const { return false; }
 private:
     WriteRestartWorkItem();
     const char* filename;
@@ -67,7 +66,6 @@ public:
         }
     ~BlobIdentificationWorkItem1() { MPI_Comm_free(&newcomm); }
     virtual void run() {
-        ThreadPool::WorkItem::d_state = 1;  // Change state to in progress
         // Compute the global blob id and compare to the previous version
         PROFILE_START("Identify blobs",1);
         double vF = 0.0;
@@ -75,8 +73,8 @@ public:
         IntArray& ids = new_index->second;
         new_index->first = ComputeGlobalBlobIDs(Nx-2,Ny-2,Nz-2,rank_info,*phase,dist,vF,vS,ids,newcomm);
         PROFILE_STOP("Identify blobs",1);
-        ThreadPool::WorkItem::d_state = 2;  // Change state to finished
     }
+    virtual bool has_result() const { return false; }
 private:
     BlobIdentificationWorkItem1();
     int timestep;
@@ -101,7 +99,6 @@ public:
         }
     ~BlobIdentificationWorkItem2() { MPI_Comm_free(&newcomm); }
     virtual void run() {
-        ThreadPool::WorkItem::d_state = 1;  // Change state to in progress
         // Compute the global blob id and compare to the previous version
         PROFILE_START("Identify blobs maps",1);
         const IntArray& ids = new_index->second;
@@ -123,8 +120,8 @@ public:
             writeIDMap(map,timestep,id_map_filename);
         }
         PROFILE_STOP("Identify blobs maps",1);
-        ThreadPool::WorkItem::d_state = 2;  // Change state to finished
     }
+    virtual bool has_result() const { return false; }
 private:
     BlobIdentificationWorkItem2();
     int timestep;
@@ -150,7 +147,6 @@ public:
         }
     ~WriteVisWorkItem() { MPI_Comm_free(&newcomm); }
     virtual void run() {
-        ThreadPool::WorkItem::d_state = 1;  // Change state to in progress
         PROFILE_START("Save Vis",1);
         ASSERT(visData[0].vars[0]->name=="phase");
         ASSERT(visData[0].vars[1]->name=="Pressure");
@@ -166,8 +162,8 @@ public:
         fillData.copy(Averages.Label_NWP,BlobData);
         IO::writeData( timestep, visData, newcomm );
         PROFILE_STOP("Save Vis",1);
-        ThreadPool::WorkItem::d_state = 2;  // Change state to finished
     };
+    virtual bool has_result() const { return false; }
 private:
     WriteVisWorkItem();
     int timestep;
@@ -189,7 +185,6 @@ public:
         blob_ids(ids), id_list(id_list_), beta(beta_) { }
     ~AnalysisWorkItem() { }
     virtual void run() {
-        ThreadPool::WorkItem::d_state = 1;  // Change state to in progress
         Averages.NumberComponents_NWP = blob_ids->first;
         Averages.Label_NWP = blob_ids->second;
         Averages.Label_NWP_map = *id_list;
@@ -215,8 +210,8 @@ public:
             Averages.PrintComponents(timestep);
             PROFILE_STOP("Compute dist",1);
         }
-        ThreadPool::WorkItem::d_state = 2;  // Change state to finished
     }
+    virtual bool has_result() const { return false; }
 private:
     AnalysisWorkItem();
     AnalysisType type;
diff --git a/threadpool/atomic_helpers.cpp b/threadpool/atomic_helpers.cpp
index 9834998c..1cac8e83 100644
--- a/threadpool/atomic_helpers.cpp
+++ b/threadpool/atomic_helpers.cpp
@@ -2,25 +2,27 @@
 #include <stdexcept>
 
 #ifdef USE_PTHREAD_ATOMIC_LOCK
-    // Print a warning if we defaulted to use pthreads for atomic operations
-    // This can decrease the performance of atomic operations
-    // We print the message here so it is only printed once
-    #warning using pthreads for atomic operations, this may affect performance
+// Print a warning if we defaulted to use pthreads for atomic operations
+// This can decrease the performance of atomic operations
+// We print the message here so it is only printed once
+#warning using pthreads for atomic operations, this may affect performance
 #endif
 
 
 namespace AtomicOperations {
 
 #ifdef USE_PTHREAD_ATOMIC_LOCK
-    pthread_mutex_t atomic_pthread_lock;
-    static pthread_mutexattr_t threadpool_global_attr;
-    static int create_atomic_pthread_lock( ) {
-        pthread_mutexattr_init(&threadpool_global_attr);
-        int error = pthread_mutex_init(&atomic_pthread_lock,&threadpool_global_attr);
-        if ( error != 0 ) { throw std::logic_error("Error initializing mutex:"); }
-        return error;
-    }
-    int atomic_pthread_lock_initialized = create_atomic_pthread_lock();
+pthread_mutex_t atomic_pthread_lock;
+static pthread_mutexattr_t threadpool_global_attr;
+static int create_atomic_pthread_lock()
+{
+    pthread_mutexattr_init( &threadpool_global_attr );
+    int error = pthread_mutex_init( &atomic_pthread_lock, &threadpool_global_attr );
+    if ( error != 0 )
+        throw std::logic_error( "Error initializing mutex:" );
+    return error;
+}
+int atomic_pthread_lock_initialized = create_atomic_pthread_lock();
 #endif
 
 } // AtomicOperations namespace
diff --git a/threadpool/atomic_helpers.h b/threadpool/atomic_helpers.h
index f89d5264..5e8c4cfb 100644
--- a/threadpool/atomic_helpers.h
+++ b/threadpool/atomic_helpers.h
@@ -2,47 +2,49 @@
 // but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 #ifndef included_ThreadPoolAtomicHelpers
 #define included_ThreadPoolAtomicHelpers
+#include <stdint.h>
 #include <stdio.h>
 #include <typeinfo>
-#include <stdint.h>
+#include <stdexcept>
 
-// Choose the OS 
-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-    // Using windows
-    #define USE_WINDOWS
-    #define NOMINMAX
-    #include <stdlib.h>
-    #include <windows.h>
-    #include <process.h>
-#elif defined(__APPLE__)
-    // Using MAC
-    #define USE_MAC
-    #include <libkern/OSAtomic.h>
-#elif defined(__linux) || defined(__unix) || defined(__posix)
-    // Using Linux
-    #define USE_LINUX
-    #include <unistd.h>
-    #if !defined(__GNUC__)
-        #define USE_PTHREAD_ATOMIC_LOCK
-        #include "pthread.h"
-    #endif
-#else
-    #error Unknown OS
+// Choose the OS
+#if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 )
+// Using windows
+#define USE_WINDOWS
+#define NOMINMAX
+#include <process.h>
+#include <stdlib.h>
+#include <windows.h>
+#elif defined( __APPLE__ )
+// Using MAC
+#define USE_MAC
+#include <libkern/OSAtomic.h>
+#elif defined( __linux ) || defined( __unix ) || defined( __posix )
+// Using Linux
+#define USE_LINUX
+#include <unistd.h>
+#if !defined( __GNUC__ )
+#define USE_PTHREAD_ATOMIC_LOCK
+#include "pthread.h"
 #endif
+#else
+#error Unknown OS
+#endif
+
 
 
 
 /** \namespace atomic
  * \brief Functions for atomic operations
  * \details This class provides wrapper routines to access simple atomic operations.
- *    Since atomic operations are system dependent, these functions are necessary 
+ *    Since atomic operations are system dependent, these functions are necessary
  *    to provide a platform independent interface.  We also provide some typedef
  *    variables to wrap OS dependencies.  Currently we have 32 and 64 bit integers:
- *    int32_atomic and int64_atomic.  In all cases the operations use the barrier 
- *    versions provided by the compiler/OS if availible.  In most cases, these builtins 
- *    are considered a full barrier. That is, no memory operand will be moved across 
- *    the operation, either forward or backward. Further, instructions will be issued 
- *    as necessary to prevent the processor from speculating loads across the operation 
+ *    int32_atomic and int64_atomic.  In all cases the operations use the barrier
+ *    versions provided by the compiler/OS if availible.  In most cases, these builtins
+ *    are considered a full barrier. That is, no memory operand will be moved across
+ *    the operation, either forward or backward. Further, instructions will be issued
+ *    as necessary to prevent the processor from speculating loads across the operation
  *    and from queuing stores after the operation.
  *    Note: for all functions the variable being modified must be volatile to prevent
  *    compiler optimization that may cache the value.
@@ -51,280 +53,456 @@ namespace AtomicOperations {
 
 
 // Define int32_atomic, int64_atomic
-#include <stdint.h>
-#if defined(USE_WINDOWS)
-    typedef long int32_atomic;
-    typedef __int64 int64_atomic;
-    #define NO_INST_ATTR
-#elif defined(USE_MAC)
-    typedef int32_t int32_atomic;
-    typedef int64_t int64_atomic;
-    #define NO_INST_ATTR
-#elif defined(__GNUC__)
-    typedef int int32_atomic;
-    typedef long int int64_atomic;
-    #define NO_INST_ATTR __attribute__((no_instrument_function)) 
-#elif defined(USE_PTHREAD_ATOMIC_LOCK)
-    typedef int int32_atomic;
-    typedef long int int64_atomic;
-    #define NO_INST_ATTR
+#if defined( USE_WINDOWS )
+typedef long int32_atomic;
+typedef __int64 int64_atomic;
+#define NO_INST_ATTR_ATOMIC
+#elif defined( USE_MAC )
+typedef int32_t int32_atomic;
+typedef int64_t int64_atomic;
+#define NO_INST_ATTR_ATOMIC
+#elif defined( __GNUC__ )
+typedef int int32_atomic;
+typedef long int int64_atomic;
+#define NO_INST_ATTR_ATOMIC __attribute__( ( no_instrument_function ) )
+#elif defined( USE_PTHREAD_ATOMIC_LOCK )
+typedef int int32_atomic;
+typedef long int int64_atomic;
+#define NO_INST_ATTR_ATOMIC
 #else
-    #error Unknown OS
+#error Unknown OS
 #endif
 
 
 /**
- * \brief Increment returning the new value
- * \details Increment x and return the new value
- * \param[in] x     The pointer to the value to increment    
+ * \brief Get the value
+ * \details Read the data in x
+ * \param[in] x     The pointer to the value to get
  */
-inline int32_atomic atomic_increment( int32_atomic volatile *x ) NO_INST_ATTR;
+inline int32_atomic atomic_get( const int32_atomic volatile *x );
+
+
+/**
+ * \brief Get the value
+ * \details Read the data in x
+ * \param[in] x     The pointer to the value to get
+ */
+inline int64_atomic atomic_get( const int64_atomic volatile *x );
+
+/**
+ * \brief Set the value
+ * \details Set the data in x to y (*x=y)
+ * \param[in] x     The pointer to the value to set
+ * \param[in] y     The value to set
+ */
+inline void atomic_set( int32_atomic volatile *x, int32_atomic y );
+
+
+/**
+ * \brief Set the value
+ * \details Set the data in x to y (*x=y)
+ * \param[in] x     The pointer to the value to set
+ * \param[in] y     The value to set
+ */
+inline void atomic_set( int64_atomic volatile *x, int64_atomic y );
+
 
 /**
  * \brief Increment returning the new value
  * \details Increment x and return the new value
- * \param[in] x     The pointer to the value to increment    
+ * \param[in] x     The pointer to the value to increment
  */
-inline int64_atomic atomic_increment( int64_atomic volatile *x ) NO_INST_ATTR;
+inline int32_atomic atomic_increment( int32_atomic volatile *x ) NO_INST_ATTR_ATOMIC;
+
+/**
+ * \brief Increment returning the new value
+ * \details Increment x and return the new value
+ * \param[in] x     The pointer to the value to increment
+ */
+inline int64_atomic atomic_increment( int64_atomic volatile *x ) NO_INST_ATTR_ATOMIC;
 
 /**
  * \brief Decrement returning the new value
  * \details Decrement x and return the new value
- * \param[in] x     The pointer to the value to decrement    
+ * \param[in] x     The pointer to the value to decrement
  */
-inline int32_atomic atomic_decrement( int32_atomic volatile *x ) NO_INST_ATTR;
+inline int32_atomic atomic_decrement( int32_atomic volatile *x ) NO_INST_ATTR_ATOMIC;
 
 /**
  * \brief Decrement returning the new value
  * \details Decrement x and return the new value
- * \param[in] x     The pointer to the value to decrement    
+ * \param[in] x     The pointer to the value to decrement
  */
-inline int64_atomic atomic_decrement( int64_atomic volatile *x ) NO_INST_ATTR;
+inline int64_atomic atomic_decrement( int64_atomic volatile *x ) NO_INST_ATTR_ATOMIC;
 
 /**
  * \brief Add returning the new value
  * \details Add y to x and return the new value
- * \param[in] x     The pointer to the value to add to    
+ * \param[in] x     The pointer to the value to add to
  * \param[in] y     The value to add
  */
-inline int32_atomic atomic_add( int32_atomic volatile *x, int32_atomic y ) NO_INST_ATTR;
+inline int32_atomic atomic_add( int32_atomic volatile *x, int32_atomic y ) NO_INST_ATTR_ATOMIC;
 
 /**
  * \brief Add returning the new value
  * \details Add y to x and return the new value
- * \param[in] x     The pointer to the value to add to    
+ * \param[in] x     The pointer to the value to add to
  * \param[in] y     The value to add
  */
-inline int64_atomic atomic_add( int64_atomic volatile *x, int32_atomic y ) NO_INST_ATTR;
+inline int64_atomic atomic_add( int64_atomic volatile *x, int64_atomic y ) NO_INST_ATTR_ATOMIC;
 
 /**
  * \brief Compare the given value and swap
- * \details Compare the existing value and swap if it matches. 
- *    This function returns the previous value.
- *    To return a bool indicating if the swap was performed,
- *    use "bool t = atomic_compare_and_swap(v,x,y)==x".
+ * \details Compare the existing value and swap if it matches.
+ * \return Returns true if the swap was performed
  * \param[in] v     The pointer to the value to check and swap
  * \param[in] x     The value to compare
  * \param[in] y     The value to swap iff *v==x
  */
-inline int32_atomic atomic_compare_and_swap( int32_atomic volatile *v, int32_atomic x, int32_atomic y );
+inline bool atomic_compare_and_swap( int32_atomic volatile *v, int32_atomic x, int32_atomic y );
 
 /**
  * \brief Compare the given value and swap
- * \details Compare the existing value and swap if it matches. 
- *    This function returns the previous value.
- *    To return a bool indicating if the swap was performed,
- *    use "bool t = atomic_compare_and_swap(v,x,y)==x".
+ * \details Compare the existing value and swap if it matches.
+ * \return Returns true if the swap was performed
  * \param[in] v     The pointer to the value to check and swap
  * \param[in] x     The value to compare
  * \param[in] y     The value to swap iff *v==x
  */
-inline int64_atomic atomic_compare_and_swap( int64_atomic volatile *v, int64_atomic x, int64_atomic y );
+inline bool atomic_compare_and_swap( int64_atomic volatile *v, int64_atomic x, int64_atomic y );
 
 /**
  * \brief Compare the given value and swap
- * \details Compare the existing value and swap if it matches. 
- *    This function returns the previous value.
- *    To return a bool indicating if the swap was performed,
- *    use "bool t = atomic_compare_and_swap(v,x,y)==x".
+ * \details Compare the existing value and swap if it matches.
+ * \return Returns true if the swap was performed
  * \param[in] v     The pointer to the value to check and swap
  * \param[in] x     The value to compare
  * \param[in] y     The value to swap iff *v==x
  */
-inline void* atomic_compare_and_swap( void* volatile *v, void* x, void* y );
+inline bool atomic_compare_and_swap( void *volatile *v, void *x, void *y );
 
+/**
+ * \brief Fetch the current value and "and" with given value
+ * \details Perform *v = (*v) & x, returning the previous value
+ * \return Returns the previous value before the "and" operation
+ * \param[in] v     The pointer to the value to check and swap
+ * \param[in] x     The value to compare
+ * \param[in] y     The value to swap iff *v==x
+ */
+inline int32_atomic atomic_fetch_and_and( int32_atomic volatile *v, int32_atomic x );
+
+/**
+ * \brief Fetch the current value and "and" with given value
+ * \details Perform *v = (*v) & x, returning the previous value
+ * \return Returns the previous value before the "and" operation
+ * \param[in] v     The pointer to the value to check and swap
+ * \param[in] x     The value to compare
+ * \param[in] y     The value to swap iff *v==x
+ */
+inline int64_atomic atomic_fetch_and_and( int64_atomic volatile *v, int64_atomic x );
+
+/**
+ * \brief Fetch the current value and "or" with given value
+ * \details Perform *v = (*v) | x, returning the previous value
+ * \return Returns the previous value before the "and" operation
+ * \param[in] v     The pointer to the value to check and swap
+ * \param[in] x     The value to compare
+ * \param[in] y     The value to swap iff *v==x
+ */
+inline int32_atomic atomic_fetch_and_or( int32_atomic volatile *v, int32_atomic x );
+
+/**
+ * \brief Fetch the current value and "ou" with given value
+ * \details Perform *v = (*v) | x, returning the previous value
+ * \return Returns the previous value before the "and" operation
+ * \param[in] v     The pointer to the value to check and swap
+ * \param[in] x     The value to compare
+ * \param[in] y     The value to swap iff *v==x
+ */
+inline int64_atomic atomic_fetch_and_or( int64_atomic volatile *v, int64_atomic x );
+
+
+
+/**
+ * \brief Class to store a pool of objects
+ * \details This class stores a pool of objects that can be added/removed in a thread-safe way
+ */
+template<class TYPE,int N_MAX>
+class pool
+{
+  public:
+    pool( )
+    {
+        d_data = new volatile TYPE*[N_MAX];
+        for (int i=0; i<N_MAX; i++)
+            d_data[i] = new TYPE;
+    }
+    ~pool( )
+    {
+        for (int i=0; i<N_MAX; i++)
+            if ( d_data[i] != nullptr )
+                delete d_data[i];
+        delete [] d_data;
+    }
+    inline TYPE* get()
+    {
+        int i=0;
+        while ( true ) {
+            TYPE* tmp = const_cast<TYPE*>( d_data[i] );
+            bool swapped = atomic_compare_and_swap( (void* volatile*) &d_data[i], tmp, nullptr );
+            if ( swapped && ( tmp != nullptr ) )
+                return tmp;
+            i = (i+1)%N_MAX;
+        }
+    }
+    inline void put( TYPE* ptr )
+    {
+        int i = 0;
+        while ( !atomic_compare_and_swap( (void* volatile*) &d_data[i], nullptr, ptr ) )
+            i = (i+1)%N_MAX;
+    }
+  private:
+    volatile TYPE **d_data;
+    pool( const pool &rhs );
+    pool &operator=( const pool &rhs );
+};
 
 
 // Define increment/decrement/add operators for int32, int64
-#if defined(USE_WINDOWS)
-    inline int32_atomic atomic_increment( int32_atomic volatile *x ) {
-        return InterlockedIncrement(x);
-    }
-    inline int64_atomic atomic_increment( int64_atomic volatile *x ) {
-        return InterlockedIncrement64(x);
-    }
-    inline int32_atomic atomic_decrement( int32_atomic volatile *x ) {
-        return InterlockedDecrement(x);
-    }
-    inline int64_atomic atomic_decrement( int64_atomic volatile *x ) {
-        return InterlockedDecrement64(x);
-    }
-    inline int32_atomic atomic_add( int32_atomic volatile *x, int32_atomic y ) {
-        return InterlockedExchangeAdd(x,y)+y;
-    }
-    inline int64_atomic atomic_add( int64_atomic volatile *x, int64_atomic y ) {
-        return InterlockedExchangeAdd64(x,y)+y;
-    }
-    inline int32_atomic atomic_compare_and_swap( int32_atomic volatile *v, int32_atomic x, int32_atomic y ) {
-        return InterlockedCompareExchange(v,x,y);
-    }
-    inline int64_atomic atomic_compare_and_swap( int64_atomic volatile *v, int64_atomic x, int64_atomic y ) {
-        return InterlockedCompareExchange64(v,x,y);
-    }
-    inline void* atomic_compare_and_swap( void* volatile *v, void* x, void* y ) {
-        return InterlockedCompareExchangePointer(v,x,y);
-    }
-#elif defined(USE_MAC)
-    inline int32_atomic atomic_increment( int32_atomic volatile *x ) {
-        return OSAtomicIncrement32Barrier(x);
-    }
-    inline int64_atomic atomic_increment( int64_atomic volatile *x ) {
-        return OSAtomicIncrement64Barrier(x);
-    }
-    inline int32_atomic atomic_decrement( int32_atomic volatile *x ) {
-        return OSAtomicDecrement32Barrier(x);
-    }
-    inline int64_atomic atomic_decrement( int64_atomic volatile *x ) {
-        return OSAtomicDecrement64Barrier(x);
-    }
-    inline int32_atomic atomic_add( int32_atomic volatile *x, int32_atomic y ) {
-       return OSAtomicAdd32Barrier(y,x);
-    }
-    inline int64_atomic atomic_add( int64_atomic volatile *x, int64_atomic y ) {
-       return OSAtomicAdd64Barrier(y,x);
-    }
-    inline int32_atomic atomic_compare_and_swap( int32_atomic volatile *v, int32_atomic x, int32_atomic y ) {
-        return OSAtomicCompareAndSwap32Barrier(x,y,v) ? y:x;
-    }
-    inline int64_atomic atomic_compare_and_swap( int64_atomic volatile *v, int64_atomic x, int64_atomic y ) {
-        return OSAtomicCompareAndSwap64Barrier(x,y,v) ? y:x;
-    }
-    inline void* atomic_compare_and_swap( void* volatile *v, void* x, void* y ) {
-        return OSAtomicCompareAndSwapPtrBarrier(x,y,v) ? y:x;
-    }
-#elif defined(__GNUC__)
-    int32_atomic atomic_increment( int32_atomic volatile *x ) {
-        return __sync_add_and_fetch(x,1);
-    }
-    int64_atomic atomic_increment( int64_atomic volatile *x ) {
-        return __sync_add_and_fetch(x,1);
-    }
-    int32_atomic atomic_decrement( int32_atomic volatile *x ) {
-        return __sync_sub_and_fetch(x,1);
-    }
-    int64_atomic atomic_decrement( int64_atomic volatile *x ) {
-        return __sync_sub_and_fetch(x,1);
-    }
-    inline int32_atomic atomic_add( int32_atomic volatile *x, int32_atomic y ) {
-        return __sync_add_and_fetch(x,y);
-    }
-    inline int64_atomic atomic_add( int64_atomic volatile *x, int64_atomic y ) {
-        return __sync_add_and_fetch(x,y);
-    }
-    inline int32_atomic atomic_compare_and_swap( int32_atomic volatile *v, int32_atomic x, int32_atomic y ) {
-        return __sync_val_compare_and_swap(v,x,y);
-    }
-    inline int64_atomic atomic_compare_and_swap( int64_atomic volatile *v, int64_atomic x, int64_atomic y ) {
-        return __sync_val_compare_and_swap(v,x,y);
-    }
-    inline void* atomic_compare_and_swap( void* volatile *v, void* x, void* y ) {
-        return __sync_val_compare_and_swap(v,x,y);
-    }
-#elif defined(USE_PTHREAD_ATOMIC_LOCK)
-    extern pthread_mutex_t atomic_pthread_lock;
-    inline int32_atomic atomic_increment( int32_atomic volatile *x ) {
-        pthread_mutex_lock(&atomic_pthread_lock);
-        int32_atomic y = ++(*x);
-        pthread_mutex_unlock(&atomic_pthread_lock);
-        return y;
-    }
-    inline int64_atomic atomic_increment( int64_atomic volatile *x ) {
-        pthread_mutex_lock(&atomic_pthread_lock);
-        int64_atomic y = ++(*x);
-        pthread_mutex_unlock(&atomic_pthread_lock);
-        return y;
-    }
-    inline int32_atomic atomic_decrement( int32_atomic volatile *x ) {
-        pthread_mutex_lock(&atomic_pthread_lock);
-        int32_atomic y = --(*x);
-        pthread_mutex_unlock(&atomic_pthread_lock);
-        return y;
-    }
-    inline int64_atomic atomic_decrement( int64_atomic volatile *x ) {
-        pthread_mutex_lock(&atomic_pthread_lock);
-        int64_atomic y = --(*x);
-        pthread_mutex_unlock(&atomic_pthread_lock);
-        return y;
-    }
-    inline int32_atomic atomic_add( int32_atomic volatile *x, int32_atomic y ) {
-        pthread_mutex_lock(&atomic_pthread_lock);
-        *x += y;
-        int32_atomic z = *x;
-        pthread_mutex_unlock(&atomic_pthread_lock);
-        return z;
-    }
-    inline int64_atomic atomic_add( int64_atomic volatile *x, int64_atomic y ) {
-        pthread_mutex_lock(&atomic_pthread_lock);
-        *x += y;
-        int64_atomic z = *x;
-        pthread_mutex_unlock(&atomic_pthread_lock);
-        return z;
-    }
-    inline int32_atomic atomic_compare_and_swap( int32_atomic volatile *v, int32_atomic x, int32_atomic y ) {
-        pthread_mutex_lock(&atomic_pthread_lock);
-        *v = (*v==x) ? y:x;
-        int32_atomic z = *v;
-        pthread_mutex_unlock(&atomic_pthread_lock);
-        return z;
-    }
-    inline int64_atomic atomic_compare_and_swap( int64_atomic volatile *v, int64_atomic x, int64_atomic y ) {
-        pthread_mutex_lock(&atomic_pthread_lock);
-        *v = (*v==x) ? y:x;
-        int64_atomic z = *v;
-        pthread_mutex_unlock(&atomic_pthread_lock);
-        return z;
-    }
-    inline void* atomic_compare_and_swap( void* volatile *v, void* x, void* y ) {
-        pthread_mutex_lock(&atomic_pthread_lock);
-        *v = (*v==x) ? y:x;
-        void* z = *v;
-        pthread_mutex_unlock(&atomic_pthread_lock);
-        return z;
-    }
+#if defined( USE_WINDOWS )
+inline int32_atomic atomic_increment( int32_atomic volatile *x )
+{
+    return InterlockedIncrement( x );
+}
+inline int64_atomic atomic_increment( int64_atomic volatile *x )
+{
+    return InterlockedIncrement64( x );
+}
+inline int32_atomic atomic_decrement( int32_atomic volatile *x )
+{
+    return InterlockedDecrement( x );
+}
+inline int64_atomic atomic_decrement( int64_atomic volatile *x )
+{
+    return InterlockedDecrement64( x );
+}
+inline int32_atomic atomic_add( int32_atomic volatile *x, int32_atomic y )
+{
+    return InterlockedExchangeAdd( x, y ) + y;
+}
+inline int64_atomic atomic_add( int64_atomic volatile *x, int64_atomic y )
+{
+    return InterlockedExchangeAdd64( x, y ) + y;
+}
+inline bool atomic_compare_and_swap( int32_atomic volatile *v, int32_atomic x, int32_atomic y )
+{
+    return InterlockedCompareExchange( v, y, x ) == x;
+}
+inline bool atomic_compare_and_swap( int64_atomic volatile *v, int64_atomic x, int64_atomic y )
+{
+    return InterlockedCompareExchange64( v, y, x ) == x;
+}
+inline bool atomic_compare_and_swap( void *volatile *v, void *x, void *y )
+{
+    return InterlockedCompareExchangePointer( v, x, y ) == x;
+}
+#elif defined( USE_MAC )
+inline int32_atomic atomic_increment( int32_atomic volatile *x )
+{
+    return OSAtomicIncrement32Barrier( x );
+}
+inline int64_atomic atomic_increment( int64_atomic volatile *x )
+{
+    return OSAtomicIncrement64Barrier( x );
+}
+inline int32_atomic atomic_decrement( int32_atomic volatile *x )
+{
+    return OSAtomicDecrement32Barrier( x );
+}
+inline int64_atomic atomic_decrement( int64_atomic volatile *x )
+{
+    return OSAtomicDecrement64Barrier( x );
+}
+int32_atomic atomic_fetch_and_or( int32_atomic volatile *v, int32_atomic x ) { return OSAtomicOr32Orig( x, (volatile uint32_t *) v ); }
+int32_atomic atomic_fetch_and_and( int32_atomic volatile *v, int32_atomic x ) { return OSAtomicAnd32Orig( x, (volatile uint32_t *) v); }
+int64_atomic atomic_fetch_and_or( int64_atomic volatile *v, int64_atomic x ) { throw std::logic_error("Not availible for this OS"); return 0; }
+int64_atomic atomic_fetch_and_and( int64_atomic volatile *v, int64_atomic x ) { throw std::logic_error("Not availible for this OS"); return 0; }
+inline int32_atomic atomic_add( int32_atomic volatile *x, int32_atomic y )
+{
+    return OSAtomicAdd32Barrier( y, x );
+}
+inline int64_atomic atomic_add( int64_atomic volatile *x, int64_atomic y )
+{
+    return OSAtomicAdd64Barrier( y, x );
+}
+inline bool atomic_compare_and_swap( int32_atomic volatile *v, int32_atomic x, int32_atomic y )
+{
+    return OSAtomicCompareAndSwap32Barrier( x, y, v );
+}
+inline bool atomic_compare_and_swap( int64_atomic volatile *v, int64_atomic x, int64_atomic y )
+{
+    return OSAtomicCompareAndSwap64Barrier( x, y, v );
+}
+inline bool atomic_compare_and_swap( void *volatile *v, void *x, void *y )
+{
+    return OSAtomicCompareAndSwapPtrBarrier( x, y, v );
+}
+#elif defined( __GNUC__ )
+int32_atomic atomic_increment( int32_atomic volatile *x ) { return __sync_add_and_fetch( x, 1 ); }
+int64_atomic atomic_increment( int64_atomic volatile *x ) { return __sync_add_and_fetch( x, 1 ); }
+int32_atomic atomic_decrement( int32_atomic volatile *x ) { return __sync_sub_and_fetch( x, 1 ); }
+int64_atomic atomic_decrement( int64_atomic volatile *x ) { return __sync_sub_and_fetch( x, 1 ); }
+int32_atomic atomic_fetch_and_or( int32_atomic volatile *v, int32_atomic x ) { return __sync_fetch_and_or( v, x ); }
+int64_atomic atomic_fetch_and_or( int64_atomic volatile *v, int64_atomic x ) { return __sync_fetch_and_or( v, x ); }
+int32_atomic atomic_fetch_and_and( int32_atomic volatile *v, int32_atomic x ) { return __sync_fetch_and_and( v, x ); }
+int64_atomic atomic_fetch_and_and( int64_atomic volatile *v, int64_atomic x ) { return __sync_fetch_and_and( v, x ); }
+inline int32_atomic atomic_add( int32_atomic volatile *x, int32_atomic y )
+{
+    return __sync_add_and_fetch( x, y );
+}
+inline int64_atomic atomic_add( int64_atomic volatile *x, int64_atomic y )
+{
+    return __sync_add_and_fetch( x, y );
+}
+inline bool atomic_compare_and_swap( int32_atomic volatile *v, int32_atomic x, int32_atomic y )
+{
+    return __sync_bool_compare_and_swap( v, x, y );
+}
+inline bool atomic_compare_and_swap( int64_atomic volatile *v, int64_atomic x, int64_atomic y )
+{
+    return __sync_bool_compare_and_swap( v, x, y );
+}
+inline bool atomic_compare_and_swap( void *volatile *v, void *x, void *y )
+{
+    return __sync_bool_compare_and_swap( v, x, y );
+}
+#elif defined( USE_PTHREAD_ATOMIC_LOCK )
+extern pthread_mutex_t atomic_pthread_lock;
+inline int32_atomic atomic_increment( int32_atomic volatile *x )
+{
+    pthread_mutex_lock( &atomic_pthread_lock );
+    int32_atomic y = ++( *x );
+    pthread_mutex_unlock( &atomic_pthread_lock );
+    return y;
+}
+inline int64_atomic atomic_increment( int64_atomic volatile *x )
+{
+    pthread_mutex_lock( &atomic_pthread_lock );
+    int64_atomic y = ++( *x );
+    pthread_mutex_unlock( &atomic_pthread_lock );
+    return y;
+}
+inline int32_atomic atomic_decrement( int32_atomic volatile *x )
+{
+    pthread_mutex_lock( &atomic_pthread_lock );
+    int32_atomic y = --( *x );
+    pthread_mutex_unlock( &atomic_pthread_lock );
+    return y;
+}
+inline int64_atomic atomic_decrement( int64_atomic volatile *x )
+{
+    pthread_mutex_lock( &atomic_pthread_lock );
+    int64_atomic y = --( *x );
+    pthread_mutex_unlock( &atomic_pthread_lock );
+    return y;
+}
+inline int32_atomic atomic_add( int32_atomic volatile *x, int32_atomic y )
+{
+    pthread_mutex_lock( &atomic_pthread_lock );
+    *x += y;
+    int32_atomic z = *x;
+    pthread_mutex_unlock( &atomic_pthread_lock );
+    return z;
+}
+inline int64_atomic atomic_add( int64_atomic volatile *x, int64_atomic y )
+{
+    pthread_mutex_lock( &atomic_pthread_lock );
+    *x += y;
+    int64_atomic z = *x;
+    pthread_mutex_unlock( &atomic_pthread_lock );
+    return z;
+}
+inline bool atomic_compare_and_swap( int32_atomic volatile *v, int32_atomic x, int32_atomic y )
+{
+    pthread_mutex_lock( &atomic_pthread_lock );
+    bool test = *v == x;
+    *v        = test ? y : x;
+    pthread_mutex_unlock( &atomic_pthread_lock );
+    return test;
+}
+inline bool atomic_compare_and_swap( int64_atomic volatile *v, int64_atomic x, int64_atomic y )
+{
+    pthread_mutex_lock( &atomic_pthread_lock );
+    bool test = *v == x;
+    *v        = test ? y : x;
+    pthread_mutex_unlock( &atomic_pthread_lock );
+    return test;
+}
+inline bool atomic_compare_and_swap( void *volatile *v, void *x, void *y )
+{
+    pthread_mutex_lock( &atomic_pthread_lock );
+    bool test = *v == x;
+    *v        = test ? y : x;
+    pthread_mutex_unlock( &atomic_pthread_lock );
+    return test;
+}
 #else
-    #error Unknown OS
+#error Unknown OS
 #endif
 
 
+inline int32_atomic atomic_get( const int32_atomic volatile *x )
+{
+    return atomic_add( const_cast<int32_atomic volatile *>( x ), 0 );
+}
+inline int64_atomic atomic_get( const int64_atomic volatile *x )
+{
+    return atomic_add( const_cast<int64_atomic volatile *>( x ), 0 );
+}
+inline void atomic_set( int32_atomic volatile *x, int32_atomic y )
+{
+    int32_atomic tmp = *x;
+    while ( !atomic_compare_and_swap( x, tmp, y ) ) { tmp = *x; }
+}
+inline void atomic_set( int64_atomic volatile *x, int64_atomic y )
+{
+    int64_atomic tmp = *x;
+    while ( !atomic_compare_and_swap( x, tmp, y ) ) { tmp = *x; }
+}
+inline void atomic_swap( int32_atomic volatile *x, int32_atomic *y )
+{
+    int32_atomic tmp = *x;
+    while ( !atomic_compare_and_swap( x, tmp, *y ) ) { tmp = *x; }
+    *y = tmp;
+}
+inline void atomic_swap( int64_atomic volatile *x, int64_atomic *y )
+{
+    int64_atomic tmp = *x;
+    while ( !atomic_compare_and_swap( x, tmp, *y ) ) { tmp = *x; }
+    *y = tmp;
+}
+
+
+
 // Define an atomic counter
 struct counter_t {
-    public:
-        // Constructor
-        inline counter_t(): count(0) {}
-        // Destructor
-        inline ~counter_t() {}                     // Destructor
-        // Increment returning the new value
-        inline int increment() { return atomic_increment(&count);}
-        // Decrement returning the new value
-        inline int decrement() { return atomic_decrement(&count);}
-        // Set the current value of the count
-        inline void setCount(int val) { count = val;}
-        // Get the current value of the count
-        inline int getCount() const { return count;}
-    private:
-        counter_t( const counter_t& );
-        counter_t& operator=( const counter_t& );
-        volatile int32_atomic count;
+public:
+    // Constructor
+    inline counter_t() : count( 0 ) {}
+    // Destructor
+    inline ~counter_t() {} // Destructor
+    // Increment returning the new value
+    inline int increment() { return atomic_increment( &count ); }
+    // Decrement returning the new value
+    inline int decrement() { return atomic_decrement( &count ); }
+    // Set the current value of the count
+    inline void setCount( int val ) { count = val; }
+    // Get the current value of the count
+    inline int getCount() const { return count; }
+private:
+    counter_t( const counter_t & );
+    counter_t &operator=( const counter_t & );
+    volatile int32_atomic count;
 };
 
 
@@ -332,4 +510,3 @@ struct counter_t {
 
 
 #endif
-
diff --git a/threadpool/atomic_list.h b/threadpool/atomic_list.h
new file mode 100644
index 00000000..d3c73f2e
--- /dev/null
+++ b/threadpool/atomic_list.h
@@ -0,0 +1,185 @@
+#ifndef included_AtomicModelAtomicList
+#define included_AtomicModelAtomicList
+
+#include <functional>
+#include <csignal>
+#include <atomic>
+
+#include "threadpool/atomic_helpers.h"
+
+
+
+/** \class AtomicList
+ *
+ * \brief Maintain a sorted list of entries 
+ * \details This class implements a basic sorted list that is thread-safe and lock-free.
+ *    Entries are stored smallest to largest according to the compare operator
+ */
+template< class TYPE, int MAX_SIZE, class COMPARE = std::less<TYPE> >
+class AtomicList final
+{
+public:
+    //! Default constructor
+    AtomicList( const TYPE& default_value=TYPE(), const COMPARE& comp=COMPARE() );
+
+    /*!
+     * \brief   Remove an item from the list
+     * \details Find and remove first entry that meets the given criteria
+     * @return          Return the item that matches the criteria, or the default item if no item matches
+     * @param comp	 	Comparison function object (i.e. an object that satisfies
+     *                  the requirements of Compare) which returns ​true if the
+     *                  given value meets the selection criteria.
+     *                  The signature of the comparison function should be equivalent to:
+     *                      bool cmp( const TYPE& value, ... );
+     */
+    template<class Compare, class ... Args>
+    inline TYPE remove( Compare compare, Args... args );
+
+    //! Remove the first from the list
+    inline TYPE remove_first( );
+
+    /*!
+     * \brief   Insert an item
+     * \details Insert an item into the list
+     * @param x         Item to insert
+     * @param comp	 	Comparison function object (i.e. an object that satisfies
+     *                  the requirements of Compare) which returns ​true if the
+     *                  first argument is less than (i.e. is ordered before) the second. 
+     *                  The signature of the comparison function should be equivalent to:
+     *                      bool cmp(const TYPE &a, const TYPE &b);
+     */
+    inline void insert( TYPE x );
+
+    /*!
+     * \brief   Return the size of the list
+     * \details Return the number of items in the list
+     */
+    inline int size( ) const { return AtomicOperations::atomic_get(&d_N); }
+
+    /*!
+     * \brief   Check if the list is empty
+     * \details Return true if the list is empty
+     */
+    inline bool empty( ) const { return AtomicOperations::atomic_get(&d_N)==0; }
+
+    /*!
+     * \brief   Return the capacity of the list
+     * \details Return the maximum number of items the list can hold
+     */
+    inline int capacity( ) const { return MAX_SIZE; }
+
+    /*!
+     * \brief   Check the list
+     * \details Perform a series of checks to verify the list is in a stable state.
+     *    Note: This function is only partially thread-safe: it will block all other
+     *    operations on the list, but check may fail if we caught a thread modifing the list.
+     *    It is intended for debugging purposes only!
+     * @return          This function returns true if the list is in a good working state
+     */
+    inline bool check( );
+
+
+    //! Return the total number of inserts since object creation
+    inline int64_t N_insert() const { return AtomicOperations::atomic_get(&d_N_insert); }
+
+
+    //! Return the total number of removals since object creation
+    inline int64_t N_remove() const { return AtomicOperations::atomic_get(&d_N_remove); }
+
+private:
+    // Data members
+    COMPARE d_compare;
+    volatile TYPE d_default;
+    volatile TYPE d_objects[MAX_SIZE];
+    volatile AtomicOperations::int32_atomic d_N;
+    volatile AtomicOperations::int32_atomic d_next[MAX_SIZE+1];
+    volatile AtomicOperations::int32_atomic d_unused;
+    volatile AtomicOperations::int64_atomic d_N_insert;
+    volatile AtomicOperations::int64_atomic d_N_remove;
+
+private:
+    inline int lock( int i )
+    {
+        if ( i == -1 )
+            return -1;
+        int tmp = 0;
+        while ( tmp == 0 )
+            tmp = AtomicOperations::atomic_fetch_and_and( &d_next[i], 0 );
+        return tmp;
+    }
+    inline void unlock( int i, int value )
+    {
+        if ( i != -1 )
+            AtomicOperations::atomic_fetch_and_or( &d_next[i], value );
+    }
+    inline int get_unused( )
+    {
+        int i = 0;
+        while ( i == 0 )
+            i = AtomicOperations::atomic_fetch_and_and( &d_unused, 0 );
+        AtomicOperations::atomic_fetch_and_or( &d_unused, -(d_next[i]+4)+1 );
+        d_next[i] = -3;
+        return i;
+    }
+    inline void put_unused( int i )
+    {
+        int j = 0;
+        while ( j == 0 )
+            AtomicOperations::atomic_swap( &d_unused, &j );
+        d_next[i] = -3-j;
+        AtomicOperations::atomic_fetch_and_or( &d_unused, i );
+    }
+
+
+private:
+    AtomicList( const AtomicList& );
+    AtomicList& operator=( const AtomicList& );
+};
+
+
+/** \class MemoryPool
+ *
+ * \brief Pool allocator
+ * \details This class implements a basic fast pool allocator that is thread-safe.
+ */
+template< class TYPE, class INT_TYPE=int >
+class MemoryPool final
+{
+public:
+    //! Default constructor
+    explicit MemoryPool( size_t size );
+
+    //! destructor
+    ~MemoryPool( );
+
+    /*!
+     * \brief   Allocate an object
+     * \details Allocates a new object from the pool
+     * @return          Return the new pointer, or nullptr if there is no more room in the pool
+     */
+    inline TYPE* allocate( );
+
+    /*!
+     * \brief   Insert an item
+     * \details Insert an item into the list
+     * @param ptr       The pointer to free
+     */
+    inline void free( TYPE* ptr );
+
+private:
+    // Data members
+    volatile TYPE *d_objects;
+    volatile AtomicOperations::int32_atomic d_next;
+
+private:
+    MemoryPool( const MemoryPool& );
+    MemoryPool& operator=( const MemoryPool& );
+};
+
+
+
+
+#include "threadpool/atomic_list.hpp"
+
+#endif
+
diff --git a/threadpool/atomic_list.hpp b/threadpool/atomic_list.hpp
new file mode 100644
index 00000000..877d953f
--- /dev/null
+++ b/threadpool/atomic_list.hpp
@@ -0,0 +1,242 @@
+#ifndef included_AtomicList_hpp
+#define included_AtomicList_hpp
+
+
+#include <stdexcept>
+#include <iostream>
+#include <thread>
+
+
+
+/******************************************************************
+* Constructor                                                     *
+******************************************************************/
+template<class TYPE,int MAX_SIZE,class COMPARE>
+AtomicList<TYPE,MAX_SIZE,COMPARE>::AtomicList( const TYPE& default_value, const COMPARE& comp ):
+    d_compare(comp),
+    d_default(default_value)
+{
+    d_N = 0;
+    d_next[0] = -1;
+    d_unused = 1;
+    d_N_insert = 0;
+    d_N_remove = 0;
+    for (int i=0; i<MAX_SIZE; i++) {
+        d_next[i+1] = -5-i;
+        d_objects[i] = d_default;
+    }
+}
+
+
+/******************************************************************
+* Remove an item                                                  *
+******************************************************************/
+template<class TYPE,int MAX_SIZE,class COMPARE>
+template<class Compare, class ... Args>
+inline TYPE AtomicList<TYPE,MAX_SIZE,COMPARE>::remove( Compare compare, Args... args )
+{
+    // Acquiring temporary ownership 
+    int pos = 0;
+    auto next = lock( 0 );
+    while ( true ) {
+        if ( next == -1 ) {
+            // We have no more entires to search
+            unlock( pos, -1 );
+            pos = -1;
+            break;
+        }
+        if ( next < 0 )
+            throw std::logic_error( "Internal error" );
+        // Acquire ownership of the next item
+        int next2 = lock( next );
+        // Test to see if the object passes compare
+        bool test = compare( const_cast<TYPE&>(d_objects[next-1]), args... );
+        if ( test ) {
+            // We want to return this object, update next to point to another entry and remove the entry
+            unlock( next, -3 );
+            unlock( pos, next2 );
+            pos = next;
+            break;
+        }
+        // Release the ownership and move on
+        unlock( pos, next );
+        pos = next;
+        next = next2;
+    }
+    TYPE rtn(d_default);
+    if ( pos != -1 ) {
+        std::swap( rtn, const_cast<TYPE&>( d_objects[pos-1] ) );
+        put_unused( pos );
+        AtomicOperations::atomic_decrement( &d_N );
+        AtomicOperations::atomic_increment( &d_N_remove );
+    }
+    return rtn;
+}
+template<class TYPE,int MAX_SIZE,class COMPARE>
+inline TYPE AtomicList<TYPE,MAX_SIZE,COMPARE>::remove_first( )
+{
+    TYPE rtn(d_default);
+    auto next = lock( 0 );
+    if ( next != -1 ) {
+        int next2 = lock( next );
+        unlock( next, -3 );
+        unlock( 0, next2 );
+        std::swap( rtn, const_cast<TYPE&>( d_objects[next-1] ) );
+        put_unused( next );
+        AtomicOperations::atomic_decrement( &d_N );
+        AtomicOperations::atomic_increment( &d_N_remove );
+    } else {
+        unlock( 0, next );
+    }
+    return rtn;
+}
+
+
+/******************************************************************
+* Insert an item                                                  *
+******************************************************************/
+template<class TYPE,int MAX_SIZE,class COMPARE>
+inline void AtomicList<TYPE,MAX_SIZE,COMPARE>::insert( TYPE x )
+{
+    int N_used = AtomicOperations::atomic_increment( &d_N );
+    if ( N_used > MAX_SIZE ) {
+        AtomicOperations::atomic_decrement( &d_N );
+        throw std::logic_error( "No room in list" );
+    }
+    // Get an index to store the entry
+    auto index = get_unused();
+    if ( index<1 )
+        throw std::logic_error( "Internal error" );
+    // Store the object in d_objects
+    AtomicOperations::atomic_increment( &d_N_insert );
+    d_objects[index-1] = x;
+    d_next[index] = -1;
+    // Find the position to store and update the next entires
+    int pos = 0;
+    auto next = lock( pos );
+    while ( true ) {
+        // Get the next item in the list (acquiring temporary ownership)
+        if ( next == -1 ) {
+            // We have no more entires to search, store here
+            unlock( pos, index );
+            break;
+        }
+        // Test to see if the object is < the value being compared
+        bool test = d_compare.operator()( x, const_cast<TYPE&>(d_objects[next-1]) );
+        if ( test ) {
+            // We want to store this object before next
+            d_next[index] = next;
+            unlock( pos, index );
+            break;
+        }
+        // Release the ownership and move on
+        int last = pos;
+        pos = next;
+        next = lock( next );
+        unlock( last, pos );
+    }
+}
+
+
+/******************************************************************
+* Check the internal structures of the list                       *
+* This is mostly thread-safe, but blocks all threads              *
+******************************************************************/
+template<class TYPE,int MAX_SIZE,class COMPARE>
+inline bool AtomicList<TYPE,MAX_SIZE,COMPARE>::check( )
+{
+    // Get the lock and check for any other threads modifying the list
+    auto start = lock( 0 );
+    std::this_thread::sleep_for( std::chrono::microseconds(100) );
+    // Perform the checks on the list
+    bool pass = true;
+    int N1 = 0;
+    int N2 = 0;
+    int N_unused = 0;
+    int N_tail = 0;
+    for (int i=0; i<MAX_SIZE; i++) {
+        if ( d_objects[i] != d_default )
+            N1++;
+    }
+    for (int i=0; i<MAX_SIZE+1; i++) {
+        int next = i==0 ? start:d_next[i];
+        if ( next > 0 ) {
+            N2++;
+        } else if ( next < -3 ) {
+            N_unused++;
+        } else if ( next == -1 ) {
+            N_tail++;
+        } else {
+            pass = false;
+        }
+    }
+    pass = pass && N_tail==1 && N1==d_N && N2==d_N && N_unused+d_N==MAX_SIZE;
+    int it = 0;
+    int pos = 0;
+    while ( true ) {
+        int next = pos==0 ? start:d_next[pos];
+        if ( next == -1 )
+            break;
+        pos = next;
+        it++;
+    }
+    pass = pass && it==d_N;
+    // Unlock the list and return the results
+    unlock( 0, start );
+    return pass;
+}
+
+
+
+/******************************************************************
+* MemoryPool                                                      *
+******************************************************************/
+template<class TYPE,class INT_TYPE>
+MemoryPool<TYPE,INT_TYPE>::MemoryPool( size_t size )
+{
+    static_assert( sizeof(TYPE) >= sizeof(int),
+        "sizeof(TYPE) must be >= sizeof(int) to ensure proper operation" );
+    static_assert( sizeof(TYPE) >= sizeof(INT_TYPE),
+        "sizeof(TYPE) must be >= sizeof(INT_TYPE) to ensure proper operation" );
+    d_objects = reinterpret_cast<TYPE*>( malloc(sizeof(TYPE)*size) );
+    d_next = 1;
+    for (size_t i=0; i<size; i++)
+        reinterpret_cast<volatile INT_TYPE&>(d_objects[i]) = i+1;
+    reinterpret_cast<volatile INT_TYPE&>(d_objects[size-1]) = -1;
+}
+template<class TYPE,class INT_TYPE>
+MemoryPool<TYPE,INT_TYPE>::~MemoryPool()
+{
+    free( const_cast<TYPE*>( d_objects ) );
+    d_objects = nullptr;
+}
+template<class TYPE,class INT_TYPE>
+inline TYPE* MemoryPool<TYPE,INT_TYPE>::allocate()
+{
+    AtomicOperations::int32_atomic i = 0;
+    while ( i == 0 )
+        AtomicOperations::atomic_swap( &d_next, &i );
+    TYPE *ptr = nullptr;
+    if ( i!=-1 ) {
+        INT_TYPE j = reinterpret_cast<volatile INT_TYPE&>(d_objects[i-1]);
+        ptr = const_cast<TYPE*>( &d_objects[i-1] );
+        new(ptr) TYPE();
+        i = j+1;
+    }
+    AtomicOperations::atomic_fetch_and_or( &d_next, i );
+    return ptr;
+}
+template<class TYPE,class INT_TYPE>
+inline void MemoryPool<TYPE,INT_TYPE>::free( TYPE* ptr )
+{
+    ptr->~TYPE();
+    AtomicOperations::int32_atomic i = 0;
+    while ( i == 0 )
+        AtomicOperations::atomic_swap( &d_next, &i );
+    reinterpret_cast<INT_TYPE&>(*ptr) = i-1;
+    i = ptr - d_objects + 1;
+    AtomicOperations::atomic_fetch_and_or( &d_next, i );
+}
+
+
+#endif
diff --git a/threadpool/test/CMakeLists.txt b/threadpool/test/CMakeLists.txt
index 6b1462f0..9074933e 100644
--- a/threadpool/test/CMakeLists.txt
+++ b/threadpool/test/CMakeLists.txt
@@ -2,6 +2,7 @@ include ( macros )
 
 # Add thread pool tests
 ADD_LBPM_TEST( test_atomic )
+ADD_LBPM_TEST( test_atomic_list )
 SET_TESTS_PROPERTIES ( test_atomic PROPERTIES FAIL_REGULAR_EXPRESSION ".*FAILED.*" PROCESSORS 64 )
 ADD_LBPM_TEST_THREAD_MPI( test_thread_pool 1 4 )
 ADD_LBPM_TEST_THREAD_MPI( test_thread_pool 2 4 )
diff --git a/threadpool/test/test_atomic.cpp b/threadpool/test/test_atomic.cpp
index f687de27..c3e0c5b0 100644
--- a/threadpool/test/test_atomic.cpp
+++ b/threadpool/test/test_atomic.cpp
@@ -1,66 +1,25 @@
-#include <stdlib.h>
-#include <stdio.h>
+#include "threadpool/atomic_helpers.h"
+#include "common/UnitTest.h"
+#include "common/Utilities.h"
 #include <iostream>
+#include <stdio.h>
+#include <stdlib.h>
 #include <string>
 #include <vector>
-#include "threadpool/atomic_helpers.h"
-#include "common/Utilities.h"
-#include "common/UnitTest.h"
+#include <thread>
+#include <chrono>
+#include <functional>
+#include <atomic>
+
 
 #define perr std::cerr
 #define pout std::cout
 #define printp printf
 
 
-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-    // Using windows
-    #define USE_WINDOWS
-    #define NOMINMAX
-    #include <stdlib.h>
-    #include <windows.h>
-    #include <process.h>
-#elif defined(__APPLE__)
-    // Using MAC
-    #define USE_MAC
-    #include <unistd.h>
-    #include <mach/mach_init.h>
-    #include <mach/thread_policy.h>
-#elif defined(__linux) || defined(__unix) || defined(__posix)
-    // Using Linux
-    #define USE_LINUX
-    #include <pthread.h>
-    #include <unistd.h>
-#else
-    #error Unknown OS
-#endif
-
-
-#ifdef USE_WINDOWS
-    #include <windows.h>
-    #define TIME_TYPE LARGE_INTEGER
-    #define get_time(x) QueryPerformanceCounter(x)
-    #define get_diff(start,end,f) (((double)(end.QuadPart-start.QuadPart))/((double)f.QuadPart))
-    #define get_frequency(f) QueryPerformanceFrequency(f)
-    #define sleep(x) Sleep(x*1000)
-#elif defined(USE_LINUX) || defined(USE_MAC)
-    #include <sys/time.h>
-    #define TIME_TYPE timeval
-    #define get_time(x) gettimeofday(x,NULL);
-    #define get_diff(start,end,f) (((double)end.tv_sec-start.tv_sec)+1e-6*((double)end.tv_usec-start.tv_usec))
-    #define get_frequency(f) (*f=timeval())
-#else
-    #error Unknown OS
-#endif
-
-
 // Function to increment/decrement a counter N times
-struct counter_data {
-    AtomicOperations::counter_t *counter;
-    int N;
-};
-void modify_counter( counter_data *data ) {
-    int N = data->N;
-    AtomicOperations::counter_t &counter = *(data->counter);
+static void modify_counter( int N, AtomicOperations::counter_t &counter )
+{
     if ( N > 0 ) {
         for (int i=0; i<N; i++)
             counter.increment();
@@ -71,78 +30,35 @@ void modify_counter( counter_data *data ) {
 }
 
 
-// Define the thread handle type
-#ifdef USE_WINDOWS
-    typedef HANDLE thread_handle;
-#elif defined(USE_LINUX) || defined(USE_MAC)
-    typedef pthread_t* thread_handle;
-#else
-    #error Unknown OS
-#endif
-
-// Create a thread
-#ifdef USE_WINDOWS
-    static thread_handle create_thread( void (*routine)(void*), void* data ) {
-        return (HANDLE)_beginthread( routine, 0, data);
-    }
-#elif defined(USE_LINUX) || defined(USE_MAC)
-    static thread_handle create_thread( void (*routine)(void*), void* data ) {
-        pthread_t *id = new pthread_t;
-        pthread_create( id, NULL, (void*(*)(void*)) routine, data );
-        return id;
-    }
-#else
-    #error Unknown OS
-#endif
-
-// Destroy a thread
-#ifdef USE_WINDOWS
-    static void destroy_thread( thread_handle id ) {
-        WaitForMultipleObjects( 1, &id, 1, 10000 );
-    }
-#elif defined(USE_LINUX) || defined(USE_MAC)
-    static void destroy_thread( thread_handle id ) {
-        pthread_join(*id,NULL);
-        delete id;
-    }
-#else
-    #error Unknown OS
-#endif
-
-
 /******************************************************************
 * The main program                                                *
 ******************************************************************/
 #ifdef USE_WINDOWS
-    int __cdecl main(int, char **) {
-#elif defined(USE_LINUX) || defined(USE_MAC)
-    int main(int, char*[]) {
+int __cdecl main( int, char ** )
+{
+#elif defined( USE_LINUX ) || defined( USE_MAC )
+int main( int, char *[] )
+{
 #else
-    #error Unknown OS
+#error Unknown OS
 #endif
     UnitTest ut;
 
-    int N_threads = 64;     // Number of threads
-    int N_count = 1000000;  // Number of work items
+    int N_threads = 64;      // Number of threads
+    int N_count   = 1000000; // Number of work items
 
-    TIME_TYPE start, end, f;
-    get_frequency(&f);
-
-    // Ensure we are using all processors
-    #ifdef __USE_GNU
-        int N_procs = sysconf( _SC_NPROCESSORS_ONLN );
-        cpu_set_t mask;
-        CPU_ZERO(&mask);
-        for (int i=0; i<N_procs; i++)
-            CPU_SET(i,&mask);
-        sched_setaffinity(getpid(), sizeof(cpu_set_t), &mask );
-    #endif
+// Ensure we are using all processors
+#ifdef __USE_GNU
+    int N_procs = sysconf( _SC_NPROCESSORS_ONLN );
+    cpu_set_t mask;
+    CPU_ZERO( &mask );
+    for ( int i = 0; i < N_procs; i++ )
+        CPU_SET( i, &mask );
+    sched_setaffinity( getpid(), sizeof( cpu_set_t ), &mask );
+#endif
 
     // Create the counter we want to test
     AtomicOperations::counter_t count;
-    counter_data data;
-    data.counter = &count;
-    data.N = 0;
     if ( count.increment() == 1 )
         ut.passes("increment count");
     else
@@ -159,88 +75,78 @@ void modify_counter( counter_data *data ) {
     count.setCount(0);
 
     // Increment the counter in serial
-    data.N = N_count;
-    get_time(&start);
-    modify_counter( &data );
-    get_time(&end);
-    double time_inc_serial = get_diff(start,end,f)/N_count;
-    int val = count.getCount();
+    auto start = std::chrono::high_resolution_clock::now();
+    modify_counter( N_count, count );
+    auto stop = std::chrono::high_resolution_clock::now();
+    double time_inc_serial = std::chrono::duration<double>(stop-start).count() / N_count;
+    int val                = count.getCount();
     if ( val != N_count ) {
         char tmp[100];
-        sprintf(tmp,"Count of %i did not match expected count of %i",val,N_count);
-        ut.failure(tmp);
+        sprintf( tmp, "Count of %i did not match expected count of %i", val, N_count );
+        ut.failure( tmp );
     }
-    printp("Time to increment (serial) = %0.1f ns\n",1e9*time_inc_serial);
+    printp( "Time to increment (serial) = %0.1f ns\n", 1e9 * time_inc_serial );
 
     // Decrement the counter in serial
-    data.N = -N_count;
-    get_time(&start);
-    modify_counter( &data );
-    get_time(&end);
-    double time_dec_serial = get_diff(start,end,f)/N_count;
-    val = count.getCount();
+    start = std::chrono::high_resolution_clock::now();
+    modify_counter( -N_count, count );
+    stop = std::chrono::high_resolution_clock::now();
+    double time_dec_serial = std::chrono::duration<double>(stop-start).count() / N_count;
+    val                    = count.getCount();
     if ( val != 0 ) {
         char tmp[100];
-        sprintf(tmp,"Count of %i did not match expected count of %i",val,0);
-        ut.failure(tmp);
+        sprintf( tmp, "Count of %i did not match expected count of %i", val, 0 );
+        ut.failure( tmp );
     }
-    printp("Time to decrement (serial) = %0.1f ns\n",1e9*time_dec_serial);
+    printp( "Time to decrement (serial) = %0.1f ns\n", 1e9 * time_dec_serial );
 
     // Increment the counter in parallel
-    data.N = N_count;
-    std::vector<thread_handle> thread_ids(N_threads);
-    get_time(&start);
-    for (int i=0; i<N_threads; i++) {
-        thread_ids[i] = create_thread( (void (*)(void*)) modify_counter, (void*) &data );
-    }
-    for (int i=0; i<N_threads; i++) {
-        destroy_thread( thread_ids[i] );
-    }
-    get_time(&end);
-    double time_inc_parallel = get_diff(start,end,f)/(N_count*N_threads);
-    val = count.getCount();
-    if ( val != N_count*N_threads ) {
+    std::vector<std::thread> threads( N_threads );
+    start = std::chrono::high_resolution_clock::now();
+    for ( int i = 0; i < N_threads; i++ )
+        threads[i] = std::thread( modify_counter, N_count, std::ref(count) );
+    for ( int i = 0; i < N_threads; i++ )
+        threads[i].join();
+    stop = std::chrono::high_resolution_clock::now();
+    double time_inc_parallel = std::chrono::duration<double>(stop-start).count() / ( N_count * N_threads );
+    val                      = count.getCount();
+    if ( val != N_count * N_threads ) {
         char tmp[100];
-        sprintf(tmp,"Count of %i did not match expected count of %i",val,N_count*N_threads);
-        ut.failure(tmp);
+        sprintf( tmp, "Count of %i did not match expected count of %i", val, N_count * N_threads );
+        ut.failure( tmp );
     }
-    printp("Time to increment (parallel) = %0.1f ns\n",1e9*time_inc_parallel);
+    printp( "Time to increment (parallel) = %0.1f ns\n", 1e9 * time_inc_parallel );
 
     // Decrement the counter in parallel
-    data.N = -N_count;
-    get_time(&start);
-    for (int i=0; i<N_threads; i++) {
-        thread_ids[i] = create_thread( (void (*)(void*)) modify_counter, (void*) &data );
-    }
-    for (int i=0; i<N_threads; i++) {
-        destroy_thread( thread_ids[i] );
-    }
-    get_time(&end);
-    double time_dec_parallel = get_diff(start,end,f)/(N_count*N_threads);
-    val = count.getCount();
+    start = std::chrono::high_resolution_clock::now();
+    for ( int i = 0; i < N_threads; i++ )
+        threads[i] = std::thread( modify_counter, -N_count, std::ref(count) );
+    for ( int i = 0; i < N_threads; i++ )
+        threads[i].join();
+    stop = std::chrono::high_resolution_clock::now();
+    double time_dec_parallel = std::chrono::duration<double>(stop-start).count() / ( N_count * N_threads );
+    val                      = count.getCount();
     if ( val != 0 ) {
         char tmp[100];
-        sprintf(tmp,"Count of %i did not match expected count of %i",val,0);
-        ut.failure(tmp);
+        sprintf( tmp, "Count of %i did not match expected count of %i", val, 0 );
+        ut.failure( tmp );
     }
-    printp("Time to decrement (parallel) = %0.1f ns\n",1e9*time_dec_parallel);
+    printp( "Time to decrement (parallel) = %0.1f ns\n", 1e9 * time_dec_parallel );
 
     // Check the time to increment/decrement
-    if ( time_inc_serial>100e-9 || time_dec_serial>100e-9 || time_inc_parallel>100e-9 || time_dec_serial>100e-9 ) {
-        #if USE_GCOV
-            ut.expected_failure("Time to increment/decrement count is too expensive");
-        #else
-            ut.failure("Time to increment/decrement count is too expensive");
-        #endif
+    if ( time_inc_serial > 100e-9 || time_dec_serial > 100e-9 || time_inc_parallel > 100e-9 ||
+         time_dec_serial > 100e-9 ) {
+#if USE_GCOV
+        ut.expected_failure( "Time to increment/decrement count is too expensive" );
+#else
+        ut.failure( "Time to increment/decrement count is too expensive" );
+#endif
     } else {
-        ut.passes("Time to increment/decrement passed");
+        ut.passes( "Time to increment/decrement passed" );
     }
 
     // Finished
     ut.report();
-    int N_errors = ut.NumFailGlobal();
+    int N_errors = static_cast<int>( ut.NumFailGlobal() );
     return N_errors;
 }
-
-
-
diff --git a/threadpool/test/test_atomic_list.cpp b/threadpool/test/test_atomic_list.cpp
new file mode 100644
index 00000000..7d4aee16
--- /dev/null
+++ b/threadpool/test/test_atomic_list.cpp
@@ -0,0 +1,210 @@
+#include "threadpool/atomic_list.h"
+#include "common/UnitTest.h"
+#include "common/Utilities.h"
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <vector>
+#include <thread>
+#include <chrono>
+#include <functional>
+#include <atomic>
+#include <algorithm>
+
+
+
+static void modify_list( AtomicList<int,1024>& list )
+{
+    const int N_count = 50000;
+    for (int i=0; i<N_count; i++) {
+        auto v1 = list.remove_first( );
+        auto v2 = list.remove( [](int) { return true; } );
+        auto v3 = list.remove( [](int v) { return v>=(rand()/8); } );
+        auto v4 = list.remove( [](int v) { return v>=(rand()/4); } );
+        auto v5 = list.remove( [](int v) { return v>=(rand()/2); } );
+        if ( v1 !=-1 ) { list.insert( v1 ); }
+        if ( v2 !=-1 ) { list.insert( v2 ); }
+        if ( v3 !=-1 ) { list.insert( v3 ); }
+        if ( v4 !=-1 ) { list.insert( v4 ); }
+        if ( v5 !=-1 ) { list.insert( v5 ); }
+    }
+}
+
+
+static bool check_list( const std::vector<int>& x, AtomicList<int,1024>& list )
+{
+    bool pass = list.check();
+    pass = pass && (int) x.size() == list.size();
+    if ( pass ) {
+        for (size_t i=0; i<x.size(); i++)
+            pass = pass && x[i] == list.remove( [](int) { return true; } );
+    }
+    // Restore the list
+    for (int i=0; i<list.size(); i++)
+        list.remove_first();
+    for (size_t i=0; i<x.size(); i++)
+        list.insert( x[i] );
+    return pass;
+}
+
+
+static inline void clear_list(AtomicList<int,1024>& list )
+{
+    for (int i=0; i<list.size(); i++)
+        list.remove_first();
+}
+
+
+
+/******************************************************************
+* The main program                                                *
+******************************************************************/
+int main( int, char *[] )
+{
+    UnitTest ut;
+
+    int N_threads = 8;      // Number of threads
+
+    // Create the list
+    AtomicList<int,1024> list(-1);
+    if ( list.size()==0 && list.check() )
+        ut.passes( "Initialize" );
+    else
+        ut.failure( "Initialize" );
+
+    // Initialize the list with some empty values
+    for (int i=0; i<80; i++)
+        list.insert( rand() );
+    list.insert( 2 );
+    list.insert( 1 );
+    list.insert( rand() );
+
+    // Try to pull off a couple of values
+    int v1 = list.remove( [](int a) { return a==1; } );    // Find the entry with 1
+    int v2 = list.remove( [](int) { return true; } );      // Get the first entry
+    int v3 = list.remove( [](int) { return false; } );     // Fail to get an entry
+    if ( v1==1 && v2==2 && v3==-1 && list.size()==81 && list.check() )
+        ut.passes( "Basic sanity test" );
+    else
+        ut.failure( "Basic sanity test" );
+
+    // Clear the list
+    while ( list.remove( [](int) { return true; } ) != -1 ) {}
+
+    // Create a list of known values
+    //std::vector<int> data0(512);
+    std::vector<int> data0(5*N_threads);
+    for (size_t i=0; i<data0.size(); i++)
+        data0[i] = rand();
+    auto data = data0;
+    std::sort( data.begin(), data.end() );
+
+    // Test the cost to insert
+    int N_it = 20;
+    for (int i=0; i<list.size(); i++)
+        list.remove( [](int) { return true; } );
+    std::chrono::duration<double> time;
+    std::chrono::time_point<std::chrono::high_resolution_clock> start, stop;
+    time = time.zero();
+    for (int it=0; it<N_it; it++ ) {
+        clear_list( list );
+        start = std::chrono::high_resolution_clock::now();
+        for (size_t i=0; i<data0.size(); i++)
+            list.insert( data0[i] );
+        stop = std::chrono::high_resolution_clock::now();
+        time += ( stop - start );
+    }
+    printf("insert time/item = %0.0f ns\n",1e9*time.count()/(N_it*data0.size()));
+
+    // Test the cost to remove (first)
+    time = time.zero();
+    for (int it=0; it<N_it; it++ ) {
+        check_list( data, list );
+        start = std::chrono::high_resolution_clock::now();
+        for (size_t i=0; i<data0.size(); i++)
+            list.remove_first( );
+        stop = std::chrono::high_resolution_clock::now();
+        time += ( stop - start );
+    }
+    printf("remove (first) time/item = %0.0f ns\n",1e9*time.count()/(N_it*data0.size()));
+
+    // Test the cost to remove (in order)
+    time = time.zero();
+    for (int it=0; it<N_it; it++ ) {
+        check_list( data, list );
+        start = std::chrono::high_resolution_clock::now();
+        for (size_t i=0; i<data0.size(); i++)
+            list.remove( [](int) { return true; } );
+        stop = std::chrono::high_resolution_clock::now();
+        time += ( stop - start );
+    }
+    printf("remove (ordered) time/item = %0.0f ns\n",1e9*time.count()/(N_it*data0.size()));
+
+    // Test the cost to remove (out order)
+    time = time.zero();
+    for (int it=0; it<N_it; it++ ) {
+        check_list( data, list );
+        start = std::chrono::high_resolution_clock::now();
+        for (size_t i=0; i<data0.size(); i++) {
+            int tmp = data0[i];
+            list.remove( [tmp](int v) { return v==tmp; } );
+        }
+        stop = std::chrono::high_resolution_clock::now();
+        time += ( stop - start );
+    }
+    printf("remove (unordered) time/item = %0.0f ns\n",1e9*time.count()/(N_it*data0.size()));
+
+    // Read/write to the list and check the results
+    int64_t N0 = list.N_remove();
+    check_list( data, list );
+    start = std::chrono::high_resolution_clock::now();
+    modify_list( list );
+    stop = std::chrono::high_resolution_clock::now();
+    double time_serial = std::chrono::duration<double>(stop-start).count();
+    int64_t N1 = list.N_remove();
+    bool pass = check_list( data, list );
+    if ( pass )
+        ut.passes( "Serial get/insert" );
+    else
+        ut.failure( "Serial get/insert" );
+    printf("serial time = %0.5f s\n",time_serial);
+    printf("serial time/item = %0.0f ns\n",1e9*time_serial/(N1-N0));
+
+    // Have multiple threads reading/writing to the list simultaneously
+    std::vector<std::thread> threads( N_threads );
+    start = std::chrono::high_resolution_clock::now();
+    for ( int i = 0; i < N_threads; i++ )
+        threads[i] = std::thread( modify_list, std::ref(list) );
+    for ( int i = 0; i < N_threads; i++ )
+        threads[i].join();
+    stop = std::chrono::high_resolution_clock::now();
+    double time_parallel = std::chrono::duration<double>(stop-start).count();
+    int64_t N2 = list.N_remove();
+    pass = check_list( data, list );
+    if ( pass )
+        ut.passes( "Parallel get/insert" );
+    else
+        ut.failure( "Parallel get/insert" );
+    printf("parallel time = %0.5f s\n",time_parallel);
+    printf("parallel time/item = %0.0f ns\n",1e9*time_parallel/(N2-N1));
+
+    // Try to over-fill the list
+    while ( !list.empty() )
+        list.remove_first();
+    for (int i=1; i<=list.capacity(); i++)
+        list.insert( i );
+    try {
+        list.insert( list.capacity()+1 );
+        ut.failure( "List overflow" );
+    } catch (const std::exception& e) {
+        ut.passes( "List overflow" );
+    } catch(...) {
+        ut.failure( "List overflow (unknown exception)" );
+    }
+
+    // Finished
+    ut.report();
+    int N_errors = static_cast<int>( ut.NumFailGlobal() );
+    return N_errors;
+}
diff --git a/threadpool/test/test_thread_pool.cpp b/threadpool/test/test_thread_pool.cpp
old mode 100755
new mode 100644
index ef68b0f4..1fd0ae63
--- a/threadpool/test/test_thread_pool.cpp
+++ b/threadpool/test/test_thread_pool.cpp
@@ -1,270 +1,256 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <iostream>
-#include <string>
-#include <vector>
-#include <algorithm>
-#include <stdexcept>
+#include "ProfilerApp.h"
+#ifdef USE_TIMER
+#include "MemoryApp.h"
+#endif
 #include "threadpool/thread_pool.h"
 #include "common/UnitTest.h"
 #include "common/Utilities.h"
-#include "ProfilerApp.h"
-#include "math.h"
+#include <math.h>
+#include <algorithm>
+#include <iostream>
+#include <stdexcept>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <vector>
+#include <mutex>
+
+
+#define MAX( x, y ) ( ( x ) > ( y ) ? ( x ) : ( y ) )
 
 
 #define perr std::cerr
 #define pout std::cout
 #define printp printf
 
-#define MAX(x,y) ((x) > (y) ? (x) : (y))
-
-#ifdef USE_WINDOWS
-    #include <windows.h>
-    #define TIME_TYPE LARGE_INTEGER
-    #define get_time(x) QueryPerformanceCounter(x)
-    #define get_diff(start,end,f) (((double)(end.QuadPart-start.QuadPart))/((double)f.QuadPart))
-    #define get_frequency(f) QueryPerformanceFrequency(f)
-    #define sleep(x) Sleep(x*1000)
-    #define sleepMs(x) Sleep(x)
-#elif defined(USE_LINUX) || defined(USE_MAC)
-    #include <sys/time.h>
-    #define TIME_TYPE timeval
-    #define get_time(x) gettimeofday(x,NULL);
-    #define get_diff(start,end,f) (((double)end.tv_sec-start.tv_sec)+1e-6*((double)end.tv_usec-start.tv_usec))
-    #define get_frequency(f) (*f=timeval())
-    #define sleepMs(x) usleep(1000*x)
-#else
-    #error Unknown OS
-#endif
 
 #ifdef USE_MPI
-    #include "mpi.h"
+#include "mpi.h"
 #endif
 
+#define to_ns(x) std::chrono::duration_cast<std::chrono::nanoseconds>(x).count()
+#define to_ms(x) std::chrono::duration_cast<std::chrono::milliseconds>(x).count()
 
-// Wrapper function for mpi barrier
-static inline void barrier() {
-    #ifdef USE_MPI
-        MPI_Barrier( MPI_COMM_WORLD );
-    #endif
+
+// Wrapper functions for mpi
+static inline void barrier()
+{
+#ifdef USE_MPI
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+}
+static inline int getRank()
+{
+    int rank = 0;
+#ifdef USE_MPI
+    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
+#endif
+    return rank;
+}
+static inline int getSize()
+{
+    int size = 0;
+#ifdef USE_MPI
+    MPI_Comm_size( MPI_COMM_WORLD, &size );
+#endif
+    return size;
 }
 
+
 // Function to waste CPU cycles
-void waste_cpu(int N) {
-    if ( N > 10000 ) { PROFILE_START("waste_cpu",2); }
+void waste_cpu( int N )
+{
+    if ( N > 10000 ) {
+        PROFILE_START( "waste_cpu", 2 );
+    }
     double pi = 3.141592653589793;
-    double x = 1.0;
-    N = std::max(10,N);
-    { for (int i=0; i<N; i++) x = sqrt(x*exp(pi/x)); } // style to limit gcov hits
-    if ( fabs(x-2.926064057273157) > 1e-12 ) { abort(); }
-    if ( N > 10000 ) { PROFILE_STOP("waste_cpu",2); }
+    double x  = 1.0;
+    N         = std::max( 10, N );
+    {
+        for ( int i = 0; i < N; i++ )
+            x = sqrt( x * exp( pi / x ) );
+    } // style to limit gcov hits
+    if ( fabs( x - 2.926064057273157 ) > 1e-12 ) {
+        abort();
+    }
+    if ( N > 10000 ) {
+        PROFILE_STOP( "waste_cpu", 2 );
+    }
+}
+
+
+// Sleep for the given time
+// Note: since we may encounter interrupts, we may not sleep for the desired time
+//   so we need to perform the sleep in a loop
+void sleep_ms( int64_t N ) {
+    auto t1 = std::chrono::high_resolution_clock::now();
+    auto t2 = std::chrono::high_resolution_clock::now();
+    while ( to_ms(t2-t1) < N ) {
+        int N2 = N - to_ms(t2-t1);
+        std::this_thread::sleep_for( std::chrono::milliseconds(N2) );
+        t2 = std::chrono::high_resolution_clock::now();
+    }
+}
+void sleep_s( int N ) {
+    sleep_ms(1000*N);
 }
 
 
 // Function to sleep for N seconds then increment a global count
 static volatile int global_sleep_count = 0;
-void sleep_inc(int N) {
-    PROFILE_START("sleep_inc");
-    sleep(N);
+void sleep_inc( int N )
+{
+    PROFILE_START( "sleep_inc" );
+    sleep_s( N );
     ++global_sleep_count;
-    PROFILE_STOP("sleep_inc");
+    PROFILE_STOP( "sleep_inc" );
 }
-void sleep_inc2(double x) {
-    sleepMs(static_cast<int>(round(x*1000)));
+void sleep_inc2( double x )
+{
+    sleep_ms( static_cast<int>( round( x * 1000 ) ) );
     ++global_sleep_count;
 }
-void sleep_msg( double x, std::string msg ) {
-    PROFILE_START(msg);
-    sleepMs(static_cast<int>(round(x*1000)));
-    PROFILE_STOP(msg);
-}
-bool check_inc(int N) {
-    return global_sleep_count==N;
+void sleep_msg( double x, std::string msg )
+{
+    PROFILE_START( msg );
+    sleep_ms( static_cast<int>( round( x * 1000 ) ) );
+    NULL_USE( msg );
+    PROFILE_STOP( msg );
 }
+bool check_inc( int N ) { return global_sleep_count == N; }
 
 
 // Function to return the processor for the given thread
-void print_processor( ThreadPool* tpool ) 
+std::mutex print_processor_mutex;
+
+void print_processor( ThreadPool *tpool )
 {
     int rank = 0;
-    #ifdef USE_MPI
-        MPI_Comm_rank( MPI_COMM_WORLD, &rank );
-    #endif
-    int thread = tpool->getThreadNumber();
+#ifdef USE_MPI
+    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
+#endif
+    int thread    = tpool->getThreadNumber();
     int processor = ThreadPool::getCurrentProcessor();
     char tmp[100];
-    sprintf(tmp,"%i:  Thread,proc = %i,%i\n",rank,thread,processor);
+    sprintf( tmp, "%i:  Thread,proc = %i,%i\n", rank, thread, processor );
+    sleep_ms( 10*rank );
+    print_processor_mutex.lock();
     std::cout << tmp;
-    sleepMs(100);
+    print_processor_mutex.unlock();
+    sleep_ms( 100 );
 }
 
 
 // Function to test how a member thread interacts with the thread pool
-int test_member_thread( ThreadPool *tpool ) {
+int test_member_thread( ThreadPool *tpool )
+{
     int N_errors = 0;
     // Member threads are not allowed to wait for the pool to finish
     try {
         tpool->wait_pool_finished();
         N_errors++;
-    } catch (...) {
+    } catch ( ... ) {
     }
     // Member threads are not allowed to change the size of the pool
     try {
         tpool->wait_pool_finished();
         N_errors++;
-    } catch (...) {
-    }
-    return N_errors;
-}
-
-
-// Function to test creating and locking a mutex
-int test_mutex(bool recursive) {
-    int N_errors = 0;
-    Mutex lock(recursive);      // Create a lock
-    Mutex lock2 = lock;         // Copy the lock
-    // Test getting and releasing the lock
-    lock.lock();
-    lock.unlock();
-    lock2.lock();
-    lock2.unlock();
-    bool own1 = lock.ownLock();
-    bool own2 = lock2.ownLock();
-    lock.lock();
-    bool own3 = lock.ownLock();
-    bool own4 = lock2.ownLock();
-    lock.unlock();
-    bool own5 = lock.ownLock();
-    if ( own1 || own2 || !own3 || !own4 || own5 )
-        return 1;
-    if ( recursive ) {
-        // Test the behavior of a recursive lock
-        lock.lock();
-        if ( !lock.tryLock() )
-            return 1;
-        lock.unlock();
-        lock.lock();
-        lock.unlock();
-    } else {
-        // Test the behavior of a non-recursive lock
-        lock.lock();
-        if ( lock.tryLock() )
-            return 1;
-        lock.unlock();
-        try {
-            lock.unlock();
-            N_errors++;
-        } catch (...) {
-        }
-        try {
-            lock.lock();
-            lock.lock();
-            N_errors++;
-        } catch (...) {
-            lock.unlock();
-        }
-        try {
-            lock.lock();
-            lock2.lock();
-            N_errors++;
-            lock.unlock();
-            lock2.unlock();
-        } catch (...) {
-            lock.unlock();
-        }
+    } catch ( ... ) {
     }
     return N_errors;
 }
 
 
 // Functions to test the templates
-int myfun0() { return 0; }
-int myfun1(int) { return 1; }
-int myfun2(int,float) { return 2; }
-int myfun3(int,float,double) { return 3; }
-int myfun4(int,float,double,char) { return 4; }
-int myfun5(int,float,double,char,std::string) { return 5; }
-int myfun6(int,float,double,char,std::string,int) { return 6; }
-int myfun7(int,float,double,char,std::string,int,int) { return 7; }
+static int myfun0() { return 0; }
+static int myfun1( int ) { return 1; }
+static int myfun2( int, float ) { return 2; }
+static int myfun3( int, float, double ) { return 3; }
+static int myfun4( int, float, double, char ) { return 4; }
+static int myfun5( int, float, double, char, std::string ) { return 5; }
+static int myfun6( int, float, double, char, std::string, int ) { return 6; }
+static int myfun7( int, float, double, char, std::string, int, int ) { return 7; }
 
 
 // Function to test instantiation of functions with different number of arguments
-void vfunarg00() { }
-void vfunarg01(int) { }
-void vfunarg02(int,char) { }
-void vfunarg03(int,char,double) { }
-void vfunarg04(int,char,double,int) { }
-void vfunarg05(int,char,double,int,char) { }
-void vfunarg06(int,char,double,int,char,double) { }
-void vfunarg07(int,char,double,int,char,double,int) { }
-void vfunarg08(int,char,double,int,char,double,int,char) { }
-void vfunarg09(int,char,double,int,char,double,int,char,double) { }
-void vfunarg10(int,char,double,int,char,double,int,char,double,int) { }
-void vfunarg11(int,char,double,int,char,double,int,char,double,int,char) { }
-void vfunarg12(int,char,double,int,char,double,int,char,double,int,char,double) { }
-void vfunarg13(int,char,double,int,char,double,int,char,double,int,char,double,int) { }
-void vfunarg14(int,char,double,int,char,double,int,char,double,int,char,double,int,char) { }
-void vfunarg15(int,char,double,int,char,double,int,char,double,int,char,double,int,char,double) { }
-void vfunarg16(int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int) { }
-void vfunarg17(int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char) { }
-void vfunarg18(int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double) { }
-void vfunarg19(int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int) { }
-void vfunarg20(int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char) { }
-void vfunarg21(int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double) { }
-void vfunarg22(int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int) { }
-void vfunarg23(int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char) { }
-void vfunarg24(int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double) { }
-int funarg00() { return 0; }
-int funarg01(int) { return 1; }
-int funarg02(int,char) { return 2; }
-int funarg03(int,char,double) { return 3; }
-int funarg04(int,char,double,int) { return 4; }
-int funarg05(int,char,double,int,char) { return 5; }
-int funarg06(int,char,double,int,char,double) { return 6; }
-int funarg07(int,char,double,int,char,double,int) { return 7; }
-int funarg08(int,char,double,int,char,double,int,char) { return 8; }
-int funarg09(int,char,double,int,char,double,int,char,double) { return 9; }
-int funarg10(int,char,double,int,char,double,int,char,double,int) { return 10; }
-int funarg11(int,char,double,int,char,double,int,char,double,int,char) { return 11; }
-int funarg12(int,char,double,int,char,double,int,char,double,int,char,double) { return 12; }
-int funarg13(int,char,double,int,char,double,int,char,double,int,char,double,int) { return 13; }
-int funarg14(int,char,double,int,char,double,int,char,double,int,char,double,int,char) { return 14; }
-int funarg15(int,char,double,int,char,double,int,char,double,int,char,double,int,char,double) { return 15; }
-int funarg16(int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int) { return 16; }
-int funarg17(int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char) { return 17; }
-int funarg18(int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double) { return 18; }
-int funarg19(int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int) { return 19; }
-int funarg20(int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char) { return 20; }
-int funarg21(int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double) { return 21; }
-int funarg22(int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int) { return 22; }
-int funarg23(int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char) { return 23; }
-int funarg24(int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double,int,char,double) { return 24; }
-int test_function_arguements( ThreadPool* tpool ) {
+// clang-format off
+static void vfunarg00() {}
+static void vfunarg01( int ) {}
+static void vfunarg02( int, char ) {}
+static void vfunarg03( int, char, double ) {}
+static void vfunarg04( int, char, double, int ) {}
+static void vfunarg05( int, char, double, int, char ) {}
+static void vfunarg06( int, char, double, int, char, double ) {}
+static void vfunarg07( int, char, double, int, char, double, int ) {}
+static void vfunarg08( int, char, double, int, char, double, int, char ) {}
+static void vfunarg09( int, char, double, int, char, double, int, char, double ) {}
+static void vfunarg10( int, char, double, int, char, double, int, char, double, int ) {}
+static void vfunarg11( int, char, double, int, char, double, int, char, double, int, char ) {}
+static void vfunarg12( int, char, double, int, char, double, int, char, double, int, char, double ) {}
+static void vfunarg13( int, char, double, int, char, double, int, char, double, int, char, double, int ) {}
+static void vfunarg14( int, char, double, int, char, double, int, char, double, int, char, double, int, char ) {}
+static void vfunarg15( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) {}
+static void vfunarg16( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int ) {}
+static void vfunarg17( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char ) {}
+static void vfunarg18( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) {}
+static void vfunarg19( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int ) {}
+static void vfunarg20( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char ) {}
+static void vfunarg21( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) {}
+static void vfunarg22( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int ) {}
+static void vfunarg23( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char ) {}
+static void vfunarg24( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) {}
+static int funarg00() { return 0; }
+static int funarg01( int ) { return 1; }
+static int funarg02( int, char ) { return 2; }
+static int funarg03( int, char, double ) { return 3; }
+static int funarg04( int, char, double, int ) { return 4; }
+static int funarg05( int, char, double, int, char ) { return 5; }
+static int funarg06( int, char, double, int, char, double ) { return 6; }
+static int funarg07( int, char, double, int, char, double, int ) { return 7; }
+static int funarg08( int, char, double, int, char, double, int, char ) { return 8; }
+static int funarg09( int, char, double, int, char, double, int, char, double ) { return 9; }
+static int funarg10( int, char, double, int, char, double, int, char, double, int ) { return 10; }
+static int funarg11( int, char, double, int, char, double, int, char, double, int, char ) { return 11; }
+static int funarg12( int, char, double, int, char, double, int, char, double, int, char, double ) { return 12; }
+static int funarg13( int, char, double, int, char, double, int, char, double, int, char, double, int ) { return 13; }
+static int funarg14( int, char, double, int, char, double, int, char, double, int, char, double, int, char ) { return 14; }
+static int funarg15( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) { return 15; }
+static int funarg16( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int ) { return 16; }
+static int funarg17( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char ) { return 17; }
+static int funarg18( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) { return 18; }
+static int funarg19( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int ) { return 19; }
+static int funarg20( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char ) { return 20; }
+static int funarg21( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) { return 21; }
+static int funarg22( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int ) { return 22; }
+static int funarg23( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char ) { return 23; }
+static int funarg24( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) { return 24; }
+static int test_function_arguements( ThreadPool *tpool )
+{
     int N_errors = 0;
     // Test some basic types of instantiations
-    ThreadPool::thread_id_t id0 = TPOOL_ADD_WORK( tpool, myfun0, (NULL) );
+    ThreadPool::thread_id_t id0 = TPOOL_ADD_WORK( tpool, myfun0, ( nullptr ) );
     ThreadPool::thread_id_t id1 = TPOOL_ADD_WORK( tpool, myfun1, ( (int) 1 ) );
     ThreadPool::thread_id_t id2 = TPOOL_ADD_WORK( tpool, myfun2, ( (int) 1, (float) 2 ) );
     ThreadPool::thread_id_t id3 = TPOOL_ADD_WORK( tpool, myfun3, ( (int) 1, (float) 2, (double) 3 ) );
     ThreadPool::thread_id_t id4 = TPOOL_ADD_WORK( tpool, myfun4, ( (int) 1, (float) 2, (double) 3, (char) 4 ) );
-    ThreadPool::thread_id_t id5 = TPOOL_ADD_WORK( tpool, myfun5, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string("test") ) );
-    ThreadPool::thread_id_t id52= TPOOL_ADD_WORK( tpool, myfun5, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string("test") ), -1 );
-    ThreadPool::thread_id_t id6 = TPOOL_ADD_WORK( tpool, myfun6, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string("test"), (int) 1 ) );
-    ThreadPool::thread_id_t id7 = TPOOL_ADD_WORK( tpool, myfun7, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string("test"), (int) 1, (int) 1 ) );
+    ThreadPool::thread_id_t id5 = TPOOL_ADD_WORK( tpool, myfun5, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ) ) );
+    ThreadPool::thread_id_t id52= TPOOL_ADD_WORK( tpool, myfun5, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ) ), -1 );
+    ThreadPool::thread_id_t id6 = TPOOL_ADD_WORK( tpool, myfun6, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ), (int) 1 ) );
+    ThreadPool::thread_id_t id7 = TPOOL_ADD_WORK( tpool, myfun7, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ), (int) 1, (int) 1 ) );
     tpool->wait_pool_finished();
-    if ( !tpool->isFinished(id0) ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>(id0) != 0 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>(id1) != 1 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>(id2) != 2 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>(id3) != 3 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>(id4) != 4 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>(id5) != 5 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>(id52)!= 5 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>(id6) != 6 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>(id7) != 7 ) { N_errors++; }
+    if ( !tpool->isFinished( id0 ) ) { N_errors++; }
+    if ( tpool->getFunctionRet<int>( id0 ) != 0 ) { N_errors++; }
+    if ( tpool->getFunctionRet<int>( id1 ) != 1 ) { N_errors++; }
+    if ( tpool->getFunctionRet<int>( id2 ) != 2 ) { N_errors++; }
+    if ( tpool->getFunctionRet<int>( id3 ) != 3 ) { N_errors++; }
+    if ( tpool->getFunctionRet<int>( id4 ) != 4 ) { N_errors++; }
+    if ( tpool->getFunctionRet<int>( id5 ) != 5 ) { N_errors++; }
+    if ( tpool->getFunctionRet<int>( id52 ) != 5 ){ N_errors++; }
+    if ( tpool->getFunctionRet<int>( id6 ) != 6 ) { N_errors++; }
+    if ( tpool->getFunctionRet<int>( id7 ) != 7 ) { N_errors++; }
     // Test all the different numbers of arguments allowed
-    TPOOL_ADD_WORK( tpool, vfunarg00, (NULL) );
-    TPOOL_ADD_WORK( tpool, vfunarg01, ( 1) );
+    TPOOL_ADD_WORK( tpool, vfunarg00, ( nullptr ) );
+    TPOOL_ADD_WORK( tpool, vfunarg01, ( 1 ) );
     TPOOL_ADD_WORK( tpool, vfunarg02, ( 1, 'a' ) );
     TPOOL_ADD_WORK( tpool, vfunarg03, ( 1, 'a', 3.0 ) );
     TPOOL_ADD_WORK( tpool, vfunarg04, ( 1, 'a', 3.0, 4 ) );
@@ -288,15 +274,15 @@ int test_function_arguements( ThreadPool* tpool ) {
     TPOOL_ADD_WORK( tpool, vfunarg22, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't', 21.0, 22 ) );
     TPOOL_ADD_WORK( tpool, vfunarg23, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't', 21.0, 22, 'w' ) );
     TPOOL_ADD_WORK( tpool, vfunarg24, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't', 21.0, 22, 'w', 24.0 ) );
-    std::vector<ThreadPool::thread_id_t> ids(25);
-    ids[0]  = TPOOL_ADD_WORK( tpool, funarg00, (NULL) );
+    std::vector<ThreadPool::thread_id_t> ids( 25 );
+    ids[0]  = TPOOL_ADD_WORK( tpool, funarg00, ( nullptr ) );
     ids[1]  = TPOOL_ADD_WORK( tpool, funarg01, ( 1 ) );
     ids[2]  = TPOOL_ADD_WORK( tpool, funarg02, ( 1, 'a' ) );
     ids[3]  = TPOOL_ADD_WORK( tpool, funarg03, ( 1, 'a', 3.0 ) );
     ids[4]  = TPOOL_ADD_WORK( tpool, funarg04, ( 1, 'a', 3.0, 4 ) );
     ids[5]  = TPOOL_ADD_WORK( tpool, funarg05, ( 1, 'a', 3.0, 4, 'e' ) );
     ids[6]  = TPOOL_ADD_WORK( tpool, funarg06, ( 1, 'a', 3.0, 4, 'e', 6.0 ) );
-    ids[7]  = TPOOL_ADD_WORK( tpool, funarg07, ( 1, 'a', 3.0, 4, 'e', 6.0, 7) );
+    ids[7]  = TPOOL_ADD_WORK( tpool, funarg07, ( 1, 'a', 3.0, 4, 'e', 6.0, 7 ) );
     ids[8]  = TPOOL_ADD_WORK( tpool, funarg08, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h' ) );
     ids[9]  = TPOOL_ADD_WORK( tpool, funarg09, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0 ) );
     ids[10] = TPOOL_ADD_WORK( tpool, funarg10, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10 ) );
@@ -315,70 +301,59 @@ int test_function_arguements( ThreadPool* tpool ) {
     ids[23] = TPOOL_ADD_WORK( tpool, funarg23, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't', 21.0, 22, 'w' ) );
     ids[24] = TPOOL_ADD_WORK( tpool, funarg24, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't', 21.0, 22, 'w', 24.0 ) );
     tpool->wait_all( ids );
-    for (size_t i=0; i<ids.size(); i++) {
-        if ( tpool->getFunctionRet<int>(ids[i]) != static_cast<int>(i) ) 
-            N_errors++; 
+    for ( size_t i = 0; i < ids.size(); i++ ) {
+        if ( tpool->getFunctionRet<int>( ids[i] ) != static_cast<int>( i ) )
+            N_errors++;
     }
     return N_errors;
 }
+// clang-format on
 
 
 /******************************************************************
 * Examples to derive a user work item                             *
 ******************************************************************/
-class UserWorkItemVoid: public ThreadPool::WorkItem {
+class UserWorkItemVoid : public ThreadPool::WorkItem
+{
 public:
-    // User defined constructor (does not need to match any intrefaces)
-    UserWorkItemVoid( int dummy )
+    // User defined constructor (does not need to match any interfaces)
+    explicit UserWorkItemVoid( int dummy )
     {
         // User initialized variables
-        NULL_USE(dummy);
-        // Set class variables
-        ThreadPool::WorkItem::d_has_result = false;
-        ThreadPool::WorkItem::d_state = 0;
+        NULL_USE( dummy );
     }
     // User defined run (can do anything)
-    void run()
+    virtual void run() override
     {
-        // Set the state (always do this first)
-        ThreadPool::WorkItem::d_state = 1;
         // Perform the tasks
-        printf("Hello work from UserWorkItem (void)");
-        // Set the state (always do this last)
-        ThreadPool::WorkItem::d_state = 2;
+        printf( "Hello work from UserWorkItem (void)" );
     }
+    // Will the routine return a result
+    virtual bool has_result() const override { return false; }
     // User defined destructor
-    virtual ~UserWorkItemVoid() 
-    {
-    }
+    virtual ~UserWorkItemVoid() {}
 };
-class UserWorkItemInt: public ThreadPool::WorkItemRet<int> {
+class UserWorkItemInt : public ThreadPool::WorkItemRet<int>
+{
 public:
-    // User defined constructor (does not need to match any intrefaces)
-    UserWorkItemInt( int dummy )
+    // User defined constructor (does not need to match any interfaces)
+    explicit UserWorkItemInt( int dummy )
     {
         // User initialized variables
-        NULL_USE(dummy);
-        // Set class variables
-        ThreadPool::WorkItem::d_has_result = true;
-        ThreadPool::WorkItem::d_state = 0;
+        NULL_USE( dummy );
     }
     // User defined run (can do anything)
-    void run()
+    virtual void run() override
     {
-        // Set the state (always do this first)
-        ThreadPool::WorkItem::d_state = 1;
         // Perform the tasks
-        printf("Hello work from UserWorkItem (int)");
+        printf( "Hello work from UserWorkItem (int)" );
         // Store the results (it's type will match the template)
         ThreadPool::WorkItemRet<int>::d_result = 1;
-        // Set the state (always do this last)
-        ThreadPool::WorkItem::d_state = 2;
     }
+    // Will the routine return a result
+    virtual bool has_result() const override { return false; }
     // User defined destructor
-    virtual ~UserWorkItemInt() 
-    {
-    }
+    virtual ~UserWorkItemInt() {}
 };
 
 
@@ -390,84 +365,147 @@ inline double run_parallel( ThreadPool *tpool, int N_tasks, int N_work )
     // Make sure the thread pool is empty
     tpool->wait_pool_finished();
     // Add the work
-    TIME_TYPE start, end, f;
-    get_frequency(&f);
     std::vector<ThreadPool::thread_id_t> ids;
-    ids.reserve(N_tasks);
-    get_time(&start);
-    for (int i=0; i<N_tasks; i++)
-        ids.push_back( TPOOL_ADD_WORK( tpool, waste_cpu, (N_work) ) );
+    ids.reserve( N_tasks );
+    auto start = std::chrono::high_resolution_clock::now();
+    for ( int i = 0; i < N_tasks; i++ )
+        ids.push_back( TPOOL_ADD_WORK( tpool, waste_cpu, ( N_work ) ) );
     // Wait for the thread pool to finish
     tpool->wait_pool_finished();
     // Compute the time spent running the tasks
-    get_time(&end);
-    return get_diff(start,end,f);
+    auto stop = std::chrono::high_resolution_clock::now();
+    return std::chrono::duration<double>(stop-start).count();
 }
 
 
+// Move constructor function
+volatile ThreadPool::thread_id_t f1( volatile ThreadPool::thread_id_t a ) { return a; }
+ThreadPool::thread_id_t f2( ThreadPool::thread_id_t a ) { return a; }
+
+
+/******************************************************************
+* Test the basic functionallity of the atomics                    *
+******************************************************************/
+int test_atomics()
+{
+    using namespace AtomicOperations;
+    int N_errors = 0;
+    volatile int32_atomic i32;
+    volatile int64_atomic i64;
+    i32 = 32;
+    i64 = 64;
+    if ( atomic_increment( &i32 ) != 33 || atomic_increment( &i64 ) != 65 )
+        N_errors++;
+    if ( atomic_decrement( &i32 ) != 32 || atomic_decrement( &i64 ) != 64 )
+        N_errors++;
+    if ( atomic_add( &i32, 2 ) != 34 || atomic_add( &i64, 4 ) != 68 )
+        N_errors++;
+    if ( atomic_compare_and_swap( &i32, 0, 0 ) || atomic_compare_and_swap( &i64, 0, 0 ) )
+        N_errors++;
+    if ( !atomic_compare_and_swap( &i32, 34, 32 ) || !atomic_compare_and_swap( &i64, 68, 64 ) )
+        N_errors++;
+    if ( i32 != 32 || i64 != 64 )
+        N_errors++;
+    return N_errors;
+}
+
+
+/******************************************************************
+* Test FIFO behavior                                              *
+******************************************************************/
+void test_FIFO( UnitTest& ut, ThreadPool& tpool )
+{
+    int rank = getRank();
+    int size = getSize();
+    for (int r=0; r<size; r++) {
+        barrier();
+        if ( r != rank )
+            continue;   
+        std::vector<ThreadPool::thread_id_t> ids;
+        for (size_t i=0; i<4000; i++)
+            ids.push_back( TPOOL_ADD_WORK( &tpool, sleep_inc2, ( 0.001 ) ) );
+        bool pass = true;
+        while ( tpool.N_queued() > 0 ) {
+            int i1=-1, i2=ids.size();
+            for (size_t i=0; i<ids.size(); i++) {
+                bool started = ids[i].started();
+                if ( started )
+                    i1 = std::max<int>(i1,i);   // Last index to processing item
+                else
+                    i2 = std::min<int>(i2,i);   // First index to queued item
+            }
+            int diff = i1==-1 ? 0:(i2-i1-1);
+            if ( abs(diff)>4 ) {
+                printf("%i %i %i\n",i1,i2,diff);
+                pass = pass && abs(i2-i1-1)<=2;
+            }
+        }
+        ids.clear();
+        tpool.wait_pool_finished();
+        if ( pass )
+            ut.passes( "Thread pool behaves as FIFO" );
+        else
+            ut.failure( "Thread pool does not behave as FIFO" );
+    }
+}
+
 
 /******************************************************************
 * The main program                                                *
 ******************************************************************/
 #ifdef USE_WINDOWS
-    int __cdecl main(int argc, char **argv) {
-#elif defined(USE_LINUX) || defined(USE_MAC)
-    int main(int argc, char* argv[]) {
+int __cdecl main( int argc, char **argv )
+{
+#elif defined( USE_LINUX ) || defined( USE_MAC )
+int main( int argc, char *argv[] )
+{
 #else
-    #error Unknown OS
+#error Unknown OS
 #endif
 
-    int N_threads = 4;      // Number of threads
-    int N_work = 2000;      // Number of work items
-    int N_it = 10;          // Number of cycles to run
-    int N_problem = 4;      // Problem size
-    PROFILE_ENABLE(3);
+    int N_threads = 4;    // Number of threads
+    int N_work    = 2000; // Number of work items
+    int N_it      = 10;   // Number of cycles to run
+    int N_problem = 5;    // Problem size
+    PROFILE_ENABLE( 3 );
     PROFILE_ENABLE_TRACE();
+    PROFILE_DISABLE_MEMORY();
     UnitTest ut;
 
 
     // Initialize MPI and set the error handlers
-    int rank = 0;
-    int size = 1;
-    #ifdef USE_MPI
-        int provided_thread_support=-1;
-        MPI_Init_thread(&argc,&argv,MPI_THREAD_MULTIPLE,&provided_thread_support);
-        MPI_Comm_size( MPI_COMM_WORLD, &size );
-        MPI_Comm_rank( MPI_COMM_WORLD, &rank );
-        Utilities::setErrorHandlers();
-    #endif
-    NULL_USE(size);
-    
-
+#ifdef USE_MPI
+    int provided_thread_support = -1;
+    MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &provided_thread_support );
+    Utilities::setErrorHandlers();
     // Disable OS specific warnings for all non-root ranks
+#endif
+    int rank = getRank();
+    int size = getSize();
     if ( rank > 0 )
-        ThreadPool::set_OS_warnings(1);
+        ThreadPool::set_OS_warnings( 1 );
+    NULL_USE( size );
+    NULL_USE( argc );
+    NULL_USE( argv );
 
 
+    // Test the atomics
+    if ( test_atomics() == 0 )
+        ut.passes( "Atomics passed" );
+    else
+        ut.failure( "Atomics failed" );
+
     // Initialize the data
-    std::vector<int> data1(N_work,0);
-    std::vector<int> priority(N_work,0);
-    for (int i=0; i<N_work; i++) {
-        data1[i] = N_problem;
-        priority[i] = i%128;
+    std::vector<int> data1( N_work, 0 );
+    std::vector<int> priority( N_work, 0 );
+    for ( int i = 0; i < N_work; i++ ) {
+        data1[i]    = N_problem;
+        priority[i] = i % 128;
     }
-    TIME_TYPE start, end, f, start2, end2;
-    get_frequency(&f);
 
 
     // Print the size of the thread pool class
-    printp("Size of ThreadPool = %i\n",(int)sizeof(ThreadPool));
-
-
-    // Create and test a mutex
-    barrier();
-    printp("Testing mutex\n");
-    int N_errors_mutex = test_mutex(false);
-    N_errors_mutex    += test_mutex(true);
-    if ( N_errors_mutex == 0 )
-        ut.passes("test mutex");
-    else
-        ut.failure("Errors found testing mutex");
+    printp( "Size of ThreadPool = %i\n", (int) sizeof( ThreadPool ) );
 
 
     // Get the number of processors availible
@@ -475,13 +513,13 @@ inline double run_parallel( ThreadPool *tpool, int N_tasks, int N_work )
     int N_procs = 0;
     try {
         N_procs = ThreadPool::getNumberOfProcessors();
-    } catch (...) {
+    } catch ( ... ) {
     }
-    if ( N_procs>0 )
-        ut.passes("getNumberOfProcessors");
+    if ( N_procs > 0 )
+        ut.passes( "getNumberOfProcessors" );
     else
-        ut.failure("getNumberOfProcessors");
-    printp("%i processors availible\n",N_procs);
+        ut.failure( "getNumberOfProcessors" );
+    printp( "%i processors availible\n", N_procs );
 
 
     // Get the processor affinities for the process
@@ -489,55 +527,55 @@ inline double run_parallel( ThreadPool *tpool, int N_tasks, int N_work )
     std::vector<int> cpus;
     try {
         cpus = ThreadPool::getProcessAffinity();
-        printp("%i cpus for current process: ",(int)cpus.size());
-        for (size_t i=0; i<cpus.size(); i++)
-            printp("%i ",cpus[i]);
-        printp("\n");
-    } catch (...) {
+        printp( "%i cpus for current process: ", (int) cpus.size() );
+        for ( size_t i = 0; i < cpus.size(); i++ )
+            printp( "%i ", cpus[i] );
+        printp( "\n" );
+    } catch ( ... ) {
     }
     if ( !cpus.empty() ) {
-        ut.passes("getProcessAffinity");
+        ut.passes( "getProcessAffinity" );
     } else {
-        #ifdef __APPLE__
-            ut.expected_failure("getProcessAffinity");
-        #else
-            ut.failure("getProcessAffinity");
-        #endif
+#ifdef __APPLE__
+        ut.expected_failure( "getProcessAffinity" );
+#else
+        ut.failure( "getProcessAffinity" );
+#endif
     }
 
 
     // Test setting the process affinities
     barrier();
     bool pass = false;
-    if ( !cpus.empty() && N_procs>0 ) {
-        if ( cpus.size()==1 ) {
-            cpus.resize(N_procs);
-            for (int i=0; i<N_procs; i++)
+    if ( !cpus.empty() && N_procs > 0 ) {
+        if ( cpus.size() == 1 ) {
+            cpus.resize( N_procs );
+            for ( int i = 0; i < N_procs; i++ )
                 cpus.push_back( i );
             try {
                 ThreadPool::setProcessAffinity( cpus );
-            } catch (...) {
+            } catch ( ... ) {
             }
-            cpus = ThreadPool::getProcessAffinity();
+            cpus                  = ThreadPool::getProcessAffinity();
             std::vector<int> cpus = ThreadPool::getProcessAffinity();
-            printp("%i cpus for current process (updated): ",(int)cpus.size());
-            for (size_t i=0; i<cpus.size(); i++)
-                printp("%i ",cpus[i]);
-            printp("\n");
+            printp( "%i cpus for current process (updated): ", (int) cpus.size() );
+            for ( size_t i = 0; i < cpus.size(); i++ )
+                printp( "%i ", cpus[i] );
+            printp( "\n" );
             pass = cpus.size() > 1;
         } else {
             std::vector<int> cpus_orig = cpus;
-            std::vector<int> cpus_tmp(1,cpus[0]);
+            std::vector<int> cpus_tmp( 1, cpus[0] );
             try {
                 ThreadPool::setProcessAffinity( cpus_tmp );
-            } catch (...) {
+            } catch ( ... ) {
             }
             cpus = ThreadPool::getProcessAffinity();
             if ( cpus.size() == 1 )
                 pass = true;
             try {
                 ThreadPool::setProcessAffinity( cpus_orig );
-            } catch (...) {
+            } catch ( ... ) {
             }
             cpus = ThreadPool::getProcessAffinity();
             if ( cpus.size() != cpus_orig.size() )
@@ -545,403 +583,491 @@ inline double run_parallel( ThreadPool *tpool, int N_tasks, int N_work )
         }
     }
     if ( pass ) {
-        ut.passes("setProcessAffinity");
+        ut.passes( "setProcessAffinity" );
     } else {
-        #ifdef __APPLE__
-            ut.expected_failure("setProcessAffinity");
-        #else
-            ut.failure("setProcessAffinity");
-        #endif
+#ifdef __APPLE__
+        ut.expected_failure( "setProcessAffinity" );
+#else
+        ut.failure( "setProcessAffinity" );
+#endif
     }
-    int N_procs_used = std::min<int>(N_procs,N_threads);
-    printp("%i processors used\n",N_procs_used);
+    int N_procs_used = std::min<int>( N_procs, N_threads );
+    printp( "%i processors used\n", N_procs_used );
 
 
     // Create the thread pool
     barrier();
-    printp("Creating thread pool\n");
+    printp( "Creating thread pool\n" );
     ThreadPool tpool0;
     ThreadPool tpool;
     ThreadPool::thread_id_t id;
     id = TPOOL_ADD_WORK( &tpool, waste_cpu, ( data1[0] ) );
-    if ( id==ThreadPool::thread_id_t() || !tpool.isValid(id) )
-        ut.failure("Errors with id");
-    tpool.setNumThreads(N_threads);
-    if ( tpool.getNumThreads()==N_threads )
-        ut.passes("Created thread pool");
+    if ( id == ThreadPool::thread_id_t() || !tpool.isValid( id ) )
+        ut.failure( "Errors with id" );
+    tpool.setNumThreads( N_threads );
+    if ( tpool.getNumThreads() == N_threads )
+        ut.passes( "Created thread pool" );
     else
-        ut.failure("Failed to create tpool with desired number of threads");
-
-    // Test creating/destroying a thread pool using new
-    barrier();
-    pass = true;
-    try {
-        ThreadPool *tpool2 = new ThreadPool(MAX_NUM_THREADS-1);
-        if ( tpool2->getNumThreads() != MAX_NUM_THREADS-1 )
-            pass = false;
-        if ( !ThreadPool::is_valid(tpool2) )
-            pass = false;
-        delete tpool2;
-        // Check that tpool2 is invalid
-        // Note: valgrind will report this as an invalid memory read, but we want to keep the test)
-        if ( ThreadPool::is_valid(tpool2) )
-            pass = false;
-    } catch(...) {
-        pass = false;
-    }
-    if ( tpool.getNumThreads()==N_threads )
-        ut.passes("Created/destroyed thread pool with new");
-    else
-        ut.failure("Created/destroyed thread pool with new");
+        ut.failure( "Failed to create tpool with desired number of threads" );
 
 
     // Test setting the thread affinities
     barrier();
-    if ( cpus.size()>1 ) {
-        sleepMs(50);
+    if ( cpus.size() > 1 ) {
+        sleep_ms( 50 );
         // First make sure we can get the thread affinities
-        std::vector<int> procs = ThreadPool::getThreadAffinity( );
+        std::vector<int> procs = ThreadPool::getThreadAffinity();
         if ( procs == cpus ) {
-            ut.passes("getThreadAffinity() matches procs");
+            ut.passes( "getThreadAffinity() matches procs" );
         } else {
             char msg[100];
-            sprintf(msg,"getThreadAffinity() does not match procs (%i,%i)",
-                static_cast<int>(procs.size()), static_cast<int>(cpus.size()));
-            ut.failure(msg);
+            sprintf( msg, "getThreadAffinity() does not match procs (%i,%i)",
+                static_cast<int>( procs.size() ), static_cast<int>( cpus.size() ) );
+            ut.failure( msg );
         }
         pass = true;
-        for (int i=0; i<N_threads; i++) {
+        for ( int i = 0; i < N_threads; i++ ) {
             std::vector<int> procs_thread = tpool.getThreadAffinity( i );
             if ( procs_thread != procs ) {
-                printp("%i: Initial thread affinity: ",rank);
-                for (size_t i=0; i<procs_thread.size(); i++)
-                    printp("%i ",procs_thread[i]);
-                printp("\n");
+                printp( "%i: Initial thread affinity: ", rank );
+                for ( size_t i = 0; i < procs_thread.size(); i++ )
+                    printp( "%i ", procs_thread[i] );
+                printp( "\n" );
                 pass = false;
             }
         }
         if ( pass )
-            ut.passes("getThreadAffinity(thread) matches procs");
+            ut.passes( "getThreadAffinity(thread) matches procs" );
         else
-            ut.failure("getThreadAffinity(thread) does not match procs");
+            ut.failure( "getThreadAffinity(thread) does not match procs" );
         // Try to set the thread affinities
         pass = true;
         if ( !procs.empty() ) {
-            int N_procs_thread = std::max<int>((int)cpus.size()/N_threads,1);
-            for (int i=0; i<N_threads; i++) {
-                std::vector<int> procs_thread(N_procs_thread,-1);
-                for (int j=0; j<N_procs_thread; j++)
-                    procs_thread[j] = procs[(i*N_procs_thread+j)%procs.size()];
+            int N_procs_thread = std::max<int>( (int) cpus.size() / N_threads, 1 );
+            for ( int i = 0; i < N_threads; i++ ) {
+                std::vector<int> procs_thread( N_procs_thread, -1 );
+                for ( int j         = 0; j < N_procs_thread; j++ )
+                    procs_thread[j] = procs[( i * N_procs_thread + j ) % procs.size()];
                 tpool.setThreadAffinity( i, procs_thread );
-                sleepMs(10);    // Give time for OS to update thread affinities
+                sleep_ms( 10 ); // Give time for OS to update thread affinities
                 std::vector<int> procs_thread2 = tpool.getThreadAffinity( i );
                 if ( procs_thread2 != procs_thread ) {
-                    printp("%i: Final thread affinity: ",rank);
-                    for (size_t i=0; i<procs_thread.size(); i++)
-                        printp("%i ",procs_thread[i]);
-                    printp("\n");
+                    printp( "%i: Final thread affinity: ", rank );
+                    for ( size_t i = 0; i < procs_thread.size(); i++ )
+                        printp( "%i ", procs_thread[i] );
+                    printp( "\n" );
                     pass = false;
                 }
             }
         }
         if ( pass )
-            ut.passes("setThreadAffinity passes");
+            ut.passes( "setThreadAffinity passes" );
         else
-            ut.failure("setThreadAffinity failed to change affinity");
+            ut.failure( "setThreadAffinity failed to change affinity" );
     }
 
 
-    // Reset the thread affinities 
+    // Reset the thread affinities
     barrier();
-    tpool.setNumThreads(tpool.getNumThreads(),"none");
-    //tpool.setNumThreads(tpool.getNumThreads(),"independent");
-    for (int i=0; i<N_threads; i++) {
+    tpool.setNumThreads( tpool.getNumThreads(), "none" );
+    // tpool.setNumThreads(tpool.getNumThreads(),"independent");
+    for ( int i = 0; i < N_threads; i++ ) {
         std::vector<int> procs_thread = tpool.getThreadAffinity( i );
-        printp("Thread affinity: ");
-        for (size_t i=0; i<procs_thread.size(); i++)
-            printp("%i ",procs_thread[i]);
-        printp("\n");
+        printp( "Thread affinity: " );
+        for ( size_t i = 0; i < procs_thread.size(); i++ )
+            printp( "%i ", procs_thread[i] );
+        printp( "\n" );
     }
 
     // Print the current processors by thread id
     barrier();
-    print_processor(&tpool);
-    for (int i=0; i<N_threads; i++)
+    ThreadPool::set_OS_warnings( 1 );
+    print_processor( &tpool );
+    for ( int i = 0; i < N_threads; i++ )
         TPOOL_ADD_WORK( &tpool, print_processor, ( &tpool ) );
     tpool.wait_pool_finished();
 
     // Run some basic tests
     barrier();
-    get_time(&start);
-    for (int n=0; n<N_it; n++) {
-        for (int i=0; i<N_work; i++)
-            waste_cpu(data1[i]);
+    auto start = std::chrono::high_resolution_clock::now();
+    for ( int n = 0; n < N_it; n++ ) {
+        for ( int i = 0; i < N_work; i++ )
+            waste_cpu( data1[i] );
     }
-    get_time(&end);
-    double time = get_diff(start,end,f);
-    printp("Time for serial cycle = %0.0f us\n",1e6*time/N_it);
-    printp("Time for serial item = %0.0f ns\n",1e9*time/(N_it*N_work));
+    auto stop = std::chrono::high_resolution_clock::now();
+    double time = std::chrono::duration<double>(stop-start).count();
+    printp( "Time for serial cycle = %0.0f us\n", 1e6 * time / N_it );
+    printp( "Time for serial item = %0.0f ns\n", 1e9 * time / ( N_it * N_work ) );
     id = TPOOL_ADD_WORK( &tpool, waste_cpu, ( data1[0] ) );
-    tpool.wait(id);
+    tpool.wait( id );
     std::vector<ThreadPool::thread_id_t> ids2;
     ids2.push_back( TPOOL_ADD_WORK( &tpool, waste_cpu, ( data1[0] ) ) );
-    tpool.wait(ids2[0]);
+    tpool.wait( ids2[0] );
+
+    // Test the move operator for thread_id
+    ThreadPool::thread_id_t id1          = f1( id );         // move-construct from rvalue temporary
+    ThreadPool::thread_id_t id2          = std::move( id1 ); // move-construct from xvalue
+    volatile ThreadPool::thread_id_t id3 = f2( id );         // move-construct from rvalue temporary
+    volatile ThreadPool::thread_id_t id4 = std::move( id3 ); // move-construct from xvalue
+    id2.reset();
+    id4.reset();
 
     // Test calling functions with different number of arguments
     barrier();
-    printp("Testing arguments:\n");
+    printp( "Testing arguments:\n" );
     int N_errors_args = test_function_arguements( &tpool );
     if ( N_errors_args == 0 )
-        ut.passes("Calling function with default arguments");
+        ut.passes( "Calling function with default arguments" );
     else
-        ut.failure("Error calling function with default arguments");
+        ut.failure( "Error calling function with default arguments" );
 
 
-    // Check that the threads can sleep in parallel (this does not depend on the number of processors)
+    // Check that the threads can sleep in parallel (this does not depend on the number of
+    // processors)
     barrier();
     tpool.wait_pool_finished();
-    get_time(&start);
-    sleep_inc(1);
-    get_time(&end);
-    double sleep_serial = get_diff(start,end,f);
+    start = std::chrono::high_resolution_clock::now();
+    sleep_inc( 1 );
+    stop = std::chrono::high_resolution_clock::now();
+    double sleep_serial = std::chrono::duration<double>(stop-start).count();
     ids2.clear();
-    get_time(&start);
-    for (int i=0; i<N_threads; i++)
-        ids2.push_back( TPOOL_ADD_WORK( &tpool,sleep_inc, (1) ) );
-    tpool.wait_all(N_procs_used,&ids2[0]);
+    start = std::chrono::high_resolution_clock::now();
+    for ( int i = 0; i < N_threads; i++ )
+        ids2.push_back( TPOOL_ADD_WORK( &tpool, sleep_inc, ( 1 ) ) );
+    tpool.wait_all( N_procs_used, &ids2[0] );
+    stop = std::chrono::high_resolution_clock::now();
     ids2.clear();
-    get_time(&end);
-    double sleep_parallel = get_diff(start,end,f);
-    double sleep_speedup = N_procs_used*sleep_serial/sleep_parallel;
-    printf("%i:  Speedup on %i sleeping threads: %0.3f\n",rank,N_procs_used,sleep_speedup);
-    printf("%i:    ts = %0.3f, tp = %0.3f\n",rank,sleep_serial,sleep_parallel);
-    if ( fabs(sleep_serial-1.0)<0.05 && fabs(sleep_parallel-1.0)<0.075 )
-        ut.passes("Passed thread sleep");
-    else 
-        ut.failure("Failed thread sleep");
+    double sleep_parallel = std::chrono::duration<double>(stop-start).count();
+    double sleep_speedup  = N_procs_used * sleep_serial / sleep_parallel;
+    printf( "%i:  Speedup on %i sleeping threads: %0.3f\n", rank, N_procs_used, sleep_speedup );
+    printf( "%i:    ts = %0.3f, tp = %0.3f\n", rank, sleep_serial, sleep_parallel );
+    if ( fabs( sleep_serial - 1.0 ) < 0.05 && fabs( sleep_parallel - 1.0 ) < 0.25 && sleep_speedup>3 )
+        ut.passes( "Passed thread sleep" );
+    else
+        ut.failure( "Failed thread sleep" );
 
 
     // Check that the threads are actually working in parallel
     barrier();
-    if ( N_procs_used>1 ) {
-        #ifdef USE_MPI
-            // Use a non-blocking serialization of the MPI processes 
-            // if we do not have a sufficient number of processors
-            bool serialize_mpi = N_procs < N_threads*size;
-            int buf;
-            MPI_Request request;
-            MPI_Status status;
-            if ( serialize_mpi && rank>0 ) {
-                MPI_Irecv( &buf, 1, MPI_INT, rank-1, 0, MPI_COMM_WORLD, &request );
+    if ( N_procs_used > 1 ) {
+#ifdef USE_MPI
+        // Use a non-blocking serialization of the MPI processes
+        // if we do not have a sufficient number of processors
+        bool serialize_mpi = N_procs < N_threads * size;
+        int buf;
+        MPI_Request request;
+        MPI_Status status;
+        if ( serialize_mpi && rank > 0 ) {
+            MPI_Irecv( &buf, 1, MPI_INT, rank - 1, 0, MPI_COMM_WORLD, &request );
+            int flag = false;
+            while ( !flag ) {
+                MPI_Test( &request, &flag, &status );
+                sleep_s( 1 );
+            }
+        }
+#endif
+        int N = 20000000; // Enough work to keep the processor busy for ~ 1 s
+        // Run in serial
+        start = std::chrono::high_resolution_clock::now();
+        waste_cpu( N );
+        stop = std::chrono::high_resolution_clock::now();
+        double time_serial = std::chrono::duration<double>(stop-start).count();
+        // Run in parallel
+        double time_parallel2 = run_parallel( &tpool, N_procs_used, N / 1000 );
+        double time_parallel  = run_parallel( &tpool, N_procs_used, N );
+        double speedup        = N_procs_used * time_serial / time_parallel;
+        printf( "%i:  Speedup on %i procs: %0.3f\n", rank, N_procs_used, speedup );
+        printf( "%i:    ts = %0.3f, tp = %0.3f, tp2 = %0.3f\n", rank, time_serial, time_parallel,
+            time_parallel2 );
+        if ( speedup > 1.4 ) {
+            ut.passes( "Passed speedup test" );
+        } else {
+#ifdef USE_GCOV
+            ut.expected_failure( "Times do not indicate tests are running in parallel (gcov)" );
+#else
+            ut.failure( "Times do not indicate tests are running in parallel" );
+#endif
+        }
+#ifdef USE_MPI
+        if ( serialize_mpi ) {
+            if ( rank < size - 1 )
+                MPI_Send( &N, 1, MPI_INT, rank + 1, 0, MPI_COMM_WORLD );
+            if ( rank == size - 1 ) {
+                for ( int i = 0; i < size - 1; i++ )
+                    MPI_Send( &N, 1, MPI_INT, i, 1, MPI_COMM_WORLD );
+            } else {
+                MPI_Irecv( &buf, 1, MPI_INT, size - 1, 1, MPI_COMM_WORLD, &request );
                 int flag = false;
+                MPI_Status status;
                 while ( !flag ) {
                     MPI_Test( &request, &flag, &status );
-                    sleep(1);
+                    sleep_s( 1 );
                 }
             }
-        #endif
-        int N = 20000000;    // Enough work to keep the processor busy for ~ 1 s
-        // Run in serial
-        get_time(&start);
-        waste_cpu(N);
-        get_time(&end);
-        double time_serial = get_diff(start,end,f);
-        // Run in parallel
-        double time_parallel2 = run_parallel( &tpool, N_procs_used, N/1000 );
-        double time_parallel  = run_parallel( &tpool, N_procs_used, N );
-        double speedup = N_procs_used*time_serial/time_parallel;
-        printf("%i:  Speedup on %i procs: %0.3f\n",rank,N_procs_used,speedup);
-        printf("%i:    ts = %0.3f, tp = %0.3f, tp2 = %0.3f\n",rank,time_serial,time_parallel,time_parallel2);
-        if ( speedup > 1.4 ) {
-            ut.passes("Passed speedup test");
-        } else {
-            #ifdef USE_GCOV
-                ut.expected_failure("Times do not indicate tests are running in parallel (gcov)");
-            #else
-                ut.failure("Times do not indicate tests are running in parallel");
-            #endif
         }
-        #ifdef USE_MPI
-            if ( serialize_mpi ) {
-                if ( rank<size-1 )
-                    MPI_Send( &N, 1, MPI_INT, rank+1, 0, MPI_COMM_WORLD );
-                if ( rank==size-1 ) {
-                    for (int i=0; i<size-1; i++)
-                        MPI_Send( &N, 1, MPI_INT, i, 1, MPI_COMM_WORLD );
-                } else {
-                    MPI_Irecv( &buf, 1, MPI_INT, size-1, 1, MPI_COMM_WORLD, &request );
-                    int flag = false;
-                    MPI_Status status;
-                    while ( !flag ) {
-                        MPI_Test( &request, &flag, &status );
-                        sleep(1);
-                    }
-                }
-            }
-        #endif
+#endif
     } else {
-        ut.expected_failure("Testing thread performance with less than 1 processor");
+        ut.expected_failure( "Testing thread performance with less than 1 processor" );
     }
 
 
+    // Test first-in-first-out scheduler (also ensures priorities)
+    test_FIFO( ut, tpool );
+
+
     // Test adding a work item with a dependency
     barrier();
     {
         // Test that we sucessfully wait on the work items
         std::vector<ThreadPool::thread_id_t> ids;
-        ids.reserve(5);
-        global_sleep_count = 0;     // Reset the count before this test
-        ThreadPool::thread_id_t id1 = TPOOL_ADD_WORK( &tpool, sleep_inc, ( 1 ) );
-        ThreadPool::thread_id_t id2 = TPOOL_ADD_WORK( &tpool, sleep_inc, ( 2 ) );
-        ThreadPool::WorkItem *wait1 = new WorkItemFull<bool,int>( check_inc, 1 );
-        ThreadPool::WorkItem *wait2 = new WorkItemFull<bool,int>( check_inc, 2 );
-        wait1->add_dependency(id1);
-        wait2->add_dependency(id1);  wait2->add_dependency(id2);
+        ids.reserve( 5 );
+        global_sleep_count = 0; // Reset the count before this test
+        ThreadPool::thread_id_t id0;
+        auto id1 = TPOOL_ADD_WORK( &tpool, sleep_inc, ( 1 ) );
+        auto id2 = TPOOL_ADD_WORK( &tpool, sleep_inc, ( 2 ) );
+        auto *wait1 = new WorkItemFull<bool, int>( check_inc, 1 );
+        auto *wait2 = new WorkItemFull<bool, int>( check_inc, 2 );
+        wait1->add_dependency( id0 );
+        wait1->add_dependency( id1 );
+        wait2->add_dependency( id1 );
+        wait2->add_dependency( id2 );
         ids.clear();
-        ids.push_back( tpool.add_work(wait1) );
-        ids.push_back( tpool.add_work(wait2) );
-        tpool.wait_all(ids.size(),&ids[0]);
-        if ( !tpool.getFunctionRet<bool>(ids[0]) || !tpool.getFunctionRet<bool>(ids[1]) )
-            ut.failure("Failed to wait on required dependency");
+        ids.push_back( tpool.add_work( wait1 ) );
+        ids.push_back( tpool.add_work( wait2 ) );
+        tpool.wait_all( ids.size(), &ids[0] );
+        if ( !tpool.getFunctionRet<bool>( ids[0] ) || !tpool.getFunctionRet<bool>( ids[1] ) )
+            ut.failure( "Failed to wait on required dependency" );
         else
-            ut.passes("Dependencies");
+            ut.passes( "Dependencies" );
         tpool.wait_pool_finished();
+        // Test waiting on more dependencies than in the thread pool (changing priorities)
+        ids.clear();
+        for (size_t i=0; i<20; i++)
+            ids.push_back( TPOOL_ADD_WORK( &tpool, sleep_inc2, ( 0.1 ) ) );
+        auto *wait3 = new WorkItemFull<void,double>( sleep_inc2, 0 );
+        wait3->add_dependencies( ids );
+        id = tpool.add_work( wait3, 50 );
+        tpool.wait( id );
+        bool pass = true;
+        for (size_t i=0; i<ids.size(); i++)
+            pass = pass && ids[i].finished();
+        ids.clear();
+        if ( pass )
+            ut.passes( "Dependencies2" );
+        else
+            ut.failure( "Dependencies2" );
         // Check that we can handle more complex dependencies
         id1 = TPOOL_ADD_WORK( &tpool, sleep_inc2, ( 0.5 ) );
-        for (int i=0; i<10; i++) {
-            wait1 = new WorkItemFull<bool,int>( check_inc, 1 );
-            wait1->add_dependency(id1);
-            tpool.add_work(wait1);
+        for ( int i = 0; i < 10; i++ ) {
+            wait1 = new WorkItemFull<bool, int>( check_inc, 1 );
+            wait1->add_dependency( id1 );
+            tpool.add_work( wait1 );
         }
         tpool.wait_pool_finished();
         ids.clear();
-        for (int i=0; i<5; i++)
-            ids.push_back( TPOOL_ADD_WORK( &tpool, sleep_inc2, (0.5) ) );
-        sleep_inc2(0.002);
-        ThreadPool::WorkItem *work = new WorkItemFull<void,int>( waste_cpu, 100 );
-        work->add_dependencies(ids);
-        id = tpool.add_work(work,10);
-        tpool.wait(id);
+        for ( int i = 0; i < 5; i++ )
+            ids.push_back( TPOOL_ADD_WORK( &tpool, sleep_inc2, ( 0.5 ) ) );
+        sleep_inc2( 0.002 );
+        ThreadPool::WorkItem *work = new WorkItemFull<void, int>( waste_cpu, 100 );
+        work->add_dependencies( ids );
+        id = tpool.add_work( work, 10 );
+        tpool.wait( id );
+    }
+
+    // Test the timing creating and running a work item
+    barrier();
+    {
+        printp( "Testing timmings (creating/running work item):\n" );
+        std::string timer_name = "Create/Run work item";
+        PROFILE_START( timer_name );
+        int64_t time_create = 0;
+        int64_t time_run    = 0;
+        int64_t time_delete = 0;
+        std::vector<ThreadPool::WorkItem *> work( N_work );
+        start = std::chrono::high_resolution_clock::now();
+        for ( int n = 0; n < N_it; n++ ) {
+            auto t1 = std::chrono::high_resolution_clock::now();
+            for ( int i = 0; i < N_work; i++ )
+                work[i] = ThreadPool::createWork<void, int>( waste_cpu, data1[i] );
+            auto t2 = std::chrono::high_resolution_clock::now();
+            for ( int i = 0; i < N_work; i++ )
+                work[i]->run();
+            auto t3 = std::chrono::high_resolution_clock::now();
+            for ( int i = 0; i < N_work; i++ )
+                delete work[i];
+            auto t4 = std::chrono::high_resolution_clock::now();
+            time_create += to_ns(t2-t1);
+            time_run    += to_ns(t3-t2);
+            time_delete += to_ns(t4-t3);
+            if ( ( n + 1 ) % 100 == 0 )
+                printp( "Cycle %i of %i finished\n", n + 1, N_it );
+        }
+        stop = std::chrono::high_resolution_clock::now();
+        time = std::chrono::duration<double>(stop-start).count();
+        PROFILE_STOP( timer_name );
+        printp( "   time = %0.0f ms\n", 1e3 * time );
+        printp( "   time / cycle = %0.0f us\n", 1e6 * time / N_it );
+        printp( "   average time / item = %0.0f ns\n", 1e9 * time / ( N_it * N_work ) );
+        printp( "      create = %i ns\n", static_cast<int>( time_create / ( N_it * N_work ) ) );
+        printp( "      run    = %i ns\n", static_cast<int>( time_run    / ( N_it * N_work ) ) );
+        printp( "      delete = %i us\n", static_cast<int>( time_delete / ( N_it * N_work ) ) );
     }
 
     // Test the timing adding a single item
     barrier();
-    for (int it=0; it<2; it++) {
-        ThreadPool *tpool_ptr = NULL;
-        if ( it==0 ) {
-            printp("Testing timmings (adding a single item to empty tpool):\n");
+    for ( int it = 0; it < 2; it++ ) {
+        ThreadPool *tpool_ptr = nullptr;
+        std::string timer_name;
+        if ( it == 0 ) {
+            printp( "Testing timmings (adding a single item to empty tpool):\n" );
+            timer_name = "Add single item to empty pool";
             tpool_ptr = &tpool0;
-        } else if ( it==1 ) {
-            printp("Testing timmings (adding a single item):\n");
+        } else if ( it == 1 ) {
+            printp( "Testing timmings (adding a single item):\n" );
+            timer_name = "Add single item to tpool";
             tpool_ptr = &tpool;
         }
-        std::vector<ThreadPool::thread_id_t> ids(N_work);
-        double time_add = 0.0;
-        double time_wait = 0.0;
-        get_time(&start);
-        for (int n=0; n<N_it; n++) {
-            get_time(&start2);
-            for (int i=0; i<N_work; i++)
+        PROFILE_START( timer_name );
+        std::vector<ThreadPool::thread_id_t> ids( N_work );
+        int64_t time_add  = 0;
+        int64_t time_wait = 0;
+        start = std::chrono::high_resolution_clock::now();
+        for ( int n = 0; n < N_it; n++ ) {
+            auto t1 = std::chrono::high_resolution_clock::now();
+            for ( int i = 0; i < N_work; i++ )
                 ids[i] = TPOOL_ADD_WORK( tpool_ptr, waste_cpu, ( data1[i] ), priority[i] );
-            get_time(&end2);
-            time_add += get_diff(start2,end2,f);
-            get_time(&start2);
-            tpool_ptr->wait_all(N_work,&ids[0]);
-            //tpool_ptr->wait_pool_finished();
-            get_time(&end2);
-            time_wait += get_diff(start2,end2,f);
-            if ( (n+1)%100 == 0 )
-                printp("Cycle %i of %i finished\n",n+1,N_it);
+            auto t2 = std::chrono::high_resolution_clock::now();
+            tpool_ptr->wait_all( N_work, &ids[0] );
+            auto t3 = std::chrono::high_resolution_clock::now();
+            time_add += to_ns(t2-t1);
+            time_wait += to_ns(t3-t2);
+            if ( ( n + 1 ) % 100 == 0 )
+                printp( "Cycle %i of %i finished\n", n + 1, N_it );
         }
-        get_time(&end);
-        time = get_diff(start,end,f);
-        printp("  time = %0.0f ms\n",1e3*time);
-        printp("  time / cycle = %0.0f us\n",1e6*time/N_it);
-        printp("  average time / item = %0.0f ns\n",1e9*time/(N_it*N_work));
-        printp("     create and add = %0.0f ns\n",1e9*time_add/(N_it*N_work));
-        printp("     wait = %0.0f us\n",1e9*time_wait/(N_it*N_work));
+        stop = std::chrono::high_resolution_clock::now();
+        time = std::chrono::duration<double>(stop-start).count();
+        PROFILE_STOP( timer_name );
+        printp( "   time = %0.0f ms\n", 1e3 * time );
+        printp( "   time / cycle = %0.0f us\n", 1e6 * time / N_it );
+        printp( "   average time / item = %0.0f ns\n", 1e9 * time / ( N_it * N_work ) );
+        printp( "      create and add = %i ns\n", static_cast<int>( time_add / ( N_it * N_work ) ) );
+        printp( "      wait = %i us\n", static_cast<int>( time_wait / ( N_it * N_work ) ) );
     }
 
     // Test the timing pre-creating the work items and adding multiple at a time
     barrier();
-    for (int it=0; it<2; it++) {
-        ThreadPool *tpool_ptr = NULL;
-        if ( it==0 ) {
-            printp("Testing timmings (adding a block of items to empty tpool):\n");
+    for ( int it = 0; it < 2; it++ ) {
+        ThreadPool *tpool_ptr = nullptr;
+        std::string timer_name;
+        if ( it == 0 ) {
+            printp( "Testing timmings (adding a block of items to empty tpool):\n" );
+            timer_name = "Add multiple items to empty pool";
             tpool_ptr = &tpool0;
-        } else if ( it==1 ) {
-            printp("Testing timmings (adding a block of items):\n");
+        } else if ( it == 1 ) {
+            printp( "Testing timmings (adding a block of items):\n" );
+            timer_name = "Add multiple items to tpool";
             tpool_ptr = &tpool;
         }
-        double time_create_work = 0.0;
-        double time_add_work = 0.0;
-        double time_wait_work = 0.0;
-        std::vector<ThreadPool::WorkItem*> work(N_work);
-        get_time(&start);
-        for (int n=0; n<N_it; n++) {
-            get_time(&start2);
-            for (int i=0; i<N_work; i++)
-                work[i] = new WorkItemFull<void,int>( waste_cpu, data1[i] );
-            get_time(&end2);
-            time_create_work += get_diff(start2,end2,f);
-            get_time(&start2);
-            std::vector<ThreadPool::thread_id_t> ids = tpool_ptr->add_work( work, priority );
-            get_time(&end2);
-            time_add_work += get_diff(start2,end2,f);
-            get_time(&start2);
-            tpool_ptr->wait_all(ids);
-            get_time(&end2);
-            time_wait_work += get_diff(start2,end2,f);
-            if ( (n+1)%100 == 0 )
-                printp("Cycle %i of %i finished\n",n+1,N_it);
+        PROFILE_START( timer_name );
+        int64_t time_create_work = 0;
+        int64_t time_add_work    = 0;
+        int64_t time_wait_work   = 0;
+        std::vector<ThreadPool::WorkItem *> work( N_work );
+        start = std::chrono::high_resolution_clock::now();
+        for ( int n = 0; n < N_it; n++ ) {
+            auto t1 = std::chrono::high_resolution_clock::now();
+            for ( int i = 0; i < N_work; i++ )
+                work[i] = ThreadPool::createWork<void, int>( waste_cpu, data1[i] );
+            auto t2 = std::chrono::high_resolution_clock::now();
+            auto ids = tpool_ptr->add_work( work, priority );
+            auto t3 = std::chrono::high_resolution_clock::now();
+            tpool_ptr->wait_all( ids );
+            auto t4 = std::chrono::high_resolution_clock::now();
+            time_create_work += to_ns(t2-t1);
+            time_add_work += to_ns(t3-t2);
+            time_wait_work += to_ns(t4-t3);
+            if ( ( n + 1 ) % 100 == 0 )
+                printp( "Cycle %i of %i finished\n", n + 1, N_it );
         }
-        get_time(&end);
-        time = get_diff(start,end,f);
-        printp("  time = %0.0f ms\n",1e3*time);
-        printp("  time / cycle = %0.0f us\n",1e6*time/N_it);
-        printp("  average time / item = %0.0f ns\n",1e9*time/(N_it*N_work));
-        printp("     create = %0.0f ns\n",1e9*time_create_work/(N_it*N_work));
-        printp("     add = %0.0f ns\n",1e9*time_add_work/(N_it*N_work));
-        printp("     wait = %0.0f ns\n",1e9*time_wait_work/(N_it*N_work));
+        stop = std::chrono::high_resolution_clock::now();
+        time = std::chrono::duration<double>(stop-start).count();
+        PROFILE_STOP( timer_name );
+        printp( "   time = %0.0f ms\n", 1e3 * time );
+        printp( "   time / cycle = %0.0f us\n", 1e6 * time / N_it );
+        printp( "   average time / item = %0.0f ns\n", 1e9 * time / ( N_it * N_work ) );
+        printp( "      create = %i ns\n", static_cast<int>( time_create_work / ( N_it * N_work ) ) );
+        printp( "      add = %i ns\n",  static_cast<int>( time_add_work / ( N_it * N_work ) ) );
+        printp( "      wait = %i ns\n", static_cast<int>( time_wait_work / ( N_it * N_work ) ) );
     }
 
     // Run a dependency test that tests a simple case that should keep the thread pool busy
     // Note: Checking the results requires looking at the trace data
     tpool.wait_pool_finished();
-    PROFILE_START("Dependency test");
-    for (int i=0; i<10; i++) {
+    PROFILE_START( "Dependency test" );
+    for ( int i = 0; i < 10; i++ ) {
         char msg[3][100];
-        sprintf(msg[0],"Item %i-%i",i,0);
-        sprintf(msg[1],"Item %i-%i",i,1);
-        sprintf(msg[2],"Item %i-%i",i,2);
-        ThreadPool::WorkItem *work  = new WorkItemFull<void,double,std::string>(sleep_msg,0.5,msg[0]);
-        ThreadPool::WorkItem *work1 = new WorkItemFull<void,double,std::string>(sleep_msg,0.1,msg[1]);
-        ThreadPool::WorkItem *work2 = new WorkItemFull<void,double,std::string>(sleep_msg,0.1,msg[2]);
-        ThreadPool::thread_id_t id = tpool.add_work(work);
-        work1->add_dependency(id);
-        work2->add_dependency(id);
-        tpool.add_work(work1);
-        tpool.add_work(work2);
+        sprintf( msg[0], "Item %i-%i", i, 0 );
+        sprintf( msg[1], "Item %i-%i", i, 1 );
+        sprintf( msg[2], "Item %i-%i", i, 2 );
+        ThreadPool::WorkItem *work =
+            new WorkItemFull<void, double, std::string>( sleep_msg, 0.5, msg[0] );
+        ThreadPool::WorkItem *work1 =
+            new WorkItemFull<void, double, std::string>( sleep_msg, 0.1, msg[1] );
+        ThreadPool::WorkItem *work2 =
+            new WorkItemFull<void, double, std::string>( sleep_msg, 0.1, msg[2] );
+        ThreadPool::thread_id_t id = tpool.add_work( work );
+        work1->add_dependency( id );
+        work2->add_dependency( id );
+        tpool.add_work( work1 );
+        tpool.add_work( work2 );
     }
     tpool.wait_pool_finished();
-    PROFILE_STOP("Dependency test");
+    PROFILE_STOP( "Dependency test" );
 
-    tpool.wait_pool_finished();
+    // Close the thread pool
+    tpool.setNumThreads( 0 );
+
+    // Save the profiling results
+    PROFILE_SAVE( "test_thread_pool" );
+    PROFILE_DISABLE();
+
+    // Test creating/destroying a thread pool using new
+    barrier();
+    pass = true;
+    try {
+        ThreadPool *tpool = new ThreadPool( MAX_NUM_THREADS - 1 );
+        if ( tpool->getNumThreads() != MAX_NUM_THREADS - 1 )
+            pass = false;
+        if ( !ThreadPool::is_valid( tpool ) )
+            pass = false;
+        delete tpool;
+        // Check that tpool is invalid
+        // Note: valgrind will report this as an invalid memory read, but we want to keep the test)
+        if ( ThreadPool::is_valid( tpool ) )
+            pass = false;
+    } catch ( ... ) {
+        pass = false;
+    }
+    if ( pass )
+        ut.passes( "Created/destroyed thread pool with new" );
+    else
+        ut.failure( "Created/destroyed thread pool with new" );
+
+    // Print the test results
     barrier();
     ut.report();
-    int N_errors = static_cast<int>(ut.NumFailGlobal());
-
+    int N_errors = static_cast<int>( ut.NumFailGlobal() );
 
     // Shudown MPI
-    PROFILE_SAVE("test_thread_pool");
     pout << "Shutting down\n";
     barrier();
-    #ifdef USE_MPI
-        MPI_Finalize( );
-        sleepMs(10);
-    #endif
+#ifdef USE_TIMER
+    if ( rank == 0 )
+        MemoryApp::print( std::cout );
+#endif
+#ifdef USE_MPI
+    MPI_Finalize();
+    sleep_ms( 10 );
+#endif
     return N_errors;
 }
-
-
-
diff --git a/threadpool/thread_pool.cpp b/threadpool/thread_pool.cpp
old mode 100755
new mode 100644
index e05b67b0..c58152f2
--- a/threadpool/thread_pool.cpp
+++ b/threadpool/thread_pool.cpp
@@ -1,128 +1,115 @@
-#define _CRT_NONSTDC_NO_DEPRECATE 
-#include "thread_pool.h"
+#define _CRT_NONSTDC_NO_DEPRECATE
+#include "threadpool/thread_pool.h"
+#include "common/Utilities.h"
+#include "common/StackTrace.h"
+#include "ProfilerApp.h"
+#include <algorithm>
+#include <bitset>
+#include <climits>
 #include <iostream>
+#include <stdexcept>
 #include <stdio.h>
 #include <stdlib.h>
-#include <algorithm>
 #include <typeinfo>
-#include <stdexcept>
-#include <climits>
+#include <thread>
+#include <chrono>
+
 
-#include "ProfilerApp.h"
-#include "common/Utilities.h"
 #define perr std::cerr
 #define pout std::cout
 #define printp printf
 
-#define MONITOR_THREADPOOL_PERFORMANCE 0
 
-#if 0
-    #define PROFILE_THREAD_START(X)   PROFILE_START(X,3)
-    #define PROFILE_THREAD_START2(X)  PROFILE_START2(X,3)
-    #define PROFILE_THREAD_STOP(X)    PROFILE_STOP(X,3)
-    #define PROFILE_THREAD_STOP2(X)   PROFILE_STOP2(X,3)
-#else
-    #define PROFILE_THREAD_START(X)   do {} while(0)
-    #define PROFILE_THREAD_START2(X)  do {} while(0)
-    #define PROFILE_THREAD_STOP(X)    do {} while(0)
-    #define PROFILE_THREAD_STOP2(X)   do {} while(0)
-#endif
-
-
-// Include system dependent headers and define some functions
-#ifdef __WORDSIZE
-    #define ARCH_SIZE __WORDSIZE
-#elif defined(_WIN64)
-    #define ARCH_SIZE 64
-#elif defined(_WIN32) // Note: WIN64 also defines WIN32
-    #define ARCH_SIZE 32
-#endif
-#ifdef USE_WINDOWS
+// OS specific includes / definitions
+// clang-format off
+#if defined( USE_WINDOWS )
+    #include <process.h>
     #include <windows.h>
-    #define get_time(x) QueryPerformanceCounter(x)
-    #define get_frequency(f) QueryPerformanceFrequency(f)
-    #define get_diff(start,end,f) \
-        static_cast<double>(end.QuadPart-start.QuadPart)/static_cast<double>(f.QuadPart)
-    #define TIME_TYPE LARGE_INTEGER
-#elif defined(USE_LINUX)
-    #include <sys/time.h>
-    #include <errno.h>
-    #define Sleep(x) usleep(1000*x)
-    #define get_time(x) gettimeofday(x,NULL);
-    #define get_frequency(f) (*f=timeval())
-    #define get_diff(start,end,f) 1e-6*static_cast<double>( \
-        0xF4240*(static_cast<int64_t>(end.tv_sec)-static_cast<int64_t>(start.tv_sec)) + \
-                (static_cast<int64_t>(end.tv_usec)-static_cast<int64_t>(start.tv_usec)) )
-    #define TIME_TYPE timeval
-#elif defined(USE_MAC)
-    #include <sys/time.h>
-    #include <mach/mach.h>
-    #include <errno.h>
-    #define Sleep(x) usleep(1000*x)
-    #define get_time(x) gettimeofday(x,NULL);
-    #define get_frequency(f) (*f=timeval())
-    #define get_diff(start,end,f) 1e-6*static_cast<double>( \
-        0xF4240*(static_cast<int64_t>(end.tv_sec)-static_cast<int64_t>(start.tv_sec)) + \
-                (static_cast<int64_t>(end.tv_usec)-static_cast<int64_t>(start.tv_usec)) )
-    #define TIME_TYPE timeval
-    #ifndef ARCH_SIZE
-        #ifdef __LP64__
-            #define ARCH_SIZE 64
-        #else
-            #define ARCH_SIZE 32
-        #endif
-    #endif
+    #define NOMINMAX
+    // Disable warning: the inline specifier cannot be used when a friend
+    // declaration refers to a specialization of a function template
+    #pragma warning( disable : 4396 )
+#endif
+#if defined(USE_LINUX) || defined(USE_MAC)
+    #include <pthread.h>
+    #include <unistd.h>
+#endif
+#ifdef USE_MAC
+    // https://developer.apple.com/library/mac/#releasenotes/Performance/RN-AffinityAPI
+    // http://plugins.svn.wordpress.org/wp-xhprof-profiler/trunk/facebook-xhprof/extension/xhprof..c
+    #include <mach/mach_init.h>
+    #include <mach/thread_policy.h>
+    #define cpu_set_t thread_affinity_policy_data_t
+    #define CPU_SET( cpu_id, new_mask ) *new_mask.affinity_tag = ( cpu_id + 1 )
+    #define CPU_ZERO( new_mask ) ( *( new_mask ) ).affinity_tag = THREAD_AFFINITY_TAG_NULL
+    #define sched_setaffinity( pid, size, mask ) \
+        thread_policy_set(                       \
+            mach_thread_self(), THREAD_AFFINITY_POLICY, mask, THREAD_AFFINITY_POLICY_COUNT )
+    #define sched_getaffinity( pid, size, mask ) \
+        thread_policy_get(                       \
+            mach_thread_self(), THREAD_AFFINITY_POLICY, mask, THREAD_AFFINITY_POLICY_COUNT )
+#endif
+// clang-format on
+
+
+// Set some macros
+#if PROFILE_THREADPOOL_PERFORMANCE
+    #define PROFILE_THREADPOOL_START( X )  PROFILE_START( X, 3 )
+    #define PROFILE_THREADPOOL_START2( X ) PROFILE_START2( X, 3 )
+    #define PROFILE_THREADPOOL_STOP( X )   PROFILE_STOP( X, 3 )
+    #define PROFILE_THREADPOOL_STOP2( X )  PROFILE_STOP2( X, 3 )
 #else
-    #error Unknown OS
+    #define PROFILE_THREADPOOL_START( X ) \
+        do {                          \
+        } while ( 0 )
+    #define PROFILE_THREADPOOL_START2( X ) \
+        do {                           \
+        } while ( 0 )
+    #define PROFILE_THREADPOOL_STOP( X ) \
+        do {                         \
+        } while ( 0 )
+    #define PROFILE_THREADPOOL_STOP2( X ) \
+        do {                          \
+        } while ( 0 )
+#endif
+#if MONITOR_THREADPOOL_PERFORMANCE == 1
+    #define accumulate( x, t1, t2 ) AtomicOperations::atomic_add( &x, \
+        std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count() );
 #endif
 
 
-// Check the ARCH_SIZE and set macros
-// Note: ARCH_SIZE must match the number of bits in size_t
-#if ARCH_SIZE == 64
-    // 32-bit macros
-#elif ARCH_SIZE == 32
-    // 64-bit macros
-#else
-    #error Cannot identify 32 vs 64-bit
+#if MONITOR_THREADPOOL_PERFORMANCE == 1
+    static AtomicOperations::int64_atomic total_add_work_time[5] = {0,0,0,0,0};
 #endif
 
 
-#define MAX(a,b) (((a) > (b)) ? (a) : (b))
-#define MIN(a,b) (((a) < (b)) ? (a) : (b))
-
-
-#if MONITOR_THREADPOOL_PERFORMANCE==1
-    static TIME_TYPE frequency;            // Clock frequency (only used for windows)
-    static double total_add_work_time[3] = {0,0,0};
-#endif
-
-
-
 // Helper functions
-template <class T>  void quicksort(std::vector<T> &x);
-static inline bool find_id(const std::vector<ThreadPool::thread_id_t> &x_in, const ThreadPool::thread_id_t &id );
+template <class T>
+void quicksort( int N, T* data );
+template <class T>
+inline void quicksort( std::vector<T> &x ) { quicksort((int)x.size(),x.data()); }
+static inline int find_id( int, const ThreadPool::thread_id_t*, const ThreadPool::thread_id_t& );
 
 
 // Function to generate a random size_t number (excluding 0 and ~0)
-static size_t rand_size_t() 
+static size_t rand_size_t()
 {
     size_t key = 0;
     double tmp = 1;
-    if ( sizeof(size_t)==4 ) {
+    if ( sizeof( size_t ) == 4 ) {
         while ( tmp < 4e9 ) {
-            key ^= rand()*0x9E3779B9;   // 2^32*0.5*(sqrt(5)-1)
+            key ^= rand() * 0x9E3779B9; // 2^32*0.5*(sqrt(5)-1)
             tmp *= RAND_MAX;
         }
-    } else if ( sizeof(size_t)==8 ) {
+    } else if ( sizeof( size_t ) == 8 ) {
         while ( tmp < 1.8e19 ) {
-            key ^= rand()*0x9E3779B97F4A7C15;  // 2^64*0.5*(sqrt(5)-1)
+            key ^= rand() * 0x9E3779B97F4A7C15; // 2^64*0.5*(sqrt(5)-1)
             tmp *= RAND_MAX;
         }
     } else {
-        throw std::logic_error("Unhandled case");
+        throw std::logic_error( "Unhandled case" );
     }
-    if ( key==0 || (~key)==0 )
+    if ( key == 0 || ( ~key ) == 0 )
         key = rand_size_t();
     return key;
 }
@@ -131,317 +118,75 @@ static size_t rand_size_t()
 /******************************************************************
 * Run some basic compile-time checks                              *
 ******************************************************************/
-#if MAX_NUM_THREADS%64 != 0
-    // We use a bit array for d_active and d_cancel
-    #error MAX_NUM_THREADS must be a multiple of 64
+#if MAX_NUM_THREADS % 64 != 0
+// We use a bit array for d_active and d_cancel
+#error MAX_NUM_THREADS must be a multiple of 64
 #endif
 #if MAX_NUM_THREADS >= 65535
-    // We store N_threads as a short int 
-    #error MAX_NUM_THREADS must < 65535
+// We store N_threads as a short int
+#error MAX_NUM_THREADS must < 65535
 #endif
 #if MAX_QUEUED >= 65535
-    // We store the indicies to the queue list as short ints
-    #error MAX_QUEUED must < 65535
+// We store the indicies to the queue list as short ints
+#error MAX_QUEUED must < 65535
 #endif
 
 
-/******************************************************************
-* Convert a string to binary                                      *
-******************************************************************/
-template<class T>
-static inline std::string convert_binary(T x) {
-    char buffer[65];
-    T mask = ((size_t)1)<<(8*sizeof(T)-1);
-    for (size_t i=0; i<8*sizeof(T); i++) {
-        if ( ( x & mask ) == 0 )
-            buffer[i] = '0';
-        else
-            buffer[i] = '1';
-        mask >>= 1;
-    }
-    buffer[8*sizeof(T)] = 0;
-    return std::string(buffer);
-}
-
-
 /******************************************************************
 * Get/Set a bit                                                   *
+* Note: these functions are thread-safe                           *
 ******************************************************************/
-static inline void set_bit( volatile ThreadPool::uint64* x, size_t index, bool val )
+static inline void set_bit( volatile AtomicOperations::int64_atomic *x, size_t index )
 {
-    ThreadPool::uint64 mask = 0x01;
-    mask <<= index%64;
-    if ( val )
-        x[index/64] |= mask;
-    else
-        x[index/64] &= ~mask;
-}
-static inline bool get_bit( const volatile ThreadPool::uint64* x, size_t index )
-{
-    ThreadPool::uint64 mask = 0x01;
-    mask <<= index%64;
-    return (x[index/64]&mask)!=0;
-}
-
-
-
-/******************************************************************
-* Some mutex helper functions                                     *
-******************************************************************/
-#if defined(USE_LINUX) || defined(USE_MAC)
-    // Store a set of global attributes for the thread pool
-    static pthread_mutexattr_t threadpool_global_attr;
-    static int initialize_threadpool_global_attr() {
-        pthread_mutexattr_init(&threadpool_global_attr);
-        #ifdef __USE_UNIX98
-            pthread_mutexattr_settype( &threadpool_global_attr, PTHREAD_MUTEX_ERRORCHECK );
-        #endif
-        return 1;
-    }
-    static int threadpool_global_attr_dummy = 0;
-    static inline void throw_pthread_error( std::string msg, int value ) {
-        std::string code;
-        if ( value==0 ) {
-            code = "SUCCESS";
-        } else if ( value==EINVAL ) {
-            code = "EINVAL";
-        } else if ( value==EBUSY ) {
-            code = "EBUSY";
-        } else if ( value==EAGAIN ) {
-            code = "EAGAIN";
-        } else if ( value==EDEADLK ) {
-            code = "EDEADLK";
-        } else if ( value==EPERM ) {
-            code = "EPERM";
-        } else {
-            char tmp[100];
-            sprintf(tmp,"Unknown (%i)",value);
-            code = std::string(tmp);
-        }
-        throw std::logic_error(msg+code);
-    }
-#endif
-#ifdef USE_WINDOWS
-    static inline void lock_mutex( CRITICAL_SECTION *lock ) {
-        EnterCriticalSection(lock); 
-    }
-    static inline void unlock_mutex( CRITICAL_SECTION *lock ) {
-        LeaveCriticalSection(lock);
-    }
-    static CRITICAL_SECTION* create_mutex( ) {
-        CRITICAL_SECTION *lock = new CRITICAL_SECTION;
-        if (!InitializeCriticalSectionAndSpinCount(lock,0x00000400) ) 
-            throw std::exception();
-        return lock;
-    }
-    static void destroy_mutex( CRITICAL_SECTION *lock ) {
-        DeleteCriticalSection(lock);
-        delete lock;
-    }
-#elif defined(USE_LINUX) || defined(USE_MAC)
-    static inline void lock_mutex( pthread_mutex_t *lock ) {
-        int retval = pthread_mutex_lock(lock);
-        if ( retval != 0 )
-            throw_pthread_error("Error locking mutex: ",retval);
-    }
-    static inline void unlock_mutex( pthread_mutex_t *lock ) {
-        int retval = pthread_mutex_unlock(lock);
-        if ( retval != 0 )
-            throw_pthread_error("Error unlocking mutex: ",retval);
-    }
-    static pthread_mutex_t* create_mutex( ) {
-        pthread_mutex_t* lock = NULL;
-        #if defined(USE_LINUX) || defined(USE_MAC)
-            if (threadpool_global_attr_dummy!=1)
-                threadpool_global_attr_dummy = initialize_threadpool_global_attr();
-        #endif
-        // We are creating a new mutex
-        lock = new pthread_mutex_t;
-        int error = pthread_mutex_init(lock,&threadpool_global_attr);
-        if ( error != 0 )
-            throw_pthread_error("Error initializing mutex: ",error);
-        return lock;
-    }
-    static void destroy_mutex( pthread_mutex_t* lock ) {
-        pthread_mutex_destroy(lock);
-        delete lock;
-    }
-#else
-    #error Unknown OS
-#endif
-
-
-/******************************************************************
-* Mutex class                                                     *
-******************************************************************/
-Mutex::Mutex()
-{
-    d_lock = create_mutex();
-    d_recursive = false;
-    d_count = new int;
-    d_lock_count = new int;
-    d_thread = new size_t;
-    *d_count = 1;
-    *d_lock_count = 0;
-    *d_thread = 0;
-}
-Mutex::Mutex(bool recursive)
-{
-    d_lock = create_mutex();
-    d_recursive = recursive;
-    d_count = new int;
-    d_lock_count = new int;
-    d_thread = new size_t;
-    *d_count = 1;
-    *d_lock_count = 0;
-    *d_thread = 0;
-}
-Mutex::Mutex(const Mutex& rhs)
-{
-    rhs.lock();
-    d_lock = rhs.d_lock;
-    d_count = rhs.d_count;
-    d_recursive = rhs.d_recursive;
-    d_lock_count = rhs.d_lock_count;
-    d_thread = rhs.d_thread;
-    ++(*d_count);
-    rhs.unlock();
-}
-Mutex& Mutex::operator=(const Mutex& rhs)
-{
-    if (this == &rhs) // protect against invalid self-assignment
-        return *this;
-    rhs.lock();
-    this->d_lock = rhs.d_lock;
-    this->d_count = rhs.d_count;
-    this->d_recursive = rhs.d_recursive;
-    this->d_lock_count = rhs.d_lock_count;
-    this->d_thread = rhs.d_thread;
-    ++(*this->d_count);
-    rhs.unlock();
-    return *this;
-}
-Mutex::~Mutex()
-{
-    lock();
-    bool destroy = (*d_count)==1;
-    (*d_count)--;
-    unlock();
-    if ( destroy ) {
-        delete d_count;
-        delete d_lock_count;
-        delete d_thread;
-        destroy_mutex(d_lock);
+    uint64_t mask = 0x01;
+    mask <<= index % 64;
+    size_t i = index / 64;
+    bool test = false;
+    while ( !test ) {
+        AtomicOperations::int64_atomic y = x[i];
+        test = AtomicOperations::atomic_compare_and_swap( &x[i], y, (y|mask) );
     }
 }
-void Mutex::lock() const
+static inline void unset_bit( volatile AtomicOperations::int64_atomic *x, size_t index )
 {
-    // Check if we already own the lock
-    size_t id = ThreadPool::getThreadId();
-    if ( *d_lock_count>0 && *d_thread==id ) {
-        if ( !d_recursive )
-            throw std::logic_error("Lock is already locked and non-recursive");
-        // Increment the lock count and return
-        ++(*d_lock_count);
-        return;
-    }
-    // Acquire the lock
-    lock_mutex(d_lock);
-    if ( *d_lock_count != 0 )   // If we are getting the lock, the count must be 0
-        throw std::logic_error("Internal error");
-    *d_lock_count = 1;  // Change lock count after acquiring mutex
-    *d_thread = id;
-}
-bool Mutex::tryLock() const
-{
-    // Check if we already own the lock
-    size_t id = ThreadPool::getThreadId();
-    if ( *d_lock_count>0 && *d_thread==id ) {
-        if ( !d_recursive )
-            return false;
-        // Increment the lock count and return
-        ++(*d_lock_count);
-        return true;
-    }
-    // Try and acquire the lock
-    #ifdef USE_WINDOWS
-        bool success = TryEnterCriticalSection(d_lock)!=0;
-    #elif defined(USE_LINUX) || defined(USE_MAC)
-        bool success = pthread_mutex_trylock(const_cast<pthread_mutex_t*>(d_lock))==0;
-    #else
-        #error Unknown OS
-    #endif
-    if ( success ) {
-        if ( *d_lock_count != 0 )   // If we are getting the lock, the count must be 0
-            throw std::logic_error("Internal error");
-        *d_lock_count = 1;  // Chage lock count after acquiring mutex
-        *d_thread = id;
-    }
-    return success;
-}
-void Mutex::unlock() const
-{
-    // Check if we already own the lock
-    size_t id = ThreadPool::getThreadId();
-    if ( *d_lock_count <= 0 )
-        throw std::logic_error("Trying to release a lock that has not been locked");
-    if ( *d_thread != id )
-        throw std::logic_error("Thread that does not own lock is attempting to release");
-    // Release the lock
-    --(*d_lock_count);  // Change lock count before releasing mutex
-    if ( *d_lock_count == 0 ) {
-        *d_thread = 0;
-        unlock_mutex(d_lock);
+    uint64_t mask = 0x01;
+    mask <<= index % 64;
+    mask = ~mask;
+    size_t i = index / 64;
+    bool test = false;
+    while ( !test ) {
+        AtomicOperations::int64_atomic y = x[i];
+        test = AtomicOperations::atomic_compare_and_swap( &x[i], y, (y&mask) );
     }
 }
-bool Mutex::ownLock() const
+static inline bool get_bit( const volatile AtomicOperations::int64_atomic *x, size_t index )
 {
-    size_t id = ThreadPool::getThreadId();
-    if ( *d_lock_count>0 && *d_thread==id )
-        return true;
-    return false;
+    uint64_t mask = 0x01;
+    mask <<= index % 64;
+    AtomicOperations::int64_atomic y = x[index / 64];   // This is thread-safe since we only care about a single bit
+    return ( y & mask ) != 0;
 }
 
 
-/******************************************************************
-* Functions to deal with the signaling                            *
-******************************************************************/
-#ifdef USE_WINDOWS
-    static inline bool SIGNAL_EVENT(HANDLE event) {
-        SetEvent(event);
-         return false;
-    }
-#elif defined(USE_LINUX) || defined(USE_MAC)
-    static inline bool SIGNAL_EVENT(pthread_cond_t *event) {
-        int retval = pthread_cond_signal(event);
-        if ( retval == -1 ) {
-            perr << "Error signaling event\n";
-            return true;
-        }
-        return false;
-    }
-#else
-    #error Not programmed
-#endif
-
-
 /******************************************************************
 * Simple function to check if the parity is odd (true) or even    *
 ******************************************************************/
-static inline bool is_odd8(size_t x) {  // This only works for 64-bit integers
-    x ^= (x >> 1);
-    x ^= (x >> 2);
-    x ^= (x >> 4);
-    x ^= (x >> 8);
-    x ^= (x >> 16);
-    x ^= (x >> 32);
-    return (x & 0x01) > 0;
+static inline bool is_odd8( size_t x )
+{ // This only works for 64-bit integers
+    x ^= ( x >> 1 );
+    x ^= ( x >> 2 );
+    x ^= ( x >> 4 );
+    x ^= ( x >> 8 );
+    x ^= ( x >> 16 );
+    x ^= ( x >> 32 );
+    return ( x & 0x01 ) > 0;
 }
-template<class int_type>
-static inline int count_bits(int_type x) {
+template <class int_type>
+static inline int count_bits( int_type x )
+{
     int count = 0;
-    for (size_t i=0; i<8*sizeof(int_type); i++) {
-        if ( (x>>i)&0x01 )
+    for ( size_t i = 0; i < 8 * sizeof( int_type ); i++ ) {
+        if ( ( x >> i ) & 0x01 )
             ++count;
     }
     return count;
@@ -452,35 +197,38 @@ static inline int count_bits(int_type x) {
 * Set the bahvior of OS warnings                                  *
 ******************************************************************/
 static int global_OS_behavior = 0;
+std::mutex OS_warning_mutex;
 void ThreadPool::set_OS_warnings( int behavior )
 {
-    ASSERT(behavior>=0&&behavior<=2);
+    ASSERT( behavior >= 0 && behavior <= 2 );
     global_OS_behavior = behavior;
 }
-static void OS_warning( const std::string& message )
+static void OS_warning( const std::string &message )
 {
-    if ( global_OS_behavior==0 ) {
+    OS_warning_mutex.lock();
+    if ( global_OS_behavior == 0 ) {
         pout << "Warning: " << message << std::endl;
-    } else if ( global_OS_behavior==2 ) {
+    } else if ( global_OS_behavior == 2 ) {
         perr << "Error: " << message << std::endl;
     }
+    OS_warning_mutex.unlock();
 }
 
 
 /******************************************************************
-* Function to return the number of prcessors availible            *
+* Function to return the number of processors availible           *
 ******************************************************************/
 int ThreadPool::getNumberOfProcessors()
 {
-    #if defined(USE_LINUX) || defined(USE_MAC)
-        return sysconf( _SC_NPROCESSORS_ONLN );
-    #elif defined(USE_WINDOWS)
-        SYSTEM_INFO sysinfo;
-        GetSystemInfo( &sysinfo );
-        return static_cast<int>(sysinfo.dwNumberOfProcessors);
-    #else
-        #error Unknown OS
-    #endif
+#if defined( USE_LINUX ) || defined( USE_MAC )
+    return sysconf( _SC_NPROCESSORS_ONLN );
+#elif defined( USE_WINDOWS )
+    SYSTEM_INFO sysinfo;
+    GetSystemInfo( &sysinfo );
+    return static_cast<int>( sysinfo.dwNumberOfProcessors );
+#else
+#error Unknown OS
+#endif
 }
 
 
@@ -489,16 +237,16 @@ int ThreadPool::getNumberOfProcessors()
 ******************************************************************/
 int ThreadPool::getCurrentProcessor()
 {
-    #if defined(USE_LINUX) 
-        return sched_getcpu()+1;
-    #elif defined(USE_MAC)
-        OS_warning("MAC does not support getCurrentProcessor");
-        return 0;
-    #elif defined(USE_WINDOWS)
-        return GetCurrentProcessorNumber()+1;
-    #else
-        #error Unknown OS
-    #endif
+#if defined( USE_LINUX )
+    return sched_getcpu() + 1;
+#elif defined( USE_MAC )
+    OS_warning( "MAC does not support getCurrentProcessor" );
+    return 0;
+#elif defined( USE_WINDOWS )
+    return GetCurrentProcessorNumber() + 1;
+#else
+#error Unknown OS
+#endif
 }
 
 
@@ -508,71 +256,71 @@ int ThreadPool::getCurrentProcessor()
 std::vector<int> ThreadPool::getProcessAffinity()
 {
     std::vector<int> procs;
-    #ifdef USE_LINUX
-        #ifdef _GNU_SOURCE
-            cpu_set_t mask;
-            int error = sched_getaffinity(getpid(), sizeof(cpu_set_t), &mask );
-            if ( error!=0 )
-                throw std::logic_error("Error getting process affinity");
-            for (int i=0; i<(int)sizeof(cpu_set_t)*CHAR_BIT; i++) {
-                if ( CPU_ISSET(i,&mask) )
-                    procs.push_back(i);
-            }
-        #else
-            #warning sched_getaffinity is not supported for this compiler/OS
-            OS_warning("sched_getaffinity is not supported for this compiler/OS");
-            procs.clear();
-        #endif
-    #elif defined(USE_MAC)
-        // MAC does not support getting or setting the affinity
-        OS_warning("MAC does not support getting the process affinity");
-        procs.clear();
-    #elif defined(USE_WINDOWS)
-        HANDLE hProc = GetCurrentProcess();
-        size_t procMask;
-        size_t sysMask;
-        PDWORD_PTR procMaskPtr = reinterpret_cast<PDWORD_PTR>(&procMask);
-        PDWORD_PTR sysMaskPtr  = reinterpret_cast<PDWORD_PTR>(&sysMask);
-        GetProcessAffinityMask(hProc,procMaskPtr,sysMaskPtr);
-        for (int i=0; i<(int)sizeof(size_t)*CHAR_BIT; i++) {
-            if ( (procMask&0x1) != 0 )
-                procs.push_back(i);
-            procMask >>= 1;
-        }
-    #else
-        #error Unknown OS
-    #endif
+#ifdef USE_LINUX
+#ifdef _GNU_SOURCE
+    cpu_set_t mask;
+    int error = sched_getaffinity( getpid(), sizeof( cpu_set_t ), &mask );
+    if ( error != 0 )
+        throw std::logic_error( "Error getting process affinity" );
+    for ( int i = 0; i < (int) sizeof( cpu_set_t ) * CHAR_BIT; i++ ) {
+        if ( CPU_ISSET( i, &mask ) )
+            procs.push_back( i );
+    }
+#else
+#warning sched_getaffinity is not supported for this compiler/OS
+    OS_warning( "sched_getaffinity is not supported for this compiler/OS" );
+    procs.clear();
+#endif
+#elif defined( USE_MAC )
+    // MAC does not support getting or setting the affinity
+    OS_warning( "MAC does not support getting the process affinity" );
+    procs.clear();
+#elif defined( USE_WINDOWS )
+    HANDLE hProc = GetCurrentProcess();
+    size_t procMask;
+    size_t sysMask;
+    PDWORD_PTR procMaskPtr = reinterpret_cast<PDWORD_PTR>( &procMask );
+    PDWORD_PTR sysMaskPtr  = reinterpret_cast<PDWORD_PTR>( &sysMask );
+    GetProcessAffinityMask( hProc, procMaskPtr, sysMaskPtr );
+    for ( int i = 0; i < (int) sizeof( size_t ) * CHAR_BIT; i++ ) {
+        if ( ( procMask & 0x1 ) != 0 )
+            procs.push_back( i );
+        procMask >>= 1;
+    }
+#else
+#error Unknown OS
+#endif
     return procs;
 }
 void ThreadPool::setProcessAffinity( std::vector<int> procs )
 {
-    #ifdef USE_LINUX
-        #ifdef _GNU_SOURCE
-            cpu_set_t mask;
-            CPU_ZERO(&mask);
-            for (size_t i=0; i<procs.size(); i++)
-                CPU_SET(procs[i],&mask);
-            int error = sched_setaffinity(getpid(), sizeof(cpu_set_t), &mask );
-            if ( error!=0 )
-                throw std::logic_error("Error setting process affinity");
-        #else
-            #warning sched_setaffinity is not supported for this compiler/OS
-            OS_warning("sched_setaffinity is not supported for this compiler/OS");
-            procs.clear();
-        #endif
-    #elif defined(USE_MAC)
-        // MAC does not support getting or setting the affinity
-        OS_warning("Warning: MAC does not support setting the process affinity");
-        procs.clear();
-    #elif defined(USE_WINDOWS)
-        DWORD mask = 0;
-        for (size_t i=0; i<procs.size(); i++)
-            mask |= ((DWORD)1) << procs[i];
-        HANDLE hProc = GetCurrentProcess();
-        SetProcessAffinityMask( hProc, mask );
-    #else
-        #error Unknown OS
-    #endif
+#ifdef USE_LINUX
+#ifdef _GNU_SOURCE
+    cpu_set_t mask;
+    CPU_ZERO( &mask );
+    for ( size_t i = 0; i < procs.size(); i++ )
+        CPU_SET( procs[i], &mask );
+    int error = sched_setaffinity( getpid(), sizeof( cpu_set_t ), &mask );
+    if ( error != 0 )
+        throw std::logic_error( "Error setting process affinity" );
+#else
+#warning sched_setaffinity is not supported for this compiler/OS
+    OS_warning( "sched_setaffinity is not supported for this compiler/OS" );
+    procs.clear();
+#endif
+#elif defined( USE_MAC )
+    // MAC does not support getting or setting the affinity
+    OS_warning( "MAC does not support setting the process affinity" );
+    procs.clear();
+#elif defined( USE_WINDOWS )
+    DWORD mask = 0;
+    for ( size_t i = 0; i < procs.size(); i++ )
+        mask |= ( (DWORD) 1 ) << procs[i];
+    HANDLE hProc = GetCurrentProcess();
+    SetProcessAffinityMask( hProc, mask );
+#else
+#error Unknown OS
+#endif
 }
 
 
@@ -580,98 +328,96 @@ void ThreadPool::setProcessAffinity( std::vector<int> procs )
 * Function to get the thread affinities                           *
 ******************************************************************/
 #ifdef USE_WINDOWS
-    DWORD GetThreadAffinityMask(HANDLE thread)
-    {
-        DWORD mask = 1;
-        DWORD old = 0;
-        // try every CPU one by one until one works or none are left
-        while(mask)
-        {
-            old = SetThreadAffinityMask(thread, mask);
-            if(old)
-            {   // this one worked
-                SetThreadAffinityMask(thread, old); // restore original
-                return old;
-            }
-            else
-            {
-                if(GetLastError() != ERROR_INVALID_PARAMETER)
-                    return 0; // fatal error, might as well throw an exception
-            }
-            mask <<= 1;
+DWORD GetThreadAffinityMask( HANDLE thread )
+{
+    DWORD mask = 1;
+    DWORD old  = 0;
+    // try every CPU one by one until one works or none are left
+    while ( mask ) {
+        old = static_cast<DWORD>( SetThreadAffinityMask( thread, mask ) );
+        if ( old ) {                              // this one worked
+            SetThreadAffinityMask( thread, old ); // restore original
+            return old;
+        } else {
+            if ( GetLastError() != ERROR_INVALID_PARAMETER )
+                return 0; // fatal error, might as well throw an exception
         }
-
-        return 0;
+        mask <<= 1;
     }
+
+    return 0;
+}
 #endif
 std::vector<int> ThreadPool::getThreadAffinity()
 {
     std::vector<int> procs;
-    #ifdef USE_LINUX
-        #ifdef _GNU_SOURCE
-            cpu_set_t mask;
-            int error = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &mask );
-            if ( error!=0 )
-                throw std::logic_error("Error getting thread affinity");
-            for (int i=0; i<(int)sizeof(cpu_set_t)*CHAR_BIT; i++) {
-                if ( CPU_ISSET(i,&mask) )
-                    procs.push_back(i);
-            }
-        #else
-            #warning pthread_getaffinity_np is not supported
-            OS_warning("pthread does not support pthread_getaffinity_np");
-            procs.clear();
-        #endif
-    #elif defined(USE_MAC)
-        // MAC does not support getting or setting the affinity
-        OS_warning("MAC does not support getting the thread affinity");
-        procs.clear();
-    #elif defined(USE_WINDOWS)
-        size_t procMask = GetThreadAffinityMask(GetCurrentThread());
-        for (int i=0; i<(int)sizeof(size_t)*CHAR_BIT; i++) {
-            if ( (procMask&0x1) != 0 )
-                procs.push_back(i);
-            procMask >>= 1;
-        }
-    #else
-        #error Unknown OS
-    #endif
+#ifdef USE_LINUX
+#ifdef _GNU_SOURCE
+    cpu_set_t mask;
+    int error = pthread_getaffinity_np( pthread_self(), sizeof( cpu_set_t ), &mask );
+    if ( error != 0 )
+        throw std::logic_error( "Error getting thread affinity" );
+    for ( int i = 0; i < (int) sizeof( cpu_set_t ) * CHAR_BIT; i++ ) {
+        if ( CPU_ISSET( i, &mask ) )
+            procs.push_back( i );
+    }
+#else
+#warning pthread_getaffinity_np is not supported
+    OS_warning( "pthread does not support pthread_getaffinity_np" );
+    procs.clear();
+#endif
+#elif defined( USE_MAC )
+    // MAC does not support getting or setting the affinity
+    OS_warning( "MAC does not support getting the thread affinity" );
+    procs.clear();
+#elif defined( USE_WINDOWS )
+    size_t procMask = GetThreadAffinityMask( GetCurrentThread() );
+    for ( int i = 0; i < (int) sizeof( size_t ) * CHAR_BIT; i++ ) {
+        if ( ( procMask & 0x1 ) != 0 )
+            procs.push_back( i );
+        procMask >>= 1;
+    }
+#else
+#error Unknown OS
+#endif
     return procs;
 }
 std::vector<int> ThreadPool::getThreadAffinity( int thread ) const
 {
     if ( thread >= getNumThreads() )
-        std::logic_error("Invalid thread number");
+        std::logic_error( "Invalid thread number" );
     std::vector<int> procs;
-    #ifdef USE_LINUX
-        #ifdef _GNU_SOURCE
-            cpu_set_t mask;
-            int error = pthread_getaffinity_np(d_hThread[thread], sizeof(cpu_set_t), &mask );
-            if ( error!=0 )
-                throw std::logic_error("Error getting thread affinity");
-            for (int i=0; i<(int)sizeof(cpu_set_t)*CHAR_BIT; i++) {
-                if ( CPU_ISSET(i,&mask) )
-                    procs.push_back(i);
-            }
-        #else
-            #warning pthread_getaffinity_np is not supported
-            OS_warning("pthread does not support pthread_getaffinity_np");
-            procs.clear();
-        #endif
-    #elif defined(USE_MAC)
-        // MAC does not support getting or setting the affinity
-        OS_warning("MAC does not support getting the thread affinity");
-        procs.clear();
-    #elif defined(USE_WINDOWS)
-        size_t procMask = GetThreadAffinityMask(d_hThread[thread]);
-        for (int i=0; i<(int)sizeof(size_t)*CHAR_BIT; i++) {
-            if ( (procMask&0x1) != 0 )
-                procs.push_back(i);
-            procMask >>= 1;
-        }
-    #else
-        #error Unknown OS
-    #endif
+    auto handle = const_cast<std::thread&>( d_thread[thread] ).native_handle();
+#ifdef USE_LINUX
+#ifdef _GNU_SOURCE
+    cpu_set_t mask;
+    int error = pthread_getaffinity_np( handle, sizeof( cpu_set_t ), &mask );
+    if ( error != 0 )
+        throw std::logic_error( "Error getting thread affinity" );
+    for ( int i = 0; i < (int) sizeof( cpu_set_t ) * CHAR_BIT; i++ ) {
+        if ( CPU_ISSET( i, &mask ) )
+            procs.push_back( i );
+    }
+#else
+#warning pthread_getaffinity_np is not supported
+    OS_warning( "pthread does not support pthread_getaffinity_np" );
+    procs.clear();
+#endif
+#elif defined( USE_MAC )
+    // MAC does not support getting or setting the affinity
+    NULL_USE( handle );
+    OS_warning( "MAC does not support getting the thread affinity" );
+    procs.clear();
+#elif defined( USE_WINDOWS )
+    size_t procMask = GetThreadAffinityMask( handle );
+    for ( int i = 0; i < (int) sizeof( size_t ) * CHAR_BIT; i++ ) {
+        if ( ( procMask & 0x1 ) != 0 )
+            procs.push_back( i );
+        procMask >>= 1;
+    }
+#else
+#error Unknown OS
+#endif
     return procs;
 }
 
@@ -681,124 +427,125 @@ std::vector<int> ThreadPool::getThreadAffinity( int thread ) const
 ******************************************************************/
 void ThreadPool::setThreadAffinity( std::vector<int> procs )
 {
-    #ifdef USE_LINUX
-        #ifdef _GNU_SOURCE
-            cpu_set_t mask;
-            CPU_ZERO(&mask);
-            for (size_t i=0; i<procs.size(); i++)
-                CPU_SET(procs[i],&mask);
-            int error = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &mask );
-            if ( error!=0 )
-                throw std::logic_error("Error setting thread affinity");
-        #else
-            #warning pthread_getaffinity_np is not supported
-            OS_warning("pthread does not support pthread_setaffinity_np");
-            procs.clear();
-        #endif
-    #elif defined(USE_MAC)
-        // MAC does not support getting or setting the affinity
-        NULL_USE(procs);
-        OS_warning("MAC does not support setting the thread affinity");
-    #elif defined(USE_WINDOWS)
-        DWORD mask = 0;
-        for (size_t i=0; i<procs.size(); i++)
-            mask |= ((DWORD)1) << procs[i];
-        SetThreadAffinityMask( GetCurrentThread(), mask );
-    #else
-        #error Unknown OS
-    #endif
+#ifdef USE_LINUX
+#ifdef _GNU_SOURCE
+    cpu_set_t mask;
+    CPU_ZERO( &mask );
+    for ( size_t i = 0; i < procs.size(); i++ )
+        CPU_SET( procs[i], &mask );
+    int error = pthread_setaffinity_np( pthread_self(), sizeof( cpu_set_t ), &mask );
+    if ( error != 0 )
+        throw std::logic_error( "Error setting thread affinity" );
+#else
+#warning pthread_getaffinity_np is not supported
+    OS_warning( "pthread does not support pthread_setaffinity_np" );
+    procs.clear();
+#endif
+#elif defined( USE_MAC )
+    // MAC does not support getting or setting the affinity
+    NULL_USE( procs );
+    OS_warning( "MAC does not support setting the thread affinity" );
+#elif defined( USE_WINDOWS )
+    DWORD mask = 0;
+    for ( size_t i = 0; i < procs.size(); i++ )
+        mask |= ( (DWORD) 1 ) << procs[i];
+    SetThreadAffinityMask( GetCurrentThread(), mask );
+#else
+#error Unknown OS
+#endif
 }
 void ThreadPool::setThreadAffinity( int thread, std::vector<int> procs ) const
 {
     if ( thread >= getNumThreads() )
-        std::logic_error("Invalid thread number");
-    #ifdef USE_LINUX
-        #ifdef __USE_GNU
-            cpu_set_t mask;
-            CPU_ZERO(&mask);
-            for (size_t i=0; i<procs.size(); i++)
-                CPU_SET(procs[i],&mask);
-            int error = pthread_setaffinity_np(d_hThread[thread], sizeof(cpu_set_t), &mask );
-            if ( error!=0 )
-                throw std::logic_error("Error setting thread affinity");
-        #else
-            #warning pthread_getaffinity_np is not supported
-            OS_warning("pthread does not support pthread_setaffinity_np");
-            procs.clear();
-        #endif
-    #elif defined(USE_MAC)
-        // MAC does not support getting or setting the affinity
-        NULL_USE(procs);
-        OS_warning("MAC does not support getting the process affinity");
-    #elif defined(USE_WINDOWS)
-        DWORD mask = 0;
-        for (size_t i=0; i<procs.size(); i++)
-            mask |= ((DWORD)1) << procs[i];
-        SetThreadAffinityMask( d_hThread[thread], mask );
-    #else
-        #error Unknown OS
-    #endif
+        std::logic_error( "Invalid thread number" );
+    auto handle = const_cast<std::thread&>( d_thread[thread] ).native_handle();
+#ifdef USE_LINUX
+#ifdef __USE_GNU
+    cpu_set_t mask;
+    CPU_ZERO( &mask );
+    for ( size_t i = 0; i < procs.size(); i++ )
+        CPU_SET( procs[i], &mask );
+    int error = pthread_setaffinity_np( handle, sizeof( cpu_set_t ), &mask );
+    if ( error != 0 )
+        throw std::logic_error( "Error setting thread affinity" );
+#else
+#warning pthread_getaffinity_np is not supported
+    OS_warning( "pthread does not support pthread_setaffinity_np" );
+    procs.clear();
+#endif
+#elif defined( USE_MAC )
+    // MAC does not support getting or setting the affinity
+    NULL_USE( handle );
+    NULL_USE( procs );
+    OS_warning( "MAC does not support getting the process affinity" );
+#elif defined( USE_WINDOWS )
+    DWORD mask = 0;
+    for ( size_t i = 0; i < procs.size(); i++ )
+        mask |= ( (DWORD) 1 ) << procs[i];
+    SetThreadAffinityMask( handle, mask );
+#else
+#error Unknown OS
+#endif
 }
 
 
 /******************************************************************
 * Function to perform some basic checks before we start           *
 ******************************************************************/
-void ThreadPool::check_startup(size_t size0) 
+void ThreadPool::check_startup( size_t size0 )
 {
     // Check the size of the class to make sure that we don't have any
     // byte alignment problems between a library implimentation and a calling pacakge
-    size_t size1 = sizeof(ThreadPool);
-    size_t size2 = ((size_t)&d_NULL_HEAD)-((size_t)this)+sizeof(size_t);
-    size_t size3 = ((size_t)&d_NULL_TAIL)-((size_t)this)+sizeof(size_t);
-    if ( size0!=size1 || size1<size2 || size1<size3 )
-        throw std::logic_error("Internal data format problem");
-    // Check the size of variables 
-    if ( sizeof(ThreadPool::uint64)!=8 )
-        throw std::logic_error("uint64 is not 64 bits");
-    if ( sizeof(AtomicOperations::int32_atomic)!=4 )
-        throw std::logic_error("AtomicOperations::int32_atomic is not 32 bits");
-    if ( sizeof(AtomicOperations::int64_atomic)!=8 )
-        throw std::logic_error("AtomicOperations::int32_atomic is not 64 bits");
+    size_t size1 = sizeof( ThreadPool );
+    size_t size2 = ( (size_t) &d_NULL_HEAD ) - ( ( size_t ) this ) + sizeof( size_t );
+    size_t size3 = ( (size_t) &d_NULL_TAIL ) - ( ( size_t ) this ) + sizeof( size_t );
+    if ( size0 != size1 || size1 < size2 || size1 < size3 )
+        throw std::logic_error( "Internal data format problem" );
+    // Check the size of variables
+    if ( sizeof( AtomicOperations::int32_atomic ) != 4 )
+        throw std::logic_error( "AtomicOperations::int32_atomic is not 32 bits" );
+    if ( sizeof( AtomicOperations::int64_atomic ) != 8 )
+        throw std::logic_error( "AtomicOperations::int32_atomic is not 64 bits" );
     // Check getting/setting a bit
-    uint64 x[2] = {0x0,0x7};
-    set_bit(x,2,true);
-    set_bit(x,66,false);
-    if ( x[0]!=4 || x[1]!=3 || !get_bit(x,2) || get_bit(x,66) )
-        throw std::logic_error("Getting/setting a bit failed");
+    atomic_64 x[2] = { 0x0, 0x7 };
+    set_bit( x, 2 );
+    unset_bit( x, 66 );
+    if ( x[0] != 4 || x[1] != 3 || !get_bit( x, 2 ) || get_bit( x, 66 ) )
+        throw std::logic_error( "Getting/setting a bit failed" );
     // Check the thread id
     bool pass = true;
     ThreadPool::thread_id_t id;
-    if ( id.getPriority()!=-128 )
+    if ( id.getPriority() != -128 )
         pass = false;
-    id.reset(3,564,NULL);
-    if ( id.getPriority()!=3 || id.getLocalID()!=564 )
+    id.reset( 3, 564, NULL );
+    if ( id.getPriority() != 3 || id.getLocalID() != 564 )
         pass = false;
-    if ( count_bits(0x0)!=0 || count_bits(0x03)!=2 )
+    if ( count_bits( 0x0 ) != 0 || count_bits( 0x03 ) != 2 )
         pass = false;
-    if ( count_bits(~((size_t)0)) != 8*sizeof(size_t) )
+    if ( count_bits( ~( (size_t) 0 ) ) != 8 * sizeof( size_t ) )
         pass = false;
-    if ( sizeof(size_t)==8 ) {
-        if ( is_odd8(0x0) || !is_odd8(0x02) || is_odd8(0x03) )
+    if ( sizeof( size_t ) == 8 ) {
+        if ( is_odd8( 0x0 ) || !is_odd8( 0x02 ) || is_odd8( 0x03 ) )
             pass = false;
-        if ( is_odd8(~((size_t)0)) || !is_odd8(MAXID64))
+        if ( is_odd8( ~( (size_t) 0 ) ) || !is_odd8( thread_id_t::maxThreadID ) )
             pass = false;
-        for (size_t i=0; i<1024; i++) {
-            if ( (count_bits(MAXID64-i)%2==1) != is_odd8(MAXID64-i) ) {
-                printp("%i %i %s\n",count_bits(MAXID64-i),is_odd8(MAXID64-i)?1:0,
-                    convert_binary<unsigned long long int>(MAXID64-i).c_str());
+        for ( size_t i = 0; i < 1024; i++ ) {
+            if ( ( count_bits( thread_id_t::maxThreadID - i ) % 2 == 1 ) != is_odd8( thread_id_t::maxThreadID - i ) ) {
+                printp( "%i %i %s\n", count_bits( thread_id_t::maxThreadID - i ), is_odd8( thread_id_t::maxThreadID - i ) ? 1 : 0,
+                    std::bitset<64>( thread_id_t::maxThreadID - i ).to_string().c_str() );
                 pass = false;
             }
         }
     }
-    initialize_id();
-    advance_id(); advance_id(); 
+    d_id_assign = thread_id_t::maxThreadID;
+    AtomicOperations::atomic_decrement( &d_id_assign ); // Advance the id
+    AtomicOperations::atomic_decrement( &d_id_assign ); // Advance the id
     ThreadPool::thread_id_t id2;
-    id2.reset(3,d_id_assign,NULL);
-    if ( isValid(id) || !isValid(id2) )
+    id2.reset( 3, d_id_assign, nullptr );
+    if ( isValid( id ) || !isValid( id2 ) )
         pass = false;
     if ( !pass ) {
-        throw std::logic_error("Thread pool failed to initialize");
+        throw std::logic_error( "Thread pool failed to initialize" );
     }
 }
 
@@ -806,112 +553,67 @@ void ThreadPool::check_startup(size_t size0)
 /******************************************************************
 * Function to initialize the thread pool                          *
 ******************************************************************/
-void ThreadPool::initialize( const int N, const char* affinity, int N_procs, const int* procs ) 
+void ThreadPool::initialize( const int N, const char *affinity, int N_procs, const int *procs )
 {
-    // Get the clock frequency
-    #if MONITOR_THREADPOOL_PERFORMANCE==1
-        get_frequency( &frequency );
-    #endif
     // Initialize the header/tail
     d_NULL_HEAD = rand_size_t();
     d_NULL_TAIL = d_NULL_HEAD;
-    for (int i=0; i<MAX_NUM_THREADS; i++)
-        d_hThread[i] = 0;
     // Initialize the variables to NULL values
-    d_id_assign = 0;
+    d_id_assign    = 0;
     d_signal_empty = false;
     d_signal_count = 0;
-    d_N_threads = 0;
-    d_num_active = 0;
-    d_queue_size = 0;
-    d_N_wait = 0;
-    for (int i=0; i<MAX_NUM_THREADS; i++)
-        d_ThreadId[i] = ~((size_t)0);
-    memset((void*)d_active,0,MAX_NUM_THREADS/8);
-    memset((void*)d_cancel,0,MAX_NUM_THREADS/8);
-    for (int i=0; i<MAX_QUEUED; i++) {
-        d_queue_ids[i].reset();
-        d_queue_list[i].reset();
-        d_queue_list[i].position = i;
-        d_queue_list[i].prev = i-1;
-        d_queue_list[i].next = i+1;
-    }
-    d_queue_head = -1;
-    d_queue_free = 0;
-    for (int i=0; i<MAX_WAIT; i++)
-        d_wait[i] = NULL;
-    d_wait_finished = 0;
-    d_lock_queue = 0;
-    for (int i=0; i<MAX_NUM_THREADS; i++)
-        d_hThread[i] = 0;
-    #if defined(USE_LINUX) || defined(USE_MAC)
-        d_queue_not_empty = 0;
-    #endif
+    d_N_threads    = 0;
+    d_num_active   = 0;
+    d_N_added      = 0;
+    d_N_started    = 0;
+    d_N_finished   = 0;
+    memset( (void *) d_active, 0, MAX_NUM_THREADS / 8 );
+    memset( (void *) d_cancel, 0, MAX_NUM_THREADS / 8 );
+    d_wait_last = nullptr;
+    for ( int i     = 0; i < MAX_WAIT; i++ )
+        d_wait[i]   = nullptr;
     // Initialize the id
-    initialize_id();
-    // Create the mutex lock and signal variables
-    d_lock_queue = create_mutex();
-    #ifdef USE_WINDOWS
-        d_wait_finished = CreateEvent(NULL,FALSE,FALSE,NULL);
-    #elif defined(USE_LINUX) || defined(USE_MAC)
-        d_queue_not_empty = new pthread_cond_t;
-        d_wait_finished = new pthread_cond_t;
-        int error = pthread_cond_init(d_queue_not_empty,NULL);
-        if ( error == -1 )
-            perr << "Error creating d_queue_not_empty\n";
-        error = pthread_cond_init(d_wait_finished,NULL);
-        if ( error == -1 )
-            perr << "Error creating d_wait_finished\n";
-    #else
-        #error Not programmed
-    #endif
+    d_id_assign = thread_id_t::maxThreadID;
     // Create the threads
-    setNumThreads(N,affinity,N_procs,procs);
+    setNumThreads( N, affinity, N_procs, procs );
 }
 
 
-
 /******************************************************************
 * This is the de-constructor                                      *
 ******************************************************************/
-ThreadPool::~ThreadPool() {
-    if ( !is_valid(this) )
-        throw std::logic_error("Thread pool is not valid");
+ThreadPool::~ThreadPool()
+{
+    if ( !is_valid( this ) )
+        throw std::logic_error( "Thread pool is not valid" );
     // Destroy the threads
-    setNumThreads(0);
+    setNumThreads( 0 );
     // Delete all remaining data
-    destroy_mutex(d_lock_queue);
-    #ifdef USE_WINDOWS
-        CloseHandle(d_wait_finished);
-    #elif defined(USE_LINUX) || defined(USE_MAC)
-        pthread_cond_destroy(d_wait_finished);
-        pthread_cond_destroy(d_queue_not_empty);
-        delete d_queue_not_empty;     d_queue_not_empty=NULL;
-        delete d_wait_finished;       d_wait_finished=NULL;
-    #else
-        #error Not programmed
-    #endif
     d_N_threads = -1;
     d_NULL_HEAD = 0;
     d_NULL_TAIL = 0;
-    // Print the performance metrics
-    #if MONITOR_THREADPOOL_PERFORMANCE==1
-        printp("ThreadPool Performance:\n");
-        printp("add_work: %e %e %e\n",total_add_work_time[0],total_add_work_time[1],total_add_work_time[2]);
-    #endif
+    delete d_wait_last;
+// Print the performance metrics
+#if MONITOR_THREADPOOL_PERFORMANCE == 1
+    printp( "ThreadPool Performance:\n" );
+    printp( "add_work:  %lu us,  %lu us,  %lu us,  %lu us,  %lu us\n",
+        total_add_work_time[0]/1000, total_add_work_time[1]/1000,
+        total_add_work_time[2]/1000, total_add_work_time[3]/1000,
+        total_add_work_time[4]/1000 );
+#endif
 }
 
 
 /******************************************************************
 * Check if the pointer points to a valid thread pool object       *
 ******************************************************************/
-bool ThreadPool::is_valid( const ThreadPool* tpool )
+bool ThreadPool::is_valid( const ThreadPool *tpool )
 {
-    if ( tpool == NULL )
+    if ( tpool == nullptr )
         return false;
-    if ( tpool->d_N_threads<0 || tpool->d_N_threads>MAX_NUM_THREADS )
+    if ( tpool->d_N_threads < 0 || tpool->d_N_threads > MAX_NUM_THREADS )
         return false;
-    if ( tpool->d_NULL_HEAD==0 || tpool->d_NULL_HEAD!=tpool->d_NULL_TAIL )
+    if ( tpool->d_NULL_HEAD == 0 || tpool->d_NULL_HEAD != tpool->d_NULL_TAIL )
         return false;
     return true;
 }
@@ -920,134 +622,87 @@ bool ThreadPool::is_valid( const ThreadPool* tpool )
 /******************************************************************
 * This function creates the threads in the thread pool            *
 ******************************************************************/
-void ThreadPool::setNumThreads( int num_worker_threads, 
-    const char* affinity2, int N_procs, const int* procs ) 
+void ThreadPool::setNumThreads(
+    int num_worker_threads, const char *affinity2, int N_procs, const int *procs )
 {
     // Check if we are a member thread
     if ( isMemberThread() )
-        throw std::logic_error("Member threads are not allowed to change the number of threads in the pool");
+        throw std::logic_error(
+            "Member threads are not allowed to change the number of threads in the pool" );
     // Determing the number of threads we need to create or destroy
     if ( num_worker_threads > MAX_NUM_THREADS ) {
-        printp("Warning: Maximum Number of Threads is %i\n",MAX_NUM_THREADS);
-        printp("         Only that number will be created\n");
+        printp( "Warning: Maximum Number of Threads is %i\n", MAX_NUM_THREADS );
+        printp( "         Only that number will be created\n" );
         num_worker_threads = MAX_NUM_THREADS;
     } else if ( num_worker_threads < 0 ) {
-        printp("Error: cannot have a negitive number of threads\n");
-        printp("       Setting the number of threads to 0\n");
+        printp( "Error: cannot have a negitive number of threads\n" );
+        printp( "       Setting the number of threads to 0\n" );
         num_worker_threads = 0;
-    } 
-    int d_N_threads_diff = num_worker_threads-d_N_threads;
+    }
+    int d_N_threads_diff = num_worker_threads - d_N_threads;
     if ( d_N_threads_diff > 0 ) {
-        // Create new threads
-        lock_mutex(d_lock_queue);
         // Check that no threads are in the process of being deleted
-        for (int i=0; i<MAX_NUM_THREADS/64; i++) {
+        for ( int i = 0; i < MAX_NUM_THREADS / 64; i++ ) {
             if ( d_cancel[i] != 0 )
-                throw std::logic_error("Threads are being created and destroyed at the same time");
+                throw std::logic_error(
+                    "Threads are being created and destroyed at the same time" );
         }
-        // Create the thread attributes (linux only)
-        #if defined(USE_LINUX) || defined(USE_MAC)
-            pthread_attr_t attr;
-            pthread_attr_init(&attr);
-            //int ptmp;
-            //pthread_attr_setstacksize(&attr,2097152);     // Default stack size is 8MB
-            //pthread_attr_setschedpolicy(&attr,1);
-            //pthread_attr_getschedpolicy(&attr,&ptmp);
-            //pout << "getschedpolicy = " << ptmp << std::endl;
-        #endif
+// Create the thread attributes (linux only)
+#if defined( USE_LINUX ) || defined( USE_MAC )
+        pthread_attr_t attr;
+        pthread_attr_init( &attr );
+// int ptmp;
+// pthread_attr_setstacksize(&attr,2097152);     // Default stack size is 8MB
+// pthread_attr_setschedpolicy(&attr,1);
+// pthread_attr_getschedpolicy(&attr,&ptmp);
+// pout << "getschedpolicy = " << ptmp << std::endl;
+#endif
         // Create the threads
-        void **tmp = new void*[2*d_N_threads_diff];
-        int j = d_N_threads;
-        for (int i=0; i<d_N_threads_diff; i++) {
+        auto tmp = new void *[2 * d_N_threads_diff];
+        int j    = d_N_threads;
+        for ( int i = 0; i < d_N_threads_diff; i++ ) {
             d_N_threads++;
-            tmp[0+2*i] = this;
-            tmp[1+2*i] = reinterpret_cast<void*>(static_cast<size_t>(j));
-            bool error = false;
-            set_bit(d_cancel,j,true);
-            #ifdef USE_WINDOWS
-                d_hThread[j] = (HANDLE)_beginthread( create_new_thread, 0, (void *) &tmp[2*i]);
-                error = d_hThread==(HANDLE)(-1);
-            #elif defined(USE_LINUX) || defined(USE_MAC)
-                int rtn = pthread_create( &d_hThread[j], &attr, (void *(*)(void*)) create_new_thread, (void *) &tmp[2*i]);
-                error = rtn!=0;
-            #else
-                #error Not programmed
-            #endif
-            if ( error ) {
-                pout << "Warning: Only able to create " << i << " threads\n";
-                break;
-            }
+            tmp[0 + 2 * i] = this;
+            tmp[1 + 2 * i] = reinterpret_cast<void *>( static_cast<size_t>( j ) );
+            set_bit( d_cancel, j );
+            d_thread[j] = std::thread( create_new_thread, this, j );
             j++;
         }
         // Wait for all of the threads to finish initialization
         while ( 1 ) {
-            unlock_mutex(d_lock_queue);
-            Sleep(25);
-            lock_mutex(d_lock_queue);
+            std::this_thread::sleep_for( std::chrono::milliseconds(25) );
             bool wait = false;
-            for (int i=0; i<MAX_NUM_THREADS/64; i++) {
+            for ( int i = 0; i < MAX_NUM_THREADS / 64; i++ ) {
                 if ( d_cancel[i] != 0 )
                     wait = true;
             }
-            if ( !wait ) 
+            if ( !wait )
                 break;
         }
-        // Delete the thread attributes (linux only)
-        #if defined(USE_LINUX) || defined(USE_MAC)
-            pthread_attr_destroy(&attr);
-        #endif
-        // Release the lock
-        unlock_mutex(d_lock_queue);
-        Sleep(25);
-        delete [] tmp;
+// Delete the thread attributes (linux only)
+#if defined( USE_LINUX ) || defined( USE_MAC )
+        pthread_attr_destroy( &attr );
+#endif
+        std::this_thread::sleep_for( std::chrono::milliseconds(25) );
+        delete[] tmp;
     } else if ( d_N_threads_diff < 0 ) {
         // Reduce the number of threads
-        if ( num_worker_threads==0 ) {
+        if ( num_worker_threads == 0 ) {
             // Special case if we want to delete all of the threads
             wait_pool_finished();
         }
-        // Lock the mutex for the deletion of existing threads
-        lock_mutex(d_lock_queue);
         // Tell the threads to shutdown
-        for (int i=0; i>d_N_threads_diff; i--)
-            set_bit(d_cancel,d_N_threads-1+i,true);
-        #ifdef USE_WINDOWS
-            // Release the lock
-            unlock_mutex(d_lock_queue);
-            // Wake all threads to process the shutdown (Doesn't require blocking)
-            for (int i=0; i<d_N_threads; i++) {
-                ResumeThread(d_hThread[i]);
-            }
-        #elif defined(USE_LINUX) || defined(USE_MAC)
-            // Wake all threads to process the shutdown
-            int error = pthread_cond_broadcast(d_queue_not_empty);
-            if ( error != 0 )
-                perr << "Error in signaling thread";
-            // Release the lock
-            unlock_mutex(d_lock_queue);
-        #else
-            #error Not programmed
-        #endif
-        Sleep(25);
-        // Wait for all of the threads to close
-        #ifdef USE_WINDOWS
-            int j = d_N_threads+d_N_threads_diff;
-            WaitForMultipleObjects( -d_N_threads_diff, &d_hThread[j], 1, 10000 );
-        #elif defined(USE_LINUX) || defined(USE_MAC)
-            for (int i=0; i>d_N_threads_diff; i--) {
-                int rtn = pthread_join(d_hThread[d_N_threads-1+i],NULL);
-                if ( rtn != 0 ) {
-                    perr << "error\n";
-                    perr << "Error joining threads";
-                }
-            }
-        #else
-            #error Not programmed
-        #endif
-        for (int i=0; i>d_N_threads_diff; i--) {
-            set_bit(d_cancel,d_N_threads-1+i,false);
-            d_hThread[d_N_threads-1+i] = 0;
-            d_ThreadId[d_N_threads-1+i] = ~((size_t)0);
+        for ( int i = 0; i > d_N_threads_diff; i-- )
+            set_bit( d_cancel, d_N_threads - 1 + i );
+        // Wake all threads to process the shutdown
+        d_wait_work.notify_all();
+        std::this_thread::sleep_for( std::chrono::milliseconds(25) );
+        // Wait for the threads to close
+        for ( int i = 0; i > d_N_threads_diff; i-- ) {
+            d_thread[d_N_threads - 1 + i].join();
+            d_thread[d_N_threads - 1 + i] = std::thread();
+            unset_bit( d_cancel, d_N_threads - 1 + i );
+            d_threadId[d_N_threads - 1 + i] = std::thread::id();
         }
         d_N_threads += d_N_threads_diff;
     }
@@ -1055,694 +710,432 @@ void ThreadPool::setNumThreads( int num_worker_threads,
         return;
     // Get the default thread affinity to use
     std::vector<int> cpus;
-    int tmp = global_OS_behavior;
+    int tmp            = global_OS_behavior;
     global_OS_behavior = 1;
-    OS_warning("Dummy message (should not print)");
+    OS_warning( "Dummy message (should not print)" );
     try {
         cpus = ThreadPool::getProcessAffinity();
-    } catch(...) {
+    } catch ( ... ) {
         pout << "Warning: Unable to get default cpus for thread affinities\n";
     }
-    if ( !cpus.empty() && N_procs>0 ) {
-        cpus.resize(N_procs);
-        for (int i=0; i<N_procs; i++)
+    if ( !cpus.empty() && N_procs > 0 ) {
+        cpus.resize( N_procs );
+        for ( int i = 0; i < N_procs; i++ )
             cpus[i] = procs[i];
     }
     // Set the affinity model and the associated thread affinities
     // Note: not all OS's support setting the thread affinities
-    std::vector<std::vector<int> > t_procs(d_N_threads);
-    std::string affinity(affinity2);
+    std::vector<std::vector<int>> t_procs( d_N_threads );
+    std::string affinity( affinity2 );
     if ( cpus.empty() ) {
         // We do not have a list of cpus to use, do nothing (OS not supported)
-    } else if ( affinity=="none" ) {
+    } else if ( affinity == "none" ) {
         // We are using the default thread affinities (all threads get all procs of the program)
-        for (int i=0; i<d_N_threads; i++)
+        for ( int i    = 0; i < d_N_threads; i++ )
             t_procs[i] = cpus;
-    } else if ( affinity=="independent" ) {
+    } else if ( affinity == "independent" ) {
         // We want to use an independent set of processors for each thread
         if ( (int) cpus.size() == d_N_threads ) {
             // The number of cpus matches the number of threads
-            for (int i=0; i<d_N_threads; i++)
-                t_procs[i] = std::vector<int>(1,cpus[i]);
+            for ( int i    = 0; i < d_N_threads; i++ )
+                t_procs[i] = std::vector<int>( 1, cpus[i] );
         } else if ( (int) cpus.size() > d_N_threads ) {
             // There are more cpus than threads, threads will use more the one processor
-            int N_procs_thread = (cpus.size()+d_N_threads-1)/d_N_threads;
-            size_t k = 0;
-            for (int i=0; i<d_N_threads; i++) {
-                for (int j=0; j<N_procs_thread && k<cpus.size(); j++) {
+            int N_procs_thread = static_cast<int>( cpus.size() + d_N_threads - 1 ) / d_N_threads;
+            size_t k           = 0;
+            for ( int i = 0; i < d_N_threads; i++ ) {
+                for ( int j = 0; j < N_procs_thread && k < cpus.size(); j++ ) {
                     t_procs[i].push_back( cpus[k] );
                     k++;
                 }
             }
         } else {
             // There are fewer cpus than threads, threads will share a processor
-            int N_threads_proc = (cpus.size()+d_N_threads-1)/cpus.size();
-            for (int i=0; i<d_N_threads; i++)
-                t_procs[i].push_back( cpus[i/N_threads_proc] );
+            int N_threads_proc =
+                static_cast<int>( ( cpus.size() + d_N_threads - 1 ) / cpus.size() );
+            for ( int i = 0; i < d_N_threads; i++ )
+                t_procs[i].push_back( cpus[i / N_threads_proc] );
         }
     } else {
         global_OS_behavior = tmp;
-        throw std::logic_error("Unknown affinity model");
+        throw std::logic_error( "Unknown affinity model" );
     }
     try {
-        for (int i=0; i<d_N_threads; i++) {
+        for ( int i = 0; i < d_N_threads; i++ ) {
             ThreadPool::setThreadAffinity( i, t_procs[i] );
             std::vector<int> cpus2 = getThreadAffinity( i );
             if ( cpus2 != t_procs[i] )
                 pout << "Warning: error setting affinities (failed to set)\n";
         }
-    } catch (...) {
+    } catch ( ... ) {
         pout << "Warning: error setting affinities (exception)\n";
     }
     global_OS_behavior = tmp;
 }
 
 
-/******************************************************************
-* Get an item in the work queue that is ready to be processed     *
-******************************************************************/
-int ThreadPool::getThreadNumber() const
-{
-    size_t id = getThreadId();
-    int index = 0;
-    for (int i=0; i<d_N_threads; i++) {
-        if ( d_ThreadId[i]==id )
-            index = i+1;
-    }
-    return index;
-}
-
-
-/******************************************************************
-* Get an item in the work queue that is ready to be processed     *
-******************************************************************/
-short int ThreadPool::get_work_item( )
-{
-    const thread_id_t *ids = const_cast<const thread_id_t*>(d_queue_ids);
-    const queue_list_struct *list = const_cast<const queue_list_struct*>(d_queue_list);
-    short int index = d_queue_head;
-    short int index2 = check_dependecies(list,ids,index);
-    while ( index2==-1 && index!=-1 ) {
-        index = d_queue_list[index].next;
-        index2 = index==-1 ? -1:check_dependecies(list,ids,index);
-    }
-    return index2;
-}
-inline short int ThreadPool::check_dependecies( const ThreadPool::queue_list_struct *list, 
-    const thread_id_t *queue, short int index )
-{
-    if ( index==-1 )
-        return -1;
-    WorkItem* work = reinterpret_cast<WorkItem*>(queue[index].d_work);
-    // Loop through the dependencies, removing any that have finished,
-    // and search for any that have not started (keeping the one with the fewest dependencies)
-    size_t N_active = 0;
-    thread_id_t* ids = work->d_ids;
-    short int index2 = index;
-    int N_dependencies = static_cast<int>(work->d_N_ids);
-    for (int i=N_dependencies-1; i>=0; i--) {
-        WorkItem* work2 = reinterpret_cast<WorkItem*>(ids[i].d_work);
-        char state = work2->d_state;
-        if ( state==0 ) {
-            // We found a new potential item to process
-            index2 = work2->d_tpool_index;
-            index2 = check_dependecies(list,queue,index2);
-            if ( index2 != -1 )
-                break;
-        } else if ( state==1 || state==-1 ) {
-            // We found an item that is processing
-            N_active++;
-        } else if ( state==2 ) {
-            // The item has finished
-            ids[i].reset();
-            std::swap(ids[i],ids[work->d_N_ids-1]);
-            work->d_N_ids--;
-            continue;
-        }
-    }
-    if ( N_active>0 ) {
-        // Some dependencies are working, choose a different work item
-        index2 = -1;
-    }
-    return index2;
-}
-
-
 /******************************************************************
 * This is the function that controls the individual thread and    *
 * allows it to do work.                                           *
+* Note: this function is lock free                                *
 ******************************************************************/
-void ThreadPool::tpool_thread(int thread_id) 
+void ThreadPool::tpool_thread( int thread_id )
 {
-    if ( getThreadId()==0 )
-        throw std::logic_error("Invalid thread id");
-    bool shutdown = false;
-    bool printInfo = false;
-    d_ThreadId[thread_id] = getThreadId();
-    // Acquire mutex 
-    lock_mutex(d_lock_queue);
-    if ( get_bit(d_active,thread_id) )
-        throw std::logic_error("Thread cannot already be active");
-    d_num_active++;
-    set_bit(d_active,thread_id,true);
-    set_bit(d_cancel,thread_id,false);
+    bool shutdown         = false;
+    bool printInfo        = false;
+    d_threadId[thread_id] = std::this_thread::get_id();
+    if ( get_bit( d_active, thread_id ) )
+        throw std::logic_error( "Thread cannot already be active" );
+    AtomicOperations::atomic_increment( &d_num_active );
+    set_bit( d_active, thread_id );
+    unset_bit( d_cancel, thread_id );
     if ( printInfo ) {
         // Print the pid
-        printp("pid = %i\n",(int)getpid());
+        printp( "pid = %i\n", (int) getpid() );
         // Print the processor affinities for the process
         try {
             std::vector<int> cpus = ThreadPool::getProcessAffinity();
-            printp("%i cpus for current thread: ",(int)cpus.size());
-            for (size_t i=0; i<cpus.size(); i++)
-                printp("%i ",cpus[i]);
-            printp("\n");
-        } catch(...) {
-            printp("Unable to get process affinity\n");
+            printp( "%i cpus for current thread: ", (int) cpus.size() );
+            for ( size_t i = 0; i < cpus.size(); i++ )
+                printp( "%i ", cpus[i] );
+            printp( "\n" );
+        } catch ( ... ) {
+            printp( "Unable to get process affinity\n" );
         }
     }
     // Check for shutdown
+    PROFILE_THREADPOOL_START( "thread active" );
     shutdown = false;
-    //pout << "Thread initialized\n";
-    PROFILE_THREAD_START("thread active");
     while ( !shutdown ) {
         // Check if there is work to do
-        if ( d_queue_size>0 ) {
+        if ( d_queue_list.size() > 0 ) {
             // Get next work item to process
-            short int work_index = ThreadPool::get_work_item();
-            if ( work_index==-1 ) {
-                unlock_mutex(d_lock_queue);
-                Sleep(0);
-                lock_mutex(d_lock_queue);
+            auto work_id = d_queue_list.remove( []( const thread_id_t& id ) { return id.ready(); } );
+            if ( work_id.isNull() ) {
+                std::this_thread::yield();
                 continue;
             }
-            // Remove the work item from the queue
-            #ifdef D_DEBUG
-                short int cur = d_queue_list[work_index].position;
-            #endif
-            short int next = d_queue_list[work_index].next;
-            short int prev = d_queue_list[work_index].prev;
-            if ( prev==-1 ) {
-                d_queue_head = next;
-            } else {
-                d_queue_list[prev].next = next;
-            }
-            if ( next!=-1 ) {
-                d_queue_list[next].prev = prev;
-            }
-            --d_queue_size;
-            #ifdef D_DEBUG
-                if ( cur!=work_index || ( d_queue_size>0 && d_queue_head==-1 ) )
-                    throw std::logic_error("Internal error with threadpool");
-            #endif
-            thread_id_t work_id = const_cast<thread_id_t&>(d_queue_ids[work_index]);
-            d_queue_ids[work_index].reset();
-            d_queue_list[work_index].reset();
-            d_queue_list[work_index].next = d_queue_free;
-            d_queue_free = work_index;
-            WorkItem* work = reinterpret_cast<WorkItem*>(work_id.d_work);
-            work->d_state = -1;
-            // Release mutex
-            unlock_mutex(d_lock_queue);
-            // Start work here 
-            PROFILE_THREAD_START("thread working");
+            WorkItem *work = work_id.work( );
+            AtomicOperations::atomic_increment( &d_N_started );
+            // Start work here
+            PROFILE_THREADPOOL_START( "thread working" );
+            work->d_state  = 2;
             work->run();
-            if ( work->d_state!=2 ) { throw std::logic_error("Work item is not changing state"); }
-            PROFILE_THREAD_STOP("thread working");
-            // Work finished, acquire mutex and remove it from the active list
-            lock_mutex(d_lock_queue);
+            work->d_state  = 3;
+            PROFILE_THREADPOOL_STOP( "thread working" );
+            AtomicOperations::atomic_increment( &d_N_finished );
             // Check if any threads are waiting on the current work item
-            for (int i=0; i<d_N_wait; i++) {
-                wait_event_struct* wait = const_cast<wait_event_struct*>(d_wait[i]);
-                bool found = false;
-                if ( wait->ids.empty() ) {
-                    // Special case where we just want to wait for any work items to finish
-                    found = true;
-                } else {
-                    found = find_id( wait->ids, work_id );
-                }
-                if ( found ) {
-                    wait_type event = 0;
-                    volatile int* count = &(wait->count);
-                    if ( *count == 1 )
-                        event = const_cast<wait_type>(wait->wait_event);
-                    --(*count);
-                    if ( event != 0 )
-                        SIGNAL_EVENT(event);
-                }
+            // This can be done without blocking
+            for ( int i = 0; i < MAX_WAIT; i++ ) {
+                const wait_ids_struct *wait = const_cast<const wait_ids_struct *>(d_wait[i]);
+                if ( wait != nullptr )
+                    wait->id_finished( work_id );
             }
             // Check the signal count and signal if desired
+            // This can be done without blocking
             if ( d_signal_count > 0 ) {
-                --d_signal_count;
-                if ( d_signal_count == 0 )
-                    SIGNAL_EVENT(d_wait_finished);
+                int count = AtomicOperations::atomic_decrement( &d_signal_count );
+                if ( count == 0 )
+                    d_wait_finished.notify_all();
             }
         } else {
-            int N_active = --d_num_active;
-            set_bit(d_active,thread_id,false);
-            // Alert main thread that a thread finished processing 
-            if ( N_active==0 ) { 
-                if ( d_signal_empty ) {
-                    SIGNAL_EVENT(d_wait_finished);
-                    d_signal_empty = false;
-                }
+            int N_active = AtomicOperations::atomic_decrement( &d_num_active );
+            unset_bit( d_active, thread_id );
+            // Alert main thread that a thread finished processing
+            if ( ( N_active == 0 ) && d_signal_empty ) {
+                d_wait_finished.notify_all();
+                d_signal_empty = false;
             }
             // Wait for work
-            PROFILE_THREAD_STOP2("thread active");
-            #ifdef USE_WINDOWS
-                unlock_mutex(d_lock_queue);
-                SuspendThread(d_hThread[thread_id]);
-                lock_mutex(d_lock_queue);
-            #elif defined(USE_LINUX) || defined(USE_MAC)
-                pthread_cond_wait(d_queue_not_empty,d_lock_queue);
-            #endif
-            PROFILE_THREAD_START2("thread active");
-            ++d_num_active;
-            set_bit(d_active,thread_id,true);
+            PROFILE_THREADPOOL_STOP2( "thread active" );
+            d_wait_work.wait_for(1e-3);
+            PROFILE_THREADPOOL_START2( "thread active" );
+            AtomicOperations::atomic_increment( &d_num_active );
+            set_bit( d_active, thread_id );
         }
         // Check if there is a shutdown requested
-        shutdown = get_bit(d_cancel,thread_id);
+        shutdown = get_bit( d_cancel, thread_id );
     }
-    PROFILE_THREAD_STOP("thread active");
-    d_num_active--;
-    set_bit(d_active,thread_id,false);
-    // Release mutex
-    unlock_mutex(d_lock_queue);
+    PROFILE_THREADPOOL_STOP( "thread active" );
+    AtomicOperations::atomic_decrement( &d_num_active );
+    unset_bit( d_active, thread_id );
     return;
 }
 
 
-
 /******************************************************************
 * This is the function that adds work to the thread pool          *
 * Note: this version uses a last in - first out work scheduling.  *
 ******************************************************************/
-void ThreadPool::add_work( size_t N, ThreadPool::WorkItem* work[], 
-    const int* priority, ThreadPool::thread_id_t* ids ) 
+inline void ThreadPool::add_work( const ThreadPool::thread_id_t& id )
+{
+    auto work = id.work();
+    work->d_state = 1;
+    // Check and change priorities of dependency ids
+    const int priority = id.getPriority();
+    for (int i=0; i<work->d_N_ids; i++) {
+        const auto& id1 = work->d_ids[i];
+        if ( !id1.started() && id1<id ) {
+            // Remove and add the id back with a higher priority
+            auto id2 = d_queue_list.remove( []( const thread_id_t& a, const thread_id_t& b ) { return a==b; }, id1 );
+            id2.setPriority( std::max(priority,id2.getPriority()) );
+            d_queue_list.insert( id2 );
+        }
+    }
+    d_queue_list.insert( id );
+    AtomicOperations::atomic_increment( &d_N_added );
+}
+void ThreadPool::add_work(
+    size_t N, ThreadPool::WorkItem *work[], const int *priority, ThreadPool::thread_id_t *ids )
 {
-    #if MONITOR_THREADPOOL_PERFORMANCE
-        TIME_TYPE start_time_local;
-        get_time(&start_time_local);
-    #endif
     // If we have a very long list, break it up into smaller pieces to keep the threads busy
-    const size_t block_size = MAX_QUEUED/4;
+    const size_t block_size = MAX_QUEUED / 8;
     if ( N > block_size ) {
-        size_t N_sets = (N+block_size-1)/block_size;
-        for (size_t i=0; i<N_sets; i++) {
-            size_t index = i*block_size;
-            size_t N2 = std::min<size_t>(block_size,N-index);
-            add_work( N2, &work[index], &priority[index], &ids[index] );
+        size_t i = 0;
+        while ( i < N ) {
+            add_work( std::min(N-i,block_size), &work[i], &priority[i], &ids[i] );
+            i += block_size;
         }
         return;
     }
+    PROFILE_THREADPOOL_START( "add_work" );
+#if MONITOR_THREADPOOL_PERFORMANCE
+    auto t1 = std::chrono::high_resolution_clock::now();
+#endif
     // Create the thread ids (can be done without blocking)
-    for (size_t i=0; i<N; i++) {
-        ids[i].reset(priority[i],advance_id(),work[i]);
-        work[i]->d_tpool_index = -2;
-    }
+    for ( size_t i = 0; i < N; i++ )
+        ids[i].reset( priority[i], AtomicOperations::atomic_decrement(&d_id_assign), work[i] );
+#if MONITOR_THREADPOOL_PERFORMANCE
+    auto t2 = std::chrono::high_resolution_clock::now();
+    accumulate( total_add_work_time[0], t1, t2 );
+#endif
     // If there are no threads, perform the work immediately
     if ( d_N_threads < 1 ) {
-        for (size_t i=0; i<N; i++) {
+        for ( size_t i = 0; i < N; i++ ) {
+            work[i]->d_state  = 2;
             work[i]->run();
+            work[i]->d_state  = 3;
         }
+        #if MONITOR_THREADPOOL_PERFORMANCE
+            auto t5 = std::chrono::high_resolution_clock::now();
+            accumulate( total_add_work_time[4], t2, t5 );
+        #endif
+        PROFILE_THREADPOOL_STOP2( "add_work" );
         return;
     }
     // Wait for enough room in the queue (doesn't need blocking since it isn't that precise)
-    if ( N > static_cast<size_t>(MAX_QUEUED-d_queue_size) ) {
-        int N_wait = static_cast<int>( N - (MAX_QUEUED-d_queue_size) );
+    if ( N > static_cast<size_t>( MAX_QUEUED - d_queue_list.size() ) ) {
+        int N_wait = static_cast<int>( N - ( MAX_QUEUED - d_queue_list.size() ) );
         while ( N_wait > 0 ) {
-            d_signal_count = static_cast<unsigned char>(std::min(N_wait,255));
-            #ifdef USE_WINDOWS
-                DWORD ret = WaitForSingleObject( d_wait_finished, INFINITE );
-            #elif defined(USE_LINUX) || defined(USE_MAC)
-                lock_mutex(d_lock_queue);
-                if ( d_signal_count > 0 )
-                    pthread_cond_wait(d_wait_finished,d_lock_queue);
-                unlock_mutex(d_lock_queue);
-            #else
-                #error Not programmed
-            #endif
-            N_wait = static_cast<int>( N - (MAX_QUEUED-d_queue_size) );
+            d_signal_count = static_cast<unsigned char>( std::min( N_wait, 255 ) );
+            d_wait_finished.wait_for(1e-4);
+            N_wait = static_cast<int>( N - ( MAX_QUEUED - d_queue_list.size() ) );
         }
     }
-    // Get the lock and add the work items
-    lock_mutex(d_lock_queue);
-    #if MONITOR_THREADPOOL_PERFORMANCE
-        TIME_TYPE stop_time_local;
-        get_time(&stop_time_local);
-        total_add_work_time[0] += get_diff(start_time_local,stop_time_local,frequency);
-    #endif
-    // Next create the work items and add them to the queue
-    for (size_t i=0; i<N; i++) {
-        queue_list_struct *work_item = const_cast<queue_list_struct*>(&d_queue_list[d_queue_free]);
-        d_queue_free = work_item->next;
-        work_item->next = -1;
-        work_item->prev = -1;
-        d_queue_ids[work_item->position] = ids[i];
-        reinterpret_cast<WorkItem*>(ids[i].d_work)->d_tpool_index = work_item->position;
-        if ( d_queue_head==-1 ) {
-            d_queue_head = work_item->position;
-        } else if ( ids[i] > d_queue_ids[d_queue_list[d_queue_head].position] ) {
-            work_item->next = d_queue_head;
-            d_queue_list[d_queue_head].prev = work_item->position;
-            d_queue_head = work_item->position;
-        } else {
-            short int prev = d_queue_head;
-            short int cur = d_queue_list[prev].next;
-            while ( cur!=-1 ) {
-                if ( d_queue_ids[cur] < ids[i] )
-                    break;
-                prev = cur;
-                cur = d_queue_list[prev].next;
-            }
-            work_item->prev = prev;
-            work_item->next = cur;
-            if ( cur != -1 )
-                d_queue_list[cur].prev = work_item->position;
-            d_queue_list[prev].next = work_item->position;
-        }
-        ++d_queue_size;
-    }
-    int num_active2 = d_num_active;       // Copy the number of active threads to a local variable
-    unlock_mutex(d_lock_queue);
-    #if MONITOR_THREADPOOL_PERFORMANCE
-        get_time(&stop_time_local);
-        total_add_work_time[1] += get_diff(start_time_local,stop_time_local,frequency);
-    #endif
+#if MONITOR_THREADPOOL_PERFORMANCE
+    auto t3 = std::chrono::high_resolution_clock::now();
+    accumulate( total_add_work_time[1], t2, t3 );
+#endif
+    // Get add the work items to the queue
+    for ( size_t i = 0; i < N; i++ )
+        add_work( ids[i] );
+#if MONITOR_THREADPOOL_PERFORMANCE
+    auto t4 = std::chrono::high_resolution_clock::now();
+    accumulate( total_add_work_time[2], t3, t4 );
+#endif
     // Activate sleeping threads
-    #ifdef USE_WINDOWS
-        for (int i=0; i<d_N_threads; i++) {
-            if ( num_active2 == d_N_threads ) {
-                // All threads are active, no need to activate
-                break;
-            } else if ( d_queue_size == 0 ) {
-                // Queue is empty, no need to activate
-                break;
-            } else if ( !get_bit(d_active,i) ) {
-                // Thread is inactive, wake it
-                ResumeThread(d_hThread[i]);
-            }
-        }
-    #elif defined(USE_LINUX) || defined(USE_MAC)
-        if ( num_active2 == d_N_threads ) {
-            // All threads are active, no need to wake anybody
-        } else if ( d_queue_size == 0 ) {
-            // Queue is empty, no need to activate
-        } else if ( N == 1 ) {
-            // Added 1 item to the queue, wake 1 worker
-            int error = pthread_cond_signal(d_queue_not_empty);
-            if ( error != 0 )
-                perr << "Error in signaling thread";
-        } else {
-            // Added multple items in the queue, wake all workers
-            int error = pthread_cond_broadcast(d_queue_not_empty);
-            if ( error != 0 )
-                perr << "Error in signaling thread";
-        }
-    #endif
-    #if MONITOR_THREADPOOL_PERFORMANCE
-        get_time(&stop_time_local);
-        total_add_work_time[2] += get_diff(start_time_local,stop_time_local,frequency);
-    #endif
-}
-
-
-
-
-/******************************************************************
-* This function checks if the work item has finished              *
-******************************************************************/
-bool ThreadPool::isFinished(ThreadPool::thread_id_t id) const
-{
-    if ( !isValid(id) ) {
-        // The thread id is not valid
-        return false;
+    if ( d_num_active == d_N_threads ) {
+        // All threads are active, no need to wake anybody
+    } else if ( d_queue_list.size() == 0 ) {
+        // Queue is empty, no need to activate
+    } else if ( N == 1 ) {
+        // Added 1 item to the queue, wake 1 worker
+        d_wait_work.notify_one();
+    } else {
+        // Added multple items in the queue, wake all workers
+        d_wait_work.notify_all();
     }
-    return reinterpret_cast<WorkItem*>(id.d_work)->d_state==2;
+#if MONITOR_THREADPOOL_PERFORMANCE
+    auto t5 = std::chrono::high_resolution_clock::now();
+    accumulate( total_add_work_time[3], t4, t5 );
+#endif
+    PROFILE_THREADPOOL_STOP( "add_work" );
 }
 
 
-
 /******************************************************************
 * This function removes a finished work item                      *
 ******************************************************************/
-ThreadPool::WorkItem* ThreadPool::getFinishedWorkItem(ThreadPool::thread_id_t id) const
+ThreadPool::WorkItem *ThreadPool::getFinishedWorkItem( ThreadPool::thread_id_t id ) const
 {
-    if ( !isValid(id) ) 
-        return NULL;
-    if ( reinterpret_cast<WorkItem*>(id.d_work)->d_state!=2 )
-        return NULL;
-    // Return the result
-    WorkItem* work = reinterpret_cast<WorkItem*>(id.d_work);
-    return work;
+    if ( id.finished() )
+        return id.work();
+    return nullptr;
 }
 
 
-
 /******************************************************************
 * This function waits for a some of the work items to finish      *
 ******************************************************************/
-static inline void check_finished( size_t N_work, const ThreadPool::thread_id_t *ids, size_t& N_finished, bool* finished)
+static inline void check_finished(
+    size_t N_work, const ThreadPool::thread_id_t *ids, size_t &N_finished, bool *finished )
 {
-    for (size_t k=0; k<N_work; k++) {
+    for ( size_t k = 0; k < N_work; k++ ) {
         if ( !finished[k] && ids[k].finished() ) {
             N_finished++;
             finished[k] = true;
         }
     }
 }
-int ThreadPool::wait_some(size_t N_work, const ThreadPool::thread_id_t *ids, size_t N_wait, bool* finished) const
+int ThreadPool::wait_some(
+    size_t N_work, const ThreadPool::thread_id_t *ids, size_t N_wait, bool *finished ) const
 {
     // Check the inputs
-    if ( N_wait<=0 || N_wait>N_work ) {
-        printp("Invalid arguments in thread pool wait (%i,%i)\n",(int)N_work,(int)N_wait);
-        return -1;
-    }
+    if ( N_wait > N_work )
+        throw std::logic_error( "Invalid arguments in thread pool wait" );
     size_t N_finished = 0;
-    memset(finished,0,N_work*sizeof(bool));
+    memset( finished, 0, N_work * sizeof( bool ) );
     // Check that all the ids are valid
-    size_t next_id = d_id_assign-1;
-    for (size_t k=0; k<N_work; k++) {
+    size_t next_id = d_id_assign - 1;
+    for ( size_t k = 0; k < N_work; k++ ) {
         if ( !ids[k].initialized() ) {
             finished[k] = true;
             N_finished++;
         }
         size_t local_id = ids[k].getLocalID();
-        bool test = local_id==0 || local_id>MAXID64 || local_id<=next_id;
-        test = test && !finished[k];
-        if ( test ) {
-            printp("Invalid ids for wait\n");
-            return -1;
-        }
+        bool test       = local_id == 0 || local_id > thread_id_t::maxThreadID || local_id <= next_id;
+        test            = test && !finished[k];
+        if ( test )
+            throw std::logic_error( "Invalid ids for wait" );
     }
     // Check which ids have finished
-    check_finished(N_work,ids,N_finished,finished);
+    check_finished( N_work, ids, N_finished, finished );
     // If enough ids have finished return
-    if ( N_finished >= N_wait ) {
-        return 0;
-    }
-    // Acquire the lock and update the finished list
-    // It is possible that in the time required to acquire the lock, the work items may finish
-    lock_mutex(d_lock_queue);
-    check_finished(N_work,ids,N_finished,finished);
-    if ( N_finished >= N_wait ) {
-        unlock_mutex(d_lock_queue);
-        return 0;
-    }
+    if ( N_finished >= N_wait )
+        return N_finished;
     // Create the wait event struct
-    wait_event_struct* tmp = new wait_event_struct(&wait_pool);
-    wait_type event = tmp->wait_event;
-    tmp->count = static_cast<int>(N_wait-N_finished);
-    tmp->ids.reserve(N_wait-N_finished);
-    for (size_t k=0; k<N_work; k++) {
-        if ( !finished[k] )
-            tmp->ids.push_back(ids[k]);
+    auto tmp = new wait_ids_struct( N_work, ids, N_wait, d_cond_pool, MAX_WAIT, d_wait );
+    // Wait for the ids
+    auto t1 = std::chrono::high_resolution_clock::now();
+    while ( !tmp->wait_for(0.01) ) {
+        check_wait_time( t1 );
     }
-    quicksort(tmp->ids);
-    d_wait[d_N_wait] = tmp;
-    d_N_wait++;
-    // Wait for a signal indicating that a thread has finished
-    #ifdef USE_WINDOWS
-        unlock_mutex(d_lock_queue);
-        DWORD ret = WaitForSingleObject( event, INFINITE );
-        lock_mutex(d_lock_queue);
-    #elif defined(USE_LINUX) || defined(USE_MAC)
-        pthread_cond_wait(event,d_lock_queue);
-    #endif
-    // Check for remaining references to the wait struct and delete the structure
-    for (int k=0; k<d_N_wait; k++) {
-        if ( d_wait[k] == tmp ) {
-            for (int m=k+1; m<d_N_wait; m++)
-                d_wait[m-1] = d_wait[m];
-            d_wait[d_N_wait-1] = NULL;
-            break;
-        }
-    }
-    d_N_wait--;
-    delete tmp;
-    unlock_mutex(d_lock_queue);
     // Update the ids that have finished
-    check_finished(N_work,ids,N_finished,finished);
-    if ( N_finished<N_wait && N_work!=0 ) {
-        throw std::logic_error("Internal error: failed to wait");
-    }
-    return 0;
+    check_finished( N_work, ids, N_finished, finished );
+    if ( N_finished < N_wait && N_work != 0 )
+        throw std::logic_error( "Internal error: failed to wait" );
+    // Delete the wait event struct
+    // Note: we want to maintain the reference in case a thread is still using it
+    // Note: technically this should be atomic
+    std::swap(d_wait_last,tmp);
+    delete tmp;
+    return N_finished;
 }
 
 
-
 /******************************************************************
 * This function waits for all of the threads to finish their work *
 ******************************************************************/
-void ThreadPool::wait_pool_finished() const 
+void ThreadPool::check_wait_time( std::chrono::time_point<std::chrono::high_resolution_clock>& t1 ) const
+{
+    auto t2 = std::chrono::high_resolution_clock::now();
+    if ( std::chrono::duration_cast<std::chrono::seconds>(t2-t1).count() > MAX_WAIT_TIME_DEBUG ) {
+        std::cout << "Warning: Maximum wait time in ThreadPool exceeded, threads may be hung\n";
+        std::cout << "N_active: " << d_num_active << std::endl;
+        std::cout << "N_queued: " << d_queue_list.size() << std::endl;
+        std::cout << "N_added: " << d_N_added << std::endl;
+        std::cout << "N_started: " << d_N_started << std::endl;
+        std::cout << "N_finished: " << d_N_finished << std::endl;
+        std::cout << "queue.insert(): " << d_queue_list.N_insert() << std::endl;
+        std::cout << "queue.remove(): " << d_queue_list.N_remove() << std::endl;
+        std::cout << "Stack Trace:\n";
+        auto call_stack = StackTrace::getAllCallStacks( );
+        auto text = call_stack.print( "  " );
+        for ( auto& line : text )
+            std::cout << line << std::endl;
+        t1 = std::chrono::high_resolution_clock::now();
+    }
+}
+void ThreadPool::wait_pool_finished() const
 {
     // First check that we are not one of the threads
     if ( isMemberThread() ) {
-        throw std::logic_error("Member thread attempted to call wait_pool_finished");
+        throw std::logic_error( "Member thread attempted to call wait_pool_finished" );
     }
-    lock_mutex(d_lock_queue);
     // Wait for all threads to finish their work
-    while ( d_num_active>0 || d_queue_size>0 ) {
+    auto t1 = std::chrono::high_resolution_clock::now();
+    while ( d_num_active > 0 || d_queue_list.size() > 0 ) {
+        check_wait_time( t1 );
         d_signal_empty = true;
-        #ifdef USE_WINDOWS
-            unlock_mutex(d_lock_queue);
-            DWORD ret = WaitForSingleObject( d_wait_finished, INFINITE );
-            lock_mutex(d_lock_queue);
-        #elif defined(USE_LINUX) || defined(USE_MAC)
-            pthread_cond_wait(d_wait_finished,d_lock_queue);
-        #else
-            #error Not programmed
-        #endif
+        d_wait_finished.wait_for(10e-6);
     }
     d_signal_empty = false;
-    unlock_mutex(d_lock_queue);
-}
-
-
-
-/******************************************************************
-* These functions create the unique id to assign each work item   *
-* If id is a 32-bit number we have 4e9 possible work items        *
-* If id is a 64-bit number we have 9e19 possible work items and   *
-*    we have some checking that will catch some invalid ids       *
-******************************************************************/
-inline void ThreadPool::initialize_id() 
-{
-    // Note that the best option is to use a 64-bit integer
-    if ( sizeof(size_t)==8 ) {
-        // Set the starting value to 2^56-3
-        d_id_assign = MAXID64;
-    } else if ( sizeof(size_t)==4 ) {
-        // Set the starting value to 2^32-3
-        d_id_assign = MAXID32;
-    } else {
-        throw std::logic_error("Internal error: failed to initialize ids");
-    }
-}
-inline size_t ThreadPool::advance_id() 
-{
-    size_t id = AtomicOperations::atomic_decrement( &d_id_assign );
-    if ( id==0 )
-        throw std::logic_error("Ran out of valid ids");
-    return id;
 }
 
 
 /******************************************************************
-* Function to check if the current thread is a member thread      *
+* Member functions of wait_ids_struct                             *
 ******************************************************************/
-inline bool ThreadPool::isMemberThread() const
+ThreadPool::wait_ids_struct::wait_ids_struct( size_t N, const ThreadPool::thread_id_t *ids, size_t N_wait,
+    AtomicOperations::pool<condition_variable,128>& cv_pool, int N_wait_list, volatile wait_ids_struct **list ):
+    d_wait( N_wait ),
+    d_N(0),
+    d_cv_pool( cv_pool ),
+    d_wait_event( cv_pool.get() )
 {
-    size_t id = getThreadId();
-    for (int i=0; i<d_N_threads; i++) {
-        if ( id==d_ThreadId[i] )
-            return true;
+    d_ids = new ThreadPool::thread_id_t[N];
+    for ( size_t i = 0; i < N; i++ ) {
+        if ( ids[i].finished() )
+            d_wait = std::max(d_wait-1,0);
+        else
+            d_ids[d_N++] = ids[i];
     }
-    return false;
+    quicksort( d_N, d_ids );
+    d_finished = new bool[d_N];
+    memset((void*)d_finished,0,d_N);
+    int i = 0;
+    while ( !AtomicOperations::atomic_compare_and_swap( (void *volatile *) &list[i], nullptr, this ) ) { i = (i+1)%N_wait_list; }
+    d_ptr = &list[i];
 }
-
-
-/******************************************************************
-* Member functions of wait_event_struct                           *
-******************************************************************/
-ThreadPool::wait_event_struct::wait_event_struct( wait_pool_struct* wait_pool )
+void ThreadPool::wait_ids_struct::id_finished( const ThreadPool::thread_id_t& id ) const
 {
-    count = 0;
-    ThreadId = getThreadId();
-    d_wait_pool = wait_pool;
-    wait_event = d_wait_pool->pop();
-}
-ThreadPool::wait_event_struct::~wait_event_struct( )
-{
-    d_wait_pool->push(wait_event);
-}
-
-
-/******************************************************************
-* Member functions of wait_pool_struct                            *
-******************************************************************/
-ThreadPool::wait_pool_struct::wait_pool_struct( )
-{
-    d_size = 16;
-    d_count = 0;
-    d_pool = new wait_type[d_size];
-    memset(const_cast<wait_type*>(d_pool),0,d_size*sizeof(wait_type));
-    d_lock = create_mutex( );
-}
-ThreadPool::wait_pool_struct::~wait_pool_struct( )
-{
-    for (size_t i=0; i<d_count; i++) {
-        #ifdef USE_WINDOWS
-            CloseHandle(d_pool[i]);
-        #elif defined(USE_LINUX) || defined(USE_MAC)
-            pthread_cond_destroy(d_pool[i]);
-            delete d_pool[i];
-        #else
-            #error Not programmed
-        #endif
+    int index = find_id( d_N, d_ids, id );
+    if ( index >= 0 ) {
+        d_finished[index] = true;
+        int N_finished = 0;
+        for (int i=0; i<d_N; i++)
+            N_finished += d_finished[i] ? 1:0;
+        if ( N_finished >= d_wait ) {
+            *d_ptr = nullptr;
+            d_wait = 0;
+            d_N = 0;
+            d_wait_event->notify_all();
+        }
     }
-    delete [] d_pool;
-    destroy_mutex( d_lock );
-    d_size = 0;
-    d_count = 0;
-    d_pool = 0;
-    d_lock = 0;
 }
-void ThreadPool::wait_pool_struct::push( ThreadPool::wait_type event )
+bool ThreadPool::wait_ids_struct::wait_for( double seconds )
 {
-    lock_mutex(d_lock);
-    if ( d_count >= d_size ) {
-        volatile wait_type* tmp = d_pool;
-        d_pool = new wait_type[2*d_size];
-        memset((void*)d_pool,0,2*d_size*sizeof(wait_type));
-        memcpy((void*)d_pool,(void*)tmp,d_size*sizeof(wait_type));
-        delete [] d_pool;
-        d_size = 2*d_size;
+    for (int i=0; i<d_N; i++) {
+        if ( d_ids[i].finished() )
+            d_finished[i] = true;
     }
-    d_pool[d_count] = event;
-    ++d_count;
-    unlock_mutex(d_lock);
-}
-ThreadPool::wait_type ThreadPool::wait_pool_struct::pop( )
-{
-    lock_mutex(d_lock);
-    wait_type event = 0;
-    if ( d_count == 0 ) {
-        #ifdef USE_WINDOWS
-            event = CreateEvent(NULL,FALSE,FALSE,NULL);
-        #elif defined(USE_LINUX) || defined(USE_MAC)
-            event = new pthread_cond_t;
-            int error = pthread_cond_init(event,NULL);
-            if ( error == -1 )
-                std::logic_error("Error creating wait_event");
-        #else
-            #error Not programmed
-        #endif
-    } else {
-        event = d_pool[d_count-1];
-        --d_count;
+    auto t1 = std::chrono::high_resolution_clock::now();
+    while ( true ) {
+        int N_finished = 0;
+        for (int i=0; i<d_N; i++)
+            N_finished += d_finished[i] ? 1:0;
+        if ( N_finished>=d_wait || d_N==0 ) {
+            *d_ptr = nullptr;
+            d_wait = 0;
+            d_N = 0;
+            break;
+        }
+        auto t2 = std::chrono::high_resolution_clock::now();
+        if ( 1e-6*std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count() > seconds )
+            return false;
+        d_wait_event->wait_for(1e-5);
     }
-    unlock_mutex(d_lock);
-    return event;
+    return true;
 }
 
 
@@ -1750,88 +1143,86 @@ ThreadPool::wait_type ThreadPool::wait_pool_struct::pop( )
 * templated quicksort routine                                     *
 ******************************************************************/
 template <class T>
-void quicksort(std::vector<T> &x)
+void quicksort( int n, T *arr )
 {
-    int n = (int) x.size();
     if ( n <= 1 )
         return;
-    T *arr = &x[0];
     bool test;
     int i, ir, j, jstack, k, l, istack[100];
     T a, tmp_a;
     jstack = 0;
-    l = 0;
-    ir = n-1;
-    while (1) {
-        if ( ir-l < 7 ) {             // Insertion sort when subarray small enough.
-            for ( j=l+1; j<=ir; j++ ) {
-                a = arr[j];
+    l      = 0;
+    ir     = n - 1;
+    while ( 1 ) {
+        if ( ir - l < 7 ) { // Insertion sort when subarray small enough.
+            for ( j = l + 1; j <= ir; j++ ) {
+                a    = arr[j];
                 test = true;
-                for (i=j-1; i>=0; i--) {
+                for ( i = j - 1; i >= 0; i-- ) {
                     if ( arr[i] < a ) {
-                        arr[i+1] = a;
-                        test = false;
+                        arr[i + 1] = a;
+                        test       = false;
                         break;
                     }
-                    arr[i+1] = arr[i];
+                    arr[i + 1] = arr[i];
                 }
                 if ( test ) {
-                    i = l-1;
-                    arr[i+1] = a;
+                    i          = l - 1;
+                    arr[i + 1] = a;
                 }
             }
-            if ( jstack==0 )
+            if ( jstack == 0 )
                 return;
-            ir = istack[jstack];    // Pop stack and begin a new round of partitioning.
-            l = istack[jstack-1];
+            ir = istack[jstack]; // Pop stack and begin a new round of partitioning.
+            l  = istack[jstack - 1];
             jstack -= 2;
         } else {
-            k = (l+ir)/2;           // Choose median of left, center and right elements as partitioning
-                                    // element a. Also rearrange so that a(l) < a(l+1) < a(ir).
-            tmp_a = arr[k];
-            arr[k] = arr[l+1];
-            arr[l+1] = tmp_a;
-            if ( arr[l]>arr[ir] ) {
-                tmp_a = arr[l];
-                arr[l] = arr[ir];
+            k = ( l + ir ) / 2; // Choose median of left, center and right elements as partitioning
+                                // element a. Also rearrange so that a(l) < a(l+1) < a(ir).
+            tmp_a      = arr[k];
+            arr[k]     = arr[l + 1];
+            arr[l + 1] = tmp_a;
+            if ( arr[l] > arr[ir] ) {
+                tmp_a   = arr[l];
+                arr[l]  = arr[ir];
                 arr[ir] = tmp_a;
             }
-            if ( arr[l+1] > arr[ir] ) {
-                tmp_a = arr[l+1];
-                arr[l+1] = arr[ir];
-                arr[ir] = tmp_a;
+            if ( arr[l + 1] > arr[ir] ) {
+                tmp_a      = arr[l + 1];
+                arr[l + 1] = arr[ir];
+                arr[ir]    = tmp_a;
             }
-            if ( arr[l] > arr[l+1] ) {
-                tmp_a = arr[l];
-                arr[l] = arr[l+1];
-                arr[l+1] = tmp_a;
+            if ( arr[l] > arr[l + 1] ) {
+                tmp_a      = arr[l];
+                arr[l]     = arr[l + 1];
+                arr[l + 1] = tmp_a;
             }
             // Scan up to find element > a
             j = ir;
-            a = arr[l+1];           // Partitioning element.
-            for (i=l+2; i<=ir; i++) { 
-                if ( arr[i]<a ) 
+            a = arr[l + 1]; // Partitioning element.
+            for ( i = l + 2; i <= ir; i++ ) {
+                if ( arr[i] < a )
                     continue;
-                while ( arr[j]>a )  // Scan down to find element < a.
+                while ( arr[j] > a ) // Scan down to find element < a.
                     j--;
                 if ( j < i )
-                    break;          // Pointers crossed. Exit with partitioning complete.
-                tmp_a = arr[i];     // Exchange elements of both arrays.
+                    break;       // Pointers crossed. Exit with partitioning complete.
+                tmp_a  = arr[i]; // Exchange elements of both arrays.
                 arr[i] = arr[j];
                 arr[j] = tmp_a;
             }
-            arr[l+1] = arr[j];      // Insert partitioning element in both arrays.
-            arr[j] = a;
+            arr[l + 1] = arr[j]; // Insert partitioning element in both arrays.
+            arr[j]     = a;
             jstack += 2;
             // Push pointers to larger subarray on stack, process smaller subarray immediately.
-            if ( ir-i+1 >= j-l ) {
-                istack[jstack] = ir;
-                istack[jstack-1] = i;
-                ir = j-1;
+            if ( ir - i + 1 >= j - l ) {
+                istack[jstack]     = ir;
+                istack[jstack - 1] = i;
+                ir                 = j - 1;
             } else {
-                istack[jstack] = j-1;
-                istack[jstack-1] = l;
-                l = i;
+                istack[jstack]     = j - 1;
+                istack[jstack - 1] = l;
+                l                  = i;
             }
         }
     }
@@ -1841,75 +1232,76 @@ void quicksort(std::vector<T> &x)
 /************************************************************************
 * Function to find the id in a sorted vector                            *
 ************************************************************************/
-inline bool find_id(const std::vector<ThreadPool::thread_id_t> &x_in, const ThreadPool::thread_id_t &id ) 
+inline int find_id( int n, const ThreadPool::thread_id_t *x, const ThreadPool::thread_id_t &id )
 {
-    if ( x_in.empty() )
-        return false;
-    size_t n = x_in.size();
-    const ThreadPool::thread_id_t *x = &x_in[0];   // Use the pointer for speed
-    if ( n<4 ) {
-        for (size_t i=0; i<n; i++) {
-            if ( x[i] == id ) 
-                return true;
-        }
-    }
+    if ( n == 0 )
+        return -1;
     // Check if value is within the range of x
     if ( id == x[0] )
-        return true;
+        return 0;
     if ( id < x[0] )
-        return false;
-    if ( id == x[n-1] )
-        return true;
-    if ( id > x[n-1] )
-        return false;
+        return -1;
+    if ( id == x[n - 1] )
+        return n-1;
+    if ( id > x[n - 1] )
+        return -1;
     // Perform the search
     size_t lower = 0;
-    size_t upper = n-1;
+    size_t upper = n - 1;
     size_t index;
-    while ( (upper-lower) != 1 ) {
-        index = (upper+lower)/2;
+    while ( ( upper - lower ) != 1 ) {
+        index = ( upper + lower ) / 2;
         if ( x[index] == id )
-            return true;
+            return index;
         if ( x[index] >= id )
             upper = index;
         else
             lower = index;
     }
-    return false;
+    return -1;
 }
 
 
 /************************************************************************
 * Function to add dependencies to the work item                         *
+* Note: when expanding the size of d_ids, we need to allocate space for *
+* one extra entry for a spinlock.                                       *
 ************************************************************************/
-void ThreadPool::WorkItem::add_dependencies( size_t N, const ThreadPool::thread_id_t* ids)
+void ThreadPool::WorkItem::add_dependencies( size_t N, const ThreadPool::thread_id_t *ids )
 {
-    if ( d_tpool_index != -1 ) {
-        // The item has already been added to the threadpool, 
+    if ( d_state!=0 ) {
+        // The item has already been added to the threadpool,
         // we are not allowed to add dependencies
-        throw std::logic_error("Cannot add dependency to work item once it has been added the the threadpool");
+        throw std::logic_error(
+            "Cannot add dependency to work item once it has been added the the threadpool" );
     }
-    if ( static_cast<size_t>(d_N_ids)+N > 0xFFFF ) {
-        throw std::logic_error("Cannot add more than 65000 dependencies");
+    if ( static_cast<size_t>( d_N_ids ) + N > 0xFFFF ) {
+        throw std::logic_error( "Cannot add more than 65000 dependencies" );
     }
-    for (size_t i=0; i<N; i++) {
-        if ( !ids[i].finished() ) {
-            if ( d_N_ids >= d_size ) {
-                thread_id_t* tmp = d_ids;
-                unsigned int N2 = d_size;
-                if ( N2 == 0 ) { N2 = 8; }
-                while ( N2 <= d_N_ids )
-                    N2 *= 2;
-                d_ids = new thread_id_t[N2];
-                for (size_t i=0; i<d_N_ids; i++)
-                    std::swap(d_ids[i],tmp[i]);
-                delete [] tmp;
-                d_size = N2;
+    if ( d_N_ids + N + 1 > d_size ) {
+        thread_id_t *tmp = d_ids;
+        unsigned int N2  = d_size;
+        if ( N2 == 0 ) {
+            N2 = 8;
+        }
+        while ( N2 < d_N_ids + N + 1 )
+            N2 *= 2;
+        d_ids = new thread_id_t[N2];
+        for ( size_t i = 0; i < d_N_ids; i++ )
+            const_cast<thread_id_t &>( ids[i] ).swap( tmp[i] );
+        delete[] tmp;
+        d_size = N2;
+        int* lock = reinterpret_cast<int*>(&d_ids[d_size-1]);
+        *lock = 0;
+    }
+    const ThreadPool::thread_id_t id0;
+    for ( size_t i = 0; i < N; i++ ) {
+        if ( ids[i] != id0 ) {
+            if ( !ids[i].finished() ) {
+                d_ids[d_N_ids] = ids[i];
+                d_N_ids++;
             }
-            d_ids[d_N_ids] = ids[i];
-            d_N_ids++;
         }
     }
 }
 
-
diff --git a/threadpool/thread_pool.h b/threadpool/thread_pool.h
old mode 100755
new mode 100644
index f8cf46c4..db3eec9d
--- a/threadpool/thread_pool.h
+++ b/threadpool/thread_pool.h
@@ -1,280 +1,262 @@
 // Copyright © 2004 Mark Berrill. All Rights Reserved. This work is distributed with permission,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-#ifndef included_ThreadPool
-#define included_ThreadPool
-#include <stdio.h>
-#include <typeinfo>
+// but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+// PARTICULAR PURPOSE.
+#ifndef included_AtomicModelThreadPool
+#define included_AtomicModelThreadPool
 #include <iostream>
-#include <stdarg.h>
-#include <string.h>
-#include <vector>
-#include <stdexcept>
 #include <map>
+#include <stdarg.h>
+#include <stdexcept>
+#include <stdio.h>
+#include <string.h>
+#include <typeinfo>
+#include <vector>
+#include <mutex>
+#include <thread>
+#include <condition_variable>
+
 
 #include "threadpool/atomic_helpers.h"
+#include "threadpool/atomic_list.h"
 
 
-// Choose the OS 
-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+// Choose the OS
+#if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 )
     // Using windows
     #define USE_WINDOWS
-    #include <stdlib.h>
-    #include <windows.h>
-    #include <process.h>
-    #define NOMINMAX
-    // Disable warning: the inline specifier cannot be used when a friend 
-    // declaration refers to a specialization of a function template
-    #pragma warning(disable:4396)
-#elif defined(__APPLE__)
+#elif defined( __APPLE__ )
     // Using MAC
-    //   https://developer.apple.com/library/mac/#releasenotes/Performance/RN-AffinityAPI
-    //   http://plugins.svn.wordpress.org/wp-xhprof-profiler/trunk/facebook-xhprof/extension/xhprof..c
     #define USE_MAC
-    #include <unistd.h>
-    #include <mach/mach_init.h>
-    #include <mach/thread_policy.h>
-    #define cpu_set_t thread_affinity_policy_data_t
-    #define CPU_SET(cpu_id, new_mask) \
-        *new_mask.affinity_tag = (cpu_id + 1)
-    #define CPU_ZERO(new_mask)                 \
-        (*(new_mask)).affinity_tag = THREAD_AFFINITY_TAG_NULL
-    #define sched_setaffinity(pid, size, mask)       \
-        thread_policy_set(mach_thread_self(), THREAD_AFFINITY_POLICY, mask, \
-                          THREAD_AFFINITY_POLICY_COUNT)
-    #define sched_getaffinity(pid, size, mask) \
-        thread_policy_get(mach_thread_self(), THREAD_AFFINITY_POLICY, mask, \
-                          THREAD_AFFINITY_POLICY_COUNT)
-    /*
-    #define CPU_ZERO(new_mask) \
-        *new_mask.affinity_tag == THREAD_AFFINITY_TAG_NULL
-    #define SET_AFFINITY(pid, size, mask) \
-        thread_policy_set(mach_thread_self(), THREAD_AFFINITY_POLICY, mask, THREAD_AFFINITY_POLICY_COUNT)
-    #define GET_AFFINITY(pid, size, mask) \
-        thread_policy_get(mach_thread_self(), THREAD_AFFINITY_POLICY, mask, THREAD_AFFINITY_POLICY_COUNT)
-    */
-#elif defined(__linux) || defined(__unix) || defined(__posix)
-    // Using Linux
+#elif defined( __linux ) || defined( __unix ) || defined( __posix )
+    // Using linux
     #define USE_LINUX
-    #include <pthread.h>
-    #include <unistd.h>
 #else
     #error Unknown OS
 #endif
 
 
 // Set some definitions
-#define MAX_NUM_THREADS 128         // The maximum number of threads (must be a multiple of 64)
-#define MAX_QUEUED 1024             // The maximum number of items in the work queue at any moment
-#define MAX_WAIT  128               // The maximum number of active waits at any given time
+#define MAX_NUM_THREADS 128     // The maximum number of threads (must be a multiple of 64)
+#define MAX_QUEUED 1024         // The maximum number of items in the work queue at any moment
+#define MAX_WAIT 16             // The maximum number of active waits at any given time
+#define MAX_WAIT_TIME_DEBUG 600 // The maximum time in a wait command before printing a warning message
+
+#define PROFILE_THREADPOOL_PERFORMANCE 0    // Add profile timers to the threadpool (default is 0)
+#define MONITOR_THREADPOOL_PERFORMANCE 0    // Add detailed performance counters (default is 0)
 
 
+// Check the c++ std
+#if CXX_STD==98
+#error Thread pool class requires c++11 or newer
+#endif
 
 
-/** \class Mutex
- * \brief Functions for locking/unlocking a mutex
- * \details This class provides basic routines for creating, 
- *    locking, and unlocking a mutex <BR>
- *    The lock may be recursive, meaning that the same thread
- *    may lock and unlock the lock multiple times before releasing it.
- *    In this case unlock must be called the same number of times before
- *    another thread may lock the mutex.
- */
-class Mutex {
-public:
-    //! Empty constructor (equivilent to Mutex(false) )
-    Mutex();
-    /** Default constructor
-     * \param recursive     If set to true a thread may repeated lock a mutex.
-     *                      If set to false an attept to repeatedly lock will throw an error.*/
-    Mutex(bool recursive);
-    //! Destructor
-    ~Mutex();
-    //! Copy constructor
-    Mutex(const Mutex &);
-    //! Assignment operator
-    Mutex& operator=(const Mutex&);
-    //! Lock the mutex
-    void lock() const;
-    //! Unlock the mutex
-    void unlock() const;
-    //! Try to lock the mutex and return true if successful
-    bool tryLock() const;
-    //! Return true if we already own the lock
-    bool ownLock() const;
-private:
-    bool d_recursive;               // Is the lock recursive (this attribute cannot be changed)
-    volatile int* d_count;          // Number of copies of the mutex
-    volatile int* d_lock_count;     // Number of times a thread has locked the mutex
-    volatile size_t* d_thread;      // Pointer to the thread id that owns the lock
-    #ifdef USE_WINDOWS
-        CRITICAL_SECTION *d_lock;
-    #elif defined(USE_LINUX) || defined(USE_MAC)
-        pthread_mutex_t *d_lock;
-    #else
-        #error Unknown OS
-    #endif
-friend class ThreadPool;
-};
-
 
 /** \class ThreadPool
  *
- * \brief This is a concrete class that provides for a basic thread pool. 
- * \details This class implements a basic thread pool that can be used for a wide variety of applications.
- * An example call usage is provided below.  The ability to return a value is provided.  Note that there
+ * \brief This is a concrete class that provides for a basic thread pool.
+ * \details This class implements a basic thread pool that can be used for a wide variety of
+ * applications.
+ * An example call usage is provided below.  The ability to return a value is provided.  Note that
+ * there
  * is a small overhead to using this functionality. <BR>
  * <pre>Example: <BR>
  *    Existing function call:
  *       double x = myfun_1(a,b);
  *       double y = myfun_2(c,d); <BR>
- *    Threaded call (processing in parallel): 
- *       thread_id_t ids[2]; 
- *       ids[0] = TPOOL_ADD_WORK( tpool, myfun_1, (a,b) ); 
- *       ids[1] = TPOOL_ADD_WORK( tpool, myfun_2, (c,d) ); 
+ *    Threaded call (processing in parallel):
+ *       thread_id_t ids[2];
+ *       ids[0] = TPOOL_ADD_WORK( tpool, myfun_1, (a,b) );
+ *       ids[1] = TPOOL_ADD_WORK( tpool, myfun_2, (c,d) );
  *       int error = wait_all(2,ids);
  *       double x = getFunctionRet(ids[0]);
  *       double y = getFunctionRet(ids[1]); <BR>
  *   </pre>
  */
-class ThreadPool {
-
-public:
-
-    //! Convience typedef
-    typedef unsigned long long int uint64;
-
-    //! Function to get a unique id for the current thread
-    static inline size_t getThreadId();
-
+class ThreadPool
+{
 
 public:
     ///// Member classes
+    class WorkItem;
 
     /** \class thread_id_t
      *
      * \brief This a class to hold the work item id
      * \details This class hold the id of the work item that is being processed by the thread pool.
-     *      It is created when a work item is added to the thread pool and is used by various routines within the thread pool.
+     *      It is created when a work item is added to the thread pool and is used by various
+     * routines within the thread pool.
      */
-    class thread_id_t {
-        public:
-            //! Empty constructor
-            inline thread_id_t( );
-            //! Destructor
-            inline ~thread_id_t( );
-            //! Copy constructors
-            inline thread_id_t( const thread_id_t& rhs );
-            inline thread_id_t& operator=( const thread_id_t& rhs ) volatile;
-            #ifndef USE_WINDOWS
-                inline thread_id_t( const volatile thread_id_t& rhs );
-                inline thread_id_t& operator=( const thread_id_t& rhs );
-                inline thread_id_t& operator=( const volatile thread_id_t& rhs );
-                inline thread_id_t& operator=( const volatile thread_id_t& rhs ) volatile;
-            #endif
-            // Overload key operators
-            inline bool operator==(const thread_id_t& rhs ) const { return d_id==rhs.d_id; }
-            inline bool operator!=(const thread_id_t& rhs ) const { return d_id!=rhs.d_id; }
-            inline bool operator>=(const thread_id_t& rhs ) const { return d_id>=rhs.d_id; }
-            inline bool operator<=(const thread_id_t& rhs ) const { return d_id<=rhs.d_id; }
-            inline bool operator> (const thread_id_t& rhs ) const { return d_id>rhs.d_id;  }
-            inline bool operator< (const thread_id_t& rhs ) const { return d_id<rhs.d_id;  }
-            inline bool operator==(const volatile thread_id_t& rhs ) const volatile { return d_id==rhs.d_id; }
-            inline bool operator!=(const volatile thread_id_t& rhs ) const volatile { return d_id!=rhs.d_id; }
-            inline bool operator>=(const volatile thread_id_t& rhs ) const volatile { return d_id>=rhs.d_id; }
-            inline bool operator<=(const volatile thread_id_t& rhs ) const volatile { return d_id<=rhs.d_id; }
-            inline bool operator> (const volatile thread_id_t& rhs ) const volatile { return d_id>rhs.d_id;  }
-            inline bool operator< (const volatile thread_id_t& rhs ) const volatile { return d_id<rhs.d_id;  }
-            //! Reset the id back to a NULL id
-            inline void reset() volatile;
-            inline void reset();
-            //! Check if the work has finished
-            inline bool finished( ) const;
-        private:
-            // Default constructor
-            inline void reset( int priority, size_t local_id, void* work );
-            // Get the local id
-            inline size_t getLocalID() const;
-            // Get the priority
-            inline int getPriority() const; 
-            // Check if the id is initialized
-            inline bool initialized() const volatile { return d_id!=0x0FFFFFFFFFFFFFFF; }
-            // Friends
-            friend class ThreadPool;
-            template<typename T> friend void std::swap(T&, T&);
-            // Data
-            uint64 d_id;                                        // 64-bit data to store id
-            AtomicOperations::int32_atomic* volatile d_count;   // Reference count
-            void* d_work;                                       // Pointer to the work item
+    class thread_id_t
+    {
+    public:
+        // nullID definitins
+        static constexpr uint64_t nullThreadID = 0x0FFFFFFFFFFFFFFF;
+        static constexpr uint64_t maxThreadID  = 0x00FFFFFFFFFFFFFD;
+        //! Empty constructor
+        inline thread_id_t();
+        //! Destructor
+        inline ~thread_id_t();
+        //! Copy constructors
+        inline thread_id_t( const volatile thread_id_t &rhs );
+        inline thread_id_t( volatile thread_id_t &&rhs );
+        inline thread_id_t &operator=( const thread_id_t &rhs ) volatile;
+        inline thread_id_t &operator=( volatile thread_id_t &&rhs ) volatile;
+#ifndef USE_WINDOWS
+        inline thread_id_t( const thread_id_t &rhs );
+        inline thread_id_t &operator=( thread_id_t &&rhs );
+        inline thread_id_t &operator=( const thread_id_t &rhs );
+        inline thread_id_t &operator=( const volatile thread_id_t &rhs );
+        inline thread_id_t &operator=( const volatile thread_id_t &rhs ) volatile;
+#endif
+        // Overload key operators
+        inline bool operator==( const thread_id_t &rhs ) const { return !((d_id^rhs.d_id)&nullThreadID); }
+        inline bool operator!=( const thread_id_t &rhs ) const { return (d_id^rhs.d_id)&nullThreadID; }
+        inline bool operator>=( const thread_id_t &rhs ) const { return d_id >= rhs.d_id; }
+        inline bool operator<=( const thread_id_t &rhs ) const { return d_id <= rhs.d_id; }
+        inline bool operator>(  const thread_id_t &rhs ) const { return d_id  > rhs.d_id; }
+        inline bool operator<(  const thread_id_t &rhs ) const { return d_id  < rhs.d_id; }
+        inline bool operator==( const volatile thread_id_t &rhs ) const volatile { return !((d_id^rhs.d_id)&nullThreadID); }
+        inline bool operator!=( const volatile thread_id_t &rhs ) const volatile { return (d_id^rhs.d_id)&nullThreadID; }
+        inline bool operator>=( const volatile thread_id_t &rhs ) const volatile { return d_id >= rhs.d_id; }
+        inline bool operator<=( const volatile thread_id_t &rhs ) const volatile { return d_id <= rhs.d_id; }
+        inline bool operator>(  const volatile thread_id_t &rhs ) const volatile { return d_id  > rhs.d_id; }
+        inline bool operator<(  const volatile thread_id_t &rhs ) const volatile { return d_id  < rhs.d_id; }
+        //! Reset the id back to a NULL id
+        inline void reset() volatile;
+        inline void reset();
+        //! Check if the work has started (will return true if it has started or finished)
+        inline bool started() const;
+        //! Check if the work has finished
+        inline bool finished() const;
+        //! swap with rhs
+        inline void swap( thread_id_t &rhs )
+        {
+            std::swap( this->d_id, rhs.d_id );
+            std::swap( this->d_count, rhs.d_count );
+            std::swap( this->d_work, rhs.d_work );
+        }
+        //! Check if thread id is null
+        inline bool isNull( ) const { return d_id==nullThreadID; }
+
+    private:
+        // Reset the internal data to the given values
+        inline void reset( int priority, uint64_t local_id, void *work );
+        static inline uint64_t createId( int priority, uint64_t local_id );
+        // Get the local id
+        inline uint64_t getLocalID() const;
+        // Get the priority
+        inline int getPriority() const;
+        // Increase the priority
+        inline void setPriority( int priority );
+        // Check if the id is initialized
+        inline bool initialized() const volatile { return d_id != 0x0FFFFFFFFFFFFFFF; }
+        // Get a pointer to the work structure
+        inline WorkItem* work() const { return reinterpret_cast<WorkItem *>( d_work ); }
+        // Is the id ready to process
+        inline bool ready() const;
+        // Friends
+        friend class ThreadPool;
+        // Data
+        uint64_t d_id;                                    // 64-bit data to store id
+        volatile AtomicOperations::int32_atomic *d_count; // Reference count
+        void *d_work;                                     // Pointer to the work item
     };
 
 
     //! Base class for the work item (users should derive from WorkItemRet)
-    class WorkItem {
-        public:
-            //! Function to run the routine
-            virtual void run()=0;
-            //! Will the routine return a result
-            bool has_result() const { return d_has_result; }
-            //! Empty deconstructor
-            virtual ~WorkItem() { delete [] d_ids; d_ids=NULL; d_N_ids=0; d_size=0; }
-            //! Get the number of work ids that this work item depends on
-            inline size_t get_N_dependencies() const { return d_N_ids; }
-            //! Return the list of work ids that we depend on
-            std::vector<ThreadPool::thread_id_t> get_dependencies() const;
-            /*!
-             * \brief Add a work item to the list of dependencies
-             * \param id    Id of the work item to add
-             */
-            void add_dependency( const ThreadPool::thread_id_t& id ) { add_dependencies(1,&id); }
-            /*!
-             * \brief Add a list of work item to the list of dependencies
-             * \param ids   Ids of the work item to add
-             */
-            inline void add_dependencies( const std::vector<ThreadPool::thread_id_t>& ids ) { 
-                if ( !ids.empty() ) { add_dependencies(ids.size(),&ids[0]); }
+    class WorkItem
+    {
+    public:
+        //! Function to run the routine
+        virtual void run() = 0;
+        //! Will the routine return a result
+        virtual bool has_result() const = 0;
+        //! Empty deconstructor
+        virtual ~WorkItem()
+        {
+            delete[] d_ids;
+            d_ids   = nullptr;
+            d_N_ids = 0;
+            d_size  = 0;
+        }
+        //! Get the number of work ids that this work item depends on
+        inline size_t get_N_dependencies() const { return d_N_ids; }
+        //! Return the list of work ids that we depend on
+        std::vector<ThreadPool::thread_id_t> get_dependencies() const;
+        /*!
+         * \brief Add a work item to the list of dependencies
+         * \param id    Id of the work item to add
+         */
+        void add_dependency( const ThreadPool::thread_id_t &id ) { add_dependencies( 1, &id ); }
+        /*!
+         * \brief Add a list of work item to the list of dependencies
+         * \param ids   Ids of the work item to add
+         */
+        inline void add_dependencies( const std::vector<ThreadPool::thread_id_t> &ids )
+        {
+            if ( !ids.empty() ) {
+                add_dependencies( ids.size(), &ids[0] );
             }
-            /*!
-             * \brief Add a list of work item to the list of dependencies
-             * \param N     Number of items to add
-             * \param ids   Ids of the work item to add
-             */
-            void add_dependencies( size_t N, const ThreadPool::thread_id_t* ids);
-        protected:
-            friend class ThreadPool;
-            inline WorkItem(): d_has_result(false), d_state(0), d_tpool_index(-1), d_N_ids(0), d_size(0), d_ids(NULL) {}
-            bool d_has_result;          // Derived classes must set the result flag (true: has a result)
-            volatile char d_state;      // Derived classes must set the state (0: not scheduled, -1: scheduled, 1: started, 2: finished)
-            short int d_tpool_index;    // Index of the item in the thread pool (-1: not added)
-        private:
-            WorkItem(const WorkItem&);          // Private copy constructor
-            WorkItem& operator=(const WorkItem&); // Private assignment operator
-            short unsigned int d_N_ids;         // Number of dependencies
-            short unsigned int d_size;          // Size of d_ids
-            thread_id_t* d_ids;                 // Pointer to id list
+        }
+        /*!
+         * \brief Add a list of work item to the list of dependencies
+         *    Note: this function is thread-safe for the threadpool and does not need blocking.
+         * \param N     Number of items to add
+         * \param ids   Ids of the work item to add
+         */
+        void add_dependencies( size_t N, const ThreadPool::thread_id_t *ids );
+
+    protected:
+        friend class ThreadPool;
+        inline WorkItem():
+              d_state( 0 ),
+              d_N_ids( 0 ),
+              d_size( 0 ),
+              d_count( 0 ),
+              d_ids( nullptr )
+        {
+        }
+
+    private:
+        WorkItem( const WorkItem & );            // Private copy constructor
+        WorkItem &operator=( const WorkItem & ); // Private assignment operator
+        volatile char d_state;                   // Current state (0: not added to threadpool, 1: queued, 2: started, 3: finished)
+        short unsigned int d_N_ids;              // Number of dependencies
+        short unsigned int d_size;               // Size of d_ids
+        AtomicOperations::int32_atomic d_count;  // Count used by a thread_id
+        thread_id_t *d_ids;                      // Pointer to id list
+        // Friends
+        friend class ThreadPool::thread_id_t;
     };
 
 
     /*!
      * \brief   Class to define a work item returning a variable
-     * \details This is the class that defines a work item to be processed.  Users may derive their own 
+     * \details This is the class that defines a work item to be processed.  Users may derive their
+     * own
      * class and add work using the add_work routine, or can use the TPOOL_ADD_WORK macro.
      * Note: this class is templated on the return argument type and may be a void type.
      */
-    template <typename return_type> 
-    class WorkItemRet: public ThreadPool::WorkItem {
-        public:
-            //! Run the work item
-            virtual void run()=0;
-            //! Return the results
-            return_type get_results() const { return d_result; }
-            //! Virtual destructor
-            virtual ~WorkItemRet() {}
-        protected:
-            return_type d_result;
-            inline WorkItemRet(): WorkItem() { d_has_result = true; }
-        private:
-            WorkItemRet(const WorkItemRet&);            // Private copy constructor
-            WorkItemRet& operator=(const WorkItemRet&); // Private assignment operator
+    template <typename return_type>
+    class WorkItemRet : public ThreadPool::WorkItem
+    {
+    public:
+        //! Run the work item
+        virtual void run() override = 0;
+        //! Will the routine return a result
+        virtual bool has_result() const override = 0;
+        //! Return the results
+        return_type get_results() const { return d_result; }
+        //! Virtual destructor
+        virtual ~WorkItemRet() {}
+    protected:
+        return_type d_result;
+    protected:
+        inline WorkItemRet() { }
+    private:
+        WorkItemRet( const WorkItemRet & );            // Private copy constructor
+        WorkItemRet &operator=( const WorkItemRet & ); // Private assignment operator
     };
 
 
@@ -282,14 +264,14 @@ public:
     ///// Member functions
 
     //! Empty constructor
-    ThreadPool() 
+    ThreadPool()
     {
         // Note: we need the constructor in the header to ensure that check_startup
         //       is able to check for changes in the byte alignment
-        check_startup(sizeof(ThreadPool));
-        initialize(0,"none",0,NULL);
-        if ( !is_valid(this) )
-            throw std::logic_error("Thread pool is not valid");
+        check_startup( sizeof( ThreadPool ) );
+        initialize( 0, "none", 0, nullptr );
+        if ( !is_valid( this ) )
+            throw std::logic_error( "Thread pool is not valid" );
     }
 
 
@@ -301,15 +283,16 @@ public:
      *                          independent - Give each thread an independent set of processors
      * @param procs             The processors to use (defaults to the process affinitiy list)
      */
-    ThreadPool( const int N, const std::string& affinity="none", const std::vector<int>& procs=std::vector<int>() )
+    ThreadPool( const int N, const std::string &affinity = "none",
+        const std::vector<int> &procs = std::vector<int>() )
     {
         // Note: we need the constructor in the header to ensure that check_startup
         //       is able to check for changes in the byte alignment
-        check_startup(sizeof(ThreadPool));
-        const int* procs2 = procs.empty() ? NULL:(&procs[0]);
-        initialize(N,affinity.c_str(),procs.size(),procs2);
-        if ( !is_valid(this) )
-            throw std::logic_error("Thread pool is not valid");
+        check_startup( sizeof( ThreadPool ) );
+        const int *procs2 = procs.empty() ? nullptr : ( &procs[0] );
+        initialize( N, affinity.c_str(), (int) procs.size(), procs2 );
+        if ( !is_valid( this ) )
+            throw std::logic_error( "Thread pool is not valid" );
     }
 
 
@@ -363,15 +346,11 @@ public:
     int getNumThreads() const { return d_N_threads; }
 
 
-    //! Function to return the number of items in the queue (including processing items)
-    int getQueueSize() const { return d_queue_size+d_num_active; }
-
-
     /*!
      * \brief   Function to set the number of threads in the thread pool
      * \details  This function will change the number of worker threads in the ThreadPool
      *   to the number specified.  This function will immediately change the number of threads
-     *   in the ThreadPool without checking the existing work unless the desired number of 
+     *   in the ThreadPool without checking the existing work unless the desired number of
      *   threads is 0.  In this case, the function will wait for all work items to finish
      *   before deleting the existing work threads.
      *   Member threads may not call this function.
@@ -381,11 +360,11 @@ public:
      *                          independent - Give each thread an independent set of processors
      * @param procs             The processors to use (defaults to the process affinitiy list)
      */
-    inline void setNumThreads( const int N, const std::string& affinity="none", 
-        const std::vector<int>& procs=std::vector<int>() )
+    inline void setNumThreads( const int N, const std::string &affinity = "none",
+        const std::vector<int> &procs = std::vector<int>() )
     {
-        const int* procs2 = procs.empty() ? NULL:(&procs[0]);
-        setNumThreads(N,affinity.c_str(),procs.size(),procs2);
+        const int *procs2 = procs.empty() ? nullptr : ( &procs[0] );
+        setNumThreads( N, affinity.c_str(), (int) procs.size(), procs2 );
     }
 
 
@@ -399,40 +378,53 @@ public:
 
     //! Function to check if the work item is valid
     /*!
-     * This function checks if the work item has a valid id.  
-     *   Note: this function does not require blocking and will return immediately.  
+     * This function checks if the work item has a valid id.
+     *   Note: this function does not require blocking and will return immediately.
      * @param id                The id of the work item
      */
-    inline bool isValid(const thread_id_t& id) const;
+    inline bool isValid( const thread_id_t &id ) const;
 
 
     /*!
      * \brief    Function to check if the work item has finished processing
-     * \details  This function checks if the work item has finished processing. 
+     * \details  This function checks if the work item has finished processing.
      * @param id                The id of the work item
      */
-    bool isFinished(thread_id_t id) const;
+    inline bool isFinished( thread_id_t& id ) const { return id.finished(); }
 
 
     /*!
      * \brief   Function to get the returned function value
      * \details This is the function returns the value that was returned from the working function.
-     *   If the work item has not finished or was not found it will return 0.  
+     *   If the work item has not finished or was not found it will return 0.
      * @param id                The id of the work item
      */
-    template <class return_type> 
-    inline return_type getFunctionRet(const thread_id_t& id) const;
+    template <class return_type>
+    inline return_type getFunctionRet( const thread_id_t &id ) const;
+
+
+    /*!
+     * \brief   Function to create a work item
+     * \details This function creates a work item that can be added to the queue
+     * @param work              Pointer to the work item to add
+     *                          Note that the threadpool will automatically destroy the item when
+     * finished
+     * @param priority          A value indicating the priority of the work item (0-default)
+     */
+    template <class Ret, class... Args>
+    static inline WorkItem* createWork( Ret( *routine )( Args... ), Args... args );
+
 
-    
     /*!
      * \brief   Function to add a work item
      * \details This function adds a work item to the queue
      *   Note: any thread may call this routine.
      * @param work              Pointer to the work item to add
-     *                          Note that the threadpool will automatically destroy the item when finished
+     *                          Note that the threadpool will automatically destroy the item when
+     * finished
      * @param priority          A value indicating the priority of the work item (0-default)
      */
-    inline thread_id_t add_work( ThreadPool::WorkItem* work, int priority=0);
+    inline thread_id_t add_work( ThreadPool::WorkItem *work, int priority = 0 );
 
 
     /*!
@@ -440,26 +432,28 @@ public:
      * \details This function adds multiple work item to the queue
      *   Note: any thread may call this routine.
      * @param work              Vector of pointers to the work items to add
-     *                          Note that the threadpool will automatically destroy the item when finished
+     *                          Note that the threadpool will automatically destroy the item when
+     * finished
      * @param priority          Vector of values indicating the priority of the work items
      */
-    inline std::vector<thread_id_t> add_work( const std::vector<ThreadPool::WorkItem*>& work, 
-        const std::vector<int>& priority=std::vector<int>() );
+    inline std::vector<thread_id_t> add_work( const std::vector<ThreadPool::WorkItem *> &work,
+        const std::vector<int> &priority = std::vector<int>() );
 
 
     /*!
      * \brief   Function to wait until a specific work item has finished
-     * \details This is the function waits for a specific work item to finished.  It returns 0 if successful.
+     * \details This is the function waits for a specific work item to finished.  It returns 0 if
+     * successful.
      *   Note: any thread may call this routine, but they will block until finished.
      *   For worker threads this may eventually lead to a deadlock.
      * @param id                The work item to wait for
      */
-    inline int wait(thread_id_t id) const;
+    inline int wait( thread_id_t id ) const;
 
 
     /*!
      * \brief   Function to wait until any of the given work items have finished their work
-     * \details This is the function waits for any of the given work items to finish. 
+     * \details This is the function waits for any of the given work items to finish.
      *   If successful it returns the index of a finished work item (the index in the array ids).
      *   If unseccessful it will return -1.
      *   Note: any thread may call this routine, but they will block until finished.
@@ -467,46 +461,59 @@ public:
      * @param N_work            The number of work items
      * @param ids               Array of work items to wait for
      */
-    inline int wait_any(size_t N_work, const thread_id_t *ids);
+    inline int wait_any( size_t N_work, const thread_id_t *ids );
 
 
     /*!
      * \brief   Function to wait until any of the given work items have finished their work
-     * \details This is the function waits for any of the given work items to finish. 
+     * \details This is the function waits for any of the given work items to finish.
      *   If successful it returns the index of a finished work item (the index in the array ids).
      *   If unseccessful it will return -1.
      *   Note: any thread may call this routine, but they will block until finished.
      *   For worker threads this may eventually lead to a deadlock.
      * @param ids               Vector of work items to wait for
      */
-    inline int wait_any(const std::vector<thread_id_t>& ids) const;
+    inline int wait_any( const std::vector<thread_id_t> &ids ) const;
 
 
     /*!
      * \brief   Function to wait until all of the given work items have finished their work
-     * \details This is the function waits for all given of the work items to finish.  It returns 0 if successful.
+     * \details This is the function waits for all given of the work items to finish.  It returns 0
+     * if successful.
      *   Note: any thread may call this routine, but they will block until finished.
      *   For worker threads this may eventually lead to a deadlock.
      * @param N_work            The number of work items
      * @param ids               Array of work items to wait for
      */
-    inline int wait_all(size_t N_work, const thread_id_t *ids) const;
+    inline int wait_all( size_t N_work, const thread_id_t *ids ) const;
 
 
     /*!
      * \brief   Function to wait until all of the given work items have finished their work
-     * \details This is the function waits for all given of the work items to finish.  It returns 0 if successful.
+     * \details This is the function waits for all given of the work items to finish.  It returns 0
+     * if successful.
      *   Note: any thread may call this routine, but they will block until finished.
      *   For worker threads this may eventually lead to a deadlock.
      * @param ids               Vector of work items to wait for
      */
-    inline int wait_all(const std::vector<thread_id_t>& ids) const;
+    inline int wait_all( const std::vector<thread_id_t> &ids ) const;
+
+
+    /*!
+     * \brief   Function to wait until some of the given work items have finished their work
+     * \details This is the function waits for some of the given work items to finish.
+     *   If successful it returns the indicies of the finished work items (the index in the array ids).
+     *   Note: any thread may call this routine, but they will block until finished.
+     *   For worker threads this may eventually lead to a deadlock.
+     * @param ids               Vector of work items to wait for
+     */
+    inline std::vector<int> wait_some( int N_wait, const std::vector<thread_id_t> &ids ) const;
 
 
     /*!
      * \brief   Function to wait until all work items in the thread pool have finished their work
-     * \details This function will wait until all work has finished.  
-     *   Note: member threads may not call this function.  
+     * \details This function will wait until all work has finished.
+     *   Note: member threads may not call this function.
      *   Only one non-member thread should call this routine at a time.
      */
     void wait_pool_finished() const;
@@ -517,11 +524,11 @@ public:
      * \details Sometimes it is necessary to work with raw pointers for the thread pool.
      *    If the thread pool is invalid and used, the program will likely fail catastrophically.
      *    This function checks if the thread pool is valid is a relatively safe manner.
-     *    If the thread pool is pointing to an invalid memory address, because it has been 
+     *    If the thread pool is pointing to an invalid memory address, because it has been
      *    freed, never allocated, or otherwise corrupted, this function will return false.
      * @param tpool         Pointer to the ThreadPool to check
      */
-    static bool is_valid( const ThreadPool* tpool );
+    static bool is_valid( const ThreadPool *tpool );
 
 
     /*!
@@ -529,80 +536,77 @@ public:
      * \details Some of the functions such as setting/getting the thread affinities
      *      are not supported on all platforms.  This function controls the behavior
      *      of these functions on systems where they are not supported.  The default
-     *      behavior is to print a warning message.  Other options include ignoring 
+
+     *      behavior is to print a warning message.  Other options include ignoring
      *      the messages (the functions will return empty sets), or throwing an exception.
      *      Note: this is a global property and will affect all thread pools in an application.
      * @param behavior      The behavior of OS specific messages/errors
      *                      0: Print a warning message
+
      *                      1: Ignore the messages
      *                      2: Throw an error
      */
-    static void set_OS_warnings( int behavior=0 );
+    static void set_OS_warnings( int behavior = 0 );
 
 
+    //! Return the number of items queued
+    int N_queued( ) const { return d_queue_list.size(); }
+
 private:
-
-    friend class ThreadPoolData;
-
-    // Convience typedefs
-    #ifdef USE_WINDOWS
-        typedef HANDLE wait_type;
-    #elif defined(USE_LINUX) || defined(USE_MAC)
-        typedef pthread_cond_t* wait_type;
-    #else
-        #error Unknown OS
-    #endif    
-
+    typedef AtomicOperations::int32_atomic int32_atomic;
 
 private:
     ///// Member data structures
 
-    // Structure to store properties for each work item (linked list)
-    struct queue_list_struct {
-        short int position;             // Position of the work item in the list
-        short int prev;                 // Next item in the list
-        short int next;                 // Next item in the list
-        queue_list_struct(): position(-1), prev(-1), next(-1) {}
-        inline void reset() volatile { prev=-1; next=-1; }
-        inline void reset() { prev=-1; next=-1; }
-        private:
-            queue_list_struct( const queue_list_struct& );
-            queue_list_struct& operator=( const queue_list_struct& );
+   
+    // Implimentation of condition_variable which does not require a lock
+    class condition_variable
+    {
+      public:
+        condition_variable() { }
+        ~condition_variable() { }
+        inline void wait() const { std::unique_lock<std::mutex> lock(d_mutex); d_cv.wait(lock); }
+        inline void wait_for( double seconds ) const
+        {
+            std::unique_lock<std::mutex> lock(d_mutex);
+            if ( seconds < 4e-6 )
+                d_cv.wait_for(lock,std::chrono::nanoseconds(static_cast<int>(1e9*seconds)));
+            else if ( seconds < 4e-3 )
+                d_cv.wait_for(lock,std::chrono::microseconds(static_cast<int>(1e6*seconds)));
+            else if ( seconds < 4 )
+                d_cv.wait_for(lock,std::chrono::milliseconds(static_cast<int>(1e3*seconds)));
+            else
+                d_cv.wait_for(lock,std::chrono::seconds(static_cast<int>(seconds)));
+        }
+        inline void notify_one() const { d_cv.notify_one(); }
+        inline void notify_all() const { d_cv.notify_all(); }
+      private:
+        mutable std::condition_variable d_cv;
+        mutable std::mutex d_mutex;
     };
 
-    // Structure to store a pool of wait events (thread safe)
-    struct wait_pool_struct {
-        wait_pool_struct( );
-        ~wait_pool_struct( );
-        void push( wait_type event );
-        wait_type pop();
-        private:
-            volatile unsigned int d_count;
-            volatile unsigned int d_size;
-            volatile wait_type *d_pool;
-            #ifdef USE_WINDOWS
-                CRITICAL_SECTION *d_lock;
-            #elif defined(USE_LINUX) || defined(USE_MAC)
-                pthread_mutex_t *d_lock;
-            #else
-                #error Unknown OS
-            #endif
-            wait_pool_struct& operator=( const wait_pool_struct& );
-            wait_pool_struct( const wait_pool_struct& );
-    };
 
-    // Structure to store wait events (note: both the constructor and destructor are NOT thread safe and must be blocked)
-    struct wait_event_struct {
-        int count;                          // The number of work items that must finish before we alert the thread
-        size_t ThreadId;                    // Id of the waiting thread
-        std::vector<thread_id_t> ids;       // The ids we are waiting on
-        wait_type wait_event;               // Handle to a wait event
-        wait_event_struct( wait_pool_struct* wait_pool );
-        ~wait_event_struct( );
-        private:
-            wait_pool_struct* d_wait_pool;
-            wait_event_struct( );
-            wait_event_struct( const wait_event_struct& );
+    // Structure to wait on multiple ids
+    // Note: this is thread safe without blocking as long as it is added to the wait list
+    //    before calling wait
+    class wait_ids_struct {
+      public:
+        wait_ids_struct( size_t N, const ThreadPool::thread_id_t *ids, size_t N_wait,
+            AtomicOperations::pool<condition_variable,128>& cv_pool, int N_wait_list, volatile wait_ids_struct **list );
+        ~wait_ids_struct( ) { d_cv_pool.put( d_wait_event ); delete [] d_finished; delete [] d_ids; }
+        void id_finished( const ThreadPool::thread_id_t& id ) const;
+        bool wait_for( double seconds );
+      private:
+        mutable int d_wait;                     // The number of work items that must finish before we alert the thread
+        mutable int d_N;                        // The number of ids we are waiting on
+        mutable thread_id_t *d_ids;             // The ids we are waiting on
+        AtomicOperations::pool<condition_variable,128>& d_cv_pool;
+        condition_variable *d_wait_event;       // Handle to a wait event
+        volatile mutable bool *d_finished;      // Has each id finished
+        volatile mutable wait_ids_struct **d_ptr;
+        wait_ids_struct();
+        wait_ids_struct( const wait_ids_struct& );
+        wait_ids_struct& operator=( const wait_ids_struct & );
     };
 
 
@@ -610,26 +614,26 @@ private:
     ///// Member functions
 
     // Copy constructors ( we do not want the user to be able to copy the thread pool)
-    ThreadPool(const ThreadPool&);
-    ThreadPool& operator=(const ThreadPool&);
+    ThreadPool( const ThreadPool & );
+    ThreadPool &operator=( const ThreadPool & );
 
     // Function to initialize the thread pool
-    void setNumThreads( int N, const char* affinity, int N_procs, const int* procs );
-    void initialize(int N, const char* affinity, int N_procs, const int* procs);
-    void check_startup(size_t size0);
+    void setNumThreads( int N, const char *affinity, int N_procs, const int *procs );
+    void initialize( int N, const char *affinity, int N_procs, const int *procs );
+    void check_startup( size_t size0 );
 
     // Function to add an array of work items
-    void add_work(size_t N, ThreadPool::WorkItem* work[], const int* priority, ThreadPool::thread_id_t* id);
-        
+    void add_work(
+        size_t N, ThreadPool::WorkItem *work[], const int *priority, ThreadPool::thread_id_t *id );
+    inline void add_work( const ThreadPool::thread_id_t& id );
+
     // Function to get a work item that has finished
-    WorkItem* getFinishedWorkItem(ThreadPool::thread_id_t id) const;
-        
+    WorkItem *getFinishedWorkItem( ThreadPool::thread_id_t id ) const;
+
     // This function provides a wrapper (needed for the threads)
-    static void create_new_thread(void *arglist) {
-        void **tmp = (void **) arglist;
-        ThreadPool *call = reinterpret_cast<ThreadPool*>(tmp[0]);
-        int id = static_cast<int>(reinterpret_cast<size_t>(tmp[1]));
-        call->tpool_thread(id);
+    static inline void create_new_thread( ThreadPool *tpool, int id )
+    {
+        tpool->tpool_thread( id );
     }
 
     /* This is the function that controls the individual thread and allows it to do work.
@@ -637,73 +641,45 @@ private:
      * param thread_init - Structure address contining the startup information for the thread */
     void tpool_thread( int id );
 
-    // Some functions/variables used to get/test the unique work ids 
-    inline void initialize_id();                    // A simple function to initialize the id (should only be called once)
-    inline size_t advance_id();                     // A simple function to advance the return the id and advance (thread-safe)
-
     // Function to check if the current thread is a member of the thread pool
-    inline bool isMemberThread() const;
+    inline bool isMemberThread() const { return getThreadNumber()>=0; }
 
     // Function to wait for some work items to finish
-    int wait_some(size_t N_work, const thread_id_t *ids, size_t N_wait, bool *finished) const;
-
-    // Helper functions to get the next availible item in the work queue
-    inline short int get_work_item( );
-    static inline short int check_dependecies( const ThreadPool::queue_list_struct *list,
-        const thread_id_t *ids, short int index );
-
+    int wait_some( size_t N_work, const thread_id_t *ids, size_t N_wait, bool *finished ) const;
+    
+    // Check if we are waiting too long and pring debug info
+    void check_wait_time( std::chrono::time_point<std::chrono::high_resolution_clock>& t1 ) const;
 
 private:
     ///// Member data
-    // Note: We want to store the variables in a certain order to optimize storage 
+    typedef AtomicOperations::int64_atomic atomic_64;
+    typedef AtomicList<thread_id_t,MAX_QUEUED,std::greater<thread_id_t>> queue_type;
+    // Note: We want to store the variables in a certain order to optimize storage
     //   and ensure consistent packing / object size
-    size_t d_NULL_HEAD;                                 // Null data buffer to check memory bounds
-    volatile AtomicOperations::int64_atomic d_id_assign; // An internal variable used to store the current id to assign
-    volatile mutable bool d_signal_empty;               // Do we want to send a signal when the queue is empty
-    volatile mutable unsigned char d_signal_count;      // Do we want to send a signal when the count drops to zero
-    short int d_N_threads;                              // Number of threads
-    volatile short int d_num_active;                    // Number of threads that are currently active
-    volatile short int d_queue_head;                    // Index to work queue head
-    volatile short int d_queue_free;                    // Index to free queue item
-    volatile int d_queue_size;                          // Number of items in the work queue
-    volatile mutable int d_N_wait;                      // The number of threads waiting
-    size_t d_ThreadId[MAX_NUM_THREADS];                 // Unique id for each thread
-    volatile uint64 d_active[MAX_NUM_THREADS/64];       // Which threads are currently active
-    volatile uint64 d_cancel[MAX_NUM_THREADS/64];       // Which threads should be deleted
-    thread_id_t volatile d_queue_ids[MAX_QUEUED];       // List of ids in the work queue
-    queue_list_struct volatile d_queue_list[MAX_QUEUED]; // Work queue list
-    volatile mutable wait_event_struct* d_wait[MAX_WAIT]; // The wait events to check
-    wait_type d_wait_finished;                          // Handle to a wait event that indicates all threads have finished work
-    mutable wait_pool_struct wait_pool;                 // Pool of wait events that we can use
-    #ifdef USE_WINDOWS
-        CRITICAL_SECTION *d_lock_queue;                 // Mutex lock for changing the queue
-        HANDLE d_hThread[MAX_NUM_THREADS];              // Handles to the threads
-    #elif defined(USE_LINUX) || defined(USE_MAC)
-        pthread_mutex_t *d_lock_queue;                  // Mutex lock for changing the queue
-        pthread_t d_hThread[MAX_NUM_THREADS];           // Handles to the threads 
-        wait_type d_queue_not_empty;                    // Event condition 
-    #else
-        #error Unknown OS
-    #endif
-    size_t d_NULL_TAIL;                                 // Null data buffer to check memory bounds
+    size_t d_NULL_HEAD;                     // Null data buffer to check memory bounds
+    volatile atomic_64 d_id_assign;         // An internal variable used to store the current id to assign
+    volatile mutable bool d_signal_empty;   // Do we want to send a signal when the queue is empty
+    volatile mutable int32_atomic d_signal_count; // Signal count
+    short int d_N_threads;                  // Number of threads
+    volatile int32_atomic d_num_active;     // Number of threads that are currently active
+    volatile atomic_64 d_active[MAX_NUM_THREADS/64]; // Which threads are currently active
+    volatile atomic_64 d_cancel[MAX_NUM_THREADS/64]; // Which threads should be deleted
+    volatile atomic_64 d_N_added;           // Number of items added to the work queue
+    volatile atomic_64 d_N_started;         // Number of items started
+    volatile atomic_64 d_N_finished;        // Number of items finished
+    volatile mutable wait_ids_struct *d_wait[MAX_WAIT]; // The wait events to check
+    mutable wait_ids_struct *d_wait_last;   // A cached copy of the last completed wait event (in case a thread still has a reference)
+    condition_variable d_wait_finished;     // Condition variable to signal when all work is finished
+    condition_variable d_wait_work;         // Condition variable to signal when there is new work
+    mutable AtomicOperations::pool<condition_variable,128> d_cond_pool;
+    std::thread d_thread[MAX_NUM_THREADS];  // Handles to the threads
+    std::thread::id d_threadId[MAX_NUM_THREADS]; // Unique id for each thread
+    queue_type d_queue_list;                // The work queue
+    size_t d_NULL_TAIL;                     // Null data buffer to check memory bounds
 };
 
 
-
-// Swap the contents of the two ids
-namespace std {
-    template<> inline void swap<ThreadPool::thread_id_t>( 
-        ThreadPool::thread_id_t& a, ThreadPool::thread_id_t& b )
-    { 
-        std::swap(a.d_id,b.d_id);
-        std::swap(a.d_count,b.d_count);
-        std::swap(a.d_work,b.d_work);
-    }
-}
-
-#include "thread_pool.hpp"
+#include "threadpool/thread_pool.hpp"
 
 
 #endif
-
-
diff --git a/threadpool/thread_pool.hpp b/threadpool/thread_pool.hpp
index 986257ea..60840ec1 100644
--- a/threadpool/thread_pool.hpp
+++ b/threadpool/thread_pool.hpp
@@ -1,9 +1,10 @@
 // This file contains the template functions for the thread pool
 #ifndef included_ThreadPoolTmpl
 #define included_ThreadPoolTmpl
-#include "thread_pool.h"
+#include "threadpool/thread_pool.h"
+#include <functional>
 #include <stdexcept>
-
+#include <tuple>
 
 
 
@@ -14,23 +15,24 @@
 
 /*! \def id = TPOOL_ADD_WORK(tpool,function,args,priority)
  *  \brief Add an item to the thread pool
- *  \details This a macro to automatically create and add a work item to 
- *      the thread pool.  
+ *  \details This a macro to automatically create and add a work item to the thread pool.
  *  \param tpool        Pointer to the thread pool to use
  *  \param function     Pointer to the function to use
  *  \param args         The arguments to pass to the function in the form (arg1,arg2,...)
  *  \param priority     Optional argument specifying the priority of the work item
  */
-#define TPOOL_TUPLE_TO_SEQ(t) TPOOL_TUPLE_TO_SEQ_ ## II t
-#define TPOOL_TUPLE_TO_SEQ_II(a,...) a,##__VA_ARGS__
+#define TPOOL_TUPLE_TO_SEQ( t ) TPOOL_TUPLE_TO_SEQ_##II t
+#define TPOOL_TUPLE_TO_SEQ_II( a, ... ) a, ##__VA_ARGS__
 #ifdef USE_WINDOWS
-    #define TPOOL_GET_PRIORITY(a,N,c,...) N 
-    #define TPOOL_ADD_WORK(TPOOL,FUNCTION,ARGS,...) \
-        ThreadPool_add_work(TPOOL,FUNCTION,TPOOL_TUPLE_TO_SEQ(ARGS),TPOOL_GET_PRIORITY(0,__VA_ARGS__,0,0)+0)
+#define TPOOL_GET_PRIORITY( a, N, c, ... ) N
+#define TPOOL_ADD_WORK( TPOOL, FUNCTION, ARGS, ... )                                      \
+    ThreadPool_add_work( TPOOL, TPOOL_GET_PRIORITY( 0, __VA_ARGS__, 0, 0 ) + 0, FUNCTION, \
+        TPOOL_TUPLE_TO_SEQ( ARGS ) )
 #else
-    #define TPOOL_GET_PRIORITY(_0,N,...) N 
-    #define TPOOL_ADD_WORK(TPOOL,FUNCTION,ARGS,...) \
-        ThreadPool_add_work(TPOOL,FUNCTION,TPOOL_TUPLE_TO_SEQ(ARGS),TPOOL_GET_PRIORITY(_0,##__VA_ARGS__,0))
+#define TPOOL_GET_PRIORITY( _0, N, ... ) N
+#define TPOOL_ADD_WORK( TPOOL, FUNCTION, ARGS, ... ) \
+    ThreadPool_add_work(                             \
+        TPOOL, TPOOL_GET_PRIORITY( _0, ##__VA_ARGS__, 0 ), FUNCTION, TPOOL_TUPLE_TO_SEQ( ARGS ) )
 #endif
 
 /*! @} */
@@ -38,849 +40,140 @@
 // \cond HIDDEN_SYMBOLS
 
 
+
+// Unpack a tuple and call a function
+template <int...>
+struct index_tuple {
+};
+template <int I, typename IndexTuple, typename... Types>
+struct make_indexes_impl;
+template <int I, int... Indexes, typename T, typename... Types>
+struct make_indexes_impl<I, index_tuple<Indexes...>, T, Types...> {
+    typedef typename make_indexes_impl<I + 1, index_tuple<Indexes..., I>, Types...>::type type;
+};
+template <int I, int... Indexes>
+struct make_indexes_impl<I, index_tuple<Indexes...>> {
+    typedef index_tuple<Indexes...> type;
+};
+template <typename... Types>
+struct make_indexes : make_indexes_impl<0, index_tuple<>, Types...> {
+};
+template <class Ret, class... Args, int... Indexes>
+inline Ret apply_helper( Ret ( *pf )( Args... ), index_tuple<Indexes...>, std::tuple<Args...> &&tup )
+{
+    return pf( std::forward<Args>( std::get<Indexes>( tup ) )... );
+}
+template <class Ret, class... Args>
+inline Ret apply( Ret ( *pf )( Args... ), const std::tuple<Args...> &tup )
+{
+    return apply_helper( pf, typename make_indexes<Args...>::type(), std::tuple<Args...>( tup ) );
+}
+template <class Ret, class... Args>
+inline Ret apply( Ret ( *pf )( Args... ), std::tuple<Args...> &&tup )
+{
+    return apply_helper(
+        pf, typename make_indexes<Args...>::type(), std::forward<std::tuple<Args...>>( tup ) );
+}
+
+
 // Specialization for no return argument
-template <> 
-class ThreadPool::WorkItemRet<void>: public ThreadPool::WorkItem {
+template <>
+class ThreadPool::WorkItemRet<void> : public ThreadPool::WorkItem
+{
 public:
-    virtual void run()=0;
-    void get_results() { }
+    virtual void run() override = 0;
+    virtual bool has_result() const override { return false; }
+    void get_results() {}
     virtual ~WorkItemRet() {}
 };
 
 
 // Final class for the work item
-struct NULL_data {};
-template < typename return_type, 
-    typename  arg1=NULL_data,  typename  arg2=NULL_data,  typename  arg3=NULL_data,  typename  arg4=NULL_data, 
-    typename  arg5=NULL_data,  typename  arg6=NULL_data,  typename  arg7=NULL_data,  typename  arg8=NULL_data,
-    typename  arg9=NULL_data,  typename arg10=NULL_data,  typename arg11=NULL_data,  typename arg12=NULL_data, 
-    typename arg13=NULL_data,  typename arg14=NULL_data,  typename arg15=NULL_data,  typename arg16=NULL_data,
-    typename arg17=NULL_data,  typename arg18=NULL_data,  typename arg19=NULL_data,  typename arg20=NULL_data,
-    typename arg21=NULL_data,  typename arg22=NULL_data,  typename arg23=NULL_data,  typename arg24=NULL_data > 
-class WorkItemFull: public ThreadPool::WorkItemRet<return_type> 
+template <class Ret, class... Args>
+class WorkItemFull;
+template <class... Args>
+class WorkItemFull<void, Args...> : public ThreadPool::WorkItemRet<void>
 {
 private:
-    int N;
-    return_type (*routine)();
-    arg1 x1;
-    arg2 x2;
-    arg3 x3;
-    arg4 x4;
-    arg5 x5;
-    arg6 x6;
-    arg7 x7;
-    arg8 x8;
-    arg9 x9;
-    arg10 x10;
-    arg11 x11;
-    arg12 x12;
-    arg13 x13;
-    arg14 x14;
-    arg15 x15;
-    arg16 x16;
-    arg17 x17;
-    arg18 x18;
-    arg19 x19;
-    arg20 x20;
-    arg21 x21;
-    arg22 x22;
-    arg23 x23;
-    arg24 x24;
+    void ( *routine )( Args... );
+    std::tuple<Args...> args;
     WorkItemFull();
+
 public:
-    WorkItemFull( return_type (*routine2)() ):
-        ThreadPool::WorkItemRet<return_type>(), N(0),
-        routine(reinterpret_cast<return_type(*)()>(routine2)) { }
-    WorkItemFull( return_type (*routine2)(arg1), arg1 y1):
-        ThreadPool::WorkItemRet<return_type>(), N(1),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2), arg1 y1, arg2 y2 ):
-        ThreadPool::WorkItemRet<return_type>(), N(2),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3), arg1 y1, arg2 y2, arg3 y3 ):
-        ThreadPool::WorkItemRet<return_type>(), N(3),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3,arg4), arg1 y1, arg2 y2, arg3 y3, arg4 y4 ):
-        ThreadPool::WorkItemRet<return_type>(), N(4),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3,arg4,arg5), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5 ):
-        ThreadPool::WorkItemRet<return_type>(), N(5),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6 ):
-        ThreadPool::WorkItemRet<return_type>(), N(6),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7 ):
-        ThreadPool::WorkItemRet<return_type>(), N(7),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8 ):
-        ThreadPool::WorkItemRet<return_type>(), N(8),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9 ):
-        ThreadPool::WorkItemRet<return_type>(), N(9),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10 ):
-        ThreadPool::WorkItemRet<return_type>(), N(10),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11 ):
-        ThreadPool::WorkItemRet<return_type>(), N(11),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12 ):
-        ThreadPool::WorkItemRet<return_type>(), N(12),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13 ):
-        ThreadPool::WorkItemRet<return_type>(), N(13),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14 ):
-        ThreadPool::WorkItemRet<return_type>(), N(14),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14, arg15 y15 ):
-        ThreadPool::WorkItemRet<return_type>(), N(15),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14), x15(y15) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14, arg15 y15, arg16 y16 ):
-        ThreadPool::WorkItemRet<return_type>(), N(16),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14), x15(y15), x16(y16) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14, arg15 y15, arg16 y16, arg17 y17 ):
-        ThreadPool::WorkItemRet<return_type>(), N(17),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14), x15(y15), x16(y16), x17(y17) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14, arg15 y15, arg16 y16, arg17 y17, arg18 y18 ):
-        ThreadPool::WorkItemRet<return_type>(), N(18),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14), x15(y15), x16(y16), x17(y17), x18(y18) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14, arg15 y15, arg16 y16, arg17 y17, arg18 y18, arg19 y19 ):
-        ThreadPool::WorkItemRet<return_type>(), N(19),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14), x15(y15), x16(y16), x17(y17), x18(y18), x19(y19) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14, arg15 y15, arg16 y16, arg17 y17, arg18 y18, arg19 y19, arg20 y20 ):
-        ThreadPool::WorkItemRet<return_type>(), N(20),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14), x15(y15), x16(y16), x17(y17), x18(y18), x19(y19), x20(y20) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14, arg15 y15, arg16 y16, arg17 y17, arg18 y18, arg19 y19, arg20 y20, arg21 y21 ):
-        ThreadPool::WorkItemRet<return_type>(), N(21),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14), x15(y15), x16(y16), x17(y17), x18(y18), x19(y19), x20(y20), x21(y21) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21,arg22), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14, arg15 y15, arg16 y16, arg17 y17, arg18 y18, arg19 y19, arg20 y20, arg21 y21, arg22 y22 ):
-        ThreadPool::WorkItemRet<return_type>(), N(22),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14), x15(y15), x16(y16), x17(y17), x18(y18), x19(y19), x20(y20), x21(y21), x22(y22) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21,arg22,arg23), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14, arg15 y15, arg16 y16, arg17 y17, arg18 y18, arg19 y19, arg20 y20, arg21 y21, arg22 y22, arg23 y23 ):
-        ThreadPool::WorkItemRet<return_type>(), N(23),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14), x15(y15), x16(y16), x17(y17), x18(y18), x19(y19), x20(y20), x21(y21), x22(y22), x23(y23) { }
-    WorkItemFull( return_type (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21,arg22,arg23,arg24), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14, arg15 y15, arg16 y16, arg17 y17, arg18 y18, arg19 y19, arg20 y20, arg21 y21, arg22 y22, arg23 y23, arg24 y24 ):
-        ThreadPool::WorkItemRet<return_type>(), N(24),
-        routine(reinterpret_cast<return_type(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14), x15(y15), x16(y16), x17(y17), x18(y18), x19(y19), x20(y20), x21(y21), x22(y22), x23(y23), x24(y24) { }
-    void run() {
-        ThreadPool::WorkItem::d_state = 1;
-        if ( N==0 )
-            this->d_result = reinterpret_cast<return_type(*)()>(routine)();
-        else if ( N==1 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1)>(routine)(x1);
-        else if ( N==2 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2)>(routine)(x1,x2);
-        else if ( N==3 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3)>(routine)(x1,x2,x3);
-        else if ( N==4 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3,arg4)>(routine)(x1,x2,x3,x4);
-        else if ( N==5 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3,arg4,arg5)>(routine)(x1,x2,x3,x4,x5);
-        else if ( N==6 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3,arg4,arg5,arg6)>(routine)(x1,x2,x3,x4,x5,x6);
-        else if ( N==7 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7)>(routine)(x1,x2,x3,x4,x5,x6,x7);
-        else if ( N==8 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8);
-        else if ( N==9 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9);
-        else if ( N==10 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10);
-        else if ( N==11 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11);
-        else if ( N==12 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12);
-        else if ( N==13 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13);
-        else if ( N==14 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14);
-        else if ( N==15 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15);
-        else if ( N==16 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16);
-        else if ( N==17 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17);
-        else if ( N==18 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18);
-        else if ( N==19 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19);
-        else if ( N==20 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20);
-        else if ( N==21 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,x21);
-        else if ( N==22 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21,arg22)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22);
-        else if ( N==23 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21,arg22,arg23)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22,x23);
-        else if ( N==24 )
-            this->d_result = reinterpret_cast<return_type(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21,arg22,arg23,arg24)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22,x23,x24);
-        else
-            throw std::exception(); // Internal error
-        ThreadPool::WorkItem::d_state = 2;
+    WorkItemFull( void ( *routine2 )( Args... ), Args... ts )
+        : ThreadPool::WorkItemRet<void>(), routine( routine2 ), args( ts... )
+    {
     }
+    virtual void run() override
+    {
+        apply( routine, args );
+    }
+    virtual bool has_result() const override { return false; }
     virtual ~WorkItemFull() {}
 };
-template < 
-typename  arg1,  typename  arg2,  typename  arg3,  typename  arg4,  typename  arg5,  
-typename  arg6,  typename  arg7,  typename  arg8,  typename  arg9,  typename arg10,  
-typename arg11,  typename arg12,  typename arg13,  typename arg14,  typename arg15,  
-typename arg16,  typename arg17,  typename arg18,  typename arg19,  typename arg20,
-typename arg21,  typename arg22,  typename arg23,  typename arg24 > 
-class WorkItemFull<void,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21,arg22,arg23,arg24>: 
-    public ThreadPool::WorkItemRet<void> 
+template <class Ret, class... Args>
+class WorkItemFull : public ThreadPool::WorkItemRet<Ret>
 {
 private:
-    int N;
-    void (*routine)();
-    arg1 x1;
-    arg2 x2;
-    arg3 x3;
-    arg4 x4;
-    arg5 x5;
-    arg6 x6;
-    arg7 x7;
-    arg8 x8;
-    arg9 x9;
-    arg10 x10;
-    arg11 x11;
-    arg12 x12;
-    arg13 x13;
-    arg14 x14;
-    arg15 x15;
-    arg16 x16;
-    arg17 x17;
-    arg18 x18;
-    arg19 x19;
-    arg20 x20;
-    arg21 x21;
-    arg22 x22;
-    arg23 x23;
-    arg24 x24;
+    Ret ( *routine )( Args... );
+    std::tuple<Args...> args;
+    WorkItemFull();
+
 public:
-    WorkItemFull( void (*routine2)() ):
-        ThreadPool::WorkItemRet<void>(), N(0),
-        routine(reinterpret_cast<void(*)()>(routine2)) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1), arg1 y1):
-        ThreadPool::WorkItemRet<void>(), N(1),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2), arg1 y1, arg2 y2 ):
-        ThreadPool::WorkItemRet<void>(), N(2),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3), arg1 y1, arg2 y2, arg3 y3 ):
-        ThreadPool::WorkItemRet<void>(), N(3),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3){ ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3,arg4), arg1 y1, arg2 y2, arg3 y3, arg4 y4 ):
-        ThreadPool::WorkItemRet<void>(), N(4),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3,arg4,arg5), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5 ):
-        ThreadPool::WorkItemRet<void>(), N(5),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6 ):
-        ThreadPool::WorkItemRet<void>(), N(6),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7 ):
-        ThreadPool::WorkItemRet<void>(), N(7),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8 ):
-        ThreadPool::WorkItemRet<void>(), N(8),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9 ):
-        ThreadPool::WorkItemRet<void>(), N(9),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10 ):
-        ThreadPool::WorkItemRet<void>(), N(10),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11 ):
-        ThreadPool::WorkItemRet<void>(), N(11),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12 ):
-        ThreadPool::WorkItemRet<void>(), N(12),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13 ):
-        ThreadPool::WorkItemRet<void>(), N(13),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14 ):
-        ThreadPool::WorkItemRet<void>(), N(14),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14, arg15 y15 ):
-        ThreadPool::WorkItemRet<void>(), N(15),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14), x15(y15) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14, arg15 y15, arg16 y16 ):
-        ThreadPool::WorkItemRet<void>(), N(16),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14), x15(y15), x16(y16) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14, arg15 y15, arg16 y16, arg17 y17 ):
-        ThreadPool::WorkItemRet<void>(), N(17),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14), x15(y15), x16(y16), x17(y17) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14, arg15 y15, arg16 y16, arg17 y17, arg18 y18 ):
-        ThreadPool::WorkItemRet<void>(), N(18),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14), x15(y15), x16(y16), x17(y17), x18(y18) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14, arg15 y15, arg16 y16, arg17 y17, arg18 y18, arg19 y19 ):
-        ThreadPool::WorkItemRet<void>(), N(19),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14), x15(y15), x16(y16), x17(y17), x18(y18), x19(y19) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14, arg15 y15, arg16 y16, arg17 y17, arg18 y18, arg19 y19, arg20 y20 ):
-        ThreadPool::WorkItemRet<void>(), N(20),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14), x15(y15), x16(y16), x17(y17), x18(y18), x19(y19), x20(y20) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14, arg15 y15, arg16 y16, arg17 y17, arg18 y18, arg19 y19, arg20 y20, arg21 y21 ):
-        ThreadPool::WorkItemRet<void>(), N(21),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14), x15(y15), x16(y16), x17(y17), x18(y18), x19(y19), x20(y20), x21(y21) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21,arg22), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14, arg15 y15, arg16 y16, arg17 y17, arg18 y18, arg19 y19, arg20 y20, arg21 y21, arg22 y22 ):
-        ThreadPool::WorkItemRet<void>(), N(22),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14), x15(y15), x16(y16), x17(y17), x18(y18), x19(y19), x20(y20), x21(y21), x22(y22) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21,arg22,arg23), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14, arg15 y15, arg16 y16, arg17 y17, arg18 y18, arg19 y19, arg20 y20, arg21 y21, arg22 y22, arg23 y23 ):
-        ThreadPool::WorkItemRet<void>(), N(23),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14), x15(y15), x16(y16), x17(y17), x18(y18), x19(y19), x20(y20), x21(y21), x22(y22), x23(y23) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    WorkItemFull( void (*routine2)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21,arg22,arg23,arg24), arg1 y1, arg2 y2, arg3 y3, arg4 y4, arg5 y5, arg6 y6, arg7 y7, arg8 y8, arg9 y9, arg10 y10, arg11 y11, arg12 y12, arg13 y13, arg14 y14, arg15 y15, arg16 y16, arg17 y17, arg18 y18, arg19 y19, arg20 y20, arg21 y21, arg22 y22, arg23 y23, arg24 y24 ):
-        ThreadPool::WorkItemRet<void>(), N(24),
-        routine(reinterpret_cast<void(*)()>(routine2)),  x1(y1),  x2(y2),  x3(y3),  x4(y4),  x5(y5),  x6(y6),  x7(y7),  x8(y8),  x9(y9), x10(y10), x11(y11), x12(y12), x13(y13), x14(y14), x15(y15), x16(y16), x17(y17), x18(y18), x19(y19), x20(y20), x21(y21), x22(y22), x23(y23), x24(y24) {
-        ThreadPool::WorkItem::d_state=0; ThreadPool::WorkItem::d_has_result=true; }
-    void run() {
-        ThreadPool::WorkItem::d_state = 1;
-        if ( N==0 )
-            reinterpret_cast<void(*)()>(routine)();
-        else if ( N==1 )
-            reinterpret_cast<void(*)(arg1)>(routine)(x1);
-        else if ( N==2 )
-            reinterpret_cast<void(*)(arg1,arg2)>(routine)(x1,x2);
-        else if ( N==3 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3)>(routine)(x1,x2,x3);
-        else if ( N==4 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3,arg4)>(routine)(x1,x2,x3,x4);
-        else if ( N==5 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3,arg4,arg5)>(routine)(x1,x2,x3,x4,x5);
-        else if ( N==6 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3,arg4,arg5,arg6)>(routine)(x1,x2,x3,x4,x5,x6);
-        else if ( N==7 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7)>(routine)(x1,x2,x3,x4,x5,x6,x7);
-        else if ( N==8 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8);
-        else if ( N==9 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9);
-        else if ( N==10 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10);
-        else if ( N==11 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11);
-        else if ( N==12 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12);
-        else if ( N==13 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13);
-        else if ( N==14 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14);
-        else if ( N==15 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15);
-        else if ( N==16 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16);
-        else if ( N==17 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17);
-        else if ( N==18 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18);
-        else if ( N==19 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19);
-        else if ( N==20 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20);
-        else if ( N==21 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,x21);
-        else if ( N==22 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21,arg22)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22);
-        else if ( N==23 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21,arg22,arg23)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22,x23);
-        else if ( N==24 )
-            reinterpret_cast<void(*)(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21,arg22,arg23,arg24)>(routine)(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22,x23,x24);
-        else
-            throw std::exception(); // Internal error
-        ThreadPool::WorkItem::d_state = 2;
+    WorkItemFull( Ret ( *routine2 )( Args... ), Args... ts )
+        : ThreadPool::WorkItemRet<Ret>(), routine( routine2 ), args( ts... )
+    {
     }
+    virtual void run() override
+    {
+        this->d_result = apply( routine, args );
+    }
+    virtual bool has_result() const override { return true; }
     virtual ~WorkItemFull() {}
 };
 
 
-
-// Function to get the returned function value
-template <> 
-inline bool ThreadPool::getFunctionRet<bool>( const ThreadPool::thread_id_t& id )  const
-{    
-    WorkItemRet<bool> *work = dynamic_cast<WorkItemRet<bool>*>(getFinishedWorkItem(id));
-    bool rtn = false;
-    if ( work != NULL )
-        rtn = work->get_results();
-    return rtn;
-}
-template <> 
-inline char ThreadPool::getFunctionRet<char>( const ThreadPool::thread_id_t& id )  const
-{    
-    WorkItemRet<char> *work = dynamic_cast<WorkItemRet<char>*>(getFinishedWorkItem(id));
-    char rtn = 0;
-    if ( work != NULL )
-        rtn = work->get_results();
-    return rtn;
-}
-template <> 
-inline int ThreadPool::getFunctionRet<int>( const ThreadPool::thread_id_t& id )  const
-{    
-    WorkItemRet<int> *work = dynamic_cast<WorkItemRet<int>*>(getFinishedWorkItem(id));
-    int rtn = 0;
-    if ( work != NULL )
-        rtn = work->get_results();
-    return rtn;
-}
-template <> 
-inline float ThreadPool::getFunctionRet<float>( const ThreadPool::thread_id_t& id )  const
-{    
-    WorkItemRet<float> *work = dynamic_cast<WorkItemRet<float>*>(getFinishedWorkItem(id));
-    float rtn = 0;
-    if ( work != NULL )
-        rtn = work->get_results();
-    return rtn;
-}
-template <> 
-inline double ThreadPool::getFunctionRet<double>( const ThreadPool::thread_id_t& id )  const
-{    
-    WorkItemRet<double> *work = dynamic_cast<WorkItemRet<double>*>(getFinishedWorkItem(id));
-    double rtn = 0;
-    if ( work != NULL )
-        rtn = work->get_results();
-    return rtn;
-}
-template <class return_type> 
-inline return_type ThreadPool::getFunctionRet( const ThreadPool::thread_id_t& id )  const
-{
-    WorkItemRet<return_type> *work = dynamic_cast<WorkItemRet<return_type>*>(getFinishedWorkItem(id));
-    return_type rtn;
-    if ( work != NULL )
-        rtn = work->get_results();
-    return rtn;
-}
-
-
-
-// Functions create work items
-template <class return_type> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)()) 
-{
-    WorkItemFull<return_type> *work;
-    work = new WorkItemFull<return_type>(routine);
-    return work;
-}
-template <class return_type> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(), void*) 
-{
-    WorkItemFull<return_type> *work;
-    work = new WorkItemFull<return_type>(routine);
-    return work;
-}
-template <class return_type, class type1> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1), type1 arg1) 
-{
-    WorkItemFull<return_type,type1> *work;
-    work = new WorkItemFull<return_type,type1>(routine,arg1);
-    return work;
-}
-template <class return_type, class type1, class type2> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2), type1 arg1, type2 arg2) 
-{
-    WorkItemFull<return_type,type1,type2> *work;
-    work = new WorkItemFull<return_type,type1,type2>(routine,arg1,arg2);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3), 
-    type1 arg1, type2 arg2, type3 arg3) 
-{
-    WorkItemFull<return_type,type1,type2,type3> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3>(routine,arg1,arg2,arg3);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3, class type4> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3,type4), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4) 
-{
-    WorkItemFull<return_type,type1,type2,type3,type4> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3,type4>(routine,arg1,arg2,arg3,arg4);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3,type4,type5), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) 
-{
-    WorkItemFull<return_type,type1,type2,type3,type4,type5> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3,type4,type5>(routine,arg1,arg2,arg3,arg4,arg5);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3,type4,type5,type6), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6) 
-{
-    WorkItemFull<return_type,type1,type2,type3,type4,type5,type6> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3,type4,type5,type6>(routine,arg1,arg2,arg3,arg4,arg5,arg6);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3,type4,type5,type6,type7), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7) 
-{
-    WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7>(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8) 
-{
-    WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8>(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9) 
-{
-    WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9>(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10) 
-{
-    WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10>(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11) 
-{
-    WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11>(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12) 
-{
-    WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12>(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13) 
-{
-    WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13>(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14) 
-{
-    WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14>(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14, class type15> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14, type15 arg15) 
-{
-    WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15>(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14, class type15, class type16> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14, type15 arg15, type16 arg16) 
-{
-    WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16>(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14, class type15, class type16, class type17> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14, type15 arg15, type16 arg16, type17 arg17) 
-{
-    WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17>(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14, class type15, class type16, class type17, class type18> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14, type15 arg15, type16 arg16, type17 arg17, type18 arg18) 
-{
-    WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18>(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14, class type15, class type16, class type17, class type18, class type19> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14, type15 arg15, type16 arg16, type17 arg17, type18 arg18, type19 arg19) 
-{
-    WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19>(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14, class type15, class type16, class type17, class type18, class type19, class type20> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19,type20), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14, type15 arg15, type16 arg16, type17 arg17, type18 arg18, type19 arg19, type20 arg20) 
-{
-    WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19,type20> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19,type20>(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14, class type15, class type16, class type17, class type18, class type19, class type20, class type21> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19,type20,type21), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14, type15 arg15, type16 arg16, type17 arg17, type18 arg18, type19 arg19, type20 arg20, type21 arg21) 
-{
-    WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19,type20,type21> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19,type20,type21>(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14, class type15, class type16, class type17, class type18, class type19, class type20, class type21, class type22> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19,type20,type21,type22), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14, type15 arg15, type16 arg16, type17 arg17, type18 arg18, type19 arg19, type20 arg20, type21 arg21, type22 arg22) 
-{
-    WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19,type20,type21,type22> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19,type20,type21,type22>(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21,arg22);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14, class type15, class type16, class type17, class type18, class type19, class type20, class type21, class type22, class type23> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19,type20,type21,type22,type23), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14, type15 arg15, type16 arg16, type17 arg17, type18 arg18, type19 arg19, type20 arg20, type21 arg21, type22 arg22, type23 arg23) 
-{
-    WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19,type20,type21,type22,type23> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19,type20,type21,type22,type23>(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21,arg22,arg23);
-    return work;
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14, class type15, class type16, class type17, class type18, class type19, class type20, class type21, class type22, class type23, class type24> 
-inline ThreadPool::WorkItem* ThreadPool_create_work(return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19,type20,type21,type22,type23,type24), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14, type15 arg15, type16 arg16, type17 arg17, type18 arg18, type19 arg19, type20 arg20, type21 arg21, type22 arg22, type23 arg23, type24 arg24) 
-{
-    WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19,type20,type21,type22,type23,type24> *work;
-    work = new WorkItemFull<return_type,type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19,type20,type21,type22,type23,type24>(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21,arg22,arg23,arg24);
-    return work;
-}
-
-
 // Functions to add work to the thread pool
-template <class return_type> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(), int priority) 
+template <class Ret, class... Ts>
+inline ThreadPool::thread_id_t ThreadPool_add_work(
+    ThreadPool *tpool, int priority, Ret ( *routine )( Ts... ), Ts... ts )
 {
-    ThreadPool::WorkItem* work = ThreadPool_create_work<return_type>(routine);
+    ThreadPool::WorkItem *work = new WorkItemFull<Ret, Ts...>( routine, ts... );
     return tpool->add_work( work, priority );
 }
-template <class return_type> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(), void*, int priority) 
+template <class Ret>
+inline ThreadPool::thread_id_t ThreadPool_add_work(
+    ThreadPool *tpool, int priority, Ret ( *routine )(), void * )
 {
-    ThreadPool::WorkItem* work = ThreadPool_create_work<return_type>(routine);
+    ThreadPool::WorkItem *work = new WorkItemFull<Ret>( routine );
     return tpool->add_work( work, priority );
 }
-template <class return_type, class type1> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1), type1 arg1, int priority) 
+template <class Ret, class... Args>
+inline ThreadPool::WorkItem* ThreadPool::createWork( Ret( *routine )( Args... ), Args... args )
 {
-    ThreadPool::WorkItem* work = ThreadPool_create_work<return_type,type1>(routine,arg1);
-    return tpool->add_work( work, priority );
+    return new WorkItemFull<Ret, Args...>( routine, args... );
 }
-template <class return_type, class type1, class type2> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2), type1 arg1, type2 arg2, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3), 
-    type1 arg1, type2 arg2, type3 arg3, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3, class type4> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3,type4), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3,arg4);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3,type4,type5), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3,arg4,arg5);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3,type4,type5,type6), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3,arg4,arg5,arg6);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3,type4,type5,type6,type7), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14, class type15> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14, type15 arg15, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14, class type15, class type16> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14, type15 arg15, type16 arg16, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14, class type15, class type16, class type17> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14, type15 arg15, type16 arg16, type17 arg17, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14, class type15, class type16, class type17, class type18> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14, type15 arg15, type16 arg16, type17 arg17, type18 arg18, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14, class type15, class type16, class type17, class type18, class type19> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14, type15 arg15, type16 arg16, type17 arg17, type18 arg18, type19 arg19, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14, class type15, class type16, class type17, class type18, class type19, class type20> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19,type20), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14, type15 arg15, type16 arg16, type17 arg17, type18 arg18, type19 arg19, type20 arg20, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14, class type15, class type16, class type17, class type18, class type19, class type20, class type21> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19,type20,type21), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14, type15 arg15, type16 arg16, type17 arg17, type18 arg18, type19 arg19, type20 arg20, type21 arg21, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14, class type15, class type16, class type17, class type18, class type19, class type20, class type21, class type22> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19,type20,type21,type22), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14, type15 arg15, type16 arg16, type17 arg17, type18 arg18, type19 arg19, type20 arg20, type21 arg21, type22 arg22, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21,arg22);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14, class type15, class type16, class type17, class type18, class type19, class type20, class type21, class type22, class type23> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19,type20,type21,type22,type23), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14, type15 arg15, type16 arg16, type17 arg17, type18 arg18, type19 arg19, type20 arg20, type21 arg21, type22 arg22, type23 arg23, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21,arg22,arg23);
-    return tpool->add_work( work, priority );
-}
-template <class return_type, class type1, class type2, class type3, class type4, class type5, class type6, class type7, class type8, class type9, class type10, class type11, class type12, class type13, class type14, class type15, class type16, class type17, class type18, class type19, class type20, class type21, class type22, class type23, class type24> 
-inline ThreadPool::thread_id_t ThreadPool_add_work(ThreadPool* tpool, return_type (*routine)(type1,type2,type3,type4,type5,type6,type7,type8,type9,type10,type11,type12,type13,type14,type15,type16,type17,type18,type19,type20,type21,type22,type23,type24), 
-    type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6, type7 arg7, type8 arg8, type9 arg9, type10 arg10, type11 arg11, type12 arg12, type13 arg13, type14 arg14, type15 arg15, type16 arg16, type17 arg17, type18 arg18, type19 arg19, type20 arg20, type21 arg21, type22 arg22, type23 arg23, type24 arg24, int priority) 
-{
-    ThreadPool::WorkItem* work = ThreadPool_create_work(routine,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17,arg18,arg19,arg20,arg21,arg22,arg23,arg24);
-    return tpool->add_work( work, priority );
-}
-
-
 
 
 /******************************************************************
-* Inline function to return a unique id of the current thread     *
+* Function to get the returned function value                     *
 ******************************************************************/
-inline size_t ThreadPool::getThreadId()
+template <class T> inline constexpr T zeroConstructor();
+template<> inline constexpr bool zeroConstructor<bool>( ) { return false; }
+template<> inline constexpr char zeroConstructor<char>( ) { return 0; }
+template<> inline constexpr unsigned char zeroConstructor<unsigned char>( ) { return 0; }
+template<> inline constexpr int zeroConstructor<int>( ) { return 0; }
+template<> inline constexpr unsigned int zeroConstructor<unsigned int>( ) { return 0; }
+template<> inline constexpr long zeroConstructor<long>( ) { return 0; }
+template<> inline constexpr unsigned long zeroConstructor<unsigned long>( ) { return 0; }
+template<> inline constexpr float zeroConstructor<float>( ) { return 0; }
+template<> inline constexpr double zeroConstructor<double>( ) { return 0; }
+template <class T> inline constexpr T zeroConstructor() { return T(); }
+template <class Ret>
+inline Ret ThreadPool::getFunctionRet( const ThreadPool::thread_id_t &id ) const
 {
-    #ifdef USE_WINDOWS
-        DWORD tmp_thread_id = GetCurrentThreadId();
-        size_t thread_id = (size_t) tmp_thread_id;
-    #elif defined(USE_LINUX) || defined(USE_MAC)
-        pthread_t tmp_thread_id = pthread_self();
-        size_t thread_id = (size_t) tmp_thread_id;
-    #else 
-        #error Not defined for this OS
-    #endif
-    return thread_id;
+    WorkItemRet<Ret> *work = dynamic_cast<WorkItemRet<Ret>*>( getFinishedWorkItem( id ) );
+    return work == nullptr ? zeroConstructor<Ret>() : work->get_results();
 }
 
 
@@ -890,92 +183,101 @@ inline size_t ThreadPool::getThreadId()
 inline int ThreadPool::wait( ThreadPool::thread_id_t id ) const
 {
     bool finished;
-    return wait_some( 1, &id, 1, &finished );
+    wait_some( 1, &id, 1, &finished );
+    return 0;
 }
-inline int ThreadPool::wait_any(size_t N_work, const ThreadPool::thread_id_t *ids) 
+inline int ThreadPool::wait_any( size_t N_work, const ThreadPool::thread_id_t *ids )
 {
-    bool* finished = new bool[N_work];
-    int error = wait_some( N_work, ids, 1, finished );
-    if ( error!=0 ) {
-        delete [] finished;
-        return error;
-    }
+    auto finished = new bool[N_work];
+    wait_some( N_work, ids, 1, finished );
     int index = -1;
-    for (size_t i=0; i<N_work; i++) {
+    for ( size_t i = 0; i < N_work; i++ ) {
         if ( finished[i] ) {
-            index = static_cast<int>(i);
+            index = static_cast<int>( i );
             break;
         }
     }
+    delete[] finished;
+    return index;
+}
+inline int ThreadPool::wait_any( const std::vector<thread_id_t> &ids ) const
+{
+    if ( ids.empty() )
+        return 0;
+    auto finished = new bool[ids.size()];
+    wait_some( ids.size(), &ids[0], 1, finished );
+    int index = -1;
+    for ( size_t i = 0; i < ids.size(); i++ ) {
+        if ( finished[i] ) {
+            index = static_cast<int>( i );
+            break;
+        }
+    }
+    delete[] finished;
+    return index;
+}
+inline int ThreadPool::wait_all( size_t N_work, const ThreadPool::thread_id_t *ids ) const
+{
+    if ( N_work==0 )
+        return 0;
+    auto finished = new bool[N_work];
+    wait_some( N_work, ids, N_work, finished );
+    delete[] finished;
+    return 0;
+}
+inline int ThreadPool::wait_all( const std::vector<thread_id_t> &ids ) const
+{
+    if ( ids.empty() )
+        return 0;
+    auto finished = new bool[ids.size()];
+    wait_some( ids.size(), ids.data(), ids.size(), finished );
+    delete[] finished;
+    return 0;
+}
+inline std::vector<int> ThreadPool::wait_some( int N_wait, const std::vector<thread_id_t> &ids ) const
+{
+    auto finished = new bool[ids.size()];
+    int N_finished = wait_some( ids.size(), ids.data(), N_wait, finished );
+    std::vector<int> index(N_finished,-1);
+    for ( size_t i=0, j=0; i < ids.size(); i++ ) {
+        if ( finished[i] ) {
+            index[j] = i;
+            j++;
+        }
+    }
     delete [] finished;
     return index;
 }
-inline int ThreadPool::wait_any(const std::vector<thread_id_t>& ids) const
-{
-    if ( ids.empty() )
-        return 0;
-    bool* finished = new bool[ids.size()];
-    int error = wait_some( ids.size(), &ids[0], 1, finished );
-    if ( error!=0 ) {
-        delete [] finished;
-        return error;
-    }
-    int index = -1;
-    for (size_t i=0; i<ids.size(); i++) {
-        if ( finished[i] ) {
-            index = static_cast<int>(i);
-            break;
-        }
-    }
-    delete [] finished;
-    return index;
-}
-inline int ThreadPool::wait_all(size_t N_work, const ThreadPool::thread_id_t *ids) const
-{
-    bool* finished = new bool[N_work];
-    int error = wait_some( N_work, ids, N_work, finished );
-    delete [] finished;
-    return error;
-}
-inline int ThreadPool::wait_all(const std::vector<thread_id_t>& ids) const
-{
-    if ( ids.empty() )
-        return 0;
-    bool* finished = new bool[ids.size()];
-    int error = wait_some( ids.size(), &ids[0], ids.size(), finished );
-    delete [] finished;
-    return error;
-}
 
 
 /******************************************************************
 * Functions to add work items.                                    *
 ******************************************************************/
-inline ThreadPool::thread_id_t ThreadPool::add_work( WorkItem *work, int priority ) 
+inline ThreadPool::thread_id_t ThreadPool::add_work( WorkItem *work, int priority )
 {
     ThreadPool::thread_id_t id;
     add_work( 1, &work, &priority, &id );
     return id;
 }
-inline std::vector<ThreadPool::thread_id_t> ThreadPool::add_work( 
-    const std::vector<ThreadPool::WorkItem*>& work, const std::vector<int>& priority ) 
+inline std::vector<ThreadPool::thread_id_t> ThreadPool::add_work(
+    const std::vector<ThreadPool::WorkItem *> &work, const std::vector<int> &priority )
 {
     size_t N = work.size();
-    if ( N==0 )
+    if ( N == 0 )
         return std::vector<ThreadPool::thread_id_t>();
-    if ( priority.size()!=N && !priority.empty() )
-        throw std::logic_error("size of work and priority do not match");
-    const int* priority2 = NULL;
+    if ( priority.size() != N && !priority.empty() )
+        throw std::logic_error( "size of work and priority do not match" );
+    const int *priority2 = nullptr;
     if ( priority.empty() ) {
         priority2 = new int[N];
-        memset( const_cast<int*>(priority2), 0, N*sizeof(int) );
+        memset( const_cast<int *>( priority2 ), 0, N * sizeof( int ) );
     } else {
         priority2 = &priority[0];
     }
-    std::vector<ThreadPool::thread_id_t> ids(N);
-    add_work( N, const_cast<ThreadPool::WorkItem**>(&work[0]), priority2, &ids[0] );
+    std::vector<ThreadPool::thread_id_t> ids( N );
+    add_work( N, const_cast<ThreadPool::WorkItem **>( &work[0] ), priority2, &ids[0] );
     if ( priority.empty() )
-        delete [] priority2;
+        delete[] priority2;
     return ids;
 }
 
@@ -983,181 +285,225 @@ inline std::vector<ThreadPool::thread_id_t> ThreadPool::add_work(
 /******************************************************************
 * Class functions to for the thread id                            *
 ******************************************************************/
-inline ThreadPool::thread_id_t::thread_id_t( ):
-    d_id(0x0FFFFFFFFFFFFFFF), d_count(NULL), d_work(NULL)
-{ }
-inline ThreadPool::thread_id_t::~thread_id_t( )
-{ 
-    reset();
+inline ThreadPool::thread_id_t::thread_id_t()
+    : d_id( nullThreadID ), d_count( NULL ), d_work( NULL )
+{
 }
-inline ThreadPool::thread_id_t::thread_id_t( const thread_id_t& rhs ):
-    d_id(rhs.d_id), d_count(rhs.d_count), d_work(rhs.d_work)
-{ 
-    if ( d_count != NULL )
-        AtomicOperations::atomic_increment(d_count);
+inline ThreadPool::thread_id_t::~thread_id_t() { reset(); }
+inline ThreadPool::thread_id_t::thread_id_t( volatile ThreadPool::thread_id_t &&rhs )
+    : d_id( std::move( rhs.d_id ) ),
+      d_count( std::move( rhs.d_count ) ),
+      d_work( std::move( rhs.d_work ) )
+{
+    rhs.d_count = nullptr;
+    rhs.d_work  = nullptr;
+    rhs.d_id    = nullThreadID;
 }
-inline ThreadPool::thread_id_t& ThreadPool::thread_id_t::operator=( const ThreadPool::thread_id_t& rhs ) volatile
-{ 
-    if (this == &rhs) // protect against invalid self-assignment
-        return const_cast<ThreadPool::thread_id_t&>(*this);
+inline ThreadPool::thread_id_t &ThreadPool::thread_id_t::operator=(
+    const ThreadPool::thread_id_t &rhs ) volatile
+{
+    if ( this == &rhs ) // protect against invalid self-assignment
+        return const_cast<ThreadPool::thread_id_t &>( *this );
     this->reset();
-    d_id = rhs.d_id;
+    d_id    = rhs.d_id;
     d_count = rhs.d_count;
-    d_work = rhs.d_work;
+    d_work  = rhs.d_work;
+    if ( d_count != nullptr )
+        AtomicOperations::atomic_increment( d_count );
+    return const_cast<ThreadPool::thread_id_t &>( *this );
+}
+inline ThreadPool::thread_id_t &ThreadPool::thread_id_t::operator=(
+    volatile ThreadPool::thread_id_t &&rhs ) volatile
+{
+    std::swap( d_id, rhs.d_id );
+    std::swap( d_work, rhs.d_work );
+    std::swap( d_count, rhs.d_count );
+    return const_cast<ThreadPool::thread_id_t &>( *this );
+}
+inline ThreadPool::thread_id_t::thread_id_t( const volatile ThreadPool::thread_id_t &rhs )
+    : d_id( rhs.d_id ), d_count( rhs.d_count ), d_work( rhs.d_work )
+{
     if ( d_count != NULL )
-        AtomicOperations::atomic_increment(d_count);
-    return const_cast<ThreadPool::thread_id_t&>(*this);
+        AtomicOperations::atomic_increment( d_count );
 }
 #ifndef USE_WINDOWS
-inline ThreadPool::thread_id_t::thread_id_t( const volatile ThreadPool::thread_id_t& rhs ):
-    d_id(rhs.d_id), d_count(rhs.d_count), d_work(rhs.d_work)
-{ 
-    if ( d_count != NULL )
-        AtomicOperations::atomic_increment(d_count);
+inline ThreadPool::thread_id_t::thread_id_t( const thread_id_t &rhs )
+    : d_id( rhs.d_id ), d_count( rhs.d_count ), d_work( rhs.d_work )
+{
+    if ( d_count != nullptr )
+        AtomicOperations::atomic_increment( d_count );
 }
-inline ThreadPool::thread_id_t& ThreadPool::thread_id_t::operator=( const ThreadPool::thread_id_t& rhs )
-{ 
-    if (this == &rhs) // protect against invalid self-assignment
-        return const_cast<ThreadPool::thread_id_t&>(*this);
-    this->reset();
-    d_id = rhs.d_id;
-    d_count = rhs.d_count;
-    d_work = rhs.d_work;
-    if ( d_count != NULL )
-        AtomicOperations::atomic_increment(d_count);
-    return const_cast<ThreadPool::thread_id_t&>(*this);
+inline ThreadPool::thread_id_t &ThreadPool::thread_id_t::operator=( ThreadPool::thread_id_t &&rhs )
+{
+    std::swap( d_id, rhs.d_id );
+    std::swap( d_work, rhs.d_work );
+    std::swap( d_count, rhs.d_count );
+    return const_cast<ThreadPool::thread_id_t &>( *this );
 }
-inline ThreadPool::thread_id_t& ThreadPool::thread_id_t::operator=( const volatile ThreadPool::thread_id_t& rhs )
-{ 
-    if (this == &rhs) // protect against invalid self-assignment
-        return const_cast<ThreadPool::thread_id_t&>(*this);
+inline ThreadPool::thread_id_t &ThreadPool::thread_id_t::operator=(
+    const ThreadPool::thread_id_t &rhs )
+{
+    if ( this == &rhs ) // protect against invalid self-assignment
+        return const_cast<ThreadPool::thread_id_t &>( *this );
     this->reset();
-    d_id = rhs.d_id;
+    d_id    = rhs.d_id;
     d_count = rhs.d_count;
-    d_work = rhs.d_work;
-    if ( d_count != NULL )
-        AtomicOperations::atomic_increment(d_count);
-    return const_cast<ThreadPool::thread_id_t&>(*this);
+    d_work  = rhs.d_work;
+    if ( d_count != nullptr )
+        AtomicOperations::atomic_increment( d_count );
+    return const_cast<ThreadPool::thread_id_t &>( *this );
 }
-inline ThreadPool::thread_id_t& ThreadPool::thread_id_t::operator=( const volatile ThreadPool::thread_id_t& rhs ) volatile
-{ 
-    if (this == &rhs) // protect against invalid self-assignment
-        return const_cast<ThreadPool::thread_id_t&>(*this);
+inline ThreadPool::thread_id_t &ThreadPool::thread_id_t::operator=(
+    const volatile ThreadPool::thread_id_t &rhs )
+{
+    if ( this == &rhs ) // protect against invalid self-assignment
+        return const_cast<ThreadPool::thread_id_t &>( *this );
     this->reset();
-    d_id = rhs.d_id;
+    d_id    = rhs.d_id;
     d_count = rhs.d_count;
-    d_work = rhs.d_work;
-    if ( d_count != NULL )
-        AtomicOperations::atomic_increment(d_count);
-    return const_cast<ThreadPool::thread_id_t&>(*this);
+    d_work  = rhs.d_work;
+    if ( d_count != nullptr )
+        AtomicOperations::atomic_increment( d_count );
+    return const_cast<ThreadPool::thread_id_t &>( *this );
+}
+inline ThreadPool::thread_id_t &ThreadPool::thread_id_t::operator=(
+    const volatile ThreadPool::thread_id_t &rhs ) volatile
+{
+    if ( this == &rhs ) // protect against invalid self-assignment
+        return const_cast<ThreadPool::thread_id_t &>( *this );
+    this->reset();
+    d_id    = rhs.d_id;
+    d_count = rhs.d_count;
+    d_work  = rhs.d_work;
+    if ( d_count != nullptr )
+        AtomicOperations::atomic_increment( d_count );
+    return const_cast<ThreadPool::thread_id_t &>( *this );
 }
 #endif
-inline void ThreadPool::thread_id_t::reset() volatile 
-{ 
-    if ( d_count != NULL ) {
-        int count = AtomicOperations::atomic_decrement(d_count);
+inline void ThreadPool::thread_id_t::reset() volatile
+{
+    if ( d_count != nullptr ) {
+        int count = AtomicOperations::atomic_decrement( d_count );
         if ( count == 0 ) {
-            delete d_count;
-            WorkItem* tmp = reinterpret_cast<ThreadPool::WorkItem*>(d_work);
+            WorkItem *tmp = reinterpret_cast<ThreadPool::WorkItem *>( d_work );
             delete tmp;
         }
     }
-    d_id = 0x0FFFFFFFFFFFFFFF;
-    d_count = NULL;
-    d_work = NULL;
+    d_id    = nullThreadID;
+    d_count = nullptr;
+    d_work  = nullptr;
 }
 inline void ThreadPool::thread_id_t::reset()
-{ 
-    if ( d_count != NULL ) {
-        int count = AtomicOperations::atomic_decrement(d_count);
+{
+    if ( d_count != nullptr ) {
+        int count = AtomicOperations::atomic_decrement( d_count );
         if ( count == 0 ) {
-            delete d_count;
-            WorkItem* tmp = reinterpret_cast<ThreadPool::WorkItem*>(d_work);
+            WorkItem *tmp = reinterpret_cast<ThreadPool::WorkItem *>( d_work );
             delete tmp;
         }
     }
-    d_id = 0x0FFFFFFFFFFFFFFF;
-    d_count = NULL;
-    d_work = NULL;
+    d_id    = nullThreadID;
+    d_count = nullptr;
+    d_work  = nullptr;
 }
-inline void ThreadPool::thread_id_t::reset( int priority, size_t local_id, void* work )
+inline uint64_t ThreadPool::thread_id_t::createId( int priority, uint64_t local_id )
 {
-    if ( d_count != NULL ) {
-        int count = AtomicOperations::atomic_decrement(d_count);
+    if ( priority < -127 || priority > 127 )
+        throw std::logic_error( "priority limited to +- 127" );
+    if ( local_id > maxThreadID )
+        throw std::logic_error( "local id >= 2^56" );
+    char tmp1          = static_cast<char>( priority + 128 );
+    unsigned char tmp2 = static_cast<unsigned char>( tmp1 );
+    if ( priority >= 0 )
+        tmp2 |= 0x80;
+    uint64_t id = tmp2;
+    id = ( id << 56 ) + local_id;
+    return id;    
+}
+inline void ThreadPool::thread_id_t::reset( int priority, uint64_t local_id, void *work )
+{
+    if ( d_count != nullptr ) {
+        int count = AtomicOperations::atomic_decrement( d_count );
         if ( count == 0 ) {
-            delete d_count;
-            WorkItem* tmp = reinterpret_cast<ThreadPool::WorkItem*>(d_work);
+            WorkItem *tmp = reinterpret_cast<ThreadPool::WorkItem *>( d_work );
             delete tmp;
         }
     }
     // Create the id
-    if ( sizeof(uint64)!=8 ) 
-        throw std::logic_error("unsigned long long int must be 64 bits");        
-    if ( priority<-127||priority>127 )
-        throw std::logic_error("priority limited to +- 127");        
-    if ( local_id > 0x00FFFFFFFFFFFFFF )
-        throw std::logic_error("local id >= 2^56");
-    char tmp1 = static_cast<char>(priority+128);
-    unsigned char tmp2 = static_cast<unsigned char>(tmp1);
-    if ( priority >= 0 )
-        tmp2 |= 0x80;
-    d_id = tmp2;
-    d_id = (d_id<<56) + local_id;
-    // Create the counter
-    d_count = new AtomicOperations::int32_atomic;
-    *d_count = 1;
-    // Initialize the remaining data
-    d_work = work;
+    d_id = createId( priority, local_id );
+    // Create the work and counter
+    d_count = nullptr;
+    d_work  = nullptr;
+    if ( work != nullptr ) {
+        d_work = work;
+        d_count = &(reinterpret_cast<WorkItem *>( work )->d_count);
+        *d_count = 1;
+    }
 }
-inline size_t ThreadPool::thread_id_t::getLocalID(  ) const
+inline uint64_t ThreadPool::thread_id_t::getLocalID() const
 {
-    if ( d_id == 0x0FFFFFFFFFFFFFFF )
-        return ~((size_t)0);
-    unsigned long long int tmp = d_id&0x00FFFFFFFFFFFFFF;
-    return static_cast<size_t>(tmp);
+    if ( d_id == nullThreadID )
+        return ~( (uint64_t) 0 );
+    uint64_t tmp = d_id & 0x00FFFFFFFFFFFFFF;
+    return static_cast<size_t>( tmp );
 }
-inline int ThreadPool::thread_id_t::getPriority(  ) const
+inline int ThreadPool::thread_id_t::getPriority() const
 {
-    if ( d_id == 0x0FFFFFFFFFFFFFFF )
+    if ( d_id == nullThreadID )
         return -128;
-    unsigned long long int tmp = d_id>>56;
-    return static_cast<int>(tmp)-128;
+    uint64_t tmp = d_id >> 56;
+    return static_cast<int>( tmp ) - 128;
 }
-inline bool ThreadPool::thread_id_t::finished( ) const
+inline void ThreadPool::thread_id_t::setPriority( int priority )
 {
-    return d_id==0x0FFFFFFFFFFFFFFF ? true:reinterpret_cast<WorkItem*>(d_work)->d_state==2;
+    if ( d_id == nullThreadID )
+        return;
+    d_id = createId( priority, getLocalID() );
+}
+inline bool ThreadPool::thread_id_t::started() const
+{
+    return d_id == nullThreadID ? true : reinterpret_cast<WorkItem *>( d_work )->d_state >= 2;
+}
+inline bool ThreadPool::thread_id_t::finished() const
+{
+    return d_id == nullThreadID ? true : reinterpret_cast<WorkItem *>( d_work )->d_state == 3;
+}
+inline bool ThreadPool::thread_id_t::ready() const
+{
+    bool ready = true;
+    if ( !isNull() ) {
+        auto tmp = work();
+        for (size_t i=0; i<tmp->d_N_ids; i++)
+            ready = ready && tmp->d_ids[i].finished();
+    }
+    return ready;
 }
 
 
 /******************************************************************
 * This function checks if the id is valid                         *
 ******************************************************************/
-#define MAXID32 0xFFFFFFFD
-#define MAXID64 0x00FFFFFFFFFFFFFD
-inline bool ThreadPool::isValid(const ThreadPool::thread_id_t& id) const
+inline bool ThreadPool::isValid( const ThreadPool::thread_id_t &id ) const
 {
-    size_t local_id = id.getLocalID();
-    size_t next_id = d_id_assign-1;
-    bool is_valid = true;
-    if ( local_id==0 || !id.initialized() ) {
-        // Invalid id
-        is_valid = false;
-    } else if ( d_id_assign==0 ) {
-        // We ran out of thread ids
-        throw std::logic_error("id space exhausted");
-    } else if ( sizeof(size_t)==4 ) {
-        // If we are using a 32-bit id, the work id is valid if it is <= 2^32-3 and > the next id
-        if ( local_id>MAXID32 || local_id<=next_id )
-            is_valid = false;
-    } else if ( sizeof(size_t)==8 ) {
-        // If we are using a 64-bit id, the work id is valid if it is <= 2^56-3, > the next id, and has an odd number of bits
-        if ( local_id>MAXID64 || local_id<=next_id )
-            is_valid = false;
-    } else {
-        // This is not a valid size
-        throw std::logic_error("Error checking ids");
+    static_assert( sizeof(atomic_64)==8, "atomic_64 must be a 64-bit integer" );
+    uint64_t local_id = id.getLocalID();
+    uint64_t next_id  = d_id_assign - 1;
+    return local_id!=0 && id.initialized() && local_id<=thread_id_t::maxThreadID && local_id>next_id;
+}
+
+
+/******************************************************************
+* Function to get the thread number                               *
+* (-1 if it is not a member thread)                               *
+******************************************************************/
+inline int ThreadPool::getThreadNumber() const
+{
+    std::thread::id id = std::this_thread::get_id();
+    for ( int i = 0; i < d_N_threads; i++ ) {
+        if ( id == d_threadId[i] )
+            return i;
     }
-    return is_valid;
+    return -1;
 }